#Loading and Looking at the Data in R
> taxi <- read.csv("C:/Users/doh1/Desktop/INFO 5550 QM/Datasets/2ndTerm2021_taxi-cancellation-case.csv")
library(dplyr)
library(ggplot2)
library(adabag)
library(rpart)
library(caret)
library(randomForest)
#The percentage of Car Cancellation
Car_Cancellation.Frequency <- table(taxi$Car_Cancellation)
Car_Cancellation.Percent <- prop.table(table(taxi$Car_Cancellation)) * 100
Car_Cancellation.Table <- cbind(Car_Cancellation.Frequency, Car_Cancellation.Percent)
Car_Cancellation.Table
class(taxi$Car_Cancellation)
hist(taxi$Car_Cancellation,main =”Car Cancellation Histogram”, xlab=”Car Cancellation”)
#Variable Handling
class(taxi$Car_Cancellation)
taxi$Car_Cancellation <- as.factor(taxi$Car_Cancellation)
class(taxi$Car_Cancellation)
# Used the values of to_lat, to_long, from_lat and from_long to calculate effective distance between #pick up and drop off points using the concept of Manhattan distance ()
taxi$distance <- NA
nonna <- which(!is.na(taxi$to_lat) & !is.na(taxi$to_long) & !is.na(taxi$from_lat) & !is.na(taxi$from_long))
taxi$to_lat <- as.numeric(taxi$to_lat)
taxi$to_long <- as.numeric(taxi$to_long)
taxi$from_lat <- as.numeric(taxi$from_lat)
taxi$from_long <- as.numeric(taxi$from_long)
taxi$distance[nonna] <- sqrt((taxi$to_long[nonna] – taxi$from_long[nonna])^2 + (taxi$to_lat[nonna] – taxi$from_lat[nonna])^2)
taxi$distance[is.na(taxi$distance)] <- mean(taxi$distance, na.rm = T)
taxi$distance
#let’s see how booking method influences cancellation
taxi$booking_method <- NA
taxi$booking_method[which(taxi$mobile_site_booking == 1)] <- "Mobile Site"
taxi$booking_method[which(taxi$online_booking == 1)] <- "Online"
taxi$booking_method[is.na(taxi$booking_method)] <- "Traditional"
method % group_by(booking_method) %>% summarise(Cancellation = length(which(Car_Cancellation == 1))/length(Car_Cancellation))
# Cancellation rate by booking method, Bar plot that shows cancellation rate for each type of booking method
ggplot(method, aes(y = Cancellation, x = booking_method, fill = booking_method)) + geom_bar(stat = “identity”, position = “Stack”) + scale_fill_brewer(palette = “Set1”) + theme_light() + ggtitle(“Cancellation rate by method of booking”)
#Vehicle_model_id
table(taxi$vehicle_model_id)
aggregate(Car_Cancellation ~ vehicle_model_id, taxi, mean)
taxi$vehicle_avg_cancelation
taxi$vehicle_model_id[ taxi$vehicle_model_id %in% c(1,13,17,30,36,70,91) ] <- 100
table(taxi$vehicle_model_id)
taxi$vehicle_model_id[ taxi$vehicle_model_id %in% c(10,23,54,64,86,100) ] <- 101
table(taxi$vehicle_model_id)
length(taxi$vehicle_model_id)
colnames(taxi)
#Travel_type_id -> Dummy variable
table(taxi$travel_type_id)
length(taxi$travel_type_id)
taxi$travel_type_id
taxi$Longdistance <- ifelse(taxi$travel_type_id==1,1,0)
taxi$Longdistance
taxi$PointToPoint <- ifelse(taxi$travel_type_id==2,1,0)
taxi$PointToPoint
str(taxi)
#Cancellation rate by type of car
car % group_by(vehicle_model_id) %>% summarise(Rides = length(vehicle_model_id), Cancellations = sum(Car_Cancellation == 1), Cancellation_rate = mean(Car_Cancellation == 1)) %>% ungroup()
car = 100) #Filter out cars with less than 100 rides
car$vehicle_model_id = factor(car$vehicle_model_id)
ggplot(car, aes(y = Cancellation_rate, x = vehicle_model_id)) + geom_bar(stat = “identity”, position = “Stack”) + theme_light() + ggtitle(“Cancellation rate by vehicle (min 100 rides)”) + xlab(“vehicle id”) + ylab(“Average cancellation rate”)
rm(car)
####Suggested predictive models#########
set.seed(1)
train.index <- sample(c(1:dim(taxi)[1]), dim(taxi)[1]*0.6)
train.df <- taxi[train.index, ]
valid.df <- taxi[-train.index, ]
# Decision Trees
tr <- rpart(Car_Cancellation ~ .,data=train.df, method="class",cp=0.00001,xval=5)
pred <- predict(tr, valid.df, type = "class")
confusionMatrix(pred, valid.df$ Car_Cancellation)
# Bagging
bag <- bagging(Car_Cancellation ~ ., data = train.df)
pred <- predict(bag, valid.df, type = "class")
confusionMatrix(as.factor(pred$class), valid.df$ Car_Cancellation)
# Boosting
boost <- boosting(Car_Cancellation ~ ., data = train.df, n.tree=200)
pred <- predict(boost, valid.df, type = "class")
confusionMatrix(as.factor(pred$class), valid.df$ Car_Cancellation)
# Random Forest
rf <- randomForest(as.factor(Car_Cancellation) ~ ., data = train.df, ntree = 200, mtry = 4, nodesize = 5, importance = TRUE)
rf.pred <- predict(rf, valid.df)
confusionMatrix(rf.pred, as.factor(valid.df$ Car_Cancellation))