#Loading and Looking at the Data in R > taxi <- read.csv("C:/Users/doh1/Desktop/INFO

#Loading and Looking at the Data in R

> taxi <- read.csv("C:/Users/doh1/Desktop/INFO 5550 QM/Datasets/2ndTerm2021_taxi-cancellation-case.csv")

library(dplyr)

library(ggplot2)

library(adabag)

library(rpart)

library(caret)

library(randomForest)

#The percentage of Car Cancellation

Car_Cancellation.Frequency <- table(taxi$Car_Cancellation)

Car_Cancellation.Percent <- prop.table(table(taxi$Car_Cancellation)) * 100

Car_Cancellation.Table <- cbind(Car_Cancellation.Frequency, Car_Cancellation.Percent)

Car_Cancellation.Table

class(taxi$Car_Cancellation)

hist(taxi$Car_Cancellation,main =”Car Cancellation Histogram”, xlab=”Car Cancellation”)

#Variable Handling

class(taxi$Car_Cancellation)

taxi$Car_Cancellation <- as.factor(taxi$Car_Cancellation)

class(taxi$Car_Cancellation)

# Used the values of to_lat, to_long, from_lat and from_long to calculate effective distance between #pick up and drop off points using the concept of Manhattan distance ()

taxi$distance <- NA

nonna <- which(!is.na(taxi$to_lat) & !is.na(taxi$to_long) & !is.na(taxi$from_lat) & !is.na(taxi$from_long))

taxi$to_lat <- as.numeric(taxi$to_lat)

taxi$to_long <- as.numeric(taxi$to_long)

taxi$from_lat <- as.numeric(taxi$from_lat)

taxi$from_long <- as.numeric(taxi$from_long)

taxi$distance[nonna] <- sqrt((taxi$to_long[nonna] – taxi$from_long[nonna])^2 + (taxi$to_lat[nonna] – taxi$from_lat[nonna])^2)

taxi$distance[is.na(taxi$distance)] <- mean(taxi$distance, na.rm = T)

taxi$distance

#let’s see how booking method influences cancellation

taxi$booking_method <- NA

taxi$booking_method[which(taxi$mobile_site_booking == 1)] <- "Mobile Site"

taxi$booking_method[which(taxi$online_booking == 1)] <- "Online"

taxi$booking_method[is.na(taxi$booking_method)] <- "Traditional"

method % group_by(booking_method) %>% summarise(Cancellation = length(which(Car_Cancellation == 1))/length(Car_Cancellation))

# Cancellation rate by booking method, Bar plot that shows cancellation rate for each type of booking method

ggplot(method, aes(y = Cancellation, x = booking_method, fill = booking_method)) + geom_bar(stat = “identity”, position = “Stack”) + scale_fill_brewer(palette = “Set1”) + theme_light() + ggtitle(“Cancellation rate by method of booking”)

#Vehicle_model_id

table(taxi$vehicle_model_id)

aggregate(Car_Cancellation ~ vehicle_model_id, taxi, mean)

taxi$vehicle_avg_cancelation

taxi$vehicle_model_id[ taxi$vehicle_model_id %in% c(1,13,17,30,36,70,91) ] <- 100

table(taxi$vehicle_model_id)

taxi$vehicle_model_id[ taxi$vehicle_model_id %in% c(10,23,54,64,86,100) ] <- 101

table(taxi$vehicle_model_id)

length(taxi$vehicle_model_id)

colnames(taxi)

#Travel_type_id -> Dummy variable

table(taxi$travel_type_id)

length(taxi$travel_type_id)

taxi$travel_type_id

taxi$Longdistance <- ifelse(taxi$travel_type_id==1,1,0)

taxi$Longdistance

taxi$PointToPoint <- ifelse(taxi$travel_type_id==2,1,0)

taxi$PointToPoint

str(taxi)

#Cancellation rate by type of car

car % group_by(vehicle_model_id) %>% summarise(Rides = length(vehicle_model_id), Cancellations = sum(Car_Cancellation == 1), Cancellation_rate = mean(Car_Cancellation == 1)) %>% ungroup()

car = 100) #Filter out cars with less than 100 rides

car$vehicle_model_id = factor(car$vehicle_model_id)

ggplot(car, aes(y = Cancellation_rate, x = vehicle_model_id)) + geom_bar(stat = “identity”, position = “Stack”) + theme_light() + ggtitle(“Cancellation rate by vehicle (min 100 rides)”) + xlab(“vehicle id”) + ylab(“Average cancellation rate”)

rm(car)

####Suggested predictive models#########

set.seed(1)

train.index <- sample(c(1:dim(taxi)[1]), dim(taxi)[1]*0.6)

train.df <- taxi[train.index, ]

valid.df <- taxi[-train.index, ]

# Decision Trees

tr <- rpart(Car_Cancellation ~ .,data=train.df, method="class",cp=0.00001,xval=5)

pred <- predict(tr, valid.df, type = "class")

confusionMatrix(pred, valid.df$ Car_Cancellation)

# Bagging

bag <- bagging(Car_Cancellation ~ ., data = train.df)

pred <- predict(bag, valid.df, type = "class")

confusionMatrix(as.factor(pred$class), valid.df$ Car_Cancellation)

# Boosting

boost <- boosting(Car_Cancellation ~ ., data = train.df, n.tree=200)

pred <- predict(boost, valid.df, type = "class")

confusionMatrix(as.factor(pred$class), valid.df$ Car_Cancellation)

# Random Forest

rf <- randomForest(as.factor(Car_Cancellation) ~ ., data = train.df, ntree = 200, mtry = 4, nodesize = 5, importance = TRUE)

rf.pred <- predict(rf, valid.df)

confusionMatrix(rf.pred, as.factor(valid.df$ Car_Cancellation))