{"id":78810,"date":"2021-12-02T00:13:39","date_gmt":"2021-12-02T00:13:39","guid":{"rendered":"https:\/\/papersspot.com\/blog\/2021\/12\/02\/loading-and-looking-at-the-data-in-r-taxi-read-csvc-users-doh1-desktop-info\/"},"modified":"2021-12-02T00:13:39","modified_gmt":"2021-12-02T00:13:39","slug":"loading-and-looking-at-the-data-in-r-taxi-read-csvc-users-doh1-desktop-info","status":"publish","type":"post","link":"https:\/\/papersspot.com\/blog\/2021\/12\/02\/loading-and-looking-at-the-data-in-r-taxi-read-csvc-users-doh1-desktop-info\/","title":{"rendered":"#Loading and Looking at the Data in R &gt; taxi &lt;- read.csv(&quot;C:\/Users\/doh1\/Desktop\/INFO"},"content":{"rendered":"<p>#Loading and Looking at the Data in R<\/p>\n<p> &gt; taxi &lt;- read.csv(&quot;C:\/Users\/doh1\/Desktop\/INFO 5550 QM\/Datasets\/2ndTerm2021_taxi-cancellation-case.csv&quot;)<\/p>\n<p> library(dplyr)<\/p>\n<p> library(ggplot2)<\/p>\n<p> library(adabag)<\/p>\n<p> library(rpart) <\/p>\n<p> library(caret)<\/p>\n<p> library(randomForest)<\/p>\n<p> #The percentage of Car Cancellation<\/p>\n<p> Car_Cancellation.Frequency &lt;- table(taxi$Car_Cancellation)<\/p>\n<p> Car_Cancellation.Percent &lt;- prop.table(table(taxi$Car_Cancellation)) * 100<\/p>\n<p> Car_Cancellation.Table &lt;- cbind(Car_Cancellation.Frequency, Car_Cancellation.Percent)<\/p>\n<p> Car_Cancellation.Table<\/p>\n<p> class(taxi$Car_Cancellation)<\/p>\n<p> hist(taxi$Car_Cancellation,main =&#8221;Car Cancellation Histogram&#8221;, xlab=&#8221;Car Cancellation&#8221;)<\/p>\n<p> #Variable Handling <\/p>\n<p> class(taxi$Car_Cancellation)<\/p>\n<p> taxi$Car_Cancellation &lt;- as.factor(taxi$Car_Cancellation)<\/p>\n<p> class(taxi$Car_Cancellation)<\/p>\n<p> # Used the values of to_lat, to_long, from_lat and from_long to calculate effective distance between #pick up and drop off points using the concept of Manhattan distance ()<\/p>\n<p> taxi$distance &lt;- NA<\/p>\n<p> nonna &lt;- which(!is.na(taxi$to_lat) &amp; !is.na(taxi$to_long) &amp; !is.na(taxi$from_lat) &amp; !is.na(taxi$from_long))<\/p>\n<p> taxi$to_lat &lt;- as.numeric(taxi$to_lat)<\/p>\n<p> taxi$to_long &lt;- as.numeric(taxi$to_long)<\/p>\n<p> taxi$from_lat &lt;- as.numeric(taxi$from_lat)<\/p>\n<p> taxi$from_long &lt;- as.numeric(taxi$from_long)<\/p>\n<p> taxi$distance[nonna] &lt;- sqrt((taxi$to_long[nonna] &#8211; taxi$from_long[nonna])^2 + (taxi$to_lat[nonna] &#8211; taxi$from_lat[nonna])^2)<\/p>\n<p> taxi$distance[is.na(taxi$distance)] &lt;- mean(taxi$distance, na.rm = T)<\/p>\n<p> taxi$distance<\/p>\n<p> #let&#8217;s see how booking method influences cancellation<\/p>\n<p> taxi$booking_method &lt;- NA<\/p>\n<p> taxi$booking_method[which(taxi$mobile_site_booking == 1)] &lt;- &quot;Mobile Site&quot;<\/p>\n<p> taxi$booking_method[which(taxi$online_booking == 1)] &lt;- &quot;Online&quot;<\/p>\n<p> taxi$booking_method[is.na(taxi$booking_method)] &lt;- &quot;Traditional&quot;<\/p>\n<p> method % group_by(booking_method) %&gt;% summarise(Cancellation = length(which(Car_Cancellation == 1))\/length(Car_Cancellation))<\/p>\n<p> # Cancellation rate by booking method, Bar plot that shows cancellation rate for each type of booking method<\/p>\n<p> ggplot(method, aes(y = Cancellation, x = booking_method, fill = booking_method)) + geom_bar(stat = &#8220;identity&#8221;, position = &#8220;Stack&#8221;) + scale_fill_brewer(palette = &#8220;Set1&#8221;) + theme_light() + ggtitle(&#8220;Cancellation rate by method of booking&#8221;)<\/p>\n<p> #Vehicle_model_id<\/p>\n<p> table(taxi$vehicle_model_id)<\/p>\n<p> aggregate(Car_Cancellation ~ vehicle_model_id, taxi, mean)<\/p>\n<p> taxi$vehicle_avg_cancelation<\/p>\n<p> taxi$vehicle_model_id[ taxi$vehicle_model_id %in% c(1,13,17,30,36,70,91) ] &lt;- 100<\/p>\n<p> table(taxi$vehicle_model_id)<\/p>\n<p> taxi$vehicle_model_id[ taxi$vehicle_model_id %in% c(10,23,54,64,86,100) ] &lt;- 101<\/p>\n<p> table(taxi$vehicle_model_id)<\/p>\n<p> length(taxi$vehicle_model_id)<\/p>\n<p> colnames(taxi)<\/p>\n<p> #Travel_type_id -&gt; Dummy variable<\/p>\n<p> table(taxi$travel_type_id)<\/p>\n<p> length(taxi$travel_type_id)<\/p>\n<p> taxi$travel_type_id<\/p>\n<p> taxi$Longdistance &lt;- ifelse(taxi$travel_type_id==1,1,0)<\/p>\n<p> taxi$Longdistance<\/p>\n<p> taxi$PointToPoint &lt;- ifelse(taxi$travel_type_id==2,1,0)<\/p>\n<p> taxi$PointToPoint<\/p>\n<p> str(taxi)<\/p>\n<p> #Cancellation rate by type of car<\/p>\n<p> car % group_by(vehicle_model_id) %&gt;% summarise(Rides = length(vehicle_model_id), Cancellations = sum(Car_Cancellation == 1), Cancellation_rate = mean(Car_Cancellation == 1)) %&gt;% ungroup()<\/p>\n<p> car = 100) #Filter out cars with less than 100 rides<\/p>\n<p> car$vehicle_model_id = factor(car$vehicle_model_id)<\/p>\n<p> ggplot(car, aes(y = Cancellation_rate, x = vehicle_model_id)) + geom_bar(stat = &#8220;identity&#8221;, position = &#8220;Stack&#8221;) + theme_light() + ggtitle(&#8220;Cancellation rate by vehicle (min 100 rides)&#8221;) + xlab(&#8220;vehicle id&#8221;) + ylab(&#8220;Average cancellation rate&#8221;)<\/p>\n<p> rm(car)<\/p>\n<p> ####Suggested predictive models#########<\/p>\n<p> set.seed(1) <\/p>\n<p> train.index &lt;- sample(c(1:dim(taxi)[1]), dim(taxi)[1]*0.6) <\/p>\n<p> train.df &lt;- taxi[train.index, ]<\/p>\n<p> valid.df &lt;- taxi[-train.index, ]<\/p>\n<p> # Decision Trees<\/p>\n<p> tr &lt;- rpart(Car_Cancellation ~ .,data=train.df, method=&quot;class&quot;,cp=0.00001,xval=5)<\/p>\n<p> pred &lt;- predict(tr, valid.df, type = &quot;class&quot;)<\/p>\n<p> confusionMatrix(pred, valid.df$ Car_Cancellation) <\/p>\n<p> # Bagging<\/p>\n<p> bag &lt;- bagging(Car_Cancellation ~ ., data = train.df)<\/p>\n<p> pred &lt;- predict(bag, valid.df, type = &quot;class&quot;)<\/p>\n<p> confusionMatrix(as.factor(pred$class), valid.df$ Car_Cancellation) <\/p>\n<p> # Boosting<\/p>\n<p> boost &lt;- boosting(Car_Cancellation ~ ., data = train.df, n.tree=200)<\/p>\n<p> pred &lt;- predict(boost, valid.df, type = &quot;class&quot;)<\/p>\n<p> confusionMatrix(as.factor(pred$class), valid.df$ Car_Cancellation) <\/p>\n<p> # Random Forest<\/p>\n<p> rf &lt;- randomForest(as.factor(Car_Cancellation) ~ ., data = train.df, ntree = 200, mtry = 4, nodesize = 5, importance = TRUE) <\/p>\n<p> rf.pred &lt;- predict(rf, valid.df)<\/p>\n<p> confusionMatrix(rf.pred, as.factor(valid.df$ Car_Cancellation))<\/p>\n","protected":false},"excerpt":{"rendered":"<p>#Loading and Looking at the Data in R &gt; taxi &lt;- read.csv(&quot;C:\/Users\/doh1\/Desktop\/INFO 5550 QM\/Datasets\/2ndTerm2021_taxi-cancellation-case.csv&quot;) library(dplyr) library(ggplot2) library(adabag) library(rpart) library(caret) library(randomForest) #The percentage of Car Cancellation Car_Cancellation.Frequency &lt;- table(taxi$Car_Cancellation) Car_Cancellation.Percent &lt;- prop.table(table(taxi$Car_Cancellation)) * 100 Car_Cancellation.Table &lt;- cbind(Car_Cancellation.Frequency, Car_Cancellation.Percent) Car_Cancellation.Table class(taxi$Car_Cancellation) hist(taxi$Car_Cancellation,main =&#8221;Car Cancellation Histogram&#8221;, xlab=&#8221;Car Cancellation&#8221;) #Variable Handling class(taxi$Car_Cancellation) taxi$Car_Cancellation &lt;- as.factor(taxi$Car_Cancellation) class(taxi$Car_Cancellation) # Used the [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[10],"class_list":["post-78810","post","type-post","status-publish","format-standard","hentry","category-research-paper-writing","tag-writing"],"_links":{"self":[{"href":"https:\/\/papersspot.com\/blog\/wp-json\/wp\/v2\/posts\/78810","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/papersspot.com\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/papersspot.com\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/papersspot.com\/blog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/papersspot.com\/blog\/wp-json\/wp\/v2\/comments?post=78810"}],"version-history":[{"count":0,"href":"https:\/\/papersspot.com\/blog\/wp-json\/wp\/v2\/posts\/78810\/revisions"}],"wp:attachment":[{"href":"https:\/\/papersspot.com\/blog\/wp-json\/wp\/v2\/media?parent=78810"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/papersspot.com\/blog\/wp-json\/wp\/v2\/categories?post=78810"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/papersspot.com\/blog\/wp-json\/wp\/v2\/tags?post=78810"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}