#FINAL PROJECT #Download the dataset ‘CarInsurance.csv' and create a file called CarInsurance CarInsurance <- read.csv("~/Downloads/CarInsurance.csv", stringsAsFactors = T) names(CarInsurance) #Report preliminary descriptive statistics for two groups – groups purchased #Car Insurance and NOT library(dplyr) CarInsurance1 <- CarInsurance %>% group_by(CarInsurance)%>% summarise_all(mean) View(CarInsurance1) str(CarInsurance1) dim(CarInsurance1) summary(CarInsurance1) names(CarInsurance1) #Create suitable charts (at least 3 charts) to visualize differences between #groups mentioned in b library(ggplot2) ggplot(CarInsurance1) + aes(CarInsurance, Balance) + geom_bar(stat = 'identity', fill = "blue", position = 'dodge' ) + labs(title = "Bar Chart of Car Insurance Purchase vs Balance", x= 'Car Insurance', y = 'Balance') + theme(plot.title = element_text(size = 12, color = 'Black', face = 'bold'), axis.text.x = element_text(size= 8, face= 'italic', color = 'Black'), axis.text.y = element_text(size= 8, face= 'italic', color = 'Black')) ggplot(CarInsurance1) + aes(CarInsurance, Age) + geom_bar(stat = 'identity', fill = 'red', position = 'dodge' ) + labs(title = "Bar Chart of Car Insurance Purchase vs Age", x= 'Car Insurance', y = 'Age') + theme(plot.title = element_text(size = 12, color = 'Black', face = 'bold'), axis.text.x = element_text(size= 8, face= 'italic', color = 'Black'), axis.text.y = element_text(size= 8, face= 'italic', color = 'Black')) ggplot(CarInsurance1) + aes(CarInsurance, PrevAttempts) + geom_bar(stat = 'identity', fill = 'darkgreen', position = 'dodge' ) + labs(title = "Bar Chart of Car Insurance Purchase vs Previous Attempts", x= 'Car Insurance', y = 'Previous Attempts') + theme(plot.title = element_text(size = 12, color = 'Black', face = 'bold'), axis.text.x = element_text(size= 8, face= 'italic', color = 'Black'), axis.text.y = element_text(size= 8, face= 'italic', color = 'Black')) #Convert variables to the necessary formats: #CallStart and CallEnd are Factor variables. You can convert these into time #variable using chron package {Hint: CarInsurance$CallStart <- chron(times=CarInsurance$CallStart) #Once, you convert both these variables, you can create the variable, CallDuration #as difference between CallEnd and CallStart install.packages("chron") library(chron) CarInsurance$CallStart <- chron(times=CarInsurance$CallStart) CarInsurance$CallEnd <- chron(times=CarInsurance$CallEnd) CarInsurance$CallDuration <- c(CarInsurance$CallEnd - CarInsurance$CallStart) View(CarInsurance) #You may also have to create dummy variables for other factor variables library(caret) dummy <- dummyVars(~., data=CarInsurance, fullRank = T) dummy CarInsurance1 = data.frame(predict(dummy, newdata = CarInsurance)) #Count the number of missing values for all variables (in the data set after #creating dummy variables). Remove any variable for which the number of missing #values (i.e. NAs) are more than 500. colSums(is.na(CarInsurance1)) CarInsurance2 <- na.omit(CarInsurance1) names(CarInsurance2) #Predict whether customers who were contacted during the current campaign will #buy car insurance or not (i.e. CarInsurance’ variable) #Make sure to split the data into train and test data sets. Train data set #should have 60 percent of observations and test dataset remaining 40 percent library(caTools) set.seed(999) sample1 <- sample.split(CarInsurance2$CarInsurance, SplitRatio = 6/10) CarInsurance_Train <- subset(CarInsurance2, sample1 == T) CarInsurance_Test <- subset(CarInsurance2, sample1 == F) names(CarInsurance_Train) table(CarInsurance_Train$CarInsurance) #Use Train data set for building the model. Be free to create more variables #(using the existing variables) that you think may be useful in improving the #fit of the model. logit <- glm(CarInsurance ~., data = CarInsurance_Train, family = "binomial") summary(logit) #Get a best fitting model - Retain variables for which the coefficients are #significant at 10% logit2 <- glm(CarInsurance ~. -Id -Age -Job.blue.collar -Marital.married -Education.secondary -Default -Balance -Communication.telephone -LastContactMonth.jan -Education.tertiary -NoOfContacts -PrevAttempts -Outcome.other -PrevAttempts -DaysPassed -LastContactMonth.nov -Job.housemaid -Job.management -Job.retired -Job.self.employed -Job.services -LastContactMonth.mar -Job.student -Job.technician -Job.unemployed -Marital.single -CarLoan -CallDuration -LastContactMonth.may, data = CarInsurance_Train, family = "binomial") summary(logit2) #After finalizing the model, predict the probability of customers buying Car #Insurance for both train and test data sets CarInsurance_Train$ProbCarIns <- predict(logit2, type = 'response', newdata = CarInsurance_Train) CarInsurance_Test$ProbCarIns <- predict(logit2, type = 'response', newdata = CarInsurance_Test) #Get the classification table (Cross tabulation of CarInsurance and Predicted #value of the same variable) and calculate the Hit Rates; Sensitivity and #Specificity for both Train and Test data sets CarInsurance_Train$CarInsPred <- as.numeric(CarInsurance_Train$ProbCarIns>=0.5) CarInsurance_Train$ProbCarIns >=0.5 table(CarInsurance_Train$CarInsurance, CarInsurance_Train$CarInsPred) TN= 179 TP=268 FN=50 FP=48 HitRateTrain = (TN + TP)/(TN + FN + TP + FP) HitRateTrain #0.820 TPR <- TP/(TP + FN) TPR #0.843 TNR <- TN/(TN + FP) TNR #0.7885 FPR <- 1 - TNR FPR #0.211 CarInsurance_Test$CarInsPred <- as.numeric(CarInsurance_Test$ProbCarIns>=0.5) CarInsurance_Test$ProbCarIns >=0.5 table(CarInsurance_Test$CarInsurance, CarInsurance_Test$CarInsPred) TN= 107 TP=176 FN=45 FP=34 HitRateTest = (TN + TP)/(TN + FN + TP + FP) HitRateTest #0.782 TPR <- TP/(TP + FN) TPR #0.796 TNR <- TN/(TN + FP) TNR #0.759 FPR <- 1 - TNR FPR #0.241 #Also, get the ROC curves for both Train and Test data sets library(pROC) roc_obj <- roc(CarInsurance_Train$CarInsurance, CarInsurance_Train$ProbCarIns) roc(CarInsurance_Train$CarInsurance, CarInsurance_Train$ProbCarIns, plot = T, print.auc=T) roc_obj <- roc(CarInsurance_Test$CarInsurance, CarInsurance_Test$ProbCarIns) roc(CarInsurance_Test$CarInsurance, CarInsurance_Test$ProbCarIns, plot = T, print.auc=T) #Discuss the findings from the model (e.g. discuss which factors are important #in predicting ‘whether a customer buys car insurance’; fit of the model etc.). ## The area under the curve is 0.909for train and 0.8903 for test. These numbers are close to one which means ## it is a good model. The curve is also above the diagonal line indicating ## it is a good model.