#FINAL PROJECT


#Download the dataset ‘CarInsurance.csv' and create a file called CarInsurance  
CarInsurance <- read.csv("~/Downloads/CarInsurance.csv", stringsAsFactors = T)
names(CarInsurance)

#Report preliminary descriptive statistics for two groups – groups purchased 
#Car Insurance and NOT 
library(dplyr)
CarInsurance1 <- CarInsurance %>% group_by(CarInsurance)%>% summarise_all(mean) 
View(CarInsurance1)
str(CarInsurance1)
dim(CarInsurance1)
summary(CarInsurance1)
names(CarInsurance1)
#Create suitable charts (at least 3 charts) to visualize differences between 
#groups mentioned in b 
library(ggplot2)
ggplot(CarInsurance1) + aes(CarInsurance, Balance) + geom_bar(stat = 'identity',
  fill = "blue", position = 'dodge' ) + labs(title = "Bar Chart of Car Insurance Purchase vs Balance", 
  x= 'Car Insurance', y = 'Balance') + 
  theme(plot.title = element_text(size = 12, color = 'Black', face = 'bold'), 
        axis.text.x = element_text(size= 8, face= 'italic', color = 'Black'), 
        axis.text.y = element_text(size= 8, face= 'italic', color = 'Black'))

ggplot(CarInsurance1) + aes(CarInsurance, Age) + geom_bar(stat = 'identity',
  fill = 'red', position = 'dodge' ) + labs(title = "Bar Chart of Car Insurance Purchase vs Age", 
  x= 'Car Insurance', y = 'Age') + 
  theme(plot.title = element_text(size = 12, color = 'Black', face = 'bold'), 
        axis.text.x = element_text(size= 8, face= 'italic', color = 'Black'), 
        axis.text.y = element_text(size= 8, face= 'italic', color = 'Black'))

ggplot(CarInsurance1) + aes(CarInsurance, PrevAttempts) + geom_bar(stat = 'identity',
  fill = 'darkgreen', position = 'dodge' ) + labs(title = "Bar Chart of Car Insurance Purchase vs Previous Attempts", 
  x= 'Car Insurance', y = 'Previous Attempts') + 
  theme(plot.title = element_text(size = 12, color = 'Black', face = 'bold'), 
        axis.text.x = element_text(size= 8, face= 'italic', color = 'Black'), 
        axis.text.y = element_text(size= 8, face= 'italic', color = 'Black'))

#Convert variables to the necessary formats: 
#CallStart and CallEnd are Factor variables. You can convert these into time 
#variable using chron package {Hint: CarInsurance$CallStart <- chron(times=CarInsurance$CallStart) 
#Once, you convert both these variables, you can create the variable, CallDuration
#as difference between CallEnd and CallStart 
install.packages("chron")
library(chron)
CarInsurance$CallStart <- chron(times=CarInsurance$CallStart)
CarInsurance$CallEnd <- chron(times=CarInsurance$CallEnd)
CarInsurance$CallDuration <-  c(CarInsurance$CallEnd - CarInsurance$CallStart)
View(CarInsurance)

#You may also have to create dummy variables for other factor variables 
library(caret)
dummy <- dummyVars(~., data=CarInsurance, fullRank = T)
dummy
CarInsurance1 = data.frame(predict(dummy, newdata = CarInsurance))


#Count the number of missing values for all variables (in the data set after 
#creating dummy variables). Remove any variable for which the number of missing 
#values (i.e. NAs) are more than 500. 
colSums(is.na(CarInsurance1))
CarInsurance2 <- na.omit(CarInsurance1)
names(CarInsurance2)
#Predict whether customers who were contacted during the current campaign will 
#buy car insurance or not (i.e. CarInsurance’ variable) 
#Make sure to split the data into train and test data sets. Train data set 
#should have 60 percent of observations and test dataset remaining 40 percent 
library(caTools)
set.seed(999)
sample1 <- sample.split(CarInsurance2$CarInsurance, SplitRatio = 6/10)
CarInsurance_Train <- subset(CarInsurance2, sample1 == T)
CarInsurance_Test <- subset(CarInsurance2, sample1 == F)
names(CarInsurance_Train)
table(CarInsurance_Train$CarInsurance)
#Use Train data set for building the model. Be free to create more variables 
#(using the existing variables) that you think may be useful in improving the 
#fit of the model. 
logit <- glm(CarInsurance ~., data = CarInsurance_Train, family = "binomial")
summary(logit)

#Get a best fitting model - Retain variables for which the coefficients are 
#significant at 10% 
logit2 <- glm(CarInsurance ~. -Id -Age -Job.blue.collar -Marital.married -Education.secondary 
              -Default -Balance -Communication.telephone -LastContactMonth.jan -Education.tertiary
              -NoOfContacts -PrevAttempts -Outcome.other -PrevAttempts -DaysPassed -LastContactMonth.nov 
              -Job.housemaid -Job.management -Job.retired -Job.self.employed -Job.services -LastContactMonth.mar
              -Job.student -Job.technician -Job.unemployed -Marital.single -CarLoan -CallDuration -LastContactMonth.may,
              data = CarInsurance_Train, family = "binomial")
summary(logit2)

#After finalizing the model, predict the probability of customers buying Car 
#Insurance for both train and test data sets 
CarInsurance_Train$ProbCarIns <- predict(logit2, type = 'response', 
                                         newdata = CarInsurance_Train)
CarInsurance_Test$ProbCarIns <- predict(logit2, type = 'response', 
                                         newdata = CarInsurance_Test)

#Get the classification table (Cross tabulation of CarInsurance and Predicted 
#value of the same variable) and calculate the Hit Rates; Sensitivity and 
#Specificity for both Train and Test data sets 
CarInsurance_Train$CarInsPred <- as.numeric(CarInsurance_Train$ProbCarIns>=0.5)
CarInsurance_Train$ProbCarIns >=0.5
table(CarInsurance_Train$CarInsurance, CarInsurance_Train$CarInsPred)
TN= 179
TP=268
FN=50
FP=48
  
HitRateTrain = (TN + TP)/(TN + FN + TP + FP)
HitRateTrain #0.820

TPR <- TP/(TP + FN)
TPR #0.843

TNR <- TN/(TN + FP)
TNR #0.7885

FPR <- 1 - TNR
FPR #0.211
CarInsurance_Test$CarInsPred <- as.numeric(CarInsurance_Test$ProbCarIns>=0.5)
CarInsurance_Test$ProbCarIns >=0.5
table(CarInsurance_Test$CarInsurance, CarInsurance_Test$CarInsPred)
TN= 107
TP=176
FN=45
FP=34
HitRateTest = (TN + TP)/(TN + FN + TP + FP)
HitRateTest #0.782

TPR <- TP/(TP + FN)
TPR #0.796

TNR <- TN/(TN + FP)
TNR #0.759

FPR <- 1 - TNR
FPR #0.241
#Also, get the ROC curves for both Train and Test data sets 
library(pROC)
roc_obj <- roc(CarInsurance_Train$CarInsurance, CarInsurance_Train$ProbCarIns)
roc(CarInsurance_Train$CarInsurance, CarInsurance_Train$ProbCarIns, plot = T, print.auc=T)

roc_obj <- roc(CarInsurance_Test$CarInsurance, CarInsurance_Test$ProbCarIns)
roc(CarInsurance_Test$CarInsurance, CarInsurance_Test$ProbCarIns, plot = T, print.auc=T)

#Discuss the findings from the model (e.g. discuss which factors are important 
#in predicting ‘whether a customer buys car insurance’; fit of the model etc.).
  
  ## The area under the curve is 0.909for train and 0.8903 for test. These numbers are close to one which means
  ## it is a good model. The curve is also above the diagonal line indicating
  ## it is a good model.