We’ll use ebeer and telco data sets from before. We’ll drop the ID column, and select customers that received a mailing only.
rm(list=ls())
library(tree)
library(dplyr)
library(janitor)
library(car)
library(pROC)
library(ranger)
library(glmnet)
library(readr)
options("scipen"=200, "digits"=3)
Ebeer
# set working directory using however you want to folder where data is stored. I'll use
ebeer <- read_csv("ebeer.csv")
# load ebeer, remove account number column
ebeer<-ebeer[-c(1)]
Training-Test Samples:
# drop the ID column, select customers that received a mailing only
ebeer_test<-subset(ebeer, mailing ==1)
# create ebeer rollout data
ebeer_rollout<-subset(ebeer, mailing ==0)
# rename ebeer_test ebeer
ebeer<-ebeer_test
Telco
# load telco
telco <- read_csv("telco.csv")
telco <- strings2factors(telco)
##
## The following character variables were converted to factors
## gender Partner Dependents PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod Churn
# drop ID column, divide Total charges by 1000
telco<-subset(telco, select=-customerID)
telco$TotalCharges<-telco$TotalCharges/1000
Training-Test:
# create 70% test and 30% holdout sample
set.seed(19103)
n <- nrow(telco)
sample <- sample(c(TRUE, FALSE), n, replace=TRUE, prob=c(0.7, 0.3))
telco.test <- telco[sample, ]
telco.holdout <- telco[!sample, ]
#call test telco, and full data set telco.all
telco.all<-telco
telco<-telco.test
The biggest model we consider is with all of the variables in churn
plus all the two-way interactions with the variable
tenure
.
full <- glm(Churn ~ . + tenure:(.), data=telco, family = "binomial")
summary(full)
##
## Call:
## glm(formula = Churn ~ . + tenure:(.), family = "binomial", data = telco)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.041 -0.679 -0.270 0.663 3.389
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.11968 1.60886 1.32 0.18767
## genderMale -0.19315 0.11501 -1.68 0.09307 .
## SeniorCitizen 0.28498 0.15931 1.79 0.07364 .
## PartnerYes -0.03859 0.14764 -0.26 0.79383
## DependentsYes -0.30125 0.16618 -1.81 0.06987 .
## tenure -0.07764 0.04432 -1.75 0.07980 .
## PhoneServiceYes 1.23723 1.29950 0.95 0.34106
## MultipleLinesYes 0.77555 0.35704 2.17 0.02984 *
## InternetServiceFiber optic 2.77993 1.60230 1.73 0.08275 .
## InternetServiceNo -2.76954 1.61687 -1.71 0.08673 .
## OnlineSecurityYes 0.02269 0.35709 0.06 0.94933
## OnlineBackupYes 0.07578 0.35252 0.21 0.82979
## DeviceProtectionYes 0.67997 0.35579 1.91 0.05599 .
## TechSupportYes 0.04285 0.35803 0.12 0.90474
## StreamingTVYes 0.89952 0.65311 1.38 0.16842
## StreamingMoviesYes 0.92049 0.65598 1.40 0.16055
## ContractOne year -1.22765 0.29419 -4.17 0.00003 ***
## ContractTwo year -2.96166 0.85564 -3.46 0.00054 ***
## PaperlessBillingYes 0.35041 0.12769 2.74 0.00606 **
## PaymentMethodCredit card (automatic) -0.02210 0.23438 -0.09 0.92488
## PaymentMethodElectronic check 0.12510 0.18612 0.67 0.50147
## PaymentMethodMailed check -0.06152 0.20751 -0.30 0.76689
## MonthlyCharges -0.07838 0.06377 -1.23 0.21907
## TotalCharges 0.13415 0.65472 0.20 0.83765
## genderMale:tenure 0.00755 0.00370 2.04 0.04133 *
## SeniorCitizen:tenure -0.00335 0.00465 -0.72 0.47063
## PartnerYes:tenure 0.00554 0.00459 1.21 0.22744
## DependentsYes:tenure 0.00437 0.00479 0.91 0.36116
## tenure:PhoneServiceYes -0.03561 0.03594 -0.99 0.32178
## tenure:MultipleLinesYes -0.00858 0.01024 -0.84 0.40193
## tenure:InternetServiceFiber optic -0.03176 0.04395 -0.72 0.46984
## tenure:InternetServiceNo 0.02563 0.04552 0.56 0.57345
## tenure:OnlineSecurityYes -0.00703 0.00997 -0.71 0.48047
## tenure:OnlineBackupYes -0.00321 0.00987 -0.32 0.74531
## tenure:DeviceProtectionYes -0.01604 0.00987 -1.63 0.10406
## tenure:TechSupportYes -0.00870 0.00989 -0.88 0.37902
## tenure:StreamingTVYes -0.00543 0.01807 -0.30 0.76387
## tenure:StreamingMoviesYes -0.00726 0.01802 -0.40 0.68687
## tenure:ContractOne year 0.01247 0.00692 1.80 0.07134 .
## tenure:ContractTwo year 0.02349 0.01438 1.63 0.10229
## tenure:PaperlessBillingYes -0.00424 0.00437 -0.97 0.33244
## tenure:PaymentMethodCredit card (automatic) -0.00270 0.00609 -0.44 0.65695
## tenure:PaymentMethodElectronic check 0.00386 0.00514 0.75 0.45336
## tenure:PaymentMethodMailed check -0.01138 0.00869 -1.31 0.19039
## tenure:MonthlyCharges 0.00105 0.00186 0.56 0.57317
## tenure:TotalCharges 0.00369 0.00183 2.02 0.04315 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5697.6 on 4948 degrees of freedom
## Residual deviance: 3997.5 on 4903 degrees of freedom
## AIC: 4090
##
## Number of Fisher Scoring iterations: 7
We do forward selection. The algorithm uses AIC, but we could modify
it by including k=0
to consider deviance (with no
penalty).
You can see it going through all the models. The
<none>
in the output below shows the best model
(lowest AIC) at the current step.
# intercept only
null <- glm(Churn ~ 1, data=telco, family = "binomial")
start_time <- Sys.time()
fwd.model <- step(null, direction = 'forward', scope=formula(full), keep = function(model, aic) list(model = model, aic = aic))
## Start: AIC=5700
## Churn ~ 1
##
## Df Deviance AIC
## + Contract 2 4693 4699
## + tenure 1 5020 5024
## + InternetService 2 5160 5166
## + PaymentMethod 3 5275 5283
## + TotalCharges 1 5483 5487
## + MonthlyCharges 1 5512 5516
## + PaperlessBilling 1 5519 5523
## + TechSupport 1 5536 5540
## + OnlineSecurity 1 5539 5543
## + Dependents 1 5561 5565
## + SeniorCitizen 1 5590 5594
## + Partner 1 5599 5603
## + OnlineBackup 1 5655 5659
## + StreamingTV 1 5673 5677
## + StreamingMovies 1 5678 5682
## + DeviceProtection 1 5679 5683
## + MultipleLines 1 5690 5694
## <none> 5698 5700
## + gender 1 5697 5701
## + PhoneService 1 5697 5701
##
## Step: AIC=4699
## Churn ~ Contract
##
## Df Deviance AIC
## + InternetService 2 4413 4423
## + MonthlyCharges 1 4503 4511
## + PaymentMethod 3 4548 4560
## + PaperlessBilling 1 4611 4619
## + StreamingTV 1 4614 4622
## + StreamingMovies 1 4618 4626
## + tenure 1 4630 4638
## + SeniorCitizen 1 4649 4657
## + MultipleLines 1 4655 4663
## + OnlineSecurity 1 4658 4666
## + Dependents 1 4661 4669
## + TechSupport 1 4671 4679
## + DeviceProtection 1 4686 4694
## + OnlineBackup 1 4690 4698
## + Partner 1 4691 4699
## <none> 4693 4699
## + TotalCharges 1 4692 4700
## + PhoneService 1 4693 4701
## + gender 1 4693 4701
##
## Step: AIC=4423
## Churn ~ Contract + InternetService
##
## Df Deviance AIC
## + tenure 1 4226 4238
## + TotalCharges 1 4286 4298
## + OnlineSecurity 1 4358 4370
## + PaymentMethod 3 4360 4376
## + TechSupport 1 4373 4385
## + OnlineBackup 1 4375 4387
## + Dependents 1 4395 4407
## + PaperlessBilling 1 4398 4410
## + MonthlyCharges 1 4401 4413
## + Partner 1 4402 4414
## + PhoneService 1 4403 4415
## + StreamingTV 1 4405 4417
## + SeniorCitizen 1 4406 4418
## + StreamingMovies 1 4406 4418
## + DeviceProtection 1 4410 4422
## <none> 4413 4423
## + MultipleLines 1 4412 4424
## + gender 1 4413 4425
##
## Step: AIC=4238
## Churn ~ Contract + InternetService + tenure
##
## Df Deviance AIC
## + PaymentMethod 3 4190 4208
## + StreamingMovies 1 4197 4211
## + OnlineSecurity 1 4198 4212
## + StreamingTV 1 4198 4212
## + TechSupport 1 4203 4217
## + tenure:Contract 2 4202 4218
## + PaperlessBilling 1 4205 4219
## + TotalCharges 1 4209 4223
## + SeniorCitizen 1 4210 4224
## + PhoneService 1 4211 4225
## + Dependents 1 4214 4228
## + MultipleLines 1 4215 4229
## + tenure:InternetService 2 4219 4235
## + OnlineBackup 1 4221 4235
## <none> 4226 4238
## + DeviceProtection 1 4224 4238
## + MonthlyCharges 1 4225 4239
## + Partner 1 4226 4240
## + gender 1 4226 4240
##
## Step: AIC=4208
## Churn ~ Contract + InternetService + tenure + PaymentMethod
##
## Df Deviance AIC
## + OnlineSecurity 1 4166 4186
## + StreamingMovies 1 4168 4188
## + StreamingTV 1 4168 4188
## + tenure:Contract 2 4167 4189
## + TechSupport 1 4171 4191
## + TotalCharges 1 4172 4192
## + PaperlessBilling 1 4173 4193
## + PhoneService 1 4178 4198
## + SeniorCitizen 1 4178 4198
## + tenure:PaymentMethod 3 4175 4199
## + MultipleLines 1 4180 4200
## + Dependents 1 4181 4201
## + tenure:InternetService 2 4181 4203
## + OnlineBackup 1 4186 4206
## <none> 4190 4208
## + DeviceProtection 1 4188 4208
## + MonthlyCharges 1 4189 4209
## + gender 1 4190 4210
## + Partner 1 4190 4210
##
## Step: AIC=4186
## Churn ~ Contract + InternetService + tenure + PaymentMethod +
## OnlineSecurity
##
## Df Deviance AIC
## + TotalCharges 1 4144 4166
## + tenure:Contract 2 4143 4167
## + StreamingMovies 1 4145 4167
## + StreamingTV 1 4146 4168
## + TechSupport 1 4150 4172
## + PaperlessBilling 1 4151 4173
## + SeniorCitizen 1 4156 4178
## + PhoneService 1 4156 4178
## + tenure:PaymentMethod 3 4152 4178
## + MultipleLines 1 4156 4178
## + tenure:InternetService 2 4156 4180
## + Dependents 1 4158 4180
## + MonthlyCharges 1 4163 4185
## + OnlineBackup 1 4163 4185
## <none> 4166 4186
## + tenure:OnlineSecurity 1 4165 4187
## + DeviceProtection 1 4165 4187
## + Partner 1 4166 4188
## + gender 1 4166 4188
##
## Step: AIC=4166
## Churn ~ Contract + InternetService + tenure + PaymentMethod +
## OnlineSecurity + TotalCharges
##
## Df Deviance AIC
## + TechSupport 1 4121 4145
## + PhoneService 1 4126 4150
## + tenure:TotalCharges 1 4129 4153
## + PaperlessBilling 1 4129 4153
## + tenure:Contract 2 4128 4154
## + StreamingMovies 1 4133 4157
## + SeniorCitizen 1 4133 4157
## + StreamingTV 1 4133 4157
## + Dependents 1 4136 4160
## + OnlineBackup 1 4137 4161
## + MultipleLines 1 4138 4162
## + tenure:PaymentMethod 3 4137 4165
## <none> 4144 4166
## + tenure:InternetService 2 4140 4166
## + tenure:OnlineSecurity 1 4143 4167
## + MonthlyCharges 1 4144 4168
## + DeviceProtection 1 4144 4168
## + Partner 1 4144 4168
## + gender 1 4144 4168
##
## Step: AIC=4145
## Churn ~ Contract + InternetService + tenure + PaymentMethod +
## OnlineSecurity + TotalCharges + TechSupport
##
## Df Deviance AIC
## + PhoneService 1 4103 4129
## + PaperlessBilling 1 4106 4132
## + tenure:Contract 2 4106 4134
## + StreamingMovies 1 4108 4134
## + StreamingTV 1 4109 4135
## + tenure:TotalCharges 1 4110 4136
## + SeniorCitizen 1 4112 4138
## + OnlineBackup 1 4114 4140
## + Dependents 1 4114 4140
## + tenure:InternetService 2 4114 4142
## + MultipleLines 1 4116 4142
## <none> 4121 4145
## + tenure:PaymentMethod 3 4115 4145
## + MonthlyCharges 1 4120 4146
## + tenure:OnlineSecurity 1 4120 4146
## + DeviceProtection 1 4120 4146
## + tenure:TechSupport 1 4120 4146
## + gender 1 4120 4146
## + Partner 1 4121 4147
##
## Step: AIC=4129
## Churn ~ Contract + InternetService + tenure + PaymentMethod +
## OnlineSecurity + TotalCharges + TechSupport + PhoneService
##
## Df Deviance AIC
## + tenure:PhoneService 1 4088 4116
## + PaperlessBilling 1 4089 4117
## + tenure:Contract 2 4089 4119
## + MonthlyCharges 1 4092 4120
## + StreamingTV 1 4093 4121
## + tenure:InternetService 2 4091 4121
## + StreamingMovies 1 4094 4122
## + MultipleLines 1 4095 4123
## + OnlineBackup 1 4095 4123
## + tenure:TotalCharges 1 4095 4123
## + SeniorCitizen 1 4096 4124
## + Dependents 1 4097 4125
## <none> 4103 4129
## + tenure:PaymentMethod 3 4098 4130
## + tenure:OnlineSecurity 1 4103 4131
## + DeviceProtection 1 4103 4131
## + tenure:TechSupport 1 4103 4131
## + Partner 1 4103 4131
## + gender 1 4103 4131
##
## Step: AIC=4116
## Churn ~ Contract + InternetService + tenure + PaymentMethod +
## OnlineSecurity + TotalCharges + TechSupport + PhoneService +
## tenure:PhoneService
##
## Df Deviance AIC
## + PaperlessBilling 1 4075 4105
## + tenure:Contract 2 4076 4108
## + MultipleLines 1 4079 4109
## + OnlineBackup 1 4079 4109
## + SeniorCitizen 1 4082 4112
## + tenure:TotalCharges 1 4082 4112
## + tenure:InternetService 2 4080 4112
## + StreamingMovies 1 4083 4113
## + Dependents 1 4083 4113
## + StreamingTV 1 4083 4113
## + MonthlyCharges 1 4083 4113
## <none> 4088 4116
## + DeviceProtection 1 4088 4118
## + tenure:TechSupport 1 4088 4118
## + Partner 1 4088 4118
## + tenure:OnlineSecurity 1 4088 4118
## + gender 1 4088 4118
## + tenure:PaymentMethod 3 4085 4119
##
## Step: AIC=4105
## Churn ~ Contract + InternetService + tenure + PaymentMethod +
## OnlineSecurity + TotalCharges + TechSupport + PhoneService +
## PaperlessBilling + tenure:PhoneService
##
## Df Deviance AIC
## + OnlineBackup 1 4066 4098
## + tenure:Contract 2 4064 4098
## + MultipleLines 1 4067 4099
## + tenure:TotalCharges 1 4069 4101
## + SeniorCitizen 1 4070 4102
## + StreamingMovies 1 4071 4103
## + Dependents 1 4071 4103
## + tenure:InternetService 2 4069 4103
## + StreamingTV 1 4071 4103
## + MonthlyCharges 1 4071 4103
## <none> 4075 4105
## + tenure:PaperlessBilling 1 4074 4106
## + DeviceProtection 1 4075 4107
## + tenure:TechSupport 1 4075 4107
## + Partner 1 4075 4107
## + tenure:OnlineSecurity 1 4075 4107
## + gender 1 4075 4107
## + tenure:PaymentMethod 3 4072 4108
##
## Step: AIC=4098
## Churn ~ Contract + InternetService + tenure + PaymentMethod +
## OnlineSecurity + TotalCharges + TechSupport + PhoneService +
## PaperlessBilling + OnlineBackup + tenure:PhoneService
##
## Df Deviance AIC
## + tenure:Contract 2 4055 4091
## + MultipleLines 1 4057 4091
## + MonthlyCharges 1 4058 4092
## + tenure:InternetService 2 4057 4093
## + SeniorCitizen 1 4060 4094
## + tenure:TotalCharges 1 4060 4094
## + Dependents 1 4061 4095
## + StreamingMovies 1 4062 4096
## + StreamingTV 1 4062 4096
## <none> 4066 4098
## + tenure:OnlineBackup 1 4065 4099
## + tenure:PaperlessBilling 1 4065 4099
## + tenure:TechSupport 1 4065 4099
## + DeviceProtection 1 4066 4100
## + Partner 1 4066 4100
## + gender 1 4066 4100
## + tenure:OnlineSecurity 1 4066 4100
## + tenure:PaymentMethod 3 4063 4101
##
## Step: AIC=4091
## Churn ~ Contract + InternetService + tenure + PaymentMethod +
## OnlineSecurity + TotalCharges + TechSupport + PhoneService +
## PaperlessBilling + OnlineBackup + tenure:PhoneService + Contract:tenure
##
## Df Deviance AIC
## + MonthlyCharges 1 4043 4081
## + MultipleLines 1 4045 4083
## + StreamingMovies 1 4049 4087
## + SeniorCitizen 1 4049 4087
## + StreamingTV 1 4049 4087
## + tenure:InternetService 2 4049 4089
## + Dependents 1 4051 4089
## <none> 4055 4091
## + tenure:TechSupport 1 4054 4092
## + tenure:PaperlessBilling 1 4054 4092
## + tenure:TotalCharges 1 4054 4092
## + tenure:OnlineBackup 1 4055 4093
## + tenure:OnlineSecurity 1 4055 4093
## + Partner 1 4055 4093
## + gender 1 4055 4093
## + DeviceProtection 1 4055 4093
## + tenure:PaymentMethod 3 4051 4093
##
## Step: AIC=4081
## Churn ~ Contract + InternetService + tenure + PaymentMethod +
## OnlineSecurity + TotalCharges + TechSupport + PhoneService +
## PaperlessBilling + OnlineBackup + MonthlyCharges + tenure:PhoneService +
## Contract:tenure
##
## Df Deviance AIC
## + SeniorCitizen 1 4038 4078
## + tenure:TotalCharges 1 4038 4078
## + MultipleLines 1 4039 4079
## + Dependents 1 4040 4080
## <none> 4043 4081
## + DeviceProtection 1 4041 4081
## + tenure:OnlineBackup 1 4042 4082
## + tenure:PaymentMethod 3 4039 4083
## + tenure:TechSupport 1 4043 4083
## + tenure:PaperlessBilling 1 4043 4083
## + Partner 1 4043 4083
## + tenure:OnlineSecurity 1 4043 4083
## + tenure:MonthlyCharges 1 4043 4083
## + gender 1 4043 4083
## + StreamingMovies 1 4043 4083
## + StreamingTV 1 4043 4083
## + tenure:InternetService 2 4043 4085
##
## Step: AIC=4078
## Churn ~ Contract + InternetService + tenure + PaymentMethod +
## OnlineSecurity + TotalCharges + TechSupport + PhoneService +
## PaperlessBilling + OnlineBackup + MonthlyCharges + SeniorCitizen +
## tenure:PhoneService + Contract:tenure
##
## Df Deviance AIC
## + tenure:TotalCharges 1 4033 4075
## + MultipleLines 1 4034 4076
## + Dependents 1 4036 4078
## <none> 4038 4078
## + DeviceProtection 1 4036 4078
## + tenure:OnlineBackup 1 4036 4078
## + tenure:PaymentMethod 3 4033 4079
## + SeniorCitizen:tenure 1 4037 4079
## + tenure:TechSupport 1 4037 4079
## + tenure:PaperlessBilling 1 4037 4079
## + tenure:OnlineSecurity 1 4038 4080
## + Partner 1 4038 4080
## + StreamingTV 1 4038 4080
## + gender 1 4038 4080
## + tenure:MonthlyCharges 1 4038 4080
## + StreamingMovies 1 4038 4080
## + tenure:InternetService 2 4037 4081
##
## Step: AIC=4075
## Churn ~ Contract + InternetService + tenure + PaymentMethod +
## OnlineSecurity + TotalCharges + TechSupport + PhoneService +
## PaperlessBilling + OnlineBackup + MonthlyCharges + SeniorCitizen +
## tenure:PhoneService + Contract:tenure + tenure:TotalCharges
##
## Df Deviance AIC
## + MultipleLines 1 4028 4072
## + Dependents 1 4030 4074
## <none> 4033 4075
## + DeviceProtection 1 4031 4075
## + tenure:PaymentMethod 3 4028 4076
## + SeniorCitizen:tenure 1 4032 4076
## + tenure:OnlineBackup 1 4032 4076
## + tenure:TechSupport 1 4032 4076
## + tenure:PaperlessBilling 1 4032 4076
## + tenure:OnlineSecurity 1 4032 4076
## + Partner 1 4033 4077
## + StreamingTV 1 4033 4077
## + gender 1 4033 4077
## + tenure:MonthlyCharges 1 4033 4077
## + StreamingMovies 1 4033 4077
## + tenure:InternetService 2 4032 4078
##
## Step: AIC=4072
## Churn ~ Contract + InternetService + tenure + PaymentMethod +
## OnlineSecurity + TotalCharges + TechSupport + PhoneService +
## PaperlessBilling + OnlineBackup + MonthlyCharges + SeniorCitizen +
## MultipleLines + tenure:PhoneService + Contract:tenure + tenure:TotalCharges
##
## Df Deviance AIC
## + Dependents 1 4026 4072
## <none> 4028 4072
## + DeviceProtection 1 4027 4073
## + SeniorCitizen:tenure 1 4027 4073
## + StreamingTV 1 4027 4073
## + tenure:PaymentMethod 3 4023 4073
## + tenure:TechSupport 1 4028 4074
## + tenure:OnlineBackup 1 4028 4074
## + tenure:PaperlessBilling 1 4028 4074
## + StreamingMovies 1 4028 4074
## + tenure:MultipleLines 1 4028 4074
## + tenure:OnlineSecurity 1 4028 4074
## + Partner 1 4028 4074
## + gender 1 4028 4074
## + tenure:MonthlyCharges 1 4028 4074
## + tenure:InternetService 2 4028 4076
##
## Step: AIC=4072
## Churn ~ Contract + InternetService + tenure + PaymentMethod +
## OnlineSecurity + TotalCharges + TechSupport + PhoneService +
## PaperlessBilling + OnlineBackup + MonthlyCharges + SeniorCitizen +
## MultipleLines + Dependents + tenure:PhoneService + Contract:tenure +
## tenure:TotalCharges
##
## Df Deviance AIC
## + Dependents:tenure 1 4024 4072
## <none> 4026 4072
## + DeviceProtection 1 4025 4073
## + Partner 1 4025 4073
## + SeniorCitizen:tenure 1 4025 4073
## + StreamingTV 1 4025 4073
## + tenure:PaymentMethod 3 4021 4073
## + tenure:TechSupport 1 4025 4073
## + tenure:OnlineBackup 1 4026 4074
## + tenure:PaperlessBilling 1 4026 4074
## + tenure:MultipleLines 1 4026 4074
## + StreamingMovies 1 4026 4074
## + tenure:OnlineSecurity 1 4026 4074
## + gender 1 4026 4074
## + tenure:MonthlyCharges 1 4026 4074
## + tenure:InternetService 2 4026 4076
##
## Step: AIC=4072
## Churn ~ Contract + InternetService + tenure + PaymentMethod +
## OnlineSecurity + TotalCharges + TechSupport + PhoneService +
## PaperlessBilling + OnlineBackup + MonthlyCharges + SeniorCitizen +
## MultipleLines + Dependents + tenure:PhoneService + Contract:tenure +
## tenure:TotalCharges + tenure:Dependents
##
## Df Deviance AIC
## <none> 4024 4072
## + Partner 1 4023 4073
## + DeviceProtection 1 4023 4073
## + tenure:PaymentMethod 3 4019 4073
## + StreamingTV 1 4023 4073
## + tenure:TechSupport 1 4023 4073
## + tenure:PaperlessBilling 1 4023 4073
## + tenure:OnlineBackup 1 4023 4073
## + SeniorCitizen:tenure 1 4023 4073
## + StreamingMovies 1 4023 4073
## + tenure:OnlineSecurity 1 4023 4073
## + tenure:MultipleLines 1 4024 4074
## + gender 1 4024 4074
## + tenure:MonthlyCharges 1 4024 4074
## + tenure:InternetService 2 4024 4076
end_time <- Sys.time()
t <- end_time-start_time
Here is the table showing the variables added per step.
fwd.model$anova
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 NA NA 4948 5698 5700
## 2 + Contract -2 1004.41 4946 4693 4699
## 3 + InternetService -2 279.93 4944 4413 4423
## 4 + tenure -1 187.48 4943 4226 4238
## 5 + PaymentMethod -3 35.71 4940 4190 4208
## 6 + OnlineSecurity -1 23.61 4939 4166 4186
## 7 + TotalCharges -1 22.61 4938 4144 4166
## 8 + TechSupport -1 23.36 4937 4121 4145
## 9 + PhoneService -1 17.64 4936 4103 4129
## 10 + tenure:PhoneService -1 14.74 4935 4088 4116
## 11 + PaperlessBilling -1 12.70 4934 4075 4105
## 12 + OnlineBackup -1 9.76 4933 4066 4098
## 13 + tenure:Contract -2 10.69 4931 4055 4091
## 14 + MonthlyCharges -1 11.57 4930 4043 4081
## 15 + SeniorCitizen -1 5.59 4929 4038 4078
## 16 + tenure:TotalCharges -1 5.24 4928 4033 4075
## 17 + MultipleLines -1 4.51 4927 4028 4072
## 18 + Dependents -1 2.01 4926 4026 4072
## 19 + Dependents:tenure -1 2.25 4925 4024 4072
Note not all coefficients were added.
## Coeficients included in Forward Selection: 24
## Total number of coeficients: 46
Now let’s test the sequence of models with varying numbers of covariates using cross validation. We focus on OOS R2:
M <- dim(fwd.model$keep)[2]
OOS=data.frame(R2=rep(NA,M), rank=rep(NA, M))
## pred must be probabilities (0<pred<1) for binomial
deviance <- function(y, pred, family=c("gaussian","binomial")){
family <- match.arg(family)
if(family=="gaussian"){
return( sum( (y-pred)^2 ) )
}else{
if(is.factor(y)) y <- as.numeric(y)>1
return( -2*sum( y*log(pred) + (1-y)*log(1-pred) ) )
}
}
## get null devaince too, and return R2
R2 <- function(y, pred, family=c("gaussian","binomial")){
fam <- match.arg(family)
if(fam=="binomial"){
if(is.factor(y)){ y <- as.numeric(y)>1 }
}
dev <- deviance(y, pred, family=fam)
dev0 <- deviance(y, mean(y), family=fam)
return(1-dev/dev0)
}
for(k in 1:M){
pred = predict(fwd.model$keep[["model",k]], newdata=telco.holdout, type = "response")
OOS$R2[k]<-R2(y = telco.holdout$Churn,pred=pred, family="binomial")
OOS$rank[k]<-fwd.model$keep[["model",k]]$rank
}
ax=c(1:max(OOS$rank))
par(mai=c(.9,.8,.2,.2))
plot(x=OOS$rank, y = OOS$R2, type="b", ylab=expression(paste("Out-of-Sample R"^"2")), xlab="# of model parameters estimated (rank)", xaxt="n")
axis(1, at=ax, labels=ax)
max.idx <- which.max(OOS$R2)
OOS$rank[max.idx]
## [1] 23
abline(v=OOS$rank[max.idx], lty=3)
model <- fwd.model$keep[["model",max.idx]]
From the forward selection model path, the model with 23 coefficients has the best OOS R2.
We then “choose” this model, and re-estimate it using the entire dataset. The resulting coefficients are:
model_full_data<-glm(model$formula, data = telco.all, family = binomial(link = "logit"))
summary(model_full_data)
##
## Call:
## glm(formula = model$formula, family = binomial(link = "logit"),
## data = telco.all)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.001 -0.691 -0.285 0.684 3.404
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.52328 0.23680 -2.21 0.0271 *
## ContractOne year -1.31286 0.24392 -5.38 0.0000000735218 ***
## ContractTwo year -2.58618 0.61182 -4.23 0.0000236777268 ***
## InternetServiceFiber optic 0.34007 0.15409 2.21 0.0273 *
## InternetServiceNo -0.24861 0.19895 -1.25 0.2114
## tenure -0.04705 0.00662 -7.11 0.0000000000012 ***
## PaymentMethodCredit card (automatic) -0.09320 0.11373 -0.82 0.4125
## PaymentMethodElectronic check 0.29825 0.09460 3.15 0.0016 **
## PaymentMethodMailed check -0.06041 0.11586 -0.52 0.6021
## OnlineSecurityYes -0.47927 0.08646 -5.54 0.0000000297124 ***
## TotalCharges -0.00294 0.14259 -0.02 0.9836
## TechSupportYes -0.44905 0.09063 -4.96 0.0000007232129 ***
## PhoneServiceYes -0.81891 0.20049 -4.08 0.0000441469786 ***
## PaperlessBillingYes 0.33734 0.07491 4.50 0.0000066918015 ***
## OnlineBackupYes -0.25933 0.08017 -3.23 0.0012 **
## MonthlyCharges 0.02049 0.00520 3.94 0.0000810908098 ***
## SeniorCitizen 0.22900 0.08408 2.72 0.0065 **
## MultipleLinesYes 0.22056 0.08502 2.59 0.0095 **
## DependentsYes -0.12190 0.08184 -1.49 0.1363
## tenure:PhoneServiceYes -0.01037 0.00658 -1.58 0.1150
## ContractOne year:tenure 0.01695 0.00566 3.00 0.0027 **
## ContractTwo year:tenure 0.02054 0.01045 1.97 0.0492 *
## tenure:TotalCharges 0.00354 0.00134 2.64 0.0084 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8143.4 on 7031 degrees of freedom
## Residual deviance: 5791.8 on 7009 degrees of freedom
## AIC: 5838
##
## Number of Fisher Scoring iterations: 6
We have to create our own matrix of dummy variables for factors. So we do that using the model.matrix() command.
# all the factor variables
xfactors<- model.matrix(Churn ~ SeniorCitizen + Partner + Dependents + PhoneService + MultipleLines + InternetService + OnlineSecurity + OnlineBackup + DeviceProtection + TechSupport + StreamingTV + StreamingMovies + Contract + PaperlessBilling + PaymentMethod, data = telco)
# remove intercept
xfactors<-xfactors[,-1]
# all continuous variables
x<-as.matrix(data.frame(telco$tenure, telco$MonthlyCharges, telco$TotalCharges, xfactors))
We then attach the continuous variables for that and run the model. \(alpha = 1\) means that we are running LASSO. \(nlambda = 200\) means we are selecting 200 grid points for \(\lambda\).We will decrease \(\lambda\) slowly until changes are small or reach 200.
lasso_telco<-glmnet(x, y=as.factor(telco$Churn), alpha = 1, family = "binomial", nlambda = 100)
LASSO gives us a path of possible models.
par(mai=c(.9,.8,.8,.8))
par(mfrow=c(1,1))
plot(lasso_telco, xvar="lambda", label = TRUE, )
Here are the dimnames to interpret the graph. We can see that 9 is Fiber optic cable, for example, which is one of the first variables with a non-zero coefficient.
dimnames(x)[2]
## [[1]]
## [1] "telco.tenure" "telco.MonthlyCharges" "telco.TotalCharges" "SeniorCitizen" "PartnerYes"
## [6] "DependentsYes" "PhoneServiceYes" "MultipleLinesYes" "InternetServiceFiber.optic" "InternetServiceNo"
## [11] "OnlineSecurityYes" "OnlineBackupYes" "DeviceProtectionYes" "TechSupportYes" "StreamingTVYes"
## [16] "StreamingMoviesYes" "ContractOne.year" "ContractTwo.year" "PaperlessBillingYes" "PaymentMethodCredit.card..automatic."
## [21] "PaymentMethodElectronic.check" "PaymentMethodMailed.check"
Here’s the printed sequence of non-zero coefficients, \(R^2\) in terms of deviance, and \(\lambda\)
print(lasso_telco)
##
## Call: glmnet(x = x, y = as.factor(telco$Churn), family = "binomial", alpha = 1, nlambda = 100)
##
## Df %Dev Lambda
## 1 0 0.00 0.1550
## 2 1 1.85 0.1420
## 3 2 3.95 0.1290
## 4 2 6.61 0.1180
## 5 3 9.04 0.1070
## 6 3 11.13 0.0976
## 7 3 12.90 0.0889
## 8 3 14.40 0.0810
## 9 4 15.71 0.0738
## 10 5 16.96 0.0673
## 11 5 18.10 0.0613
## 12 5 19.08 0.0558
## 13 5 19.94 0.0509
## 14 7 20.82 0.0464
## 15 7 21.65 0.0422
## 16 7 22.37 0.0385
## 17 9 22.99 0.0351
## 18 10 23.66 0.0320
## 19 10 24.23 0.0291
## 20 10 24.71 0.0265
## 21 10 25.13 0.0242
## 22 13 25.56 0.0220
## 23 13 25.95 0.0201
## 24 13 26.29 0.0183
## 25 13 26.57 0.0167
## 26 13 26.82 0.0152
## 27 14 27.04 0.0138
## 28 15 27.25 0.0126
## 29 15 27.44 0.0115
## 30 16 27.61 0.0105
## 31 16 27.75 0.0095
## 32 16 27.88 0.0087
## 33 16 27.98 0.0079
## 34 16 28.07 0.0072
## 35 16 28.14 0.0066
## 36 16 28.21 0.0060
## 37 16 28.26 0.0055
## 38 16 28.30 0.0050
## 39 17 28.34 0.0045
## 40 17 28.37 0.0041
## 41 17 28.40 0.0038
## 42 17 28.42 0.0034
## 43 17 28.44 0.0031
## 44 18 28.46 0.0028
## 45 20 28.50 0.0026
## 46 20 28.56 0.0024
## 47 20 28.60 0.0022
## 48 20 28.65 0.0020
## 49 20 28.68 0.0018
## 50 20 28.71 0.0016
## 51 20 28.74 0.0015
## 52 20 28.76 0.0014
## 53 20 28.78 0.0012
## 54 20 28.79 0.0011
## 55 20 28.80 0.0010
## 56 20 28.81 0.0009
## 57 20 28.82 0.0008
## 58 20 28.83 0.0008
## 59 20 28.84 0.0007
## 60 20 28.84 0.0006
## 61 21 28.85 0.0006
## 62 21 28.85 0.0005
## 63 21 28.85 0.0005
## 64 21 28.86 0.0004
## 65 21 28.86 0.0004
## 66 21 28.86 0.0004
## 67 21 28.86 0.0003
## 68 21 28.86 0.0003
## 69 21 28.87 0.0003
## 70 21 28.87 0.0003
You can also look at this in terms of \(R^2\), or deviance explained.
plot(lasso_telco, xvar = "dev", label = TRUE)
We use K-fold cross-validation to “tune” \(\lambda\), in other words, to choose the right penalty weight \(\lambda\) that minimizes validation error.
lasso_cv <- cv.glmnet(x, y=telco$Churn, family = "binomial", type.measure = "deviance")
plot(lasso_cv)
The coefficients associated with the \(\lambda\) that minimizes error are:
coef(lasso_cv, s = "lambda.min")
## 23 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 0.19753
## telco.tenure -0.05534
## telco.MonthlyCharges .
## telco.TotalCharges 0.27960
## SeniorCitizen 0.18331
## PartnerYes 0.06915
## DependentsYes -0.20694
## PhoneServiceYes -0.57916
## MultipleLinesYes 0.23567
## InternetServiceFiber.optic 0.69623
## InternetServiceNo -0.82512
## OnlineSecurityYes -0.41733
## OnlineBackupYes -0.22499
## DeviceProtectionYes -0.00803
## TechSupportYes -0.45742
## StreamingTVYes 0.22753
## StreamingMoviesYes 0.19424
## ContractOne.year -0.76049
## ContractTwo.year -1.46237
## PaperlessBillingYes 0.28493
## PaymentMethodCredit.card..automatic. -0.09086
## PaymentMethodElectronic.check 0.24163
## PaymentMethodMailed.check -0.15133
Same but with telco: Here we can apply that model to a holdout data set
# use holdout telco data
xfactors<- model.matrix(Churn ~ SeniorCitizen + Partner + Dependents + PhoneService + MultipleLines + InternetService + OnlineSecurity + OnlineBackup + DeviceProtection + TechSupport + StreamingTV + StreamingMovies + Contract + PaperlessBilling + PaymentMethod, data = telco.holdout)
# remove intercept
xfactors<-xfactors[,-1]
# all continuous variables
x<-as.matrix(data.frame(telco.holdout$tenure, telco.holdout$MonthlyCharges, telco.holdout$TotalCharges, xfactors))
pred <- predict(lasso_cv, newx=x, s = "lambda.min", type = "response")
churn <- as.numeric(telco.holdout$Churn)-1
head(cbind(churn, pred))
## churn lambda.min
## 1 0 0.04427
## 2 0 0.30151
## 3 0 0.20451
## 4 1 0.43055
## 5 0 0.00892
## 6 0 0.54018
par(mfrow=c(1,1))
par(mai=c(.9,.8,.2,.2))
plot(roc(churn, pred), print.auc=TRUE, ylim=c(0,1), levels=c("Churn", "Stay"),
col="black", lwd=1, main="ROC curve", xlab="Specificity: true negative rate", ylab="Sensitivity: true positive rate", xlim=c(1,0), direction="<")