-
Notifications
You must be signed in to change notification settings - Fork 1
/
regression2_lasso.R
194 lines (139 loc) · 7.41 KB
/
regression2_lasso.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
####################################################################################################
##
## Project: Lasso and Ridge regression models
## Script purpose: Practical excercise using the Malawi 2019 dataset to run a lasso regression
## Encoding: UTF-8
## Packages: tidyverse, plyr, data.table, caret, Hmisc, elasticnet, corrplot
## Date: 6 December 2022
## Author: Michelle González Amador
## Version: R ‘3.6.2’ // Rstudio ‘2022.7.2.576’
## Notes v00:
##
####################################################################################################
rm(list=ls())
setwd("/Users/michellegonzalez/Documents/GitHub/Machine-Learning-for-Public-Policy")
### 0. Libraries ####
#install.packages("glmnet", "AppliedPredictiveModeling")
library(plyr)
library(tidyverse)
library(data.table)
library(caret)
library(Hmisc)
library(elasticnet) # works in conjunction with caret for lasso models
library(corrplot)
library(Amelia)
library(skimr)
### 1. Upload data and subset data ####
malawi <- fread("~/Desktop/MachineLearning4PP 2/Machine-Learning-for-Public-Policy/malawi.csv")
column_names <- c("reside","hhsize","hh_b05a","sumConsumption","hh_s01","hh_b03","hh_c24","hh_d10","hh_d11","hh_d12_1","hh_f19","hh_f34","hh_t08","hh_t01","hh_t14")
malawi2 <- malawi[,column_names, with=FALSE]
### 2. Data pre-processing: missing values and zero-variance predictors ####
# Overview with skimr package: for small dataframes
skim(malawi2)
# Overview with Amelia package: for large dataframes
missmap(malawi)
# Let's add some extra variables to our main dataset from Module E
column_names <- c("reside","hhsize","hh_b05a","sumConsumption","hh_s01","hh_b03","hh_c24","hh_d10","hh_d11","hh_d12_1","hh_f19","hh_f34","hh_t08","hh_t01","hh_t14", "hh_e70", "hh_e67", "hh_e66", "hh_e05", "hh_e06")
malawi <- malawi[,column_names, with=FALSE]
rm(malawi2)
###
# Missing-value cleaning
# While skimr is a great pkg, I love the simplicity of colSums(is.na(x)) right before I clean a df.
colSums(is.na(malawi))
row_miss <- which(is.na(malawi$sumConsumption))
malawi <- malawi[-row_miss,]
malawi <- na.omit(malawi)
# This line deletes ALL missing values from the entire data set. In this specific case, it only applied to two variables since we had already gotten rid all the missingn values from the target variable. Take a glimpse:
colSums(is.na(malawi))
###
# Some quick visualisations
malawi_continuous <- malawi %>% select_if(~is.integer(.) | is.numeric(.))
hist.data.frame(malawi_continuous)
summary(malawi$hh_d10) #Amt [NAME] spent in the past 4 weeks for all illnesses and injuries?
unique(malawi$hh_d10) # variable distribution is legitimate.
ill_people_spentCash <- length(which(malawi$hh_d10!=0))
cat(sprintf("%.0f people/households spent money on an illness, out of 41,430, in the past 4 weeks.", ill_people_spentCash))
cat("Variables from the H section of the Household module all follow similar distributions due to the type of question asked.")
names(malawi)[names(malawi) == "hh_d10"] <- "total_spent_on_illness"
names(malawi)[names(malawi) == "hh_d11"] <- "total_spent_on_non_illness_medicalcare"
names(malawi)[names(malawi) == "hh_d12_1"] <- "total_spent_on_medicalinsurance"
malawi_factor <- malawi %>% select_if(~is.character(.)) # subset of the dataframe containing only factor variables
malawi_factor <- as.data.frame(unclass(malawi_factor),stringsAsFactors=TRUE)
llply(.data=malawi_factor, .fun=table) # create tables of all the variables in dataframe using the plyr package
# Variables with missing values: hh_c09, hh_e70 (too many, delete var), hh_t14, hh_t01, hh_t08
malawi <- as.data.frame(unclass(malawi),stringsAsFactors=TRUE) # Convert character vectors into factors in malawi dataframe
malawi <- malawi[,-which(colnames(malawi)=="hh_e70")]
# In the Challenge Solution tab of Regression I we used the command droplevels() to delete the level recognised as an empty space from each variable. This is another way of doing this which will also allow us to keep track of how many values we delete.
length(which(malawi$hh_t14=="")) # 3 empty spaces
levels(malawi$hh_t14)[levels(malawi$hh_t14)==""] <- NA
sum(is.na(malawi$hh_t14)) # 3 missing values
length(which(malawi$hh_t01=="")) # 3 empty spaces
levels(malawi$hh_t01)[levels(malawi$hh_t01)==""] <- NA
sum(is.na(malawi$hh_t01)) # 3 missing values
length(which(malawi$hh_t08=="")) # 3 empty spaces
levels(malawi$hh_t08)[levels(malawi$hh_t08)==""] <- NA
sum(is.na(malawi$hh_t08)) # 3 missing values
colSums(is.na(malawi)) # Some of these missings are overlapping.
malawi <- na.omit(malawi)
colSums(is.na(malawi))
# correlation matrix
malawi_corr <- as.data.frame(lapply(malawi,as.numeric)) # coerce dataframe to numeric, as the cor() command only takes in numeric types
M <- cor(malawi_corr, method = "spearman") # default to spearmand due to the presence of factor variables
corrplot(M, method="circle", addCoef.col ="black", number.cex = 0.8) # visualise it in a nice way
### 3. Lasso model with caret() ####
# 3.1 Create test and train data with the caret package
# create new target variable (read: Challenge Solution tab in Regression I)
malawi$Cons_pcpd <-(malawi$sumConsumption / malawi$hhsize)/7
# Now let's get rid of household food consumption, weekly
malawi <- malawi[,-which(colnames(malawi)=="sumConsumption")]
set.seed(12345) # use any number you want
train_idx <- createDataPartition(malawi$Cons_pcpd, p = .8, list = FALSE, times = 1)
Train_df <- malawi[ train_idx,]
Test_df <- malawi[-train_idx,]
# 3.2 use the train() function from the caret package to create a lasso model
model_lasso <- train(
sumConsumption ~ .,
data = Train_df,
method = 'lasso',
preProcess = c("center", "scale") #This will scale and center all relevant variables in the model
)
print(model_lasso)
# Let's see our model
plot(model_lasso)
cat("The x-axis is the fraction of the full solution (i.e., ordinary least squares with no penalty)")
# did the Lasso method get rid of some variables?
plot(varImp(model_lasso))
cat(" We can clearly see the non-relevance of certain variables, and that the variable district was deleted / it's coefficient was shrunk to 0")
cat(" Recall that the optimal model had a fraction of .9, so we knew (!=1) that at least one variable was deleted.")
# 3.3 Make predictions with the test data set
test_features <- subset(Test_df, select=-c(Cons_pcpd))
test_target <- subset(Test_df, select=Cons_pcpd)[,1]
lasso_predictions <- predict(model_lasso, newdata = test_features)
# RMSE
sqrt(mean((test_target - lasso_predictions)^2))
# R^2
cor(test_target, lasso_predictions) ^ 2
### 4. Cross-validation: k-fold with caret() ####
cat("Cross-validation: beyond training and testing dataframes, what a k-fold (commonly 10-fold) cross validation does is resample the data to find an optimal (λ)lambda/penalisation for the lasso model.")
cat("Recall: The optimal λ(lambda)/penalisation minimizes the out-of-sample (or test) mean prediction error.")
set.seed(12345)
cv_10fold <- trainControl(
method = "cv", #cross-validation
number = 10, # k-fold = 10-fold (split the data into 10 similar-sized samples)
)
set.seed(12345)
lasso_kfold <- train(
sumConsumption ~ .,
data = Train_df,
method = 'lasso',
preProcess = c("center", "scale"),
trControl = cv_10fold
)
print(lasso_kfold)
plot(lasso_kfold)
plot(varImp(lasso_kfold))
lasso_10kpredictions <- predict(lasso_kfold, newdata = test_features)
# RMSE
sqrt(mean((test_target - lasso_10kpredictions)^2))
# R2
cor(test_target, lasso_10kpredictions) ^ 2