-
Notifications
You must be signed in to change notification settings - Fork 1
/
Lead Scoring.r
137 lines (115 loc) · 4.68 KB
/
Lead Scoring.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
### Code (C) Matthew Theisen 2018. Contact at [email protected].
library(ISLR)
library(glmnet)
split_data <- function(df,test_frac) {
mask <- runif(nrow(df))<test_frac
train_x = scale( df[!mask,colnames(df)!='Purchase'] ) ## Scaling so that regularization works properly.
test_x = scale( df[mask,colnames(df)!='Purchase'],
center=attr(train_x,"scaled:center"),
scale=attr(train_x,"scaled:scale")) ## Should be scaled separately, since otherwise we are using information from the test set for the train set! A form of data leakage.
ret <- list(train_x=train_x,
train_y=df[!mask,c('Purchase')],
test_x=test_x,
test_y=df[mask,c('Purchase')])
return (ret)
}
expit <- function(x) {
return ( 1/(1+exp(-x)) )
}
logit <- function(x) {
return( log( x/(1-x) ) )
}
matrix_from_df <- function(x) {
return (Matrix(as.matrix(x), sparse = FALSE))
}
test_splits <- function(splits) {
print("Training Rows")
print(nrow(splits$train_x))
print("Testing Rows")
print(splits$test_x)
print("Training Means")
print(colMeans(splits$train_x) )
print("Test Means")
print(colMeans(splits$test_x) )
print("Test SDs")
print(apply(splits$test_x,2,sd))
print("Train SDs")
print(apply(splits$train_x,2,sd))
}
get_confusion_matrix <- function(test,pred) {
df=data.frame(test=test,pred=pred)
df$pred <- as.factor(df$pred)
levels(df$pred)<-c("FALSE","TRUE")
return( table(df) )
}
get_conf_stats <- function(cm) {
df <- data.frame(cm)
precision <- df[( (df$test=="Yes") & (df$pred==TRUE) ),]$Freq / sum( df[( (df$pred==TRUE) ),]$Freq )
recall <- df[( (df$test=="Yes") & (df$pred==TRUE) ),]$Freq / sum( df[( (df$test=="Yes") ),]$Freq )
f1 <- 2/(1/precision + 1/recall)
accuracy <- sum(df[( (df$test=="Yes") == (df$pred==TRUE) ),]$Freq) / sum(df$Freq)
cost_of_one <- 1
profit_of_one <- 10
overall_profit <- profit_of_one*df[( (df$test=="Yes") & (df$pred==TRUE) ),]$Freq - cost_of_one*sum( df[( (df$pred==TRUE) ),]$Freq )
return (list(precision=precision, recall=recall, f1=f1, accuracy=accuracy, profit=overall_profit))
}
lin_reg_fit <- function(splits) {
#### Fit & Predict
fit <- glmnet( matrix_from_df(splits$train_x), splits$train_y, family='binomial')
train_predict <- predict(fit,newx=matrix_from_df(splits$train_x) )
test_predict <- predict(fit,newx=matrix_from_df(splits$test_x) )
#### Do quick first tests
get_confusion_matrix(splits$test_y,test_predict[,100]>0)
colSums( test_predict > 0 )
#### Find good lambdas at decision boundary = 0.1
num_pts <- 91
graphing_df <- data.frame(lambda=rep(0,num_pts), train_f1=rep(0,num_pts), test_f1=rep(0,num_pts) )
multiplier <- as.integer(100/num_pts)
i=0
for ( row in 10:100 ) {
# row <- multiplier*i
i=i+1
test_pred_y <- test_predict[,row] > logit(0.1)
test_conf_mat <- get_confusion_matrix( splits$test_y,test_pred_y )
test_conf_stats <- get_conf_stats(test_conf_mat)
train_pred_y <- train_predict[,row] > logit(0.1)
train_conf_mat <- get_confusion_matrix( splits$train_y,train_pred_y )
train_conf_stats <- get_conf_stats(train_conf_mat)
graphing_df$lambda[i] <- fit$lambda[row]
graphing_df$test_f1[i] <- test_conf_stats[['f1']]
graphing_df$train_f1[i] <- train_conf_stats[['f1']]
}
plot(graphing_df$lambda,graphing_df$test_f1,xlim=c(0,0.015),ylim=c(0.15,0.3),xlab='Lambda', ylab='Test F1',
type="l")
#### MOST SIGNIFICANT FACTORS
row <- 10
sig_names <- rownames( coef(fit) ) [ coef(fit)[,row]!=0 ]
coefs <- coef(fit)[,row][sig_names]
print(coefs)
#### Find Most Profitable Lambdas By Cutoff
fit_rows <- c(100,50,10)
cutoffs <- (1:20)/50
data_storage <- list(low=data.frame(cut=cutoffs,profit=rep(0,20)),
middle=data.frame(cut=cutoffs,profit=rep(0,20)),
high=data.frame(cut=cutoffs,profit=rep(0,20)) )
list_labels <- c('low','middle','high')
for (i in 1:20) {
cutoff <- cutoffs[i]
for (k in 1:3) {
row <- fit_rows[k]
test_pred_y <- test_predict[,row] > logit(cutoff)
test_conf_mat <- get_confusion_matrix( splits$test_y,test_pred_y )
test_conf_stats <- get_conf_stats(test_conf_mat)
data_storage[[list_labels[k]]]$profit[i] <- test_conf_stats[['profit']]
}
}
colors = list(low='red',middle='blue',high='green')
plot(1,type='n',xlim=c(0,0.4),ylim=c(-200,200),xlab='Probability Cutoff','Profit ($)')
for (label in list_labels) {
lines( data_storage[[label]]$cut, data_storage[[label]]$profit, col=colors[[label]] )
}
legend(0.3,y=200,legend=c('low','medium','high'),fill=c('red','blue','green'))
}
splits <- split_data(Caravan,0.2)
test_splits(splits)
lin_reg_fit(splits)