-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathseedwise_prediction.R
285 lines (214 loc) · 15.6 KB
/
seedwise_prediction.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# Load Libraries
library(ggplot2)
library(ggthemes)
library(pscl) # Logistic Regression
library(ROCR) # ROCR Curve
#Load Data
seasons <- read.csv("D:/My Stuff/Kaggle/march ml mania 2017/Seasons.csv")
fullSeasonData <- read.csv("D:/My Stuff/Kaggle/march ml mania 2017/RegularSeasonDetailedResults.csv")
compactSeasonData <- read.csv("D:/My Stuff/Kaggle/march ml mania 2017/RegularSeasonCompactResults.csv")
teams <- read.csv("D:/My Stuff/Kaggle/march ml mania 2017/Teams.csv")
tourneyData <- read.csv("D:/My Stuff/Kaggle/march ml mania 2017/TourneyDetailedResults.csv")
compactTourneyData <- read.csv("D:/My Stuff/Kaggle/march ml mania 2017/TourneyCompactResults.csv")
seeds <- read.csv("D:/My Stuff/Kaggle/march ml mania 2017/TourneySeeds.csv")
submission <- read.csv("D:/My Stuff/Kaggle/march ml mania 2017/sample_submission.csv")
seeds$pureSeed <- as.numeric(substr(seeds$Seed, 2,3)) # Extract the substring from the 'seed' value starting with the second character and going to the third character, then convert to a numeric and store as new variable 'pureSeed'
seeds$region <- as.character(substr(seeds$Seed,1,1)) #Extract the region as well, which we'll need for calculating dates of games later
compactTourneyData$team1 <- ifelse(compactTourneyData$Wteam > compactTourneyData$Lteam, compactTourneyData$Lteam, compactTourneyData$Wteam) #If the ID of the winning team is higher than the ID of the losing team, team1 is the losing team, else its the winning team
compactTourneyData$team2 <- ifelse(compactTourneyData$Wteam > compactTourneyData$Lteam, compactTourneyData$Wteam, compactTourneyData$Lteam) #Vice versa to find team2
compactTourneyData$team1Victory <- ifelse(compactTourneyData$Wteam == compactTourneyData$team1, 1, 0) #Create a "Team1 Victory" binary variable
# The first step is parsing the detailed regular season results into team stats per game
winnerHistory <- fullSeasonData[,c("Season","Wteam","Daynum","Wscore","Numot","Wfgm","Wfga","Wfgm3","Wfga3","Wftm","Wfta","Wor","Wdr","Wast","Wto","Wstl","Wblk","Wpf")]
winnerHistory$Victory <- 1
loserHistory <- fullSeasonData[,c("Season","Lteam","Daynum","Lscore","Numot","Lfgm","Lfga","Lfgm3","Lfga3","Lftm","Lfta","Lor","Ldr","Last","Lto","Lstl","Lblk","Lpf")]
loserHistory$Victory <- 0
# Now we normalize the column names before combining the two dataframes
names(winnerHistory) <- c("season","team","daynum","score","numot","fgmade","fgattempt","fgm3","fga3","ftmade","ftattempt","offreb","defreb","ast","turnover","steal","block","pfoul","victory")
names(loserHistory) <- c("season","team","daynum","score","numot","fgmade","fgattempt","fgm3","fga3","ftmade","ftattempt","offreb","defreb","ast","turnover","steal","block","pfoul","victory")
teamHistory <- rbind(winnerHistory, loserHistory)
# We'll likely use this teamHistory archive for several different things, but for now we'll pull out season long averages for each stat for each team
teamAverages <- aggregate(teamHistory, by=list(teamHistory$season,teamHistory$team), FUN=mean, na.rm=TRUE)
#We'll start by adding both teams' season average stats to each tourney match
train1 <- merge(compactTourneyData[,c("Season","Daynum","team1","team2","team1Victory")],teamAverages[,c("season","team","score","numot","fgmade","fgattempt","fgm3","fga3","ftmade","ftattempt","offreb","defreb","ast","turnover","steal","block","pfoul","victory")], by.x = c("Season","team1"), by.y = c("season","team"))
names(train1)[6:21] <- paste("team1avg",names(train1)[6:21],sep = "")
train1 <- merge(train1,teamAverages[,c("season","team","score","numot","fgmade","fgattempt","fgm3","fga3","ftmade","ftattempt","offreb","defreb","ast","turnover","steal","block","pfoul","victory")], by.x = c("Season","team2"), by.y = c("season","team"))
names(train1)[22:37] <- paste("team2avg",names(train1)[22:37],sep = "")
# Lets also pull in their respective seeds, as I suspect the most important independent variable will involve seeds
train1 <- merge(train1, seeds[,c("Season","Team","pureSeed","region")], by.x = c("Season","team1"), by.y = c("Season","Team"))
colnames(train1)[colnames(train1)=="pureSeed"] <- "seed1"
colnames(train1)[colnames(train1)=="region"] <- "region1"
train1 <- merge(train1, seeds[,c("Season","Team","pureSeed", "region")], by.x = c("Season","team2"), by.y = c("Season","Team"))
colnames(train1)[colnames(train1)=="region"] <- "region2"
# Now comes my primary focus: the difference between both teams' seed values, which should have a powerful predictive effect
train1$seedDelta <- train1$seed1 - train1$pureSeed
# While we're at it, it might not hurt to calculate deltas for all of the stats
train1$scoreDiff <- train1$team1avgscore - train1$team2avgscore
train1$numotDiff <- train1$team1avgnumot - train1$team2avgnumot
train1$fgmadeDiff <- train1$team1avgfgmade - train1$team2avgfgmade
train1$fgattemptDiff <- train1$team1avgfgattempt - train1$team2avgfgattempt
train1$fgm3Diff <- train1$team1avgfgm3 - train1$team2avgfgm3
train1$fga3Diff <- train1$team1avgfga3 - train1$team2avgfga3
train1$ftmadeDiff <- train1$team1avgftmade - train1$team2avgftmade
train1$ftattemptDiff <- train1$team1avgftattempt - train1$team2avgftattempt
train1$offrebDiff <- train1$team1avgoffreb - train1$team2avgoffreb
train1$defrebDiff <- train1$team1avgdefreb - train1$team2avgdefreb
train1$astDiff <- train1$team1avgast - train1$team2avgast
train1$turnoverDiff <- train1$team1avgturnover - train1$team2avgturnover
train1$stealDiff <- train1$team1avgsteal - train1$team2avgsteal
train1$blockDiff <- train1$team1avgblock - train1$team2avgblock
train1$pfoulDiff <- train1$team1avgpfoul - train1$team2avgpfoul
train1$victoryDiff <- train1$team1avgvictory - train1$team2avgvictory
# One last bit of data housekeeping: We need to convert Daynum into a broader measure of the tournament round, since we can't forecast which specific day any two teams will play during rounds 1-4
roundConverter <- function(x){return(switch(as.character(x),'134'=0,'135'=0,'136'=1,'137'=1,'138'=2,'139'=2,'143'=3,'144'=3, '145'=4,'146'=4,'152'=5,'154'=6))}
train1$round <- sapply(train1$Daynum, roundConverter)
upsets <- subset(train1, seedDelta > 0 & team1Victory == TRUE) #This identifies upsets where team 1 is the victorious underdog
upsets <- rbind(upsets, subset(train1, seedDelta < 0 & team1Victory == FALSE)) #This identifies upsets where team 2 is the victorious underdog
1 - dim(upsets)[1] / dim(train1)[1] # 1 - (# of upsets)/(# of total tourney games) = Accuracy of playing a perfectly "By the books" bracket, which assumes in every match the stronger seed team will win
ggplot(aes(x = upsets$Daynum, y = abs(upsets$seedDelta)), data = upsets) +
geom_point(position = position_jitter(width = .6, height = .4))
train <- train1[which(train1$Season <= 2012),]
test <- train1[which(train1$Season >= 2013),]
lr_model <- glm(team1Victory ~ pureSeed + seed1
, family=binomial(link = 'logit'), data = train)
summary(lr_model)
#Let's visualize the accuracy of this analysis by plotting true positive rate against false positive rate
predict <- predict(lr_model, type = 'response')
ROCRpred <- prediction(predict, train$team1Victory)
ROCRperf <- performance(ROCRpred, 'tpr','fpr')
plot(ROCRperf, colorize = TRUE) #The area under this curve is one approximation of the model's accuracy
acc <- performance(ROCRpred, measure = "auc")
acc <- [email protected][[1]]
acc
#Test it on the test dataset
test_prediction <- predict(lr_model, test, type = 'response')
#Compute same acc measure as above
ROCRpred <- prediction(test_prediction, test$team1Victory)
acc <- performance(ROCRpred, measure = "auc")
acc <- [email protected][[1]]
acc
#Compute practical accuracy measure (betting on a win or a loss)
test$guess <- round(test_prediction, digits=0)
test$correct <- ifelse(test$guess == test$team1Victory, 1, 0)
sum(test$correct)/268
lr_model <- glm(team1Victory ~ pureSeed + seed1 + round
, family=binomial(link = 'logit'), data = train)
summary(lr_model)
#Let's visualize the accuracy of this analysis by plotting true positive rate against false positive rate
predict <- predict(lr_model, type = 'response')
ROCRpred <- prediction(predict, train$team1Victory)
acc <- performance(ROCRpred, measure = "auc")
acc <- [email protected][[1]]
acc
#Test it on the test dataset
test_prediction <- predict(lr_model, test, type = 'response')
#Compute same acc measure as above
ROCRpred <- prediction(test_prediction, test$team1Victory)
acc <- performance(ROCRpred, measure = "auc")
acc <- [email protected][[1]]
acc
#Compute practical accuracy measure (betting on a win or a loss)
test$guess <- round(test_prediction, digits=0)
test$correct <- ifelse(test$guess == test$team1Victory, 1, 0)
sum(test$correct)/268
lr_model <- glm(team1Victory ~ pureSeed + seed1 + round + team1avgscore + team1avgvictory + team2avgscore + team2avgvictory
, family=binomial(link = 'logit'), data = train)
summary(lr_model)
#Let's visualize the accuracy of this analysis by plotting true positive rate against false positive rate
predict <- predict(lr_model, type = 'response')
ROCRpred <- prediction(predict, train$team1Victory)
acc <- performance(ROCRpred, measure = "auc")
acc <- [email protected][[1]]
acc
#Test it on the test dataset
test_prediction <- predict(lr_model, test, type = 'response')
#Compute same acc measure as above
ROCRpred <- prediction(test_prediction, test$team1Victory)
acc <- performance(ROCRpred, measure = "auc")
acc <- [email protected][[1]]
acc
#Compute practical accuracy measure (betting on a win or a loss)
test$guess <- round(test_prediction, digits=0)
test$correct <- ifelse(test$guess == test$team1Victory, 1, 0)
sum(test$correct)/268
lr_model <- glm(team1Victory ~ pureSeed + seed1 + round + team1avgscore + team1avgvictory + team2avgscore +
team2avgvictory + team1avgnumot + team1avgfgmade + team1avgfgattempt + team1avgfgm3 +
team1avgfga3 + team1avgftmade + team1avgftattempt + team1avgoffreb + team1avgdefreb + team1avgast +
team1avgturnover + team1avgsteal +team1avgblock + team1avgpfoul +
team2avgnumot + team2avgfgmade + team2avgfgattempt + team2avgfgm3 + team2avgfga3 + team2avgftmade +
team2avgftattempt + team2avgoffreb + team2avgdefreb + team2avgast + team2avgturnover +
team2avgsteal + team2avgblock + team2avgpfoul
, family=binomial(link = 'logit'), data = train)
summary(lr_model)
#Let's visualize the accuracy of this analysis by plotting true positive rate against false positive rate
predict <- predict(lr_model, type = 'response')
ROCRpred <- prediction(predict, train$team1Victory)
acc <- performance(ROCRpred, measure = "auc")
acc <- [email protected][[1]]
acc
#Test it on the test dataset
test_prediction <- predict(lr_model, test, type = 'response')
#Compute same acc measure as above
ROCRpred <- prediction(test_prediction, test$team1Victory)
acc <- performance(ROCRpred, measure = "auc")
acc <- [email protected][[1]]
acc
#Compute practical accuracy measure (betting on a win or a loss)
test$guess <- round(test_prediction, digits=0)
test$correct <- ifelse(test$guess == test$team1Victory, 1, 0)
sum(test$correct)/268
lr_model <- glm(team1Victory ~ seedDelta + round + scoreDiff + numotDiff + fgmadeDiff + fgattemptDiff +
fgm3Diff + fga3Diff + ftmadeDiff + ftattemptDiff + offrebDiff + defrebDiff +
astDiff + turnoverDiff + stealDiff + blockDiff + pfoulDiff + victoryDiff
, family=binomial(link = 'logit'), data = train)
summary(lr_model)
#Let's visualize the accuracy of this analysis by plotting true positive rate against false positive rate
predict <- predict(lr_model, type = 'response')
ROCRpred <- prediction(predict, train$team1Victory)
acc <- performance(ROCRpred, measure = "auc")
acc <- [email protected][[1]]
acc
test_prediction <- predict(lr_model, test, type = 'response')
#Compute same acc measure as above
ROCRpred <- prediction(test_prediction, test$team1Victory)
acc <- performance(ROCRpred, measure = "auc")
acc <- [email protected][[1]]
acc
#Compute practical accuracy measure (betting on a win or a loss)
test$guess <- round(test_prediction, digits=0)
test$correct <- ifelse(test$guess == test$team1Victory, 1, 0)
sum(test$correct)/268
#This function is the culmination of several hours of manual analysis of the NCAA bracket structure. I'm fully aware of how strange and disjointed its various segments appear, and explaining the logic of it falls outside the scope of this kernel, but I'll give a basic overview in comments
roundFinder <- function(region1, region2, seed1, seed2){
if(region1 == region2 && seed1 == seed2){return(0)} #This is for the 'First Four', the play-in games
else if(region1 != region2){
if((region1 == 'W' && region2 == 'X')||(region1=='X' && region2=='W')|| (region1=='Y'&®ion2=='Z')|| (region1=='Z'&®ion2=='Y')){
return(5) #If the two teams are from different regions, but both from either the left side or right side of the bracket (left side is W and X, right side is Y and Z), they will play each other in round 5, the semi final
}
else return(6) #If the two teams are from different regions and different sides of the bracket, they'll only play each other in the championship, round 6
}
else{ #Now it gets finnicky. Within each regional bracket, about half of the matchup rounds can be determined by simply adding the two teams' seeds together. The other half need to be called out individually.
z <- as.character(as.numeric(seed1) + as.numeric(seed2))
#We also need to put the lower seed first, so we don't have to do twice as many individual callouts
x <- as.character(min(c(as.numeric(seed1),as.numeric(seed2))))
y <- as.character(max(c(as.numeric(seed1),as.numeric(seed2))))
#This first switch function checks the seed sum to see if it can identify the round
w <- switch(z, '17'=1, '9'=2, '25'=2, '2'=4, '3'=4, '4'=4, '5'=3, '7'=4, '11'=4, '13'=3, '15'=4, '19'=4, '21'=3, '23'=4, '27'=4, '29'=3, '30'=4, '31'=4)
if(is.numeric(w)){
return(w)
}
else{ #If the first switch didn't find an answer, this one will. It calls out the remaining combinations and returns the correct round
y <- as.character(y)
return(switch(as.character(x), '1'=switch(y,'9'=2,'5'=3,'13'=3,'7'=4,'11'=4,'15'=4),'2'=switch(y,'10'=2,'6'=3,'14'=3,'4'=4,'8'=4,'12'=4,'16'=4),'3'=switch(y, '11'=2,'7'=3,'15'=3,'5'=4,'9'=4,'13'=4),'4'=switch(y,'12'=2,'8'=3,'16'=3,'6'=4,'10'=4,'14'=4),'5'=switch(y,'13'=2,'9'=3,'7'=4,'11'=4,'15'=4),'6'=switch(y,'14'=2,'10'=3,'8'=4,'12'=4,'16'=4),'7'=switch(y,'15'=2,'11'=3,'9'=4,'13'=4),'8'=switch(y,'16'=2,'12'=3,'10'=4,'14'=4),'9'=switch(y,'13'=3,'11'=4,'15'=4),'10'=switch(y,'14'=3,'12'=4,'16'=4),'11'=switch(y,'15'=3,'13'=4),'12'=switch(y,'16'=3,'14'=4),'13'=4))
}
}
}
#Now lets test it on the existing data to verify its accuracy
train1$roundCheck <- unlist(mapply(roundFinder,train1$region1,train1$region2,train1$seed1,train1$pureSeed, SIMPLIFY = TRUE), use.names = FALSE)
train1$roundDelta <- train1$round - train1$roundCheck
summary(train1$roundDelta)
test$prediction <- test_prediction
test$submissionID <- paste(test$Season, test$team1, test$team2, sep = "_")
submission <- merge(submission, test[,c("submissionID","prediction")], by.x = c("id"), by.y = c("submissionID"), all.x = TRUE)
submission$pred <- ifelse(is.na(submission$prediction),submission$pred, submission$prediction)
submission$prediction <- NULL
write.csv(submission, file="D:/My Stuff/Kaggle/march ml mania 2017/LogisticRegressionSubmission.csv", row.names=FALSE)