-
Notifications
You must be signed in to change notification settings - Fork 1
/
winning_scores.R
169 lines (145 loc) · 4.69 KB
/
winning_scores.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
### an interesting discussion on Twitter recently
# has been around the number of points you need to win the contest
# and how these are distributed across the televote and jury votes
# so i've decided to look at this
# purely over the 2016-2021 period (it's faffy to go earlier)
# this hasn't worked at all
# but maybe people will be interested to play with it and improve it
# load packages
library(tidyverse)
library(Runuran)
library(fitdistrplus)
# load data
previous_scores <-
read_csv("previous_contest_scores.csv")
# add ranking per year
previous_scores <-
previous_scores %>%
group_by(Year) %>%
mutate(Rank = rank(-Total))
# add winner flag
previous_scores <-
previous_scores %>%
mutate(is_winner =
case_when(Rank == 1 ~ "Winner",
Rank != 1 ~ "Not Winner"))
# add lots of variables
previous_scores <-
previous_scores %>%
group_by(Year) %>%
mutate(points_that_year =
sum(Total)) %>% # total points available that year
mutate(sd_year_total =
sd(Total)) %>% # sd of points that year
mutate(sd_year_televote =
sd(Televote)) %>% # sd of televote
mutate(sd_year_jury =
sd(Jury)) %>% # sd of jury
mutate(winning_score_that_year =
max(Total)) %>%
mutate(mean_score_that_year =
mean(Total))
# illustrate relevance of SD
ggplot(previous_scores) +
aes(x = sd_year_total,
y = Total) +
geom_point(aes(y = mean_score_that_year),
size = 10,
colour = "skyblue") +
geom_point(data = . %>%
filter(is_winner == "Winner"),
position = position_jitter(height = NULL,
width = 1),
colour = "black",
fill = "gold",
size = 5,
shape = 23) +
geom_point(data = . %>%
filter(is_winner == "Not Winner"),
position = position_jitter(height = NULL,
width = 1)) +
theme_minimal()
# compare distributions for last few years
ggplot(previous_scores) +
aes(x = Total,
colour = as.factor(Year)) +
geom_density()
# what's the range of values within SD?
table(previous_scores$sd_year_total,
previous_scores$Year)
# goes from about 124 to about 184
# ok so imagine we're going from 100 to 200
# there are 40 countries, meaning the number of points to be awarded is
(1+2+3+4+5+6+7+8+10+12) * 2 * 40
# 4640
# over 25 countries...
4640/25
# mean score will be 185.6
# max possible score?
12*2*39
# 936
# look at distributions that might fit ok
fit_w <- fitdist(previous_scores$sd_year_total, "weibull")
fit_g <- fitdist(previous_scores$sd_year_total, "gamma")
fit_ln <- fitdist(previous_scores$sd_year_total, "lnorm")
gofstat(list(fit_w, fit_g, fit_ln), fitnames = c("Weilbull", "Gamma", "Lnorm"))
# lognormal is best of these
# simulate based on the lognormal distribution
m <- 185.6
s <- 124
location <- log(m^2 / sqrt(s^2 + m^2))
shape <- sqrt(log(1 + (s^2 / m^2)))
print(paste("location:", location))
print(paste("shape:", shape))
draws3 <- rlnorm(n=25000000, location, shape)
winning_scores_when_its_close <-
draws3 %>%
as_tibble() %>%
mutate(contest_number =
(rep(c(1:1000000), 25))) %>%
group_by(contest_number) %>%
filter(value == max(value))
# do it again, higher variance
m2 <- 185.6
s2 <- 190
location2 <- log(m2^2 / sqrt(s2^2 + m2^2))
shape2 <- sqrt(log(1 + (s2^2 / m2^2)))
print(paste("location:", location2))
print(paste("shape:", shape2))
draws3_2 <- rlnorm(n=25000000, location2, shape2)
winning_scores_when_its_not_close <-
draws3_2 %>%
as_tibble() %>%
mutate(contest_number =
(rep(c(1:1000000), 25))) %>%
group_by(contest_number) %>%
filter(value == max(value))
# eyeball distributions
winning_scores_when_its_not_close %>%
mutate(variety = "Runaway winner (like 2017)") %>%
bind_rows(winning_scores_when_its_close %>%
mutate(variety = "Close contest (like 2018)")) %>%
filter(value < 937) %>%
ggplot() +
aes(x = value,
colour = variety) +
geom_density() +
labs(x = "Total points won by the simulated winner",
y = "",
colour = "") +
theme_minimal() +
theme(legend.position = "bottom")
ggsave("overall_simulation.png")
# get the numbers for when it's close...
winning_scores_when_its_close %>%
ungroup() %>%
filter(value < 937) %>%
summarise(quantile = quantile(value, c(.1, 0.25, 0.5, 0.75, .9)),
q = c(.1, 0.25, 0.5, 0.75, .9))
#... and when it's not
winning_scores_when_its_not_close %>%
ungroup() %>%
filter(value < 937) %>%
summarise(quantile =
quantile(value, c(.1, 0.25, 0.5, 0.75, .9)),
q = c(.1, 0.25, 0.5, 0.75, .9))