-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCriteo2.rmd
398 lines (302 loc) · 15.2 KB
/
Criteo2.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
---
title: "Recommendations for Criteo case study"
author: "Vladimir Mikheev"
date: "06 07 2021"
output:
html_document:
code_download: yes
fontsize: 8pt
highlight: textmate
number_sections: no
theme: flatly
toc: yes
toc_float:
collapsed: no
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
library(dplyr)
library(data.table)
library(readr)
library(lme4)
library(broom)
library(ggplot2)
library(tidyr)
library(arm)
library(ggpmisc)
library(formattable)
library(parameters)
library(sjPlot)
remove_outliers <- function(x, na.rm = TRUE, ...) {
qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
H <- 1.5 * IQR(x, na.rm = na.rm)
y <- x
y[x < (qnt[1] - H)] <- NA
y[x > (qnt[2] + H)] <- NA
y
}
#Sys.setlocale("LC_ALL", "English")
Sys.setlocale("LC_ALL", "en_GB.UTF-8")
```
```{r, include=FALSE}
#wf <- 'C:/Users/Vladimir/YandexDisk/Достижения/Criteo/ecommerce_dataset.csv'
wf <- "/home/vovan/Downloads/Criteo_case/ecommerce_dataset.csv"
#df <- fread('unzip -cq C:/Users/Vladimir/YandexDisk/Достижения/Criteo/ecommerce_dataset.zip')
df <- fread(wf) %>% mutate(
Datetime =as.POSIXct(as.numeric(as.character (Timestamp)), origin="1970-01-01", tz="GMT"),
`Product Price` = as.numeric(`Product Price`),
`Product quantity` = as.numeric(`Product quantity`),
Date = as.Date(Date, "%d/%m/%y"),
Weekdays = weekdays(Date),
`Device type` = if_else(`Device type` == 'Unknown', "Android app", `Device type`),
`Browser family` = if_else(`Browser family` == 'other', "Android app", `Browser family`)
) %>%
mutate(`Device group` = case_when(
`Device type` %in% c('Android - Smartphone', 'iPhone', 'Mobile - Other') ~ 'Smartphone',
`Device type` %in% c('Android - Tablet', 'iPad') ~ 'Tablet',
`Device type` == 'Android app' ~ 'Android app',
`Device type` == 'Desktop' ~ 'Desktop'
))
df$`Event type` = factor(df$`Event type`, levels=c('Listing', 'Product', 'Basket', 'Sales'))
df$`Existing client` = as.factor(df$`Existing client`)
df$`Weekdays` = factor(df$`Weekdays`, levels = c('Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'))
```
```{r, include=FALSE}
table(df$`Browser family`)
table(df$`User location`)
table(df$`Event type`)
table(df$`Device type`)
table(df$`Environment`)
```
# 1. Timing
## 1.1. Hour
Let's observe at what time sales occur
```{r, echo=FALSE}
df %>% filter(`Event type` %in% c('Sales', 'Basket')) %>%
group_by(Hours, `Event type`) %>%
mutate(Revenue = sum(`Product Price` * `Product quantity`), na.rm = TRUE) %>%
ggplot(aes(x = Hours, y = Revenue )) + geom_line() +
facet_grid(~`Event type`) +
stat_peaks(colour = "red", ignore_threshold = 0.99) +
stat_peaks(color = "red", geom = "rug", sides = "b", ignore_threshold = 0.99)+
ggtitle('Distribution of basket and sale events over a day') + theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
```
The purchasing spans from 9 o'clock to 23 o'clock. The most popular time for sales is 20 o'clock.
```{r, echo=FALSE}
df %>%
group_by(Hours, `Event type`) %>%
mutate(Visits = n()) %>%
ggplot(aes(x = Hours, y = Visits )) + geom_line() +
facet_grid(~`Event type`) +
stat_peaks(colour = "red", ignore_threshold = 0.99) +
stat_peaks(color = "red", geom = "rug", sides = "b", ignore_threshold = 0.99) +
ggtitle('Distribution of events over a day') + theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
```
Product visiting, basketing and purchasing happens mostly at 20 o'clock, while listing is uniformly distrusted across day with maximum at 16 o'clock.
**Recommendation:** the perfect timing for advertisment is between 16 to 20 o'clock
## 1.2. Day of week
```{r, echo=FALSE}
df %>% filter(`Event type` == c('Product', 'Sales')) %>%
group_by(Weekdays, `Event type`) %>%
summarise(Visits = n()) %>%
ggplot(aes(x = Weekdays, y = Visits, fill = `Event type`, label = Visits)) +
geom_bar(stat="identity", position ="dodge") + geom_text( position = position_dodge(width = 0.9), vjust = -0.5) +
ggtitle('Distribution of sale events over an average week') +
labs(y = "Number of sales", x = 'Weekdays', fill = 'Number of sales') + theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
```
Does these differences between days are significant?
```{r, echo=FALSE}
a <- df %>% filter(`Event type` %in% c('Product', 'Sales')) %>% mutate(
week_part = case_when(
Weekdays %in% c('Monday','Tuesday','Wednesday','Thursday') ~ 0,
Weekdays %in% c('Friday','Saturday','Sunday') ~ 1),
`Event type` = if_else(`Event type` == 'Product', 0, 1 )) %>%
dplyr::select(week_part, `Event type`)
b <- df %>% filter(`Event type` == c('Product', 'Sales')) %>% mutate(
week_part = case_when(
Weekdays == 'Thursday' ~ 0,
Weekdays == 'Friday' ~ 1),
`Event type` = if_else(`Event type` == 'Product', 0, 1 )) %>%
dplyr::select(week_part, `Event type`)
parameters(chisq.test(table(a)))
parameters(chisq.test(table(b)))
```
**Recommendation:**
1. The most sailing days are Friday, Saturday and Sunday;
2. However, chi-square demonstrates that there is no significant difference between these days, so we can favor any of these days for advetisement campaign.
# 2. Browser
```{r, echo=FALSE}
df %>% filter(`Event type` == 'Sales', `Browser family` != 'NULL', `Browser family` != 'other') %>%
group_by(`Browser family`) %>%
summarise(Usage = n()) %>% arrange(desc(Usage)) %>%
mutate(`Browser family` = case_when(
Usage < 100 ~ 'small',
Usage > 100 ~ `Browser family`)) %>%
filter(`Browser family` != 'small') %>%
ggplot(aes(y = Usage, x = reorder(`Browser family`, Usage), label = Usage , fill = reorder(`Browser family`, desc(Usage)))) +
geom_col() +
geom_text( position = position_dodge(width = 0.9), vjust = -0.5) +
ggtitle('Distribution of sale events over the browser families') +
labs(y = "Number of sales", x = 'Browser family', fill = 'Browser family') + theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
```
Let's also observe browser usage through different device groups.
```{r, echo=FALSE}
df %>% filter(`Event type` == 'Sales', `Browser family` != 'NULL', `Browser family` != 'other') %>%
group_by(`Browser family`, `Device group`) %>%
summarise(Usage = n()) %>% arrange(desc(Usage)) %>%
mutate(`Browser family` = case_when(
Usage < 90 ~ 'small',
Usage > 90 ~ `Browser family`)) %>%
filter(`Browser family` != 'small') %>%
ggplot(aes(x = Usage, y = reorder(`Browser family`, Usage), label = Usage , fill = reorder(`Browser family`, desc(Usage)))) +
geom_col() +
geom_text( position = position_dodge(width = 0.9)) +
facet_wrap(~`Device group`) +
ggtitle('Distribution of sale events over the browser families') +
labs(x = "Number of sales", y = 'Browser family', fill = 'Browser family') + theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
```
**Recommendation:**
Different browsers render content in their own way. So during advertisement campaign company should adjust to the following browsers as soon as they affect the majority of sales:
- Chrome, Safari and Firefox for desktop;
- Mobile Safari and Chrome Mobile for smartphones;
- Mobile Safari and Chrome for Tablets.
```{r, include=FALSE}
df %>% filter(`Event type` == 'Sales', `Browser family` != 'NULL', `Browser family` != 'other') %>%
group_by(`Browser family`) %>%
summarise(Usage = n()) %>% arrange(desc(Usage)) %>%
mutate(`Browser family` = case_when(
Usage < 100 ~ 'small',
Usage > 100 ~ `Browser family`)) %>%
filter(`Browser family` != 'small') %>% distinct( `Browser family`) -> list_bro
list_bro <- as.list(list_bro$`Browser family`)
```
```{r, echo=FALSE}
# position dodge?
df %>% filter( `Browser family` != 'NULL', `Browser family` != 'other') %>%
group_by(`Browser family`, `Event type`) %>%
summarise(Usage = n()) %>%
filter(`Browser family` %in% list_bro) %>%
ggplot(aes(x = Usage, y = reorder(`Browser family`, Usage), label = Usage , fill = reorder(`Browser family`, Usage))) +
geom_col() +
geom_text( position = position_dodge(width = 0.9)) +
ggtitle('Distribution of browser families over actions') +
labs(x = "Number of sales", fill = 'Browser family') + facet_wrap(~`Event type`)+ theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
```
# 2. User location
```{r, echo=FALSE}
df %>% filter(`User location` != "Unknown") %>% filter(`Event type` == 'Sales') %>%
group_by(`User location`, Date) %>%
mutate(Usage = n()) %>%
ggplot(aes(x = Date, y = Usage, color = `User location`)) + geom_col(show.legend = FALSE) + facet_wrap(~`User location`) +
ggtitle('The geography of sale events') + theme_minimal() +
theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 90), axis.title.x = element_text(hjust = 2), axis.ticks = element_line(size = 1))
```
```{r, echo=FALSE}
a <-df %>% filter(`User location` != "Unknown") %>% filter(`Event type` == 'Sales') %>%
group_by(`User location`, Date) %>%
mutate(Usage = n()) %>% dplyr::select(`Date`, `User location`, Usage)
tab_model(lm(data = a, Usage ~ `User location`))
```
The mean daily number of visit from inland is higher by 57 times than outside.
```{r, echo=FALSE}
df %>% filter(`User location` != "Unknown", `Event type` == 'Sales') %>%
group_by(`User location`) %>%
summarise(Usage = n()) %>%
ggplot(aes(x = `User location`, y = Usage, fill = `User location`, label = Usage)) +
geom_col() +
geom_text(position = position_dodge(width = 0.9), vjust = -0.5) +
theme_minimal() + ggtitle('Annual number of sales actions from inland and outland') +
theme(plot.title = element_text(hjust = 0.5))
```
**Recommendation:** only 10 % of sales occur from outside and mean daily, so outside advertisment should not be a main focus of a campaign.
# 3. Existing client
```{r, echo=FALSE}
df %>% filter(`Event type` == 'Sales') %>%
group_by(`Existing client`) %>%
summarise(Usage = n()) %>%
ggplot(aes(x = `Existing client`, y = Usage, fill = `Existing client`, label = Usage)) +
geom_col(show.legend = FALSE) +
geom_text(position = position_dodge(width = 0.9), vjust = -0.5) +
theme_minimal() + ggtitle('Annual number of sales actions from new and exising clients') +
theme(plot.title = element_text(hjust = 0.5)) + scale_fill_hue(name = "Type of client", labels=c("New", "Existing")) + scale_x_discrete(labels=c("New", "Existing"))
```
```{r, echo=FALSE}
df %>% filter(`Event type` == 'Sales') %>% group_by(`Cross-device user ID`) %>% mutate(Revenue = `Product Price` * `Product quantity`) %>% group_by(`Existing client`) %>% summarise(`Total Revenue` = sum(Revenue), `Mean Revenue` = mean(Revenue),`Median Revenue` = median(Revenue), `Mean Product Quantity` = mean(`Product quantity`)) %>% formattable(pd = FALSE)
```
```{r, echo=FALSE}
df %>% filter(`Event type` == 'Sales') %>% group_by(`Existing client`, Date) %>%
mutate(Revenue = sum(`Product Price` * `Product quantity`)) %>%
ggplot(aes(x = Date, y = Revenue, fill = `Existing client`)) + geom_col(show.legend = FALSE) +
facet_wrap(~`Existing client`, labeller = as_labeller(c(`0` = "New", `1` = "Existing"))) +
ggtitle('Distribution of sales over type of purchasers') +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
```
New clients buy significantly more
# 3. Product category
```{r, echo=FALSE}
df %>% filter(`Event type` == 'Sales') %>% group_by(`Product category`) %>%
summarise(Revenue = sum(`Product Price` * `Product quantity`)) %>%
ggplot(aes(x =`Product category`, y = Revenue, label = Revenue, fill= `Product category`))+
geom_col() +
geom_text(position = position_dodge(width = 0.9), vjust = -0.5) +
theme_minimal() + ggtitle('Annual Revenue for product categories') +
theme(plot.title = element_text(hjust = 0.5))
```
```{r, echo=FALSE}
df %>% filter(`Event type` == 'Sales') %>% group_by(`Product category`) %>%
summarise(Quantity = sum(`Product quantity`)) %>%
ggplot(aes(x =`Product category`, y = Quantity, fill =`Product category`, label = Quantity))+
geom_col() +
geom_text(position = position_dodge(width = 0.9), vjust = -0.5) +
theme_minimal() + ggtitle('Annual quantity of saled products') +
theme(plot.title = element_text(hjust = 0.5))
```
```{r, echo=FALSE}
df %>% filter(`Event type` == 'Sales') %>% group_by(`Product category`, Date) %>%
mutate(Quantity = sum(`Product quantity`)) %>%
ggplot(aes(x = Date, y = Quantity, color = `Product category`))+
geom_col(show.legend = FALSE) + facet_wrap(~`Product category`) +
ggtitle('Annual sales events by product category') +
theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 90), axis.title.x = element_text(hjust = 2), axis.ticks = element_line(size = 1))
```
```{r, echo=FALSE}
df %>% filter(`Event type` == 'Sales') %>% group_by(`Product category`, Date) %>%
mutate(Revenue = sum(`Product Price` * `Product quantity`)) %>%
ggplot(aes(x = Date, y = Revenue, color = `Product category`))+
geom_col(show.legend = FALSE) + facet_wrap(~`Product category`)+
ggtitle('Annual revenue by product category') +
theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 90), axis.title.x = element_text(hjust = 2), axis.ticks = element_line(size = 1)) + scale_y_continuous(labels = function(x) format(x, scientific = FALSE))
```
**Recommendations:**
1. packages and small item were saled in the same quantity, but revenue from packages is highest among all products and by two times higher than from small items
2. Advertising of two leading products, packages and small items, should give the highest revenue.
3. In November-December promotion of packages and medium items is preferable.
4. In April promotion of packages and small items is preferable.
<!-- # 5. Sales funnel -->
<!-- ```{r, echo=FALSE} -->
<!-- df %>% -->
<!-- group_by(`Event type`) %>% -->
<!-- summarise(Visits = n()) -->
<!-- ``` -->
<!-- ```{r, echo=FALSE} -->
<!-- tab <- df %>% -->
<!-- group_by(`Cross-device user ID`) %>% -->
<!-- summarise(`Number of actions` = n_distinct(`Event type`), `Listed?` = if_else('Listing' %in% `Event type`, 1, 0), `Sales?` = if_else('Sales' %in% `Event type`, 1, 0)) %>% filter(`Listed?` != 0) %>% group_by(`Sales?`) %>% summarise(`Number of those who listed` = n()) -->
<!-- tab -->
<!-- (tab$`Number of those who listed`[2] *100) / tab$`Number of those who listed`[1] -->
<!-- tab <- df %>% -->
<!-- group_by(`Cross-device user ID`) %>% -->
<!-- summarise(`Number of actions` = n_distinct(`Event type`), `Product?` = if_else('Product' %in% `Event type`, 1, 0), `Sales?` = if_else('Sales' %in% `Event type`, 1, 0)) %>% filter(`Product?` != 0) %>% group_by(`Sales?`) %>% summarise(`Number of those who visited product page` = n()) -->
<!-- tab -->
<!-- (tab$`Number of those who visited product page`[2] *100) / tab$`Number of those who visited product page`[1] -->
<!-- ``` -->
<!-- That means that -->
<!-- 1. Among whose who listed only 17% purchased a product, while among those who visited product page - 19%. Majority of shoppers do not list and navigate directly to the required product. -->
<!-- 2. Кажется нужен пивалью -->