-
Notifications
You must be signed in to change notification settings - Fork 0
/
wikipedia-vandalism.Rmd
1116 lines (966 loc) · 54.7 KB
/
wikipedia-vandalism.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Detecting Vandalism in Wikipedia"
author: "Xabriel J Collazo Mojica"
date: "3/2/2022"
output:
pdf_document:
latex_engine: xelatex # workaround for unicode
html_document: default
bibliography: references.bib
csl: journal-of-the-acm.csl
link-citations: yes
notes: footnotes
---
```{r setup, include=FALSE, cache=TRUE}
knitr::opts_chunk$set(echo = TRUE)
#
# Dependencies
#
if(!require(tidyverse)) install.packages("tidyverse", repos = "http://cran.us.r-project.org")
if(!require(multidplyr)) install.packages("multidplyr", repos = "http://cran.us.r-project.org")
if(!require(caret)) install.packages("caret", repos = "http://cran.us.r-project.org")
if(!require(doParallel)) install.packages("doParallel", repos = "http://cran.us.r-project.org")
if(!require(MLmetrics)) install.packages("MLmetrics", repos = "http://cran.us.r-project.org")
if(!require(yardstick)) install.packages("yardstick", repos = "http://cran.us.r-project.org")
if(!require(tidytext)) install.packages("tidytext", repos = "http://cran.us.r-project.org")
if(!require(nnet)) install.packages("nnet", repos = "http://cran.us.r-project.org")
if(!require(kknn)) install.packages("kknn", repos = "http://cran.us.r-project.org")
if(!require(wordcloud)) install.packages("wordcloud", repos = "http://cran.us.r-project.org")
library(tidyverse)
library(multidplyr)
library(caret)
library(doParallel)
library(MLmetrics)
library(yardstick)
library(tidytext)
library(nnet)
library(kknn)
library(wordcloud)
# load up helper functions
source("functions.R")
# We utilize "PAN Wikipedia Vandalism Corpus 2010": https://webis.de/data/pan-wvc-10
# download corpus if needed. note the zip file is 438 MBs, and it expands to 1.4 GBs.
if (!dir.exists("data/pan-wikipedia-vandalism-corpus-2010")) {
dir.create("data/pan-wikipedia-vandalism-corpus-2010")
corpusFileName <- "data/pan-wikipedia-vandalism-corpus-2010.zip"
corpusURI <-
"https://zenodo.org/record/3341488/files/pan-wikipedia-vandalism-corpus-2010.zip?download=1"
download.file(url = corpusURI,
destfile = corpusFileName,
method = "curl", # use curl since auto mode fails intermittently
timeout = 300) # 5 minutes
unzip(corpusFileName, exdir = "data")
file.remove(corpusFileName)
rm(corpusFileName, corpusURI)
}
```
## Introduction
Wikipedia is a free online encyclopedia that is written and maintained by volunteers. Anyone with internet access can edit Wikipedia. This model has enabled Wikipedia to grow steadily since its inception in 2001 to the current 58 million articles in 325 languages. As of November 2020, Wikipedia is edited 17 million times per month (1.9 edits per second) @Wiki-Wikipedia.
Most of the edits are bona fide contributions. However, about 7% of them are vandalism. Wikipedia defines vandalism as "any change or edit that manipulates content in a way that deliberately compromises Wikipedia's integrity" @Wiki-Vandalism. Vandalism examples include adding out of context profanity, nonsense, removing content without a reason, and adding plausible but false content.
Wikipedia vandalism is removed with manual and automated approaches. In manual approaches, vandalism is deleted by the same community of contributors. In severe cases, where vandalism occurs repeatedly for a particular entry, administrators may enable edit restrictions. Both of these manual approaches slow down the rate of quality contributions since time has to be diverted to clean up existing content, or bona fide contributions are temporally not allowed because of the restrictions.
In automated approaches, software bots have been implemented to detect and correct vandalism. The first iteration of bots used heuristics and regular expressions to detect potential vandalism. The second and current iteration utilizes more sophisticated algorithms. As an example, the "ClueBot NG" bot uses Naive Bayes and Neural Networks to detect vandalism in the English Wikipedia. It catches about 40% of vandalism with its current setting of maximum 0.1% false positive rate @Wiki-User-ClueBotNG. This is the current state of the work.
There has been considerate research done on improving Wikipedia vandalism detection @Wiki-Corpus-2010, @Potthast-2010, @MolaVelasco-2012, @Laat-2015. One of the most interesting approaches is that of @Potthast-2010, in which a competition was held to encourage detection contributions to the state of the art. We utilize the same dataset as in @Potthast-2010 to implement a binary classifier that can be compared to the top submission of their competition.
The dataset is the "PAN Wikipedia Vandalism Corpus 2010". It consists of 32,452 edits on 28,468 English Wikipedia articles, among which 2,391 vandalism edits have been identified @Wiki-Corpus-2010. It reflects the same rate of vandalism as Wikipedia.
The dataset consists of multiple files with metadata about the edits such as a unique identifier, the author of the edit, and whether the edit is considered vandalism. It also includes the content of the edited article before and after the edit, and thus we can use existing textual difference algorithms to find the additions and deletions. We discuss the dataset and transformations in more details in the Methods and Analysis section.
We define the problem as follows: given a set of edits of Wikipedia articles, we want to separate the edits we believe to be vandalism, from those we believe not to be. Given that most edits are bona fide, this is an imbalanced binary classification problem. Existing literature suggest that an appropriate method to measure performance is with a "Precision-recall curve" (PR Curve) @Potthast-2010, @Saito-2015. This method allows us to visually compare the trade-off between these metrics, and by calculating the area under the curve of this method (PR-AUC), we can summarize the performance to a single number. This number can then be used to choose the best parameters when tuning an individual model, and to compare between models.
We find that by implementing a set of 24 features that rely on the provided dataset without any other external source, we achieve a PR-AUC of 0.6608231 with a Random Forest algorithm trained with 1000 trees and 3 random features at each split point. This result is on par with the best approach discussed in @Potthast-2010 which yields 0.6628893.
## Methods and Analysis
### PAN Wikipedia Vandalism Corpus 2010
The PAN Wikipedia Vandalism Corpus 2010 consists of 32,452 edits on 28,468 English Wikipedia articles, among which 2,391 vandalism edits have been identified @Wiki-Corpus-2010. It reflects the same rate of vandalism as Wikipedia at 7.38%. There are two files and one directory included in the dataset that are relevant to our work. The first file, `edits.csv`, contains comma separated fields, with each line containing the following:
+-----------------------------------+------------------------------------------------------------------------------------+
| Field | Description |
+===================================+====================================================================================+
| editid | A synthetic unique edit id to easily join this file with others. |
+-----------------------------------+------------------------------------------------------------------------------------+
| editor | The username for this edit, or an IP address if anonymous. |
+-----------------------------------+------------------------------------------------------------------------------------+
| oldrevisionid | A Wikipedia unique id for the revision of the article before the edit. |
+-----------------------------------+------------------------------------------------------------------------------------+
| newrevisionid | A Wikipedia unique id for the revision of the article after the edit. |
+-----------------------------------+------------------------------------------------------------------------------------+
| diffurl | a URL pointing to Wikipedia's visual interface for comparing old and new revision. |
+-----------------------------------+------------------------------------------------------------------------------------+
| edittime | Timestamp for the edit in ISO 8601 UTC format. |
+-----------------------------------+------------------------------------------------------------------------------------+
| editcomment | The comment, if any, that the user left when making the edit. |
+-----------------------------------+------------------------------------------------------------------------------------+
| articleid | A Wikipedia unique id of the article that this edit modifies. |
+-----------------------------------+------------------------------------------------------------------------------------+
| articletitle | The title of the edited article. |
+-----------------------------------+------------------------------------------------------------------------------------+
The second file, `gold-annotations.csv`, contains the following:
+-----------------------------------+------------------------------------------------------------------+
| Field | Description |
+===================================+==================================================================+
| editid | References the editid from `edits.csv`. |
+-----------------------------------+------------------------------------------------------------------+
| goldclass | Classification of the edit as vandalism or regular. |
+-----------------------------------+------------------------------------------------------------------+
| annotators | Number of annotators that concur with the classification |
+-----------------------------------+------------------------------------------------------------------+
| totalannotators | Total number of annotators who reviewed and classified the edit. |
+-----------------------------------+------------------------------------------------------------------+
Finally, there is a directory called `article-revisions/` which @Wiki-Corpus-2010 describes as: "A list of directories `part1/` through `part65/`. Each part directory contains up to 1000 text files, each of which being the revision of a Wikipedia article. The name of a file is its revision identifier, which is referenced in the `edits.csv` file. The contents of each text file is the plain wikitext of the revision."
### Data cleaning
For the rest of this report we use the R programming language and supporting libraries to transform the dataset @R-Language. A working knowledge of programming and regular expressions is assumed.
The clean up steps for `edits.csv` and `gold-annotations.csv` is trivial: we need only to read and join them by `editid`. We keep the output in an `edits` R data frame object.
Processing `article-revisions/` is more interesting. Our end goal is to calculate what is commonly known as a `diff`; the textual difference between two files. We first define a function `git_diff()` that leverages the built-in diff algorithm from the git version control system[^1]. The key argument to this call is `--word-diff=porcelain`, which creates an output that is in a line-based format easily consumed by scripts @git-porcelain.
[^1]: We considered multiple approaches for calculating the diff, including using portable code from the `diffobj` R package. However, we found no elegant way to transform the diff into a format amenable to our purposes other than with `git diff`. If the reader is attempting to reproduce our code, we recommend the use of a Unix-like system like Linux or MacOS.
```{r git-diff-definition, eval=FALSE, cache=TRUE, purl=FALSE}
git_diff <- Vectorize(function(old, new) {
system2(
command = "git",
args = c(
"diff",
"--no-prefix",
"--no-index",
"--word-diff=porcelain",
"--unified=0",
old,
new
),
stdout = TRUE
)
})
```
With this function defined, we now walk the `article-revisions/` folder collecting metadata about each revision, and then join this data with `edits`. We then mutate the data frame further by calling `git_diff()` on each pair of 'old' and 'new' revisions, and then we separate the diff output into additions (lines that start with an `+`) and deletions (`-`). Finally, we join the difference calculations into `edits` to consolidate into one object.
```{r load-edits, include=FALSE, cache=TRUE}
#
# Load data
# Below we read both edits.csv and gold-annotations.csv and join them.
#
parentPath <- "data/pan-wikipedia-vandalism-corpus-2010/"
edits <- read_csv(
file = paste(parentPath, "edits.csv", sep = ""),
quote = "\"",
col_names = TRUE,
col_types = cols(
editid = col_integer(),
editor = col_character(),
oldrevisionid = col_integer(),
newrevisionid = col_integer(),
diffurl = col_character(),
edittime = col_datetime(format = ""),
editcomment = col_character(),
articleid = col_integer(),
articletitle = col_character()
)
)
annotations <-
read_csv(
file = paste(parentPath, "gold-annotations.csv", sep = ""),
quote = "\"",
col_names = TRUE,
col_types = cols(
editid = col_integer(),
class = col_character(),
annotators = col_integer(),
totalannotators = col_integer()
)
) %>%
# make sure vandalism == 1
mutate(class = factor(class, levels = c("vandalism", "regular")))
edits <- edits %>%
left_join(annotations %>% select(editid, class), by = "editid")
rm(annotations)
```
```{r setup-multidplyr, include=FALSE, cache=TRUE}
# use multidplyr to parallelize dplyr actions
cores <- coalesce(detectCores() - 1, 1)
cluster <- multidplyr::new_cluster(cores)
multidplyr::cluster_copy(cluster, 'git_diff')
multidplyr::cluster_library(cluster, 'tidyverse')
edits <- edits %>% partition(cluster)
```
```{r calculate-diffs, cache=TRUE, warning=FALSE}
parentPath <- "data/pan-wikipedia-vandalism-corpus-2010/"
# recursively find files in the article-revisions/ directory
revisionPaths <-
list.files(
path = paste(parentPath, "article-revisions", sep = ""),
full.names = TRUE,
recursive = TRUE
)
# create a data frame with revision information
revisions <- map_dfr(revisionPaths, function(path) {
list(
revisionid = as.integer(str_remove(basename(path), ".txt")),
revisionpath = path,
revisionsize = file.size(path)
)
})
# calculate diff for each edit
diffs <- edits %>%
select(editid, oldrevisionid, newrevisionid) %>%
left_join(
revisions %>%
select(revisionid, revisionpath, revisionsize) %>%
rename(oldrevisionpath = revisionpath, oldrevisionsize = revisionsize),
by = c("oldrevisionid" = "revisionid"),
copy = TRUE
) %>%
left_join(
revisions %>%
select(revisionid, revisionpath, revisionsize) %>%
rename(newrevisionpath = revisionpath, newrevisionsize = revisionsize),
by = c("newrevisionid" = "revisionid"),
copy = TRUE
) %>%
# diff is a list of chars
mutate(diff = git_diff(oldrevisionpath, newrevisionpath)) %>%
select(editid,
oldrevisionid,
newrevisionid,
oldrevisionsize,
newrevisionsize,
diff) %>%
# additions are diff lines that start with '+'
mutate(additions =
lapply(diff, function(d) {
s = d[str_starts(d, fixed("+")) & !str_starts(d, fixed("+++"))]
str_sub(s, start = 2)
})) %>%
# deletions are diff lines that start with '-'
mutate(deletions =
lapply(diff, function(d) {
s = d[str_starts(d, fixed("-")) & !str_starts(d, fixed("---"))]
str_sub(s, start = 2)
}))
edits <- edits %>%
select(-diffurl) %>%
left_join(diffs %>% select(-oldrevisionid,-newrevisionid),
by = c("editid"))
```
```{r, include=FALSE, cache=TRUE}
rm(revisionPaths, parentPath)
rm(diffs, revisions)
```
### Data exploration
As alluded previously, the dataset is highly imbalanced with the `vandalism` class being 7.38% of the data:
```{r imbalance, cache=TRUE}
edits %>%
collect() %>%
group_by(class) %>%
summarise(n = n()) %>%
ggplot(aes(x=class, y= n)) +
geom_bar(stat="identity") +
ggtitle("Distribution of edits per class")
```
However, the `vandalism` class does show trends that can drive the classification. As an example, most of the profanities are found in the `vandalism` class:
```{r profanity, cache=TRUE}
# load list of profanities
profanities <- read_lines(file = "data/en/profanities.txt")
# a regex that is aware of wikisyntax
wiki_regex <- "----|:{1,6}|\\*{1,4}|#{1,4}|={1,5}|\\{\\{|\\}\\}|\\[\\[|]]|\\w+"
# setup for parallelism
multidplyr::cluster_copy(cluster, 'bin_search')
multidplyr::cluster_copy(cluster, 'profanities')
# extract words from the diff additions, and check whether each one of them
# against a profanity word list. Then calculate the profanity rate for each class.
words_by_class <- edits %>%
collect() %>%
select(class, additions) %>%
mutate(
additions_as_string = map_chr(additions, str_c, collapse = "\n"),
word_list = map(str_match_all(additions_as_string, wiki_regex), as.vector),
word_list_lower = map(word_list, str_to_lower),
) %>%
select(class, word_list_lower) %>%
unnest_longer(col = word_list_lower, values_to = "word") %>%
partition(cluster) %>%
mutate(word = replace_na(word, ""),
is_profanity = map_lgl(word, function(w) {
bin_search(profanities, w) > 0
})) %>%
collect() %>%
group_by(class)
words_by_class %>%
summarise(profanity_rate = sum(is_profanity) / n()) %>%
ggplot(aes(x = class, y = profanity_rate)) +
geom_bar(stat="identity") +
ggtitle("Rate of profanity use per class")
```
Similarly, vandals tend to use words that differs from bona fide edits. In the comparison wordcloud below, we can see that vandals tend to use profanities and contractions, and notably the word 'time', while bona fide edits tend to use wikisyntax and URLs.
```{r wordcloud, echo=TRUE, message=FALSE, warning=FALSE, cache=TRUE, fig.align='center'}
word_counts <- words_by_class %>%
anti_join(stop_words) %>%
filter(!str_detect(word, "^\\d+$")) %>% # no numbers
group_by(class, word) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
pivot_wider(names_from = class, values_from = n, values_fill = 0L) %>%
# normalize vandalism words hits since it is 7.38%
mutate(vandalism = as.integer(vandalism * 1/.0738))
word_counts_matrix <- as.matrix(word_counts[,-1])
rownames(word_counts_matrix) <- word_counts$word
comparison.cloud(
word_counts_matrix,
random.order=FALSE,
title.size = 2L,
match.colors = TRUE
)
```
Additionally, most of the vandals are anonymous:
```{r vandals-anonymous, cache=TRUE}
# a regex that matches IP addresses
ip_address_regex <- "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"
edits %>%
collect() %>%
select(editor, class) %>%
mutate(is_anonymous = str_detect(editor, ip_address_regex)) %>%
group_by(class) %>%
summarise(anonymity_rate = sum(is_anonymous) / n()) %>%
ggplot(aes(x = class, y = anonymity_rate)) +
geom_bar(stat="identity") +
ggtitle("Rate of anonymity per class")
```
These and other signals can then be used to construct a feature set that tries to separate the vandalism from the bona fide contributions. Note however that not all profanity use is vandalism, and not all anonymous contributors are vandals. Similarly, not all contraction use is illegitimate. Thus our main insight is that since the signals seem to be blurry, we need to be able to evaluate our solution in a way that penalizes classifying bona fide contributions as vandalism.
### Evaluating performance
There are multiple ways to evaluate the performance of a learning algorithm. Metrics are typically derived from the confusion matrix. That is, from the four possible predicted conditions: true positive (TP), true negative (TN), false positive (FP), or false negative (TP). Below we discuss our chosen metrics that allow us to compare our solution against both research classifiers as well as the current system used by Wikipedia.
#### Metric for comparison againt top classifiers from @Potthast-2010
For imbalanced binary classification, a metric like $Accuracy = \frac{TP + TN}{TP + TN + FP +FN}$ can be misleading since we would get a high accuracy even if we always predict the majority class. In our case where the `vandalism` class is 7.38% of the dataset, always predicting the majority class would yield 92.62% accuracy.
Additionally, we need a metric that penalizes FP hits, since labeling bona fide contributions as `vandalism` is a particularly bad outcome for a system as Wikipedia that depends on volunteer contributions. Existing literature suggest that an appropriate metric for this situation is the "Precision-recall curve" (PR Curve). This curve allows us to visually compare the trade-off between precision and recall, and provides visual separation when comparing alternative classifiers. By calculating the area under the curve (PR-AUC), we can summarize the performance to a single number @Potthast-2010, @Saito-2015.
PR Curves are calculated as follows: 1) Implement a binary classification algorithm. 2) Use the algorithm to predict the probability of the 'positive' class (in our case `vandalism`). 3) Now calculate the confusion matrix of the algorithm while varying the threshold $\tau$ of binary classification between [0,1] with respect to the positive class probability. 4) Use the confusion matrix at each threshold $\tau$ to calculate the Precision and Recall, and plot the outcome as x and y, respectively.
#### Metric for comparison against ClueBot NG @Wiki-User-ClueBotNG
We also compare our classifier to the current vandalism bot in the English Wikipedia. Only two performance metrics seem to be available: 40% TP-rate at fixed 0.1% FP-rate, and 55% TP-rate at fixed 0.25% FP-rate @Wiki-User-ClueBotNG[^2].
[^2]: After perusing ClueBot NG's Request for approval at https://en.wikipedia.org/wiki/Wikipedia:Bots/Requests_for_approval/ClueBot_NG, it seems like at some point there was a 'trial report', that is, a detailed document with perhaps a full ROC Curve, but this report has been lost as it was kept in a personal server rather than as part of Wikipedia. We asked for help to find this document at https://web.libera.chat/?channel=#wikipedia-en-help, but as of the time of publishing, the report has not been found.
These two data points can be used as reference points in a Receiver Operating Characteristic Curve (ROC Curve). A ROC Curve is constructed similarly to a PR curve, but instead we plot the TP rate against the FP rate. This is the only comparison that we can do against the limited data available from ClueBot NG's performance.
#### Validation and test data
In this work, the validation set will be 50% of source data. We do this to be comparable to the classifiers discussed in @Potthast-2010, where a 50% validation set was used as well. Given half of the data will be used for validation, we opt to do the training using 10-fold cross validation, instead of further partitioning the remaining data into a discrete test set.
```{r data-split, include=FALSE, cache=TRUE}
# validation set will be 50% of source data, to be comparable to [Potthast 2010].
# data is imbalanced, but createDataPartition takes care of stratifying splits properly.
set.seed(123, sample.kind = "Rounding")
validation_index <-
createDataPartition(
y = edits %>% pull(class),
times = 1,
p = 0.5,
list = FALSE
)
validation_edits <- edits %>% as_tibble() %>% .[validation_index, ]
train_edits <- edits %>% as_tibble() %>% .[-validation_index, ]
```
### Feature engineering
The following set of features have been implemented from descriptions of the work of @Potthast-2010 and @MolaVelasco-2012. They were extracted from the dataset without aid of external sources of information. Unless otherwise noted, these features were derived from the additions of the diff. We considered features from the deletions, but they either yielded insignificant gains or hurt performance.
Category | Feature | Description
--------|---------|--------
Character-level | `upper_to_lower_ratio` | The ratio of uppercase characters (`[A-Z]`) to lowercase (`[a-z]`).
Character-level | `upper_to_all_ratio` | The ratio of uppercase characters (`[A-Z]`) to all characters (`.`).
Character-level | `digits_ratio` | The ratio of digits (`\\d`) to all characters (`.`).
Character-level | `special_chars_ratio` | The ratio of special characters (`[^A-Za-z0-9]`) to all characters (`.`).
Character-level | `char_diversity` | The character length of all additions to the (1 / number of different characters ).
Character-level | `compression_ratio` | The ratio of the length of all additions to the length of the compressed version of all additions. We used the `gzip` coder for its speed.
Word-level | `profanity_count` | How many inserted words are considered to be profanities.
Word-level | `pronoun_count` | How many inserted words are pronouns. Ex: He, she, they.
Word-level | `superlative_count` | How many inserted words are superlatives. Ex: biggest, greatest.
Word-level | `contraction_count` | How many inserted words are contractions. Ex: methinks, won't).
Word-level | `wikisyntax_count` | How many inserted words are considered WikiSyntax Ex: "[[", table.
Word-level | `common_vandalism_count` | The top 200 words used by vandals calculated from the train set.
Word-level | `common_regular_count` | The top 200 words used by non-vandals calculated from the train set.
Word-level | `longest_word` | The length of the longest word inserted.
Comment-level | `comment_exists` | Whether the editor included a comment or not.
Comment-level | `comment_length` | The length of the comment if included, otherwise 0.
Comment-level | `comment_is_revert` | Whether the comment content suggests the edit is a revert of a previous edit.
Comment-level | `comment_is_bot` | Whether the comment content suggests the edit was made by a bot.
Comment-level | `comment_has_profanity` | Whether the comment content includes any word considered a profanity.
Size statistic | `size_delta` | The difference between the byte length of the new and old revisions.
Size statistic | `size_ratio` | The ratio between the byte length of the new and old revisions.
Size statistic | `num_additions` | The number of separate additions as derived by the diff algorithm.
Size statistic | `num_deletions` | The number of separate deletions as derived by the diff algorithm.
Editor reputation | `is_anonymous` | Whether the editor is registered on Wikipedia or not.
```{r features, include=FALSE, cache=TRUE}
# copy function num_unique_chars into cluster
multidplyr::cluster_copy(cluster, 'num_unique_chars')
# character level features:
character_features <- edits %>%
select(editid, additions) %>%
mutate(additions_as_string = map_chr(additions, str_c, collapse = "\n")) %>%
# Ratio of upper case chars to lower case chars (all chars)
mutate(
upper_to_lower_ratio =
(str_count(additions_as_string, "[A-Z]") + 1) / (str_count(additions_as_string, "[a-z]") + 1)
) %>%
# Ratio of upper case chars to all chars
mutate(
upper_to_all_ratio =
(str_count(additions_as_string, "[A-Z]") + 1) / (str_count(additions_as_string, ".") + 1)
) %>%
# Ratio of digits to all letters.
mutate(
digits_ratio =
(str_count(additions_as_string, "\\d") + 1) / (str_count(additions_as_string, ".") + 1)
) %>%
# Ratio of special chars to all chars.
mutate(
special_chars_ratio =
(str_count(additions_as_string, "[^A-Za-z0-9]") + 1) / (str_count(additions_as_string, ".") + 1)
) %>%
# Length of all inserted lines to the (1 / number of different chars)
mutate(
char_diversity =
str_length(additions_as_string) ^ (1 / num_unique_chars(additions_as_string))
) %>%
# achievable compression ratio of all chars
# gzip chosen for its speed (https://cran.r-project.org/web/packages/brotli/vignettes/benchmarks.html)
mutate(
compression_ratio =
(map_int(additions_as_string, function(s) { length(as.raw(s)) }) + 1) /
(map_int(additions_as_string, function(s) { length(memCompress(s, type = "gzip")) }) + 1)
) %>%
select(-additions, -additions_as_string) %>%
collect()
# load word dictionaries
profanities <- read_lines(file = "data/en/profanities.txt")
pronouns <- read_lines(file = "data/en/pronouns.txt")
contractions <- read_lines(file = "data/en/contractions.txt")
superlatives <- read_lines(file = "data/en/superlatives.txt")
wikisyntax <- read_lines(file = "data/global/wikisyntax.txt")
# This tokenizer regex is wikisyntax aware.
# matches:
# "----" section break
# indentation markers with ":", "*", "#" or "=".
# link delimiters with "{{", "}}", "[[" or "]]"
# any other regular word ( which would also include other wikisyntax )
wiki_regex <- "----|:{1,6}|\\*{1,4}|#{1,4}|={1,5}|\\{\\{|\\}\\}|\\[\\[|]]|\\w+"
# figure out most common words, but use only train_edits as to not overtrain
common_words <- train_edits %>%
mutate(additions_as_string = map_chr(additions, str_c, collapse = "\n")) %>%
mutate(
word_list = map(str_match_all(additions_as_string, wiki_regex), as.vector),
word_list_lower = map(word_list, str_to_lower),
) %>% select(editid, class, word_list_lower) %>%
unnest_longer(col = word_list_lower, values_to = "word") %>%
anti_join(stop_words) %>%
filter(!word %in% wikisyntax) %>%
filter(!str_detect(word, "^\\d+$")) %>% # no numbers
group_by(class, word) %>%
summarise(n = n()) %>%
arrange(desc(n))
common_vandalism_words <- common_words %>%
filter(class == "vandalism") %>%
slice_head(n = 200)
common_regular_words <- common_words %>%
filter(class == "regular") %>%
slice_head(n = 200)
intersection_words <- common_vandalism_words %>%
inner_join(common_regular_words, by = "word")
common_vandalism_words <- common_vandalism_words %>%
anti_join(intersection_words, by = "word") %>%
arrange(word) %>%
pull(word)
common_regular_words <- common_regular_words %>%
anti_join(intersection_words, by = "word") %>%
arrange(word) %>%
pull(word)
multidplyr::cluster_copy(cluster, 'profanities')
multidplyr::cluster_copy(cluster, 'pronouns')
multidplyr::cluster_copy(cluster, 'contractions')
multidplyr::cluster_copy(cluster, 'superlatives')
multidplyr::cluster_copy(cluster, 'wikisyntax')
multidplyr::cluster_copy(cluster, 'common_vandalism_words')
multidplyr::cluster_copy(cluster, 'common_regular_words')
word_features <- edits %>%
collect() %>% # multidplyr doesn't like unnest_longer(), so let's do this section locally
select(editid, additions) %>%
mutate(
additions_as_string = map_chr(additions, str_c, collapse = "\n"),
word_list = map(str_match_all(additions_as_string, wiki_regex), as.vector),
word_list_lower = map(word_list, str_to_lower),
) %>%
select(editid, word_list_lower) %>%
unnest_longer(col = word_list_lower, values_to = "word") %>%
partition(cluster) %>% # now leverage multidplyr
mutate(
word = replace_na(word, ""),
is_profanity = map_lgl(word, function(w) {
bin_search(profanities, w) > 0
}),
is_pronoun = map_lgl(word, function(w) {
bin_search(pronouns, w) > 0
}),
is_superlative = map_lgl(word, function(w) {
bin_search(superlatives, w) > 0
}),
is_contraction = map_lgl(word, function(w) {
bin_search(contractions, w) > 0
}),
is_wikisyntax = map_lgl(word, function(w) {
bin_search(wikisyntax, w) > 0
}),
is_common_vandalism = map_lgl(word, function(w) {
bin_search(common_vandalism_words, w) > 0
}),
is_common_regular = map_lgl(word, function(w) {
bin_search(common_regular_words, w) > 0
}),
word_length = str_length(word)
) %>%
collect() %>%
# since some lists may be emtpy, this group_by may return less rows than there are edits
# so we need to compensate for this later when joining.
group_by(editid) %>%
summarise(
profanity_count = sum(is_profanity),
pronoun_count = sum(is_pronoun),
superlative_count = sum(is_superlative),
contraction_count = sum(is_contraction),
wikisyntax_count = sum(is_wikisyntax),
common_vandalism_count = sum(is_common_vandalism),
common_regular_count = sum(is_common_regular),
longest_word = max(word_length)
)
# edit comment features
comment_features <- edits %>%
select(editid, editcomment) %>%
mutate(
comment_exists = !editcomment == "null" | is.na(editcomment),
comment_length = if_else(comment_exists, str_length(editcomment), 0L),
comment_is_revert =
str_starts(editcomment, fixed("Revert")) | # user revert
str_starts(editcomment, fixed("revert")) | # user revert
str_starts(editcomment, fixed("[[Help:Reverting|Reverted]] ")) | # user revert
str_starts(editcomment, fixed("[[WP:RBK|Reverted]]")) | # bot revert
str_starts(editcomment, fixed("[[WP:UNDO|Undid]]")), # bot undo ( aka late revert )
comment_is_bot = str_starts(editcomment, "\\[\\[WP:"),
# tokenize each comment
word_lists = map(str_match_all(editcomment, "\\w+"), as.vector),
word_lists_lower = map(word_lists, str_to_lower),
# for each token, lowercase, and check if there is a hit on profanity list
profanity_lists = map(word_lists_lower,
function(word_list_lower) {
map_lgl(word_list_lower, function(word) { bin_search(profanities, word) > 0 } )
}),
# OR all word checks. if any TRUE, comment has profanity
comment_has_profanity = map_lgl(profanity_lists, function(list) { reduce(list, `|`, .init = FALSE) })
) %>%
select(-editcomment, -word_lists, -word_lists_lower, -profanity_lists) %>%
collect()
size_features <- edits %>%
select(editid, oldrevisionsize, newrevisionsize, additions, deletions) %>%
mutate(
size_delta = newrevisionsize - oldrevisionsize,
size_ratio = (newrevisionsize + 1) / (oldrevisionsize + 1),
num_additions = map_int(additions, length),
num_deletions = map_int(deletions, length)
) %>%
select(-oldrevisionsize,-newrevisionsize,-additions,-deletions) %>%
collect()
ip_address_regex <- "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"
multidplyr::cluster_copy(cluster, 'ip_address_regex')
editor_features <- edits %>%
select(editid, editor, edittime) %>%
mutate(is_anonymous = str_detect(editor, ip_address_regex)) %>%
select(editid, is_anonymous) %>%
collect()
# prepare features as matrix to feed to algos
all_features <-
character_features %>%
left_join(word_features, by = "editid") %>%
# fill in with 0s whatever word_features may be missing
mutate_all(~replace(., is.na(.), 0L)) %>%
left_join(comment_features, by = "editid") %>%
left_join(size_features, by = "editid") %>%
left_join(editor_features, by = "editid") %>%
arrange(editid) %>% # make sure we align rows with class
select(-editid) %>%
as.matrix()
#
# split into train, test, and validation sets
#
golden_class <- edits %>% collect() %>% arrange(editid) %>% pull(class)
validation_features <- all_features[validation_index, ]
validation_class <- golden_class[validation_index]
train_features <- all_features[-validation_index, ]
train_class <- golden_class[-validation_index]
```
```{r rm-multiplyr-cluster, include=FALSE, cache=TRUE}
# we are now done with multidplyr parallelism
rm(cluster)
```
### Machine Learning Algorithm
In this work we considered three different algorithms: K-nearest Neighbors (kNN) @Algo-knn, @Algo-kknn, Neural Networks (NN) @Algo-nnet and Random Forest (RF) @Algo-rf. kNN was an attractive baseline algorithm because of its fast training time. NN was interesting to consider since there is established use in Wikipedia for their ClueBot NG bot @Wiki-User-ClueBotNG. Random forests have also been successfully used for Wikipedia vandalism @Potthast-2010, @MolaVelasco-2012.
```{r ml-setup, include=FALSE, cache=TRUE}
# setup parallel cluster for caret
library(doParallel)
cl <- makePSOCKcluster(cores)
registerDoParallel(cl)
control <- trainControl(
method = "cv",
number = 10,
summaryFunction = prSummary, # this summary function includes PR-AUC
classProbs = TRUE, # this makes AUC calculation work
allowParallel = TRUE
)
```
```{r knn, cache=TRUE, include=FALSE}
#
# kNN
#
set.seed(123, sample.kind = "Rounding")
train_knn <- train(
x = train_features,
y = train_class,
method = "knn",
trControl = control,
tuneGrid = data.frame(k = c(3, 5, 7, 9, 11, 13, 15, 17, 19, 21)),
metric = "AUC", # PR-AUC
maximize = TRUE
)
```
```{r knn-normalized, cache=TRUE, include=FALSE}
#
# kNN, with features normalized to [0,1]
#
set.seed(123, sample.kind = "Rounding")
train_knn_normalized <- train(
x = train_features,
y = train_class,
method = "knn",
trControl = control,
tuneGrid = data.frame(k = c(3, 5, 7, 9, 11, 13, 15, 17, 19, 21)),
metric = "AUC", # PR-AUC
maximize = TRUE,
preProcess = "range" # normalize all features to [0,1]
)
```
```{r knn-normalized-weigthed, cache=TRUE, include=FALSE}
#
# kNN, with features normalized to [0,1], and with weigthed neighboors
#
set.seed(123, sample.kind = "Rounding")
train_knn_normalized_weigthed <- train(
x = train_features,
y = train_class,
method = "kknn",
trControl = control,
tuneGrid = expand.grid(
kmax = c(3, 5, 7, 9, 11, 13, 15, 17, 19, 21),
distance = c(1, 2), # try manhattan and euclidean distance
kernel = "optimal"
),
metric = "AUC", # PR-AUC
maximize = TRUE,
preProcess = "range" # normalize all features to [0,1]
)
```
```{r neural-network, message=FALSE, warning=FALSE, cache=TRUE, include=FALSE}
#
# Neural Network
#
set.seed(123, sample.kind = "Rounding")
train_neural <- train(
x = train_features,
y = train_class,
method = "nnet",
trControl = control,
tuneGrid = expand.grid(
size = c(1, 2, 3, 4, 5, 10, 15, 20, 25),
decay = seq(0, 1, 0.1)
),
metric = "AUC", # PR-AUC
maximize = TRUE,
maxit = 300 # max iterations
)
```
```{r random-forest-150, include=FALSE, cache=TRUE}
#
# Random Forest 150
#
set.seed(123, sample.kind = "Rounding")
train_rf_150 <- train(
x = train_features,
y = train_class,
method = "rf",
ntree = 150,
trControl = control,
tuneGrid = data.frame(mtry = c(1, 2, 3, 4, 5, 10, 25, 50, 100)),
metric = "AUC", # PR-AUC
maximize = TRUE,
preProcess = "range" # normalize all features to [0,1]
)
```
```{r random-forest-500, include=FALSE, cache=TRUE}
#
# Random Forest 500
#
set.seed(123, sample.kind = "Rounding")
train_rf_500 <- train(
x = train_features,
y = train_class,
method = "rf",
ntree = 500,
trControl = control,
tuneGrid = data.frame(mtry = c(1, 2, 3, 4, 5, 10, 25, 50, 100)),
metric = "AUC", # PR-AUC
maximize = TRUE,
preProcess = "range" # normalize all features to [0,1]
)
```
```{r random-forest-1000, include=FALSE, cache=TRUE}
#
# Random Forest 1000
#
set.seed(123, sample.kind = "Rounding")
train_rf_1000 <- train(
x = train_features,
y = train_class,
method = "rf",
ntree = 1000,
trControl = control,
tuneGrid = data.frame(mtry = c(1, 2, 3, 4, 5, 10, 25, 50, 100)),
metric = "AUC", # PR-AUC
maximize = TRUE,
preProcess = "range" # normalize all features to [0,1]
)
```
```{r clean-up-caret, include=FALSE, cache=TRUE}
# we are done with caret cluster
stopCluster(cl)
```
## Results
### Training results
#### kNN
We train kNN with $k = 3, 5, 7, \dots, 21$. Normalizing the features to the range $[0,1]$ helps the algorithm significantly, from a best training PR-AUC of `r max(train_knn$results$AUC)` at $k =$ `r train_knn$bestTune$k` without normalization, to `r max(train_knn_normalized$results$AUC)` at $k =$ `r train_knn_normalized$bestTune$k`. However, the greatest improvement is achieved when we normalize and weight closer neighbors more. In a weighted kNN with same $k$ range, we obtain a best training PR-AUC of `r max(train_knn_normalized_weigthed$results$AUC)` at $k =$ `r train_knn_normalized_weigthed$bestTune$kmax`. The weighted kNN algorithm allows us to specify the distance metric. We considered Manhattan and Euclidean distances, finding that the Manhattan distance achieves marginally better performance.
#### Neural networks
We chose a NN that requires a $size$ and $decay$ parameter. The $size$ relates to the number of neurons in the hidden layer of the network. It is recommended to use as many neurons as necessary, but not more, since extra neurons may overtrain the network. Similarly, the $decay$ parameter is a form of regularization that penalizes the NN optimization function as to help not over train. We consider $size = 1, 2, 3, 4, 5, 10, 15, 20, 25$ and $decay = 0, 0.1, 0.2, \dots, 1$. We tried regularizing the features to [0,1] but that only hurt performance. We find a best training PR-AUC of `r max(train_neural$results$AUC)` with $size =$ `r train_neural$bestTune$size` and $decay =$ `r train_neural$bestTune$decay`.
#### Random Forest
The RF algorithm can be tuned with two variables: $ntree$, which is the number of trees in the forest, and $mtry$, the number of randomly chosen features at each split point. We train with $ntree = 150, 500, 1000$, and $mtry = 1, 2, 3, 4, 5, 10, 25, 50, 100$. All features were regularized to [0,1] as this yielded a minor improvement. We find a best training PR-AUC of `r max(train_rf_1000$results$AUC)` with $ntree = 1000$ and $mtry = 3$.
### Validation results
```{r validation-predictions, cache=TRUE, include=FALSE}
pred_knn <-
predict(train_knn, newdata = validation_features, type="prob")
pred_knn_normalized <-
predict(train_knn_normalized, newdata = validation_features, type="prob")
pred_knn_normalized_weigthed <-
predict(train_knn_normalized_weigthed, newdata = validation_features, type="prob")
pred_neural <- predict(train_neural, newdata = validation_features, type="prob")
pred_rf_150 <- predict(train_rf_150, newdata = validation_features, type="prob")
pred_rf_500 <- predict(train_rf_500, newdata = validation_features, type="prob")
pred_rf_1000 <- predict(train_rf_1000, newdata = validation_features, type="prob")
set.seed(123, sample.kind = "Rounding")
pred_random <- sample(x = c(0,1), size = nrow(validation_features), replace = TRUE, prob = c(0.5,0.5))
# dataframe with truth and 'positive' class (vandalism) probability
predictions = tibble(
truth = validation_class,
knn = pred_knn$vandalism,
knn_normalized = pred_knn_normalized$vandalism,
knn_normalized_weigthed = pred_knn_normalized_weigthed$vandalism,
nn = pred_neural$vandalism,
rf_150 = pred_rf_150$vandalism,
rf_500 = pred_rf_500$vandalism,
rf_1000 = pred_rf_1000$vandalism,
random = pred_random
)
```
Below we can find a table summarizing the results for PR-AUC and ROC-AUC metrics when using the validation set. We find that the best algorithm, as with the training set, is the Random Forest with 1000 trees.
```{r validation-aocs, echo=FALSE, cache=TRUE}
pr_aocs <- bind_rows(
pr_auc(predictions, truth = "truth", knn) %>%
add_column(model = "kNN"),
pr_auc(predictions, truth = "truth", knn_normalized) %>%
add_column(model = "Normalized kNN"),
pr_auc(predictions, truth = "truth", knn_normalized_weigthed) %>%
add_column(model = "Weigthed Normalized kNN"),
pr_auc(predictions, truth = "truth", nn) %>%
add_column(model = "Neural Network"),
pr_auc(predictions, truth = "truth", rf_150) %>%
add_column(model = "Random Forest with 150 trees"),
pr_auc(predictions, truth = "truth", rf_500) %>%
add_column(model = "Random Forest with 500 trees"),
pr_auc(predictions, truth = "truth", rf_1000) %>%
add_column(model = "Random Forest with 1000 trees"),
pr_auc(predictions, truth = "truth", random) %>%
add_column(model = "Random Classifier")
)
roc_aocs <- bind_rows(
roc_auc(predictions, truth = "truth", knn) %>%
add_column(model = "kNN"),
roc_auc(predictions, truth = "truth", knn_normalized) %>%
add_column(model = "Normalized kNN"),
roc_auc(predictions, truth = "truth", knn_normalized_weigthed) %>%
add_column(model = "Weigthed Normalized kNN"),
roc_auc(predictions, truth = "truth", nn) %>%
add_column(model = "Neural Network"),
roc_auc(predictions, truth = "truth", rf_150) %>%
add_column(model = "Random Forest with 150 trees"),
roc_auc(predictions, truth = "truth", rf_500) %>%
add_column(model = "Random Forest with 500 trees"),
roc_auc(predictions, truth = "truth", rf_1000) %>%
add_column(model = "Random Forest with 1000 trees"),
roc_auc(predictions, truth = "truth", random) %>%
add_column(model = "Random Classifier")
)
pr_aocs <- pr_aocs %>%
rename("PR-AUC" = .estimate) %>%
select("PR-AUC", model)
roc_aocs <- roc_aocs %>%
rename("ROC-AUC" = .estimate) %>%
select("ROC-AUC", model)
pr_aocs %>%
left_join(roc_aocs, by = "model") %>%
select(model, "PR-AUC", "ROC-AUC") %>%
arrange("PR-AUC") %>%
knitr::kable(caption = "PR-AUC and ROC-AUC of all tested algorithms")
```
The PR Curves below show that the three Random Forest classifiers are clearly in the lead over the whole space, while the Neural Network classifier is a close 4th place. The three kNN classifiers perform the worst.
```{r pr-curves, echo=FALSE, cache=TRUE}
pr_curves <- bind_rows(
pr_curve(predictions, truth = "truth", knn) %>%
add_column(model = "kNN"),
pr_curve(predictions, truth = "truth", knn_normalized) %>%
add_column(model = "Normalized kNN"),
pr_curve(predictions, truth = "truth", knn_normalized_weigthed) %>%
add_column(model = "Weigthed Normalized kNN"),
pr_curve(predictions, truth = "truth", nn) %>%
add_column(model = "Neural Network"),
pr_curve(predictions, truth = "truth", rf_150) %>%
add_column(model = "Random Forest with 150 trees"),
pr_curve(predictions, truth = "truth", rf_500) %>%
add_column(model = "Random Forest with 500 trees"),
pr_curve(predictions, truth = "truth", rf_1000) %>%
add_column(model = "Random Forest with 1000 trees"),
# fix random curve as it should be a constant line approx at the vandalism/regular ratio
pr_curve(predictions, truth = "truth", random) %>%
mutate(precision = 0.073) %>%
add_column(model = "Random Classifier")
)
pr_curves %>%
ggplot(aes(x = recall, y = precision, color = model)) +
geom_line() +
ggtitle("PR Curves of all tested algorithms")
```
In ROC space, we can appreciate the same ranking as in PR space. Note how it is significantly more difficult to separate the algorithms, thus agreeing with @Saito-2015 in that a PR curve is more informative when comparing imbalanced binary classifiers.
```{r roc-curves, echo=FALSE, cache=TRUE}
roc_curves <- bind_rows(
roc_curve(predictions, truth = "truth", knn) %>%
add_column(model = "kNN"),
roc_curve(predictions, truth = "truth", knn_normalized) %>%
add_column(model = "Normalized kNN"),
roc_curve(predictions, truth = "truth", knn_normalized_weigthed) %>%
add_column(model = "Weigthed Normalized kNN"),
roc_curve(predictions, truth = "truth", nn) %>%
add_column(model = "Neural Network"),
roc_curve(predictions, truth = "truth", rf_150) %>%
add_column(model = "Random Forest with 150 trees"),
roc_curve(predictions, truth = "truth", rf_500) %>%
add_column(model = "Random Forest with 500 trees"),
roc_curve(predictions, truth = "truth", rf_1000) %>%
add_column(model = "Random Forest with 1000 trees"),
roc_curve(predictions, truth = "truth", random) %>%
add_column(model = "Random Classifier")
)
roc_curves %>%
ggplot(aes(x = 1 - specificity, y = sensitivity, color = model)) +
geom_line() +
ggtitle("ROC Curves of all tested algorithms") +
ylab("sensitivity (TP rate)") +
xlab("1 - specificity (FP rate)")
```
### Comparison against top classifier from @Potthast-2010[^3]
[^3]: We would like to acknowledge the help from Dr. Martin Potthast in making the classification runs from @Potthast-2010 available at https://drive.google.com/file/d/1Q1fSGZWU3rGopnyCGxKpqHn507rm0CKt/view?usp=sharing.
In this comparison, we can see that our solution performs similarly as the best from @Potthast-2010. In PR Curve space, we perform slightly better on recall values between 0.4 and 0.75, but otherwise their solution has a slight advantage.
```{r potthast-comparison, echo=FALSE, cache=TRUE}
# load in data from best algorithm from Potthast
archivePath <- "data/pan-wikipedia-vandalism-detection-2010/molavelasco10-runs/"
archiveName <- "molavelasco10-run-2010-06-23-1103.txt.zip"
fileName <- "molavelasco10-run-2010-06-23-1103.txt"
unzip(paste(archivePath, archiveName, sep = ""),
exdir = archivePath,
files = c(fileName))
molaVelascoRun <- read_delim(
file = paste(archivePath, fileName, sep = ""),
delim = " ",