commit_survey.Rmd

---
title: "Commits Survey"
output:
  html_document:
    df_print: paged
---

```{r}
source("implementation.R")
initializeEnvironment("./artifact/commit_survey")
```

## Get developers' labels into one combined dataframe

```{r}
library(dplyr)
library("fst")
library("irr")

inputPath = "./input_data/commit_survey"
# read all files in a directory
answers = list.files(inputPath)
data = lapply(answers, function(x) { 
    cat(x, "\n")
    read.csv(paste(inputPath, x, sep = "/"), header = F, sep =",") 
    })
i = 1
while (i <= length(data)) {
    d = data[[i]]
    d$category = as.character(d$V2)
    d$category[is.na(d$category)] = 0
    d$category[d$category != 0] = 1
    data[[i]]$category = as.numeric(d$category)
    i = i + 1
}

# if this is set to T, pulls data from the extra developers we had available for the survey as well - but these finished later, or did not finish at all so we only used their subset. 
if (F) {
    inputPathExtra = "./input_data/commit_survey_extras"
    answersExtra = list.files(inputPathExtra)
    dataExtra = lapply(answersExtra, function(x) { read.csv(paste(inputPathExtra, x, sep = "/"), header = F, sep =",") })
    i = 1
    while (i <= length(dataExtra)) {
        d = dataExtra[[i]]
        d$category = as.character(d$V2)
        d$category[is.na(d$category)] = 0
        d$category[d$category != 0] = 1
        dataExtra[[i]]$category = as.numeric(d$category)
        i = i + 1
    }
    
    # join the single dataframe per file into a single dataframe with all of them
    
    
    data = rbind(data, dataExtra)

}
separateDevs = data
data = do.call("rbind", data)
# now aggregate according to the commit
#data$category = as.character(data$V2)
#data$category[is.na(data$category)] = 0
#data$category[data$category == "0"] = 0
#data$category[data$category != 0] = 1
data$category = as.numeric(data$category)
data$test = rep(1, nrow(data))
sanity_check = aggregate(test ~ V1, data, sum) # we expect the sanity check to be three for all commits
if(!all(sanity_check$test == 3)) cat("Error -- Not all commits have 3 votes\n")
dd = data
data = aggregate(category ~ V1, data, sum)
data$score = data$category / sanity_check$test # 3 # sanity_check$test
#data$score = sapply(data$category, function(x) mean(sapply(x, function (x) if ( x == 0) 0 else 1)))
data$label = ifelse(data$score>=0.5, 1, 0) # This will never == 0.5 because we will have 3 votes
significant <- data[sapply(data$category, length) > 1,]
discord <- significant[(significant$score != 0) & (significant$score != 1), ]
```


## Compare developers' labels to study's labels

```{r}

stripGHUrl = function(what) {
    cat(what)
    substr(what, nchar(what) - 39, nchar(what))
}
ourLabelsPath <- "./input_data/petrs_commits"
buggycommits <- read.csv(paste(ourLabelsPath, "buggy_commits.csv", sep="/"), header=F)
buggycommits$V1 <- as.character(buggycommits$V1)
#buggycommits$V1 = sapply(buggycommits$V1, stripGHUrl)
buggycommits$ours = buggycommits$V3
buggycommits$paper = 1
nonbuggycommits <- read.csv(paste(ourLabelsPath, "non_buggy_commits.csv", sep="/"), header=F)
nonbuggycommits$V1 <- as.character(nonbuggycommits$V1)
#nonbuggycommits$V1 = sapply(nonbuggycommits$V1, stripGHUrl)
nonbuggycommits$ours = as.numeric(!nonbuggycommits$V3)
nonbuggycommits$paper = 0
allcommits = rbind(buggycommits, nonbuggycommits)

data$V1 = as.character(data$V1)
allcommits$V2 = as.character(allcommits$V2)
```

```{r}
# Calculate disagreement

getPaperLabels <- function() {
    labels = rep(0, nrow(data))
    for (r in 1:nrow(data)) {
        df2row = allcommits[allcommits$V1 == data$V1[[r]],]
        labels[r] = df2row$paper;
    }
    labels    
}

getOurLabels <- function() {
    labels = rep(0, nrow(data))
    for (r in 1:nrow(data)) {
        df2row = allcommits[allcommits$V1 == data$V1[[r]],]
        labels[r] = df2row$ours;
    }
    labels    
}

data$ours = getOurLabels()
data$paper = getPaperLabels()

data$agreePaper = as.numeric(data$label == data$paper)
data$agreeOurs = as.numeric(data$label == data$ours)

#data$disagreeOurs = getDisagreementLabels(data, allcommits)
#data$disagreePaper = getDisagreementLabelsPaper(data, allcommits)

write.csv(data, file=paste0(WORKING_DIR, "/Data/commit_survey_results.csv"))
```

The `data` format is as follows:

- `V1` - link to the commit on github
- `category` - category of the bug (in the end this data was not used)
- `score` - fraction of the participants who have labelled the commit as a bugfix
- `label` - the overall verdict of the participants (majority)
- `ours` - did we think the commit is a bug?
- `paper` - did the original paper report the commit as a bugfix? 
- `agreePaper` - did the survey agree with the original paper labelling on the commit?
- `agreeOurs` - did the survey agree with our assessment of the commit?

```{r}
falsePositives <- nrow(data[data$paper == 1 & data$label == 0,])
falseNegatives <- nrow(data[data$paper == 0 & data$label == 1,])

out("commitsFalsePositives",  paste0(round(falsePositives / nrow(buggycommits) * 100, 1),"\\%"))
out("commitsFalseNegatives", paste0(round(falseNegatives / nrow(nonbuggycommits) * 100, 1), "\\%"))

#cat(paste("Percentage of false positives (Paper vs Devs): ", falsePositives / nrow(buggycommits), "\n"))
#cat(paste("Percentage of false negatives: (Paper vs Devs)", falseNegatives / nrow(nonbuggycommits), "\n\n"))
```

```{r}
unanimous_falsePositives <- nrow(data[data$paper == 1 & data$score == 0,])
out("commitsUnanimousFalsePositives", paste0(round(unanimous_falsePositives / falsePositives * 100, 1), "\\%"))
#cat("Percentage of false positives upon which developers unanimously agreed: ", unanimous_falsePositives / falsePositives)
```

```{r}
PfalsePositives <- nrow(data[data$paper == 1 & data$ours == 0,])
PfalseNegatives <- nrow(data[data$paper == 0 & data$ours == 1,])

cat(paste("Percentage of false positives (Paper vs Us): ", PfalsePositives / nrow(buggycommits), "\n"))
cat(paste("Percentage of false negatives (Paper vs Us): ", PfalseNegatives / nrow(nonbuggycommits), "\n\n"))
```

```{r}
PDfalsePositives <- nrow(data[data$label == 1 & data$ours == 0,])
PDfalseNegatives <- nrow(data[data$label == 0 & data$ours == 1,])

cat(paste("Percentage of false positives (Devs vs Us): ", PDfalsePositives / nrow(data[data$label == 1,]), "\n"))
cat(paste("Percentage of false negatives (Devs vs Us): ", PDfalseNegatives / nrow(data[data$label == 0,]), "\n"))
```

Let's look at how certain people are:

```{r}
cat(paste("3 votes not a bug:                 ", length(data$score[data$score == 0]), "\n"))
cat(paste("Likely not a bug (1 vote for bug): ", length(data$score[data$score == 1/3]), "\n"))
cat(paste("Likely a bug (2 votes for bug):    ", length(data$score[data$score == 2/3]), "\n"))
cat(paste("3 votes for a bug:                 ", length(data$score[data$score == 1]), "\n"))
```
This is cool, but let's do this for buggy and non-buggy commits as determined by the paper:

```{r}
buggy = data[data$paper == 1,]
cat("Paper labelled as buggy\n")
cat(paste("3 votes not a bug:                 ", length(buggy$score[buggy$score == 0]), "\n"))
cat(paste("Likely not a bug (1 vote for bug): ", length(buggy$score[buggy$score == 1/3]), "\n"))
cat(paste("Likely a bug (2 votes for bug):    ", length(buggy$score[buggy$score == 2/3]), "\n"))
cat(paste("3 votes for a bug:                 ", length(buggy$score[buggy$score == 1]), "\n"))
cat("Paper labelled as non-buggy\n")
nonbuggy = data[data$paper == 0,]
cat(paste("3 votes not a bug:                 ", length(nonbuggy$score[nonbuggy$score == 0]), "\n"))
cat(paste("Likely not a bug (1 vote for bug): ", length(nonbuggy$score[nonbuggy$score == 1/3]), "\n"))
cat(paste("Likely a bug (2 votes for bug):    ", length(nonbuggy$score[nonbuggy$score == 2/3]), "\n"))
cat(paste("3 votes for a bug:                 ", length(nonbuggy$score[nonbuggy$score == 1]), "\n"))
```

And now with me

```{r}
buggyp = data[data$ours == 1,]
cat("Peta labelled as buggy\n")
cat(paste("3 votes not a bug:                 ", length(buggyp$score[buggyp$score == 0]), "\n"))
cat(paste("Likely not a bug (1 vote for bug): ", length(buggyp$score[buggyp$score == 1/3]), "\n"))
cat(paste("Likely a bug (2 votes for bug):    ", length(buggyp$score[buggyp$score == 2/3]), "\n"))
cat(paste("3 votes for a bug:                 ", length(buggyp$score[buggyp$score == 1]), "\n"))
cat("Peta labelled as non-buggy\n")
nonbuggyp = data[data$ours == 0,]
cat(paste("3 votes not a bug:                 ", length(nonbuggyp$score[nonbuggyp$score == 0]), "\n"))
cat(paste("Likely not a bug (1 vote for bug): ", length(nonbuggyp$score[nonbuggyp$score == 1/3]), "\n"))
cat(paste("Likely a bug (2 votes for bug):    ", length(nonbuggyp$score[nonbuggyp$score == 2/3]), "\n"))
cat(paste("3 votes for a bug:                 ", length(nonbuggyp$score[nonbuggyp$score == 1]), "\n"))
```

Now look at the developers and calculate their ratio of bugs found:

```{r}
for (d in separateDevs) {
    cat(sum(as.numeric(d$category)) / length(d$category), "\n")
}
```
Ok, what we can do, is to compute for each developer how often he is in opposition to the rest:

```{r}
devOps <- function(index) {
    score = data$score
    names(score) = data$V1
    d = separateDevs[[index]]
    dc = d$category
    names(dc) = d$V1
    bugX = 0
    nonBugX = 0
    resBug = 0
    resNonBug = 0
    for (i in d$V1) {
        s = score[[i]]
        if (dc[[i]] == 0) {
            if (s == 2/3)
                nonBugX = nonBugX + 1
            else if (s == 1/3)
                resNonBug = resNonBug + 1
        } else {
            if (s == 1/3)
                bugX = bugX + 1
            else if (s == 2/3)
                resBug = resBug + 1
        }
    }
    c(bugX, nonBugX, resBug, resNonBug) / length(d$V1)
}
for (i in 1:10)
    print(devOps(i))
```

# Calculating Cohen's Kappa

```{r}
# the number of people in the survey
numDevs = length(separateDevs)
# now calculate a dataframe where we have aggregated commits and for each commit we have the category assigned by all of the developers (NA if the developer did not see the commit)
ratings = NULL
tmp = list()
for (n in (1:numDevs)) {
    x = cbind(as.character(separateDevs[[n]]$V1), separateDevs[[n]]$category)
    colnames(x) = c("url", paste0("x",n))
    if (n == 1) {
        ratings = x
    } else {
        ratings = merge(ratings, x, by = c("url"), all = T)
    }
}
# create matrix of only the ratings w/o the commit urls
#ratings_matrix = as.matrix(subset(ratings, select = - c(url)))
```

Now that we have the matrix, we should calculate the Cohen's Kappa to determine the interrater agreement between the raters. Because Cohen's Kappe is only defined for 2 raters, we use the Light's Kappa, which is an average over all pairwise Cohen Kappas. The fact that different commits were reviewed by different reviewers further complicates this. We therefore create a submatrix for each 2 pairs of developers, remove any NA rows, calculate the kappas and then report the results and distribution:

```{r}
kvalue = c()
kpvalue = c()
ksize = c()
kfirst = c()
ksecond = c()
n = 1
for (first in (1:(numDevs - 1))) {
    for (second in ((first + 1):numDevs)) {
        tmp = cbind(ratings[colnames(ratings)[[first + 1]]], ratings[colnames(ratings)[[second + 1]]])
        tmp = na.omit(tmp)
        if (nrow(tmp) != 0) {
            k = kappa2(tmp)
            kvalue[[n]] = k$value
            kpvalue[[n]] = k$p.value
            ksize[[n]] = nrow(tmp)
            kfirst[[n]] = first
            ksecond[[n]] = second
            n = n + 1
        } 
    }
}
summary(kvalue)
summary(kpvalue)
summary(ksize)
out("commitsKappaMedian", round(median(kvalue), 3))
out("commitsKappaMin", round(min(kvalue), 3))
out("commitsKappaMax", round(max(kvalue), 3))
out("commitsKappaPValMedian", round(median(kpvalue), 3))
out("commitsKappaPValThird", round(quantile(kpvalue, 0.75), 3))
```

Ok, all of our kappas are positive. Most of the p-values are also very small. 

```{r}
boxplot(kvalue)
boxplot(kpvalue)
sort(kpvalue)

```