-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_anonymization.R
70 lines (61 loc) · 2.27 KB
/
data_anonymization.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#############################################################
# #
# This script automatically assigns pseudonyms for speakers #
# everywhere their names show up in a data frame, including #
# in utterances. #
# #
# -Joshua McNeill (joshua dot mcneill at uga dot edu) #
# #
#############################################################
# Load packages
library(randomNames)
# Read in raw data
tweets <- read.csv("data/tweets.csv")
usersAll <- read.csv("data/usersAll.csv")
## Functions
# Take a data frame and an anonymization key data frame and attempt to pseudonymize
pseudonymize <- function(df, key) {
matrix <- as.matrix(df)
iteration <- 1 # For progress indicator
for(name in key[,1]) {
# Show progress
print(paste(iteration, "people renamed | renaming", name))
iteration <- iteration + 1
# Perform task
if(is.element(key[key[, 1] == name, 2], matrix)) {
if(exists("makeNewPseudo")) {
makeNewPseudo <- c(makeNewPseudo, name)
} else {
makeNewPseudo <- name
}
} else {
matrix <- gsub(name, key[key[, 1] == name, 2], matrix, fixed = TRUE)
}
}
dfProcessed <- as.data.frame(matrix)
if(exists("makeNewPseudo")) {
print("Your check $makeNewPseudo for people to rename.")
return(list("newDf" = dfProcessed, "makeNewPseudo" = makeNewPseudo))
} else {
print("All names successfully anonymized.")
return(dfProcessed)
}
}
## Anonymize
# Remove tweet URLs
tweets <- tweets[, 2:ncol(tweets)]
# Remove redundant ID column for users list
usersAll <- usersAll[, c(1, 3:ncol(usersAll))]
# Create key
fakeNames <- randomNames(length(unique(usersAll$Id)), name.order = "first.last", name.sep = "_")
userKey <- data.frame("realNames" = unique(usersAll$Id),
"fakeNames" = fakeNames)
# Apply pseudonym function
tweetsAnon <- pseudonymize(tweets, userKey)
usersAllAnon <- pseudonymize(usersAll, userKey)
# Record key and results
write.csv(userKey, "data/userKey.csv")
write.csv(tweetsAnon, "data/tweetsAnon.csv")
write.csv(usersAllAnon, "data/usersAllAnon.csv")
# Clean up
rm(list = ls())