-
Notifications
You must be signed in to change notification settings - Fork 1
/
parsethenaddloop.R
383 lines (326 loc) · 17.9 KB
/
parsethenaddloop.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
#this script looks for new BTB interviews and adds them to the dataset
#simplified to stop manually inputting gender info - just asks if it's an author or not, then gets the gender data from wikipedia (with final manual input step)
#altered beginning to automate process of getting data from NYTimes site
#General outline:
#visit btb page at NYTimes.com, get names and links
#hit against current list to look for new interviewees
#for each new interviewee
#scrape interview page
#look for potential author names
#loop through names and flag if real name, flag text errrs
#loop through errors & fix
#wind up with list of author names, interviewees
#hit goodreads for data re new interviewees and new authors, add to GRdata
#error checking for erroneous hits - look for cases where name and GR name are too far apart, manually fix
#hit wikipedia for residual gender data, birthdate data
#error check for data still missing
#merge GRdata and author/interview listing to create new btb analysis dataset with interviewee & author
#load libraries ####
library("xml2")
library("tidyverse")
library("lubridate")
library("stringdist")
library("stringr")
library("rvest")
library("WikipediR")
library("tidytext")
#create functions ####
nF<-function(x) {ifelse(is.na(x), F,x)}
countN<-function(x) {as.integer(sum(!is.na(x)))}
rmean<-function(x){mean(x,na.rm=T)}
rmeanr<-function(x){round(mean(x,na.rm=T),2)}
rmedian<-function(x){median(x,na.rm=T)}
rsum<-function(x) {sum(x,na.rm=T)}
#function to:
#search for author
#grab author id
#archive search page
#grab author page
#archive author page
#extract gender, birthdate, hometown and return as data frame row
#!!requires object grkey - API authorization for GoodReads!!##
authdetails<-function(authname){
#search for author's name to get goodreads ID
searchgr<-read_xml(paste0("https://www.goodreads.com/search/index.xml?key=",grkey,"&q=",str_replace_all(string = authname,pattern = " ",replacement = "%20"),"&search[field]=author"))
ids<-data_frame(id=as.numeric(xml_text(xml_find_all(searchgr,"//author//id"))),name=xml_text(xml_find_all(searchgr,"//author//name")))
#if the search comes up with nothing, nrow for ids is 0, set authID and modalauthname to 0; else pick one that matches name entered or most frequently appearing one; if the latter return warning
if(nrow(ids)==0){
authID<-NA
modalauthname<-NA} else{
modalauthname<-aggregate(id~name,ids,countN)[which.max(aggregate(id~name,ids,countN)[,2]),1]
if(length(unique(ids$id[which(ids$name==authname)]))==0){
warning(paste0(authname," no exact match; ",modalauthname," used"))
authID<-unique(ids$id[which(ids$name==modalauthname)])
} else{
authID<-unique(ids$id[which(ids$name==authname)])
}
}
#rest to prevent querying API too rapidly
Sys.sleep(1.5)
#get author page
if(!is.na(authID)){
authgr<-read_xml(paste0("https://www.goodreads.com/author/show/",authID,"?format=xml&key=",grkey))
authgend<-xml_text(xml_find_all(authgr,"//author//gender"))
authbirth<-xml_text(xml_find_all(authgr,"//author//born_at"))
authtown<-xml_text(xml_find_all(authgr,"//author//hometown"))
Sys.sleep(1.5)
} else {
authgr<-NA
authgend<-NA
authbirth<-NA
authtown<-NA
Sys.sleep(1.5)
}
#archive both - archiving as character; if need to use will have to use read_xml again
assign(x = paste0("searchgr",str_replace_all(string = authname,pattern = " ",replacement = "_")),value = as.character(searchgr),envir = .GlobalEnv)
assign(x = paste0("authgr",str_replace_all(string = authname,pattern = " ",replacement = "_")),value = as.character(authgr),envir = .GlobalEnv)
#return data frame
data_frame(name=authname,id=authID, gender=authgend, birthdate=authbirth, town=authtown, GRname=ifelse(length(unique(ids$id[which(ids$name==authname)]))==0,modalauthname,as.character(authname)))
}
#function wikigetdata - get birthdate and gender info from wikipedia, based on a vector of names (assumes GR names)
wikigetdata<-function(namevector){
wikidata<-data.frame(GRID=numeric(),GRname=character(),wikibirthdate=as.Date(x = integer(0), origin = "1970-01-01"),wikigender=character(),wikimsg=character())
for(i in 1:length(namevector)){
name<-namevector[i]
GRID<-NA
birthdate<-NA
gender<-NA
msg<-NA
try({
wikifull<- page_content(language = "en",project = "wikipedia",page_name = name)
})
if(!exists("wikifull")){
msg<-"no wiki page"} else{
if(wikifull$parse$text$`*` %>% read_html() %>% xml_text() %>% str_detect("This disambiguation page")){
msg<-"disambiguation"
} else{
tables<- wikifull$parse$text$`*` %>% read_html() %>% xml2::xml_find_all(".//table")
if(length(tables)==0){
msg<-"no tables"
} else {
if(is.na(wikibiotablenum(wikifull))) {
msg<-"no bio table"
}
else {
table<-tables[[wikibiotablenum(wikifull)]] %>% html_table(fill = T)
table<-table[,1:2]
names(table)<-c("item","data")
birthdate<-table %>% filter(item=="Born") %>% select(data) %>% str_extract(pattern = "\\d+-\\d+-\\d+") %>% parse_date_time("ymd") %>% as.Date()
}
}
}
assign(x = paste0("wikifull",str_replace_all(string = name,pattern = " ",replacement = "_")),value = wikifull,envir = .GlobalEnv)
assign(x = paste0("wikitable",str_replace_all(string = name,pattern = " ",replacement = "_")),value = table,envir = .GlobalEnv)
temptext<-data.frame(line=1, text=wikifull$parse$text$`*` %>% read_html %>% xml_text(),stringsAsFactors = F)
temptokens<-temptext %>% unnest_tokens(output = word,input = text)
temptokens$row<-as.numeric(rownames(temptokens))
shehercount<-temptokens %>% filter(word %in% c("she","She","her","Her")) %>% count() %>% pull()
hehimcount<-temptokens %>% filter(word %in% c("he","He","him","Him")) %>% count() %>% pull()
if(nz(shehercount)==nz(hehimcount)){
gender<-NA
} else if(nz(shehercount)>nz(hehimcount)){
gender<-("f")
} else {
gender<-("m")
}
}
tempdetails<-data.frame(GRID=GRdata$id[match(name,GRdata$GRname)],GRname=name,wikibirthdate=birthdate,wikigender=gender,wikimsg=msg,stringsAsFactors = F)
wikidata<-rbind(wikidata,tempdetails)
rm(wikifull)
}
return(wikidata)
}
#find the bio table within a wikipedia page - used in above function
wikibiotablenum<- function(pagecontent){
try(findborn<-unlist(lapply(X = (pagecontent$parse$text$`*` %>% read_html() %>% xml2::xml_find_all(".//table") %>% html_table(fill = T)),FUN = function(x){max(grepl("Born",x[[1]]))})))
if(exists("findborn")){
if(sum(findborn)==0){
biotablenum<-NA
} else {
biotablenum<-which.max(findborn)
}
return(biotablenum)
} else {
return(NA)
}
}
#look at main btb page, get new interviews and links####
btbcolumn<-read_html("https://www.nytimes.com/column/by-the-book")
headlines <- btbcolumn %>% html_nodes(css = ".headline") %>% html_text(trim = T)
dates <- btbcolumn %>% html_nodes(css = ".dateline") %>% html_text(trim = T)
dates <-as.Date(dates , format = "%b. %d, %Y")
links <- btbcolumn %>% html_nodes(css = ".story-link") %>% html_attr(name = "href")
btbnames<-str_replace(headlines,": By the Book","")
isnew<-!btbnames %in% interviewees$Subject
btbnewcols<-data.frame(Date=dates,Link=links, Subject=btbnames,isnew)
btbnewcols<-unique(btbnewcols[(btbnewcols$isnew),])
#order by date so adding older interviews first
btbnewcols<-btbnewcols[order(btbnewcols$Date),]
# read in text files & parse ####
#for each interviewee:
#read in file
#use regex to find potential author names
#loop through author names and record: is a name (y), need attention (/), not a name (enter)
#store issues for further look
#wind up with list of author names, genders, interviewees
##make intervieweesnew data frame because that was already used in the code. Need to loop through and add gender because that had been added in the excel file. Other fields aren't being used but are left over - could clean this up at some point.
intervieweesnew<-data.frame(btbnewcols[,c("Date","Subject")],Subject.gender = NA, Count.Authors = NA,Count.Female.Authors= NA, Count.Male.Authors= NA, printoronline = "online",Notes = NA)
print("New interviewees: type f or m")
for(i in 1:nrow(intervieweesnew)){
intervieweesnew[i,3]<-readline(prompt = paste0(intervieweesnew[i,2]," "))
}
#now back on track, except this code is changed to hit the link directly and to save to a text file
authorsnew<-data.frame(name=character(),isauth=character(),interviewee=character())
for(i in 1:length(intervieweesnew$Subject)){
#read in file & save
interviewee<-as.character(intervieweesnew$Subject[i])
btblines<-read_html(as.character(btbnewcols$Link[i]))
btblines<-btblines %>% html_nodes(css = ".story-body-text") %>% html_text()
writeLines(text = btblines, con = paste0(btbnewcols$Subject[i],".txt"))
#use regex to find potential author names
#var to store regex results looking for multiple initial cap words in a row
multi<-character()
#var to store regex results looking for single initial cap words that weren't in multi
single<-character()
#first look for multi - store only if something is found in the line
print(paste0("New interviewee: ",interviewee))
print("type y if is an author name; type slash if unsure or error in string")
for(x in 1:length(btblines)){
tempmulti<-str_extract_all(string = btblines[x],pattern = "((?<![“])([:upper:]{1}(\\. )?)+[:lower:]+(?=([ \\’\\'-][:upper:]{1}(\\. )?)+)(?:[\\s\\’\\'-][:upper:]{1}(\\. )?[[:upper:]{1}([:lower:]\\'+)-]+)+)")[[1]]
tempmulti<-str_replace(string = tempmulti,pattern = "\\.$",replacement = "")
multi<-c(multi,tempmulti)
}
#now look for single but only store if they weren't in multi
for(x in 1:length(btblines)){
tempsingle<-str_extract_all(string = btblines[x],pattern = "((?<![“])[A-Z][A-Z#(\\w+)]+)")[[1]]
tempsingle<-str_replace(string = tempsingle,pattern = "\\.$",replacement = "")
single<-c(single,tempsingle[unlist(lapply(X = tempsingle,FUN = function(x){max(grepl(pattern = x,x = multi))}))==0])
}
#var to store input data: is this an author name?
isauth = character()
#loop through multi and register authorness
for(x in 1:length(multi)){
tempmulti<-readline(prompt = paste0(multi[x]," "))
isauth<-c(isauth, tempmulti)
}
#bring together names, genders
multiframe<-data.frame(name=multi,isauth,stringsAsFactors = F)
multiframe<-multiframe[which(multiframe$isauth!=""),]
#for single just view and tag any that are real
singleisauth = character()
print(single)
singleindices<-readline(prompt = "Which items are authors? enter index numbers separated by spaces ")
singleindices<-as.integer(unlist(str_split(string = singleindices," ")))
print("Enter y if author name ok, / if needs editing")
for(x in 1:length(singleindices)){
tempsingle<-readline(prompt = paste0(single[singleindices[x]]," "))
singleisauth<-c(singleisauth, tempsingle)
}
singleframe<-data.frame(name=single[singleindices],isauth=singleisauth,stringsAsFactors = F)
authorsnew<-rbind(authorsnew, rbind(cbind(multiframe,interviewee=interviewee),cbind(singleframe,interviewee=interviewee)))
}
#fix flagged
#find rownums of flagged entries
fixnums<-grep("/",authorsnew$isauth)
#loop through flagged entries and prompt for name,gender string - separated by comma, no quotes
print("for each author enter correct name, no quotes")
for(i in 1:length(fixnums)){
tempfix<-readline(prompt = paste0(authorsnew$name[fixnums[i]], " "))
authorsnew$name[fixnums[i]]<-tempfix
authorsnew$isauth[fixnums[i]]<-"y"
}
#delete any blank lines
authorsnew<-authorsnew[authorsnew$name!=""&authorsnew$isauth!="",]
#get rid of dupes
authorsnew<-unique(authorsnew)
#hit goodreads for data re new interviewees and new authors, add to GRdata ####
#record row count of GRdata so can look only at new entries later. If GRdata doesn't exist yet (ie, starting from scratch), create it
if(exists("GRdata")){
GRdatarowcountold<-nrow(GRdata)} else{
GRdata<-data_frame(name=character(),id=numeric(), gender=factor(levels = c("","female","male")), birthdate=as.Date(x = integer(0), origin = "1970-01-01"), town=character(),is.interviewee=logical(),is.author=logical(),input.gender=factor(levels=c("f","m")),gender.use=factor(levels=c("f","m")), GRname=character(),stringdistance=numeric(),matchOK=logical())
}
#loop through interviewees to get GR data
#gender.use - use gender from authors if it matches GR data or if GR data is blank; else use GR data
for (i in 1:length(intervieweesnew$Subject)){
if(!intervieweesnew$Subject[i] %in% GRdata$name){
tempdetails<-authdetails(intervieweesnew$Subject[i])
tempdetails$birthdate<-as.Date(tempdetails$birthdate,format = "%Y/%m/%d")
GRdata<-rbind(GRdata,cbind(tempdetails,is.interviewee=T,is.author=NA,input.gender=intervieweesnew$Subject.gender[i],gender.use=ifelse(test = is.na(tempdetails$gender)|tempdetails$gender==""|substr(tempdetails$gender,1,1)==intervieweesnew$Subject.gender[i],yes = as.character(intervieweesnew$Subject.gender[i]),no = substr(tempdetails$gender,1,1)),stringdistance=stringdist(a = tempdetails$name,b = tempdetails$GRname),matchOK=ifelse(test = stringdist(a = tempdetails$name,b = tempdetails$GRname)<3,yes = T,no = NA)))
} else {
GRdata$is.interviewee[which(GRdata$name==intervieweesnew$Subject[i])]<-T
}
}
#loop through authors to get GR data
authorstemp<-unique(authorsnew$name[which(!authorsnew$name %in% GRdata$name[which(GRdata$is.author)])])
for (i in 1:length(authorstemp)){
if(!authorstemp[i] %in% GRdata$name){
tempdetails<-authdetails(authorstemp[i])
tempdetails$birthdate<-as.Date(tempdetails$birthdate,format = "%Y/%m/%d")
GRdata<-rbind(GRdata,cbind(tempdetails,is.interviewee=NA,is.author=T,input.gender=NA,gender.use=NA,stringdistance=stringdist(a = tempdetails$name,b = tempdetails$GRname),matchOK=ifelse(test = stringdist(a = tempdetails$name,b = tempdetails$GRname)<3,yes = T,no = NA)))
} else{
GRdata$is.author[which(GRdata$name==authorstemp[i])]<-T
}
}
# quality control: find the sketchy GR matches (do this before btb merge) ####
mismatches<-GRdata %>% filter(row(GRdata[,1])>GRdatarowcountold&GRdata$stringdistance>0) %>% data.frame
#if not too many, just eyeball and designate appropriately
GRdata[which(GRdata$name %in% mismatches$name),"matchOK"]<-c()
#manually fix the ones where the search just went wonky####
fixname<-#name to fix
newlookup<-#new string to use for lookup
GRdata[which(GRdata$name==fixname),c("id","gender","birthdate","town","GRname")]<-authdetails(newlookup)[,2:6]
GRdata$matchOK[which(GRdata$name==fixname)]<-T
#get wikipedia gender####
wikinew<-wikigetdata(namevector = GRdata$GRname[(GRdatarowcountold+1):length(GRdata$GRname)])
#add missing genders
#find rownums of flagged entries
fixnums<-which(is.na(wikinew$wikigender))
#loop through flagged entries and prompt for name,gender string - separated by comma, no quotes
print("for each author enter m, f, or nothing")
for(i in 1:length(fixnums)){
tempfix<-readline(prompt = paste0(wikinew$GRname[fixnums[i]]," "))
wikinew$wikigender[fixnums[i]]<-tempfix
}
#add gender and bday data to GRnames
#for bday only the ones where GR data doesn't have bday
GRdata$birthdate[which((GRdata$id %in% wikinew$GRID)&is.na(GRdata$birthdate))]<-wikinew$wikibirthdate[match(GRdata$id[which((GRdata$id %in% wikinew$GRID)&is.na(GRdata$birthdate))],wikinew$GRID)]
#for gender, the wiki data takes the place of input gender, then evaluate
GRdata$input.gender[which(GRdata$id %in% wikinew$GRID)]<-wikinew$wikigender[match(GRdata$id[which(GRdata$id %in% wikinew$GRID)],wikinew$GRID)]
GRdata$gender.use[which(GRdata$id %in% wikinew$GRID)]<-case_when(GRdata$gender[which(GRdata$id %in% wikinew$GRID)]=="" ~ as.character(GRdata$input.gender[which(GRdata$id %in% wikinew$GRID)]), T ~ substr(GRdata$gender[which(GRdata$id %in% wikinew$GRID)],1,1))
#add new interviewee and author names to crosswalk ####
interviewees<-rbind(interviewees,cbind(intervieweesnew,GRID=GRdata$id[match(intervieweesnew$Subject,GRdata$name)]))
authorsnew$gender<-NA
authors<-rbind(authors,cbind(authorsnew[,c(1,3:4)], GRID=GRdata$id[match(authorsnew$name,GRdata$name)]))
#create dataset of authors & interviewees merging by GR ID (omits anyone who wasn't a GR hit) ####
#could prob redo this with dplyr
btb<-merge(authors[,c(3,4)],GRdata[which(GRdata$matchOK==T&!is.na(GRdata$gender.use)),c(2,4:5,9,10)],by.x = "GRID",by.y = "id")
names(btb)[c(1,3:6)]<-paste0("author.",names(btb)[c(1,3:6)])
names(btb)[2]<-"interviewee.name"
btb<-merge(btb,GRdata[,c(1:2,4:5,9,10)],by.x = "interviewee.name",by.y = "name",all.x = T,all.y = F)
names(btb)[7:11]<-paste0("interviewee.",names(btb)[7:11])
btb<-merge(btb,interviewees[,c("GRID","Date")],by.x="interviewee.id",by.y="GRID")
#dedupe by GRID
btb<-unique(btb)
#save btb so as to load in rmd file. ####
#save GRdata, authgr, searchgr, wiki files just for safekeeping - iterate update
if(exists("savenum")){
savenum<-savenum+1} else {
savenum<-1
}
save(btb,file = "btb.Rdata")
save(btb,file = paste0("btb",as.character(savenum),".Rdata"))
save(GRdata,file = "GRdata.Rdata")
save(GRdata,file= paste0("GRdata",as.character(savenum),".Rdata"))
save(authors,file = "authors.Rdata")
save(authors,file= paste0("authors",as.character(savenum),".Rdata"))
save(interviewees,file = "interviewees.Rdata")
save(interviewees,file= paste0("interviewees",as.character(savenum),".Rdata"))
save(wikinew,file= paste0("wikinew",as.character(savenum),".Rdata"))
save(list = apropos("authgr"),file = paste0("authgr",as.character(savenum),".Rdata"))
save(list = apropos("searchgr"),file = paste0("searchgr",as.character(savenum),".Rdata"))
save(list = apropos("wikifull"),file = paste0("wikifull",as.character(savenum),".Rdata"))
save(list = apropos("wikitable"),file = paste0("wikitable",as.character(savenum),".Rdata"))
rm(list = apropos("authgr"))
rm(list = apropos("searchgr"))
rm(list = apropos("wikifull"))
rm(list = apropos("wikitable"))