-
Notifications
You must be signed in to change notification settings - Fork 1
/
authors_lightning.Rmd
182 lines (130 loc) · 8.87 KB
/
authors_lightning.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
---
title: 'Who Reads Whom: Text Mining Literary Interviews'
author: "Sarah Rankin"
date: "6/14/2018"
output:
ioslides_presentation: default
beamer_presentation: default
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = F,message = F,warning = F,cache = T)
```
```{r load_packages}
library("lubridate")
library("tidyverse")
library("knitr")
library("gridExtra")
```
```{r load_data}
load(file = "/Users/srhrnkn/Documents/authors/btb.Rdata")
```
```{r}
nF<-function(x) {ifelse(is.na(x), F,x)}
countN<-function(x) {as.integer(sum(!is.na(x)))}
rmean<-function(x){mean(x,na.rm=T)}
rmeanr<-function(x){round(mean(x,na.rm=T),2)}
rmedian<-function(x){median(x,na.rm=T)}
rsum<-function(x) {sum(x,na.rm=T)}
```
```{r create_objects}
#summarize by interviewee: counts by gender, percents, bday, age
byint<-btb %>% filter(year(Date)==2017) %>% group_by(interviewee.GRname,interviewee.id, interviewee.gender.use,interviewee.birthdate,Date) %>% summarise(countF=rsum(author.gender.use=="f"),countM=rsum(author.gender.use=="m"),countauth=countN(author.gender.use=="f")) %>% mutate(percF=countF/countauth,age=as.period(interval(interviewee.birthdate,Sys.Date()))$year) %>% mutate(varf=(1-percF)/countauth) %>% arrange(percF)
#summarize by author: mentiongs, bday, age
byauth<-btb %>% filter(year(Date)==2017) %>% group_by(author.GRname, author.gender.use,author.birthdate) %>% summarise(countFint=rsum(interviewee.gender.use=="f"),countMint=rsum(interviewee.gender.use=="m"),countment=countN(interviewee.gender.use)) %>% mutate(percF=countFint/countment,age=as.period(interval(author.birthdate,Sys.Date()))$year) %>% arrange(desc(countment))
#authors with more than three mentions by date of interview
highfreqauthbydate<-btb %>% filter(year(Date)==2017,author.GRname %in% byauth$author.GRname[byauth$countment>3]) %>% select(author.GRname, interviewee.GRname, Date, interviewee.gender.use) %>% left_join(byauth[,c("author.GRname","countment")]) %>% arrange(desc(countment),author.GRname,Date) %>% mutate(author=factor(author.GRname,levels = rev(unique(author.GRname))))
#colors for plots
plotcolors<-data.frame(abbr=c("f","m"),adj=c("female","male"),noun=c("women","men"),color=c("#a04646","#4661a0"),stringsAsFactors = F)
oranges<-colorRampPalette(c("grey","orange"))
sixoranges<-oranges(6)
```
----
<div class="centered">
```{r, out.width = "400px"}
knitr::include_graphics("nytbrcover.jpg")
knitr::include_graphics("btbex.jpg")
```
</div>
----
```{r }
btb %>% filter(year(Date)==2017) %>% group_by(Interviewee_gender=interviewee.gender.use,interviewee.name) %>% summarize(count=n(),Mentions_perc_female=sum(author.gender.use=="f")/n(),perc_m=sum(author.gender.use=="m")/n(),f=sum(author.gender.use=="f")) %>% ggplot(aes(x=Interviewee_gender,y=Mentions_perc_female)) +
geom_boxplot() + geom_abline(slope = 0,intercept = .5,color="darkred") +
geom_label(aes(x=2,y=.5,label="50%"),color="darkred") +
stat_summary(fun.y=mean, geom="point", shape=20, size=5, color="darkred", fill="darkred") +
scale_x_discrete(name=element_blank(),labels=paste0(c("Women","Men"),"\n(",btb %>% filter(year(Date)==2017) %>% count(interviewee.gender.use,interviewee.GRname) %>% count(interviewee.gender.use) %>% pull(nn),")"),position="top") + labs(title="2017 By the Book Interviews:\nPercent women mentioned by gender of interviewee") +
scale_y_continuous(name=element_blank(),breaks=c(0,.20,.40,.6),labels=c("0%","20%","40%","60%")) +
theme(axis.text.x = element_text(size=10),axis.title.y = element_text(angle=0),aspect.ratio = .75)
```
---
(@) Get text of interview: </br>
**xml2** package (read.xml), **rvest**
(@) Pull author names from text:</br>
Regular expressions (!!!) - **stringr**
``` { size="tiny"}
str_extract_all(string = btblines[x],pattern = "((?<![“])
([:upper:]{1}(\\. )?)+[:lower:]+(?=([ \\’\\'-][:upper:]{1}
(\\. )?)+)(?:[\\s\\’\\'-][:upper:]{1}(\\. )?[[:upper:]{1}
([:lower:]\\'+)-]+)+)")
```
(@) Connect to goodreads & wikipedia APIs for gender & birthdate:</br>
**xml2**, **WikipediR**, **tidytext**
(@) Analyze: **dplyr**, **ggplot**
---
<div class="centered">
```{r}
ggplot(byint,aes(x=interviewee.birthdate,y = percF,color=interviewee.gender.use,size=countauth)) +geom_point() +scale_color_manual(values=plotcolors$color, name="interviewees",labels=c("Women","Men")) + scale_size(name="authors\nmentioned") + ylab("Female authors mentioned (percent)") + xlab("Interviewee birthdate") +scale_y_continuous(labels = function(x){as.character(100*x)},limits = c(0,1)) + labs(title="Percent women mentioned by birthdate and gender of interviewee") + theme(axis.title.x = element_text(size=15))
```
</div>
---
<div class="centered">
```{r}
ggplot(byint,aes(x=interviewee.birthdate,y = percF,color=interviewee.gender.use,size=countauth)) +geom_point() + geom_smooth(method = "lm",se = F,show.legend = F) + scale_color_manual(values=plotcolors$color, name="interviewees",labels=c("Women","Men")) + scale_size(name="authors\nmentioned") + ylab("Female authors mentioned (percent)") + xlab("Interviewee birthdate") +scale_y_continuous(labels = function(x){as.character(100*x)},limits = c(0,1)) + labs(title="Percent women mentioned by birthdate and gender of interviewee") + theme(axis.title.x = element_text(size=15))
```
</div>
---
```{r}
#make df
heatmapint <- btb %>% filter(author.GRname %in% byauth$author.GRname[byauth$countment>0]) %>% select(author.GRname, interviewee.GRname, Date, interviewee.gender.use) %>% left_join(byauth[,c("author.GRname","countment")]) %>% arrange(desc(countment),author.GRname,Date) %>% mutate(author=factor(author.GRname,levels = rev(unique(author.GRname))))
heatmapint <- full_join(x = heatmapint[,c("author","interviewee.GRname")],y = heatmapint[,c("author","interviewee.GRname")],by = "author" )
#rearrange columns and delete entries where x and y are same
heatmapint$Weight<-1
#heatmapint$Weight[which(heatmapint$interviewee.GRname.y==heatmapint$interviewee.GRname.x)]<-NA
#set upper half to NA (getting rid of this, it's nice to be able to see full rows for everyone)
#heatmapint$Weight[match(heatmapint$interviewee.x, heatmaplevels)<match(heatmapint$interviewee.y, heatmaplevels)]<-NA
#data for plot - sum weight by interview mentions
heatmapintplot<-heatmapint %>% group_by(interviewee.GRname.x,interviewee.GRname.y) %>% summarize(Weight=sum(Weight))
#create levels to sort in order of most appearances (pull turns this back into a character vector)
heatmapintlevels<-heatmapintplot %>% group_by(interviewee.GRname.x) %>% summarize(n=max(Weight)) %>% arrange(desc(n)) %>% select(interviewee.GRname.x) %>% transmute(levels=as.character(interviewee.GRname.x)) %>% pull()
heatmapintplot$interviewee.x<-factor(heatmapintplot$interviewee.GRname.x,levels = heatmapintlevels)
heatmapintplot$interviewee.y<-factor(heatmapintplot$interviewee.GRname.y,levels = heatmapintlevels)
#pull out total authors mentioned as separate character var to plot on diagonal
heatmapintplot$authtotal<-as.character(heatmapintplot$Weight)
heatmapintplot$authtotal[which(heatmapintplot$interviewee.y!=heatmapintplot$interviewee.x)]<-NA
heatmapintplot$Weight[which(heatmapintplot$interviewee.y==heatmapintplot$interviewee.x)]<-NA
```
```{r}
#make df
heatmapauth <- btb %>% filter(author.GRname %in% byauth$author.GRname[byauth$countment>3],year(Date)==2017) %>% select(author.GRname, interviewee.GRname, Date, interviewee.gender.use) %>% left_join(byauth[,c("author.GRname","countment")]) %>% arrange(desc(countment),author.GRname,Date) %>% mutate(author=factor(author.GRname,levels = rev(unique(author.GRname))))
heatmapauth <- full_join(x = heatmapauth[,c("author","interviewee.GRname")],y = heatmapauth[,c("author","interviewee.GRname")],by = "interviewee.GRname" )
#rearrange columns and delete entries where x and y are same
heatmapauth$Weight<-1
#heatmapauth$Weight[which(heatmapauth$author.y==heatmapauth$author.x)]<-NA
heatmaplevels<-heatmapauth %>% group_by(author.x) %>% tally() %>% arrange(desc(n)) %>% select(author.x) %>% transmute(levels=as.character(author.x)) %>% pull()
heatmapauth$author.x<-factor(heatmapauth$author.x,levels = heatmaplevels)
heatmapauth$author.y<-factor(heatmapauth$author.y,levels = heatmaplevels)
#create levels to sort in order of most appearances (pull turns this back into a character vector)
heatmapauthplot<-heatmapauth %>% group_by(author.x,author.y) %>% summarise(Weight=sum(Weight))
heatmapauthplot$mentiontotal<-as.character(heatmapauthplot$Weight)
heatmapauthplot$mentiontotal[which(heatmapauthplot$author.y!=heatmapauthplot$author.x)]<-NA
heatmapauthplot$Weight[which(heatmapauthplot$author.y==heatmapauthplot$author.x)]<-NA
```
Top Authors Mentioned Together
George Saunders & Lin-Manuel Miranda <br>
Ann Patchett & Zadie Smith <br>
Colson Whitehead & James Baldwin<br>
Colson Whitehead & Zadie Smith<br>
David Sedaris & George Saunders<br>
Elena Ferrante & George Eliot<br>
Ernest Hemingway & F. Scott Fitzgerald <br>
James Baldwin & Ta-Nehisi Coates <br>
</div>