-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcountry_charts.Rmd
97 lines (58 loc) · 2.46 KB
/
country_charts.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
---
title: "Country information"
author: "Stephanie Rivera"
date: "5/12/2018"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library("ggplot2")
library("dplyr")
library(tidyverse)
library(maps)
library(geosphere)
library(rgeos)
library(rworldmap)
library("data.table")
```
```{r cars}
country_data <- read.csv(file = "emails.country.csv", header= TRUE, sep=",")
```
## Plots to show distribution of authors throughout different countries
You can also embed plots, for example:
```{r pressure, echo=FALSE}
country_data$Serial <- rep(1,nrow(country_data))
#country_data[country_data == "Russian Federation"] <- "Russia"
#ifelse(country_data$Name == "Russian Federation", "Russia", NA)
#aggregate by country to get a count
count_country <- aggregate(country_data$Serial,by=list(Name=country_data$Name), FUN=sum)
#count_country <- count_country[order(-count_country$x),]
colnames(count_country) <- c("Country", "count")
#filter for countries with counts of at least 1000
count_country <- count_country %>% filter(count > 50)
a <- ggplot(count_country, aes(x = reorder(Country, count), y = count, fill=count)) + geom_bar(stat= "identity")+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_fill_gradient(low = "green", high = "red") + ggtitle("Country counts for article contributors") + xlab(label = "Countries (count > 50)") + ylab(label = "Count") + coord_flip()
a
#count unique countries (80 unique countries)
```
```{r}
par(mar=c(0,0,0,0))
map('world',col="#f2f2f2", fill=TRUE, bg="white", lwd=0.05,mar=rep(0,4),border=0, ylim=c(-80,80) )
wmap <- getMap(resolution="high")
centroids <- gCentroid(wmap, byid=TRUE)
country_locations <- as.data.frame(centroids,row.names = NULL)
setDT(df, keep.rownames = TRUE)
colnames(df) <- c("Name","lat","long")
```
df holds location information, country_data has article information
```{r}
#join data
article_locations <- merge(x=country_data,y=df, by = "Name", all=TRUE)
#remove values that are unique (articles with only 1 autho)
change <- subset(article_locations,duplicated(article) | duplicated(article, fromLast=TRUE))
#remove articles where everyone is from the same country group things by article and see if each name is the same, if it is then remove that article from the data frame
store <- change[change$article == 21, "Name"]
store <- store[!is.na(store)]
store
#returning false means that different countries collaborated on something :)
length(unique(store)) == 1
```