-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrapeingCountry_vaccinations_20170226.R
103 lines (79 loc) · 2.97 KB
/
scrapeingCountry_vaccinations_20170226.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# scrape the list of countries from TravelPro Home page
# scrap the page.
# this is the URL
url <- c("http://travelhealthpro.org.uk/countries")
# this readLines() function is a base R function which allows reading webpages
data <- readLines(url)
print(paste(url, "has just been scraped"))
# identify where the proteins names are on the scraped data
country_numbers <- grep("travelhealthpro.org.uk/country/", data)
# 317 long... sounds like a good length
# based on http://stackoverflow.com/questions/17227294/removing-html-tags-from-a-string-in-r
extractCountry <- function(htmlString) {
htmlString <- gsub("<.*?>", "", htmlString)
htmlString <- gsub("\t", "", htmlString)
return(htmlString)
}
extractCountry(data[380])
extractCountryUrl <- function(htmlString) {
htmlString <- gsub("\\t<li><a href=", "", htmlString)
htmlString <- gsub("</a></li>", "", htmlString)
htmlString <- gsub("\"", "", htmlString)
htmlString <- gsub(">.*", "", htmlString)
return(htmlString)
}
# test the function
extractCountryUrl(data[380])
country_list <- NULL
country_urls <- NULL
for(i in 1:length(country_numbers)){
reqd_url <- extractCountryUrl(data[country_numbers[i]])
country <- extractCountry(data[country_numbers[i]])
country_list <- c(country_list, country)
country_urls <- c(country_urls, reqd_url)
}
# I want a list of countries
# and a list of URLs
i <- 1
# one to 276 look fine but after that there is some rubbish.
# truncate both at 276
countrylist <- country_list[1:276]
countryUrls <- country_urls[1:276]
# with this list of countries, apply the extraction of vaccinations...
# I am attempting to scrape vaccinations recommendations
# from a Travel Health Pro website on a particular country.
# the example here is travel health pro page about afghanistan
# this is the page
# http://travelhealthpro.org.uk/country/1/afghanistan#Vaccine_recommendations
# install.packages("rvest")
library("rvest")
library(curl)
extractVacs <- function(x){
# scrape the page
web_content <- read_html(curl(x, handle = new_handle("useragent" = "Chrome")))
# handle is required as extra data and curl package is required too
print(paste(x, "has just been scraped"))
# extracting the data using pipes
vac_list <- web_content %>%
html_nodes(".accordion") %>%
html_nodes(".accordion-item") %>%
html_nodes("p") %>%
html_text(trim = FALSE)
# using gsub to remove the spaces and the line brake symbol
vac_list <- gsub("\n", "", vac_list)
vac_list <- gsub(" ", "", vac_list)
vac_list
# this works and returns a list.
return(vac_list)
}
# test the code
extractVacs(countryUrls[2])
# works
output <- lapply(countryUrls, extractVacs)
# returns a list of 276 with all the vaccinations in...
# and looks good.
# setwd("/Users/paulbrennan/Dropbox/R for Biochemists/webScraping2017_CardiffUsersGroup")
# save(output, file = "vaccinationList_20170226")
# saved.
# save(country_list, file = "countryList_20170226")
# next step - turn these two objects into something to do a visualiation.