-
Notifications
You must be signed in to change notification settings - Fork 0
/
google_scholar.py
267 lines (233 loc) · 11.5 KB
/
google_scholar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
import urllib
import time
import re
from professor import Professor
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException, InvalidElementStateException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from selenium import webdriver
from web_util import wait, tree_from_string, css_select, Selector
from typing import List
STARTING_YEAR = '2007'
def none_to_empty(obj):
return obj if obj else ''
def empty_to_none(str):
return str if len(str) > 0 else None
def parse_int(str):
return int(str) if str else 0
def get_from_array(arr, idx):
if idx >= len(arr):
return None
else:
return empty_to_none(arr[idx])
def get_year(citation: str):
# first, look for the year inside parens
for number in re.findall(r"\(([12][0-9]{3})\)", citation):
if 1900 <= int(number) <= 2030:
return number
# if no year exists inside parens, then take the last number that looks like a year
numbers = re.findall(r"[12][0-9]{3}", citation)
numbers.reverse()
for number in numbers:
if 1900 <= int(number) <= 2030:
return number
# if no year is present, then return None
# TODO: maybe treat "Forthcoming" as the current year
return None
class Paper:
def __init__(self, title, authors, venue, year, scholar_citations, wos_citations=None, id=None):
self.title = title
self.authors = authors
self.venue = venue
self.year = year
self.scholar_citations = scholar_citations
self.wos_citations = wos_citations
self.id = id
def __str__(self):
return '\t'.join([self.authors, self.title, self.venue, self.year, str(self.scholar_citations),
str(none_to_empty(self.wos_citations)), none_to_empty(self.id)])
def __repr__(self):
return str(self)
def pretty_citation(self):
return '. '.join([self.authors, self.title, self.venue, self.year])
@classmethod
def from_string(cls, string):
s = string.replace('\n', '').split('\t')
return Paper(authors = s[0],
title = s[1],
venue = s[2],
year = s[3],
scholar_citations = parse_int(get_from_array(s, 4)),
wos_citations = parse_int(get_from_array(s, 5)),
id = get_from_array(s, 6))
class GoogleScholar:
def __init__(self, executable_path=None):
self.selenium_driver = \
webdriver.Firefox(executable_path=executable_path) if executable_path else webdriver.Firefox()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.selenium_driver.quit()
def wait_for_captchas(self):
"""Sleep if a captcha or error page is shown, otherwise return immediately."""
# detect the javascript Captcha that is embedded in the search results page
try:
self.selenium_driver.find_element_by_css_selector('div#gs_captcha_ccl')
except NoSuchElementException:
pass
else:
print("WARNING: got a Javascript CAPTCHA")
# wait until CAPTCHA is gone
WebDriverWait(self.selenium_driver, 99999).until(
expected_conditions.invisibility_of_element_located((By.ID, "gs_captcha_ccl")))
# detect the IP-based Captcha that redirects you to ipv4.google.com/sorry/index and is more old-school looking
already_failed = False
while 'google.com/sorry' in self.selenium_driver.current_url:
if not already_failed:
print("WARNING: got a form CAPTCHA")
already_failed = True
time.sleep(1)
if already_failed:
self.wait_for_captchas()
# detect 403 error that refers to Terms of Service
already_failed = False
while True:
try:
title = self.selenium_driver.find_element_by_css_selector('title').text
except NoSuchElementException:
break
if 'Error' not in title and 'Sorry' not in title:
break
if not already_failed:
print("WARNING: got an error page")
already_failed = True
self.selenium_driver.delete_all_cookies()
# navigate to a scholar results page to force scholar to show us a Captcha (rather than a simple 403)
url = self.selenium_driver.current_url
self.get_page('https://scholar.google.com/scholar?hl=en&q=author%3A%22Isaac+Newton%22') # random page
self.get_page(url)
time.sleep(1)
if already_failed:
self.wait_for_captchas()
def get_page(self, url):
wait()
self.selenium_driver.get(url)
self.wait_for_captchas()
def find_google_scholar_page(self, prof: Professor):
# get search results page
self.get_page('https://scholar.google.com/scholar?q=author%%3A"%s"+%s' %
(urllib.parse.quote(prof.simple_name()), prof.school))
# look for a matching user profile
try:
anchor = self.selenium_driver.find_element_by_css_selector('h4.gs_rt2 a')
return anchor.get_attribute('href')
except NoSuchElementException:
return None
# eg., see https://scholar.google.com/citations?user=VGoSakQAAAAJ&hl=en&oi=ao
def scrape_profile(self, author_url) -> List[Paper]:
self.get_page(author_url)
# click "show more" button until it disappears
while True:
try:
button = self.selenium_driver.find_element_by_css_selector('button#gsc_bpf_more:enabled')
if button:
button.click()
self.wait_for_captchas()
wait()
else:
# if no enabled button found, then we're done
break
except (NoSuchElementException, ElementNotVisibleException, InvalidElementStateException):
break
# load the page in Beautiful Soup for easier parsing
tree = tree_from_string(self.selenium_driver.page_source)
# scrape the list of papers
papers = []
for row in css_select(tree, 'tr.gsc_a_tr'):
title = Selector('td.gsc_a_t a')(row)
authors_and_venue = css_select(row, 'div.gs_gray')
author = authors_and_venue[0].text
venue = authors_and_venue[1].text
year = Selector('td.gsc_a_y')(row)
citation_count = Selector('td.gsc_a_c a.gsc_a_ac')(row)
# look for strikeout (cross-out) over citation count, indicating that it's a dupe
if Selector('td.gsc_a_c a.gsc_a_acm')(row):
continue
papers.append(Paper(title, author, venue, year, citation_count))
return papers
def scrape_search_results(self, prof: Professor) -> List[Paper]:
"""In this case, we are saving all articles, even if we are not sure that they match the author.
We only search in the past ten years (2007 and later) and only include the first 100 pages of results,
and only papers that have at least one citation in Google Scholar (to save us some time)."""
# parse each page of results, up to at most 1000 articles (100 pages)
papers = []
# for each page of results
for start in range(0, 1000, 10):
result_row_info = []
# get search results page
self.get_page(
'https://scholar.google.com/scholar?start=%d&as_ylo=%s&q=author%%3A"%s"+%s' %
(start, STARTING_YEAR, urllib.parse.quote(prof.simple_name()), prof.school))
# We get the GS and WoS citation counts from the search results page
# We get the full citation information by virtually clicking the "cite" link for each article
tree = tree_from_string(self.selenium_driver.page_source)
for row in css_select(tree, 'div.gs_r div.gs_ri'):
scholar_citations = None
wos_citations = None
citation_id = None
for link in css_select(row, 'div.gs_fl a'):
if 'Cited by' in link.text:
scholar_citations = link.text.split(' ')[-1]
elif 'Web of Science:' in link.text:
wos_citations = link.text.split(': ')[-1]
elif 'Related articles' in link.text:
citation_id = link.get('href').split(":")[1]
# ignore papers with no citations
if not scholar_citations:
break
result_row_info.append({'scholar_citations': scholar_citations,
'wos_citations': wos_citations,
'citation_id': citation_id})
# stop when we've gone past the end of results
if len(result_row_info) == 0:
break
# fetch each citation and pick out the Chicago format because it has full firstnames
# and includes all the author names (or at least more of them before using "et al."
# eg., https://scholar.google.com/scholar?q=info:J2Uvx00ui50J:scholar.google.com/&output=cite&scirp=1&hl=en
for r in result_row_info:
self.get_page('https://scholar.google.com/scholar?q=info:%s:scholar.google.com/'
'&output=cite&scirp=1&hl=en' % r['citation_id'])
# the third row in the table contains the Chicago-style citation
citation = self.selenium_driver.find_elements_by_css_selector('td')[2].text
year = get_year(citation)
if not year:
continue
# look for the first period that is not part of a middle initial
match = re.search(r"\w{2}\. ", citation)
if not match:
# otherwise, just take the first period as in:
# Al-Najjar, Nabil I. "A bayesian framework for precautionary policies." (2013).
match = re.search(r"\. ", citation)
authors = citation[:match.end()]
# venue is in italics
try:
venue = self.selenium_driver.find_elements_by_css_selector('td')[2]\
.find_element_by_css_selector('i').text
except NoSuchElementException:
# this is probably a working paper
continue
match = re.findall(r"\"(.*)\"", citation) # article titles are inside quotes
if len(match) == 0:
# this is a book, which we don't record
continue
title = match[0]
papers.append(Paper(title=title, authors=authors, venue=venue, year=year,
scholar_citations=r['scholar_citations'],
wos_citations=r['wos_citations'], id=r['citation_id']))
return papers
if __name__ == '__main__':
# for some reason, running this in the IDE requires me to set the geckodriver path
with GoogleScholar('/usr/local/bin/geckodriver') as scholar:
print(scholar.scrape_profile('https://scholar.google.com/citations?user=a1ngrCIAAAAJ&hl=en'))
print(scholar.scrape_search_results(Professor(school='Northwestern', name='Nabil Al-Najjar')))