-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
205 lines (175 loc) · 6.67 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/python
"""
the main script
"""
import csv_professor_sheet
import os
import pprint
import subprocess
from google_scholar import GoogleScholar, get_year
from professor import *
from professor_scraper import save_paper_list
from web_util import get_bytes
from school.kellogg import scrape_kellogg
from school.harvard import scrape_harvard
from school.uchicago import scrape_uchicago
from school.mit import scrape_mit
from school.stanford import scrape_stanford
from school.upenn import scrape_upenn
from school.berkeley import scrape_berkeley
from school.dartmouth import scrape_dartmouth
from school.yale import scrape_yale
from school.columbia import scrape_columbia
from csv_professor_sheet import SpreadSheet
pp = pprint.PrettyPrinter(indent=4)
CV_PATH = 'output/CVs'
def download_cv(prof):
if prof.cv_url is None:
return
time.sleep(1)
print("downloading CV for " + prof.slug())
global CV_PATH
if not os.path.exists(CV_PATH):
os.makedirs(CV_PATH)
b = get_bytes(prof.cv_url)
with open(CV_PATH + '/' + prof.slug() + ".pdf", 'wb') as f:
f.write(b)
def convert_CVs_to_text():
for file_path in os.listdir(CV_PATH):
if file_path.endswith(".pdf"):
slug = file_path.replace('.pdf','')
print('reading ' + file_path)
# The commandline version of pdf2txt actually works better than the Python slate package.
# Slate drops a lot of whitespace and newlines.
cv = subprocess.check_output(['pdf2txt.py', CV_PATH + '/' + file_path])
with open(CV_PATH + '/' + slug + '.txt', 'w') as f:
f.write(cv)
def load_CVs():
""" :return: a dictionary mapping name sluspreadsheet to cv strinspreadsheet"""
all_CVs = {}
for file_path in os.listdir(CV_PATH):
if file_path.endswith(".txt"):
slug = file_path.replace('.txt','')
with open(CV_PATH + '/' + file_path, 'r') as f:
all_CVs[slug] = f.read()
return all_CVs
def show_editorial_service(all_CVs):
for name, cv in all_CVs.iteritems():
print()
print(name)
for line in cv.lower().splitlines():
if "editor" in line:
print(line)
def get_missing_google_scholar_pages(spreadsheet, school=None):
profs = spreadsheet.read_profs()
random.shuffle(profs)
with GoogleScholar() as scholar:
for p in profs:
if school is not None and p.school != school:
continue
if p.google_scholar_url is None:
print(p.name)
p.google_scholar_url = scholar.find_google_scholar_page(p)
spreadsheet.save_prof(p)
def download_scholar_profiles(spreadsheet, school=None):
professors = spreadsheet.read_profs()
with GoogleScholar() as scholar:
for p in professors:
if school and p.school != school:
continue
if p.google_scholar_url:
print(p.slug())
save_paper_list('scholar_profile', p, scholar.scrape_profile(p.google_scholar_url))
def download_scholar_search_results(spreadsheet, school=None):
professors = spreadsheet.read_profs()
with GoogleScholar() as scholar:
for p in professors:
if school and p.school != school:
continue
print(p.slug())
if not p.google_scholar_url:
save_paper_list('scholar_search', p, scholar.scrape_search_results(p))
def ask_for_graduation_years(spreadsheet, profs):
for p in profs:
if p.graduation_year is None:
# show the CV
# !!!: this works on Mac only
subprocess.check_output("curl %s | open -f -a Preview" % p.cv_url, shell=True)
# ask for the graduation year
print(p.name)
school = input("Graduation School? ")
year_str = input("Graduation Year? ")
if year_str is not None and len(year_str) > 0:
p.graduation_year = int(year_str)
p.graduation_school = school
spreadsheet.save_prof(p)
def load_mturk_results(csv_filename):
observed_answers = {}
with open(csv_filename, 'r') as csvfile:
for row in csv_professor_sheet.reader(csvfile, delimiter=',', quotechar='"'):
name_slug = row[3].split('/')[-1].split('.')[0]
phd_year = get_year(row[6])
if not phd_year:
phd_year = get_year(row[5])
if not phd_year:
continue
if name_slug not in observed_answers:
observed_answers[name_slug] = []
observed_answers[name_slug].append(phd_year)
# majority vote
year_to_record = {}
for slug, answers in observed_answers.items():
# if an answer is in the majority, record it
for a in answers:
if answers.count(a) > len(answers) * 0.5:
year_to_record[slug] = a
continue
# save results
ss = SpreadSheet()
profs = ss.read_profs()
for p in profs:
if p.slug() in year_to_record and year_to_record[p.slug()] != p.graduation_year:
p.graduation_year = year_to_record[p.slug()]
time.sleep(1)
print("%s: saving graduation year %s" % (p.slug(), p.graduation_year))
spreadsheet.save_prof(p)
def scrape_all_schools():
profs = []
profs.extend(scrape_kellogg())
profs.extend(scrape_harvard())
profs.extend(scrape_uchicago())
profs.extend(scrape_mit())
profs.extend(scrape_stanford())
profs.extend(scrape_upenn())
profs.extend(scrape_berkeley())
profs.extend(scrape_dartmouth())
profs.extend(scrape_yale())
profs.extend(scrape_columbia())
return profs
def rescrape(spreadsheet, school_scraper):
profs = spreadsheet.read_profs()
new_profs = school_scraper()
for p in new_profs:
for p2 in profs:
if p2.slug() == p.slug():
print("merging new data for " + p.slug())
p.merge(p2)
spreadsheet.update_profs(new_profs)
if __name__ == '__main__':
spreadsheet = SpreadSheet()
do_reload = False
if do_reload:
profs = scrape_all_schools()
get_missing_google_scholar_pages(spreadsheet)
for p in profs:
# look for CV and Scholar links on any personal website
p.parse_personal_website()
download_cv(p)
convert_CVs_to_text()
spreadsheet.append_profs(profs)
get_missing_google_scholar_pages(spreadsheet)
download_scholar_profiles(spreadsheet)
download_scholar_search_results(spreadsheet)
profs = spreadsheet.read_profs()
all_CVs = load_CVs()
print("Total of %d professors found" % len(profs))