-
Notifications
You must be signed in to change notification settings - Fork 2
/
demo_scraper.py
93 lines (75 loc) · 3.76 KB
/
demo_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
###########################################################################
# (C) Vrije Universiteit, Amsterdam (the Netherlands) #
# #
# This file is part of AmCAT - The Amsterdam Content Analysis Toolkit #
# #
# AmCAT is free software: you can redistribute it and/or modify it under #
# the terms of the GNU Lesser General Public License as published by the #
# Free Software Foundation, either version 3 of the License, or (at your #
# option) any later version. #
# #
# AmCAT is distributed in the hope that it will be useful, but WITHOUT #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public #
# License for more details. #
# #
# You should have received a copy of the GNU Lesser General Public #
# License along with AmCAT. If not, see <http://www.gnu.org/licenses/>. #
###########################################################################
"""
Demo scraper for AmCAT
Scrapes all State of the Union addresses and uploads them into AmCAT.
Requirements:
- requests (can be installed using pip)
- lxml (can be installed using pip)
- amcatclient (this repository)
An English locale is needed to parse the dates, if the locale is not found
you can install it, e.g. on ubuntu: sudo locale-gen en_US.utf8
"""
# Import modules to get and parse HTML pages
import requests
from lxml import html
# Since we need to parse an English-language date (December 3, 2002),
# set the locale to english. You can skip this step on an English-language OS
import datetime
import locale
locale.setlocale(locale.LC_ALL, "en_US.utf8")
# Import amcatclient
from amcatclient import AmcatAPI
# Connect to AmCAT.
# Note: if you create a .amcatauth file in your home dir, there is no
# need to specify username and password.
conn = AmcatAPI("http://amcat.vu.nl", "<username>","<password>")
# Create a new articleset to add the articles to.
# You can also just set 'setid' to add to an existing set
PROJECT_ID = 1
aset = conn.create_set(project=PROJECT_ID, name="State of the Union",
provenance="Scraped from http://www.presidency.ucsb.edu/sou.php")
setid = aset["id"]
# Get the main page and iterate over all links in a 'doclist'
page = requests.get('http://www.presidency.ucsb.edu/sou.php')
tree = html.fromstring(page.text)
for a in tree.cssselect("td.doclist a"):
# Skip empty links and the 'jump to menu' link
if not a.text_content().strip(): continue
if a.text_content().strip() == "jump to menu": continue
# Get the child page - this is a single state of the union
page = requests.get(a.get('href'))
tree = html.fromstring(page.text)
# Get the date and parse it
date = tree.cssselect(".docdate")[0].text_content()
date = datetime.datetime.strptime(date, "%B %d, %Y")
# Get the title, which starts with <president>:
title = tree.cssselect("title")[0].text_content()
president = title.split(":")[0]
# Get all paragraphs in the displaytext and join together
ps = tree.cssselect(".displaytext p")
text = "\n\n".join(p.text_content() for p in ps)
# Build the article dictionary and add it to AmCAT
art = {"headline": president,
"byline": title,
"medium" : "Speeches",
"text" : text,
"date" : date.isoformat()
}
articles = conn.create_articles(project=1, articleset=setid, json_data=[art])