-
Notifications
You must be signed in to change notification settings - Fork 5
/
cnn-2.py
112 lines (98 loc) · 3.27 KB
/
cnn-2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import sys
import importlib
import scrapelib
from bs4 import BeautifulSoup
from dateutil import parser
import re
import csv
from datetime import date, timedelta
columns = ['url',
'channel.name',
'program.name',
'uid',
'duration',
'year',
'month',
'date',
'time',
'timezone',
'path',
'wordcount',
'subhead',
'text']
def extract_transcript(html):
soup = BeautifulSoup(html, "html.parser")
cnnTransStoryHead = soup.find('p', {"class": "cnnTransStoryHead"})
# print cnnTransStoryHead.text
cnnTransSubHead = soup.find('p', {"class": "cnnTransSubHead"})
# print cnnTransSubHead.text
# print html
#body = soup.find('p', {"class": "cnnBodyText"})
# print body.text
for a in soup.find_all('p', {"class": "cnnBodyText"}):
text = a.text
m = re.match(
r"Aired (.*\s+\d{1,2},\s+\d{4}\s+-\s+\d{2}:\d{2}).*([A-Z]{2,3})$", text)
if m:
date = m.group(1)
tz = m.group(2)
try:
date = parser.parse(date)
except Exception as e:
print(e)
date = date.split('-')[0]
date = parser.parse(date)
print(date, tz)
elif text.startswith("THIS IS A RUSH TRANSCRIPT."):
pass
else:
content = text
data = {}
try:
print(date)
data['channel.name'] = 'WWW'
data['program.name'] = cnnTransStoryHead.text
data['year'] = date.year
data['month'] = date.month
data['date'] = date.day
data['time'] = "%02d:%02d" % (date.hour, date.minute)
data['timezone'] = tz
data['subhead'] = cnnTransSubHead.text
data['text'] = content
except Exception as e:
print(e)
return data
if __name__ == "__main__":
f = open("cnn.csv", "a")
writer = csv.DictWriter(f, fieldnames=columns, dialect='excel')
writer.writeheader()
s = scrapelib.Scraper(requests_per_minute=60)
startdate = date(2014, 6, 18)
# Will be throttled to 10 HTTP requests per minute
while True:
print(startdate.year, startdate.month, startdate.day)
res = s.get('http://transcripts.cnn.com/TRANSCRIPTS/%04d.%02d.%02d.html' %
(startdate.year, startdate.month, startdate.day))
soup = BeautifulSoup(res.text, "html.parser")
print(soup.title)
for a in soup.find_all("div", {"class": "cnnTransDate"}):
print(a)
soup2 = a.find_next_siblings('div')[0]
# for b in soup2.find_all("div", {"class": "cnnSectBulletItems"}):
for link in soup2.find_all("a"):
url = 'http://transcripts.cnn.com' + link['href']
print(url)
try:
res2 = s.get(url)
data = extract_transcript(res2.text)
data['url'] = url
writer.writerow(data)
except Exception as e:
print(e)
print(url, res2.response)
print("Inner Break")
# break
print("Outer Break")
# break
startdate += timedelta(days=1)
f.close()