-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathgoodreadsQuotes.py
60 lines (51 loc) · 1.62 KB
/
goodreadsQuotes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import urllib.request
from bs4 import BeautifulSoup
from sys import argv
import bs4
import unicodedata
import re
import json
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("term", help="search term")
parser.add_argument("pages", help="number of pages to be searched")
args = parser.parse_args()
search_term = args.term
num_pages = int(args.pages)
file_name = ''+search_term+'_goodread_quotes_'+'.json'
target = open(file_name,'w')
url_base = "https://www.goodreads.com"
url_add = "/quotes/search?q="+search_term+"&"
old_curr = 0
curr = 1
quote = {}
while num_pages > 0:
old_curr = curr
html = urllib.request.urlopen(''+url_base+url_add).read()
soup = BeautifulSoup(html,'html.parser')
print('reading page'+str(curr))
tags_div = soup('div')
for item in tags_div:
text = str(item.get('class'))
if 'quoteText' in text:
quote_str = ''
for inneritem in item.contents:
if type(inneritem) == bs4.element.NavigableString:
quote_str = quote_str + str(inneritem)
start = str.index(quote_str,'“')
end = str.index(quote_str,'”')
quote_str = quote_str[start+1:end-1]
if item.a:
quote[item.a.text] = quote_str
tags_a = soup('a')
for item in tags_a:
text = str(item.get('class'))
if 'next_page' in text:
url_add = str(item['href'])
num = re.findall('page=([0-9]*)&', url_add)
curr = int(num[0])
break
if curr < old_curr:
break
num_pages = num_pages - 1
target.write(json.dumps(quote))