forked from radiolarian/AO3Scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathao3_work_ids.py
156 lines (138 loc) · 4.13 KB
/
ao3_work_ids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# Retrieve fic ids from an AO3 search
# Will return in searched order
# Saves ids to a csv for later use e.g. to retrieve fic text
# this is for python 2.7
from bs4 import BeautifulSoup
import re
import time
import requests
import csv
import sys
import datetime
url = ""
num_requested_fic = 0
num_recorded_fic = 0
csv_name = ""
#
# Ask the user for:
# a url of a works listed page
# e.g.
# https://archiveofourown.org/works?utf8=%E2%9C%93&work_search%5Bsort_column%5D=word_count&work_search%5Bother_tag_names%5D=&work_search%5Bquery%5D=&work_search%5Blanguage_id%5D=&work_search%5Bcomplete%5D=0&commit=Sort+and+Filter&tag_id=Harry+Potter+-+J*d*+K*d*+Rowling
# https://archiveofourown.org/tags/Harry%20Potter%20-%20J*d*%20K*d*%20Rowling/works?commit=Sort+and+Filter&page=2&utf8=%E2%9C%93&work_search%5Bcomplete%5D=0&work_search%5Blanguage_id%5D=&work_search%5Bother_tag_names%5D=&work_search%5Bquery%5D=&work_search%5Bsort_column%5D=word_count
# how many fics they want
# what to call the output csv
#
def get_user_params():
global url
global csv_name
global num_requested_fic
# user input the url
while (url == ""):
url = raw_input("What URL should we scrape? ")
# how many fic?
nqf = ""
while (nqf == ""):
nqf = raw_input("How many fic do you want? (for all, enter 'a') " )
if nqf == "a":
num_requested_fic = -1
else:
num_requested_fic = int(nqf)
while (csv_name == ""):
csv_name = raw_input("What should we call the output csv? ")
#
# navigate to a works listed page,
# then extract all work ids
#
def get_ids():
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")
# some responsiveness in the "UI"
sys.stdout.write('.')
sys.stdout.flush()
works = soup.find_all(class_="work blurb group")
ids = []
for tag in works:
t = tag.get('id')
t = t[5:]
ids.append(t)
return ids
#
# update the url to move to the next page
# note that if you go too far, ao3 won't error,
# but there will be no works listed
#
def update_url_to_next_page():
global url
key = "page="
start = url.find(key)
# there is already a page indicator in the url
if (start is not -1):
# find where in the url the page indicator starts and ends
page_start_index = start + len(key)
page_end_index = url.find("&", page_start_index)
# if it's in the middle of the url
if (page_end_index is not -1):
page = int(url[page_start_index:page_end_index]) + 1
url = url[:page_start_index] + str(page) + url[page_end_index:]
# if it's at the end of the url
else:
page = int(url[page_start_index:]) + 1
url = url[:page_start_index] + str(page)
# there is no page indicator, so we are on page 1
else:
# there are other modifiers
if (url.find("?") is not -1):
url = url + "&page=2"
# there an no modifiers yet
else:
url = url + "?page=2"
#
# after every page, write the gathered ids
# to the csv, so a crash doesn't lose everything.
# include the url where it was found,
# so an interrupted search can be restarted
#
def write_ids_to_csv(ids):
global num_recorded_fic
with open(csv_name + ".csv", 'a') as csvfile:
wr = csv.writer(csvfile, delimiter=',')
for id in ids:
if (not_finished()):
wr.writerow([id, url])
num_recorded_fic = num_recorded_fic + 1
else:
break
#
# if you want everything, you're not done
# otherwise compare recorded against requested.
# recorded doesn't update until it's actually written to the csv
#
def not_finished():
if (num_requested_fic == -1):
return True
else:
if (num_recorded_fic < num_requested_fic):
return True
else:
return False
#
# include a text file with the starting url,
# and the number of requested fics
#
def make_readme():
with open(csv_name + "_readme.txt", "w") as text_file:
text_file.write("url: " + url + "\n" + "num_requested_fic: " + str(num_requested_fic) + "\n" + "retreived on: " + str(datetime.datetime.now()))
def main():
get_user_params()
make_readme()
while(not_finished()):
# 5 second delay between requests as per AO3's terms of service
time.sleep(5)
ids = get_ids()
# if the current page is empty, you've run out of fic
if (len(ids) is 0):
break
write_ids_to_csv(ids)
update_url_to_next_page()
print "That's all, folks"
main()