-
Notifications
You must be signed in to change notification settings - Fork 0
/
main2.py
131 lines (110 loc) · 4 KB
/
main2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import requests
import multiprocessing
from bs4 import BeautifulSoup
from xml.etree import ElementTree
from xml.dom import minidom
from xml.etree.ElementTree import Element, SubElement
def prettify(elem):
"""
Return a pretty-printed XML string for the Element.
"""
rough_string = ElementTree.tostring(elem, 'utf-8')
reparsed = minidom.parseString(rough_string)
return reparsed.toprettyxml(indent=" ")
def get_person_info(person_id):
"""
For now, gets the birth date of the person
:param person_id: personal ID used to navigate to the IMDB page
:return: birthday in format yyyy-mm-dd
"""
url = "https://www.imdb.com" + person_id
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
result = soup.find("div", id="name-born-info")
if result is not None:
try:
return result.find("time").attrs['datetime']
except AttributeError:
return None
else:
return None
def main():
# define url and make request
url = 'https://www.imdb.com/list/ls093785287/'
page = requests.get(url)
# scape the page and get all the separate movie divs
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find_all(class_='lister-item mode-detail')
# set up xml
imdb = Element('imdb')
i = 0
directors_added = []
actors_added = []
for result in results:
print("iteration %s" % i)
i += 1
# PARSE HTML
# get the HTML for the displayed attributes on page
header = result.find("h3", class_="lister-item-header")
p_elements = result.find_all("p", class_="text-muted text-small") # there are multiple p's with that class
subheader = p_elements[0]
cast = p_elements[1]
# get title, movie ID and year from header
title_elem = header.find("a")
title = title_elem.text
movie_id = title_elem.attrs['href'] # movie ID from the href
year = header.find("span", class_="lister-item-year").text
try:
# get only the year number in brackets, sometimes there are other vals
year = year[year.index("(") + 1:year.index(")")]
except ValueError:
print(year) # sometimes "year" is just an empty string...
# get genres from subheader
genres = subheader.find("span", class_="genre").text
# removes trailing whitespace and line break, split by commas
genres_array = genres.strip().replace("\n", "").split(", ")
# get director and stars from the cast under description
cast_elements = cast.find_all("a")
director = cast_elements[0].text # director always first in cast
director_id = cast_elements[0].attrs['href']
director_dob = get_person_info(director_id)
# PUT DATA IN XML
# default movie data
xml_movie = SubElement(imdb, "movie", {"id": movie_id})
xml_year = SubElement(xml_movie, "year")
xml_year.text = year
xml_title = SubElement(xml_movie, "title")
xml_title.text = title
# genre data
xml_genres = SubElement(xml_movie, "genres")
for genre in genres_array:
xml_genre = SubElement(xml_genres, "genre")
xml_genre.text = genre
# director data
xml_director_ref = SubElement(xml_movie, "director-ref", {"id": director_id})
if director_id not in directors_added:
directors_added.append(director_id)
xml_director = SubElement(imdb, "director", {"id": director_id})
xml_director_name = SubElement(xml_director, "name")
xml_director_name.text = director
xml_director_dob = SubElement(xml_director, "dob")
xml_director_dob.text = director_dob
# actor data
# index for array: everything excl. first element (director)
for actor in cast_elements[1:]:
actor_name = actor.text
actor_id = actor.attrs['href']
actor_dob = get_person_info(actor_id)
actor_xml_ref = SubElement(xml_movie, "actor-ref", {"id": actor_id})
if actor_id not in actors_added:
actors_added.append(actor_id)
xml_actor = SubElement(imdb, "actor", {"id": actor_id})
xml_actor_name = SubElement(xml_actor, "name")
xml_actor_dob = SubElement(xml_actor, "dob")
xml_actor_dob.text = actor_dob
xml_actor_name.text = actor_name
# write data to XML
file = open("export2.xml", "w")
file.write(prettify(imdb))
if __name__ == "__main__":
main()