This repository has been archived by the owner on Aug 27, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
WebScrapper.py
143 lines (130 loc) · 5.01 KB
/
WebScrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""
This class is responsible of scrapping a website and extracting all
the differents links from it.
"""
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import requests
class WebScrapper():
def __init__(self):
"""
input:
- url = URL of the website
- nbr_iterations = number of times to crawl the website (default value:1)
"""
self.nbr_iterations = 1
#variable to store the links as nodes for the graph
self.nodes = []
#variable to store the links as edges for the graph
self.edges = []
#variable to store the links as edges for the graph
self.broken_links = []
#variable for knowing if ignore the broken links (default=False)
self.show_broken_links = False
def set_nbr_iterations(self, n):
"""
Set's the number of iterations
"""
self.nbr_iterations = n
def getNodes(self):
"""
returns the nodes
"""
return self.nodes
def getEdges(self):
"""
returns the edges
"""
return self.edges
def setBrokenLink(self, state):
"""
Param: state = True/False
- True: ignore broken links
- False: accept broken links
"""
self.show_broken_links = state
def is_valid(self, url):
"""
input: - url = URL/link of a website
return: -True if URL contains a protocol and a domain
"""
parsed = urlparse(url)
return bool(parsed.scheme) and bool(parsed.netloc)
def scrapper(self, url):
"""
This funtion scrapes the a webpage and extracts the links(href) and stores them
to the varibles nodes and edges
input: - url = URL/link of a website
"""
#domain name of the 'url' without the protocol
domain_name = urlparse(url).netloc
#html content of the webpage
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
for a_tag in soup.findAll('a'):
#href that is in the <a> tag
href = a_tag.get('href')
#checks if the href is non empty
if href !='' or href != None:
#joins the href to the url
href = urljoin(url, href)
#parses the href
parsed_href = urlparse(href)
#cleans the href
href = parsed_href.scheme + '://' + parsed_href.netloc + parsed_href.path
if self.is_valid(href) and domain_name in href:
href = href.replace(" ", "").lower()
if self.show_broken_links:
#checks if the href(url) is not broken
if requests.get(href):
self.edges.append((url, href))
if href not in self.nodes:
self.nodes.append(href)
elif href not in self.broken_links:
self.broken_links.append(href)
else:
self.edges.append((url, href))
if href not in self.nodes:
self.nodes.append(href)
def crawl(self, url):
"""
This function crawls a website and scrapes all the links from it.
input: - url = URL/link of a website
"""
#variable to keep track of the current number of iterqtions
curr_iteration = 0
#adds the url to the list nodes
self.nodes.append(url)
for link in self.nodes:
curr_iteration+=1
if curr_iteration>self.nbr_iterations:
break
self.scrapper(link)
def writeNodesEdgesToAFile(self):
"""
This funtions writes to a nodes.txt file, the nodes from the graph 'G',
and to a edges.txt file, the edges from the graph 'G'.
"""
nodes_file = open('data/nodes.txt', 'w')
edges_file = open('data/edges.txt', 'w')
brokenLinks_file = open('data/broken-links.txt', 'w')
count = 0
for node in self.nodes:
count+=1
t = '[' + str(count) + '] -> ' + node + '\n'
nodes_file.write(t)
nodes_file.write("===================================\n")
nodes_file.write('Total number of nodes: ' + str(count))
count=0
for edge in self.edges:
count+=1
t = '[' + str(count) + '] -> ' + '(' + edge[0] + ', ' + edge[1] + ')' + '\n'
edges_file.write(t)
edges_file.write("===================================\n")
edges_file.write('Total number of edges: ' + str(count))
count=0
for bl in self.broken_links:
count+=1
t = '[' + str(count) + '] -> ' + bl + '\n'
brokenLinks_file.write(t)
brokenLinks_file.write("===================================\n")
brokenLinks_file.write('Total number of broken links: ' + str(count))