-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
119 lines (105 loc) · 3.78 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# for crawler just import requests library
import requests
# import all the functions in index file
from index import *
# to get html page
def get_htlm(link):
# set headers
headers = {
"accept": "application/json, text/javascript, */*; q=0.01",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-US,en;q=0.9,fa-IR;q=0.8,fa;q=0.7,de;q=0.6",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"dnt": "1",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) snap Chromium/74.0.3729.169 Chrome/74.0.3729.169 Safari/537.36",
}
try:
# get the page
page = requests.get(link, headers=headers)
return page.text
except requests.exceptions.RequestException as ex:
print("connection error", ex)
return " "
# returns the union of two lists
def union(lst1, lst2):
final_list = list(set(lst1) | set(lst2))
return final_list
# to add new links to to_crawl list
def append_list(to_crawl, links):
for link in links:
to_crawl.append(link)
# returns a link from the passed page
def get_next_link(page):
# get the position of the 'a href' tag in the page
start_link = page.find('a href="http')
# if no link found return none and 0
if start_link == -1:
return None, 0
# get the position of the start double quotes
start_qoute = page.find('"', start_link)
# get the position of the end double quotes
end_qoute = page.find('"', start_qoute+1)
# extract the url which is located between the stat and the end double quotes
url = page[start_qoute+1: end_qoute]
# return the link
# return end double quotes position to know where to start the next time
return url, end_qoute
# get the links in a page
def get_all_links(link):
# get the page
page = get_htlm(link)
# clean the page from all tags
clean_page = clean_web_page(page)
# remove the special characters
words = remove_special_characters(clean_page)
# index the content of the page
indexing(link, words)
# create new list to store the links
Links = []
# while the page contains links
while True:
# get a link
url, end_pos = get_next_link(page)
# if there is a link
if url:
# check if the link is new
if url not in Links:
# add the new link to the links list
Links.append(url)
# get the rest of the page
page = page[end_pos:]
# if no link found
else:
break
# return all links that found in the page
return Links
# to crawl web
def crawl_web(seed):
global to_crawl
global crawled
# add the given link to to_crawl list
to_crawl.append(seed)
# while there is a link to crawl
while len(to_crawl) != 0 and len(crawled) < 500:
# pop a link from to_crawl list
link = to_crawl.pop()
# if the link was not crawled before
if link not in crawled:
# crawl that link and add the new links found in the page
links = union(to_crawl, get_all_links(link))
# add the link to crawled list
crawled.append(link)
# add the new links th to_crawl list
append_list(to_crawl, links)
# return crawled and to_crawl lists
return crawled, to_crawl
# start crawling
def crawl():
# the seed links
seeds = ['https://en.wikipedia.org/wiki/Ai']
# crawl every links in the seeds lists
for seed in seeds:
crawled, to_crawl = crawl_web(seed)
# print result
print("crawled : ", len(crawled))
print("to crawled : ", len(to_crawl))