-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.py
111 lines (90 loc) · 3.96 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# Importing the necessary libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from posting_list import *
nltk.download('punkt')
nltk.download('stopwords')
Stopwords = set(stopwords.words('english'))
linked_list_data = {}
dictionary = set()
pages_with_index = {}
titles = []
to_crawl = []
crawled = []
idx = 1
# remove html tags using regular expresion
def remove_html_tags(page):
page = re.sub('(\<\;).*?(\>\;)', ' ', page)
page = re.sub(r" ", ' ', page)
pattern = re.compile(r'<\/?\w+\s*[^>]*?\/?>', re.DOTALL | re.MULTILINE | re.IGNORECASE | re.UNICODE)
page = re.sub(pattern, ' ', page)
pattern = re.compile(r'<([A-Z][A-Z0-9]*)(\b[^>src]*)(src\=[\'|"|\s]?[^\'][^"][^\s]*[\'|"|\s]?)?(\b[^>]*)>')
page = re.sub(pattern, ' ', page)
page = re.sub(r'<.*?>', ' ', page)
return page
# clean web page from all kinds of tags
def clean_web_page(page):
# remove <SCRIPT> to </script> and variations
pattern = r'<[ ]*script.*?\/[ ]*script[ ]*>' # mach any char zero or more times
page = re.sub(pattern, '', page, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# remove HTML <STYLE> to </style> and variations
pattern = r'<[ ]*style.*?\/[ ]*style[ ]*>' # mach any char zero or more times
page = re.sub(pattern, '', page, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# remove HTML <META> to </meta> and variations
pattern = r'<[ ]*meta.*?>' # mach any char zero or more times
page = re.sub(pattern, '', page, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# remove HTML COMMENTS <!-- to --> and variations
pattern = r'<[ ]*!--.*?--[ ]*>' # mach any char zero or more times
page = re.sub(pattern, '', page, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# remove HTML DOCTYPE <!DOCTYPE html to > and variations
pattern = r'<[ ]*\![ ]*DOCTYPE.*?>' # mach any char zero or more times
page = re.sub(pattern, '', page, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# remove HTML tags
page = remove_html_tags(page)
return page
# to remove special characters
def remove_special_characters(text):
# compile a regular expression pattern into a regular expression object
# regex will match any charecter except
regex = re.compile('[^a-zA-Z0-9\s]')
# \s -- it matches any whitespace character, this is equivalent to the set [ \t\n\r\f\v].
# if the regular expression (regex) found in text then replace it with ('') nohing
text_returned = re.sub(regex, '', text)
# matches any decimal digit; this is equivalent to the set [0-9].
# remove all decimal digit
text_returned = re.sub(re.compile('\d'), '', text_returned)
# break the page content into words
words = word_tokenize(text_returned.lower())
# define a set
clean_words = set()
for word in words:
if word not in Stopwords and len(word) > 2 and word not in clean_words:
# store unique words in clean_word set
clean_words.add(word)
# return unique words
return clean_words
# index the content of the page
def indexing(link, clean_words):
global linked_list_data
global dictionary
global pages_with_index
global idx
for word in clean_words:
if word not in dictionary:
# add new word to dictionary
dictionary.add(word)
# make a posting list for the word
linked_list_data[word] = SlinkedList()
linked_list_data[word].head = Node(idx)
linked_list = linked_list_data[word].head
# fill the posting list
while linked_list.nextval is not None:
linked_list = linked_list.nextval
linked_list.nextval = Node(idx)
# store the processed page name with its number
pages_with_index[idx] = link
# idx: is used for page numbering
idx = idx + 1
#idx => link => word1 => word2 ....