-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIEEEXplore_indexing.py
76 lines (66 loc) · 2.18 KB
/
IEEEXplore_indexing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""
Author: Pavith Bambaravanage
URL: https://github.com/Pavith19
"""
import sys
import re
def remove_html_tags(text):
"""Remove html tags from a string"""
clean = re.compile('<.*?>')
return re.sub(clean, '', text)
def rem_sm(lis):
res = []
for i in lis:
if len(i)<4:
continue
else:
res.append(i)
return res
stopWords = input().split(';')
indexTerms = input().split(';')
complete_input = sys.stdin.read()
# get all text inside title tag
title = re.findall("<title>(\n|.*?)</title>","".join(complete_input.replace("\n"," ")))
title = remove_html_tags(title[0])
# remove all notations
title = title.replace("!","").replace("?","").replace(",","").replace(".","")
title = title.split()
#remove small words
title = rem_sm(title)
title = [i.lower() for i in title]
#repeat for the other tags
body = re.findall("<body>(\n|.*?)</body>","".join(complete_input.replace("\n"," ")))
body = remove_html_tags(body[0])
body = body.replace(".","").replace("!","").replace("?","").replace(",","")
body = body.split()
body = rem_sm(body)
body = [i.lower() for i in body]
abstract = re.findall("<abstract>(\n|.*?)</abstract>","".join(complete_input.replace("\n"," ")))
abstract = remove_html_tags(abstract[0])
abstract = abstract.replace(".","").replace("!","").replace("?","").replace(",","")
abstract = abstract.split()
abstract = rem_sm(abstract)
abstract = [i.lower() for i in abstract]
for i in stopWords:
# filter out all stopWords
title = list(filter((i).__ne__, title))
body = list(filter((i).__ne__, body))
abstract = list(filter((i).__ne__, abstract))
TotalWords = len(title) + len(body) + len(abstract)
indexes = {}
# calculate scores
for i in indexTerms:
indexes[i] = (title.count(i)*5 + body.count(i) + abstract.count(i)*3)*100/TotalWords
indexes = {k: v for k, v in sorted(indexes.items(), key=lambda item: (-item[1],item[0]), reverse=False)}
flag = 0
for i in indexes:
if flag >= 3:
if indexes.get(i) == temp[1]:
print(i +":" , indexes.get(i))
temp = [i, indexes.get(i)]
continue
break
else:
temp = [i, indexes.get(i)]
print(i +":" , indexes.get(i))
flag += 1