-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathindexRecommender.py
130 lines (112 loc) · 3.79 KB
/
indexRecommender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
from nltk.corpus import stopwords
from nltk import wordpunct_tokenize
from nltk.stem import PorterStemmer
from collections import OrderedDict
import datetime
ps=PorterStemmer()
start_time=datetime.datetime.now()
stop_words=stopwords.words('english')+[',','.',';','/','+','-',"'s",'//','£','...',":"]
# set path to the directory where files are
abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)
initial_dir=os.getcwd()
#create an index directory to store the inverted index and the vocabulary_to_id file
try:
os.mkdir('BooksIndex')
except:
pass
descriptions_path=initial_dir+'/descriptions.tsv'
#create the vocabulary-id-mapping for each book description
file=open(descriptions_path,'rb')
vocabulary=set({})
booksIndex=set({})
cnt=0
for line in file:
descri=line.decode('utf-8').split("\t")
title=[ps.stem(word) for word in wordpunct_tokenize(descri[0].lower()) if word not in stop_words ]
try:
content=descri[2].split("-")
content.pop(-1)
except:
content=['']
continue
bag_of_words=set(content)
booksIndex.add(descri[1])
for token in bag_of_words:
vocabulary.add(token)
cnt+=1
#if(cnt==5):break;
file.close()
voc_id_dict={}
#save the voc-id-mapping to a .tsv file
idx=0
words_ID=open(initial_dir+'/BooksIndex/voc_id.tsv','wb')
for item in vocabulary:
words_ID.write(str(item).encode('utf-8')+b'\t'+str(idx).encode('utf-8')+b'\n')
voc_id_dict[item]=idx
idx+=1
words_ID.close()
#create an inverted index in a dictonary
#also create a bag of words dictionary of the form {doc1=[word_id_1,word_id_2,...],...}
#This is useful to perform cosine similarities during the searching process
file=open(descriptions_path,'rb')
data_index = OrderedDict({k: [] for k in voc_id_dict.values()})
bag_of_word_ids=[]
cnt=0
for line in (file):
descri=line.decode('utf-8').split("\t")
bookIndex=descri[1]
title=[ps.stem(word) for word in wordpunct_tokenize(descri[0].lower()) if word not in stop_words ]
try:
content=descri[2].split("-")
content.pop(-1)
except:
content=['']
continue
bag_of_words=set(content)
bag_of_word_ids=[voc_id_dict[word] for word in bag_of_words]
[data_index[(tokenIndex)].append(str(bookIndex)) for tokenIndex in bag_of_word_ids]
cnt+=1
#if(cnt==5):break
file.close()
error_empty=0
#store the inverted index into a .tsv file
inverted_index=open(initial_dir+'/BooksIndex/inverted_index.tsv','wb')
for item in data_index.items():
inverted_index.write(str(item[0]).encode('utf-8')+b'\t')
[inverted_index.write(str(doc).encode('utf-8')+b'\t') for doc in item[1][:-1]]
try:
inverted_index.write(str(item[1][-1]).encode('utf-8')+b'\n')
except Exception as e:
print(str(e))
inverted_index.close()
#store the bag of words index into a .tsv file
bag_of_words=open(initial_dir+'/BooksIndex/bag_of_words_index.tsv','wb')
file=open(descriptions_path,'rb')
cnt=0
for line in (file):
descri=line.decode('utf-8').split("\t")
#print(descri)
bookIndex=descri[1]
title=[ps.stem(word) for word in wordpunct_tokenize(descri[0].lower()) if word not in stop_words ]
try:
content=descri[2].split("-")
content.pop(-1)
except:
content=['']
continue
content=set(content)
bag_of_word_ids=[voc_id_dict[word] for word in content]
bag_of_words.write(str(bookIndex).encode('utf-8')+b'\t')
[bag_of_words.write(str(doc).encode('utf-8')+b'\t') for doc in bag_of_word_ids[:-1]]
try:
bag_of_words.write(str(bag_of_word_ids[-1]).encode('utf-8')+b'\n')
except Exception as e:
bag_of_words.write(b'\n')
#print(str(e))
cnt+=1
#if(cnt==5):break
bag_of_words.close()
print("Execution time: ",datetime.datetime.now()-start_time)