-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsearchRecommender.py
135 lines (102 loc) · 4.2 KB
/
searchRecommender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
from nltk.corpus import stopwords
from nltk import wordpunct_tokenize
from nltk.stem import PorterStemmer
from collections import OrderedDict,Counter
import datetime
import re
import operator
import numpy as np
abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)
initial_dir=os.getcwd()
ps=PorterStemmer()
stop_words=stopwords.words('english')+[',','.',';','/','+','-',"'s"]
#set paths
voc_path=initial_dir+'/BooksIndex/voc_id.tsv'
index_path=initial_dir+'/BooksIndex/inverted_index.tsv'
doc_path=initial_dir+'/descriptions.tsv'
bag_of_word_path=initial_dir+'/BooksIndex/bag_of_words_index.tsv'
words_ID_file=open(voc_path,'rb')
words_ID={}
for line in words_ID_file:
items=line.decode('utf-8').split('\t')
words_ID[items[0]]=(re.sub('\\n','',items[1]))
words_ID_file.close()
def search(queries,threshold=0.5):
bag_of_words=set([ps.stem(word.lower()) for word in wordpunct_tokenize(queries) if word not in stop_words])
words_ids=[words_ID[word] for word in bag_of_words if word in words_ID.keys()]
query_bag_of_word=words_ids.copy()
length_query=len(words_ids)
if length_query==0:
return([])
inverted_index=open(index_path,'rb')
res=[]
#we want to find all documents containing at least one of the query words
for line in inverted_index:
word_id=line.decode('utf-8').split('\t')[0]
if word_id in words_ids:
res.append([re.sub('\\n','',item) for item in line.decode('utf-8').split('\t')[1:]])
words_ids.remove(word_id)
else:
continue
inverted_index.close()
word_cnt={}
thresholded_results=[]
for items in res:
for item in set(items):
try:
word_cnt[item]+=1
except:
word_cnt[item]=1
#each document must contain at least threshold*100% of the query word
for item in word_cnt.items():
if item[1]>=length_query*threshold:
thresholded_results.append (item[0])
# we want to compute cosine similarity measures between the query words and the results
bag_of_word_index=open(bag_of_word_path,'rb')
bag_of_word_dic={}
length_doc=0
#here we express each book description which passed the threshold criterion by its bag of word in a dict {recipe_id:[bag of words]}
for line in bag_of_word_index:
doc_id=line.decode('utf-8').split('\t')[0]
length_doc+=1
if doc_id in thresholded_results:
bag_of_word_dic[doc_id] =[re.sub('\\n','',item) for item in line.decode('utf-8').split('\t')[1:]]
thresholded_results.remove(doc_id)
else:
continue
bag_of_word_index.close()
cos_similarity={}
set_query=set(query_bag_of_word)
#here we compute the cosine similarity measure
for item in bag_of_word_dic.items():
cos_similarity[item[0]]=(len(set_query.intersection(item[1])))/np.sqrt((len(item[1])*len(query_bag_of_word)))
sorted_results=sorted(cos_similarity.items(), key=operator.itemgetter(1),reverse=True)
#print(sorted_results)
#results are sorted, highest similiraty measures first
sorted_results =[tup[0] for tup in sorted_results]
return sorted_results
#print(search(" modern-europ-forc-idea-bloodiest-myth-six-battl-normandi-campaign-declassifi-diari-day-plan-western-written-clearli-militari-strategi-intens-went-detail-took-dare-war-document-two-invas-person-map-happen-eventu-wrong-cross ",threshold=.2))
def findContentbyIsbn(isbnList,threshold=0.3 ):
isbnList=isbnList.split(" ")
file=open(doc_path,'rb')
cnt=0
key_words=""
for line in file:
cnt=cnt+1
isbn=line.decode('utf-8').split("\t")[1]
if isbn in isbnList:
key_words+=line.decode('utf-8').split("\t")[2]
file.close()
result= search(key_words,threshold=threshold)
for isbn_ in isbnList:
try:
result.remove(isbn_)
except:
pass
return result
def findContentByKeywords(keywords,threshold=0.3 ):
result= search(keywords,threshold=threshold)
return result