-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSearch.py
77 lines (63 loc) · 2.72 KB
/
Search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pickle
from UncompressedPostings import UncompressedPostings
from CompressedPostings import CompressedPostings
from BTrees.OOBTree import OOBTree
import Stemmer
class Search:
def __init__(self, output_path):
self._output = output_path
self._alphabet = 'abcdefghijklmnñopqrstuvwxyz'
self._btree = OOBTree()
self._reverse_btree = OOBTree()
def search_in_ii(self, terms):
term_dict = pickle.load(open(self._output + '/' + 'term.dict', 'rb'))
document_dict = pickle.load(open(self._output + '/' + 'document.dict', 'rb'))
ii_dict = pickle.load(open(self._output + '/' + 'ii.dict', 'rb'))
# Invierto document_dict para trabajar con los ID
id_to_document_list = {y: x for x, y in document_dict.items()}
id_to_term = {y: x for x, y in term_dict.items()}
# Inserto los términos en el árbol
for key in ii_dict:
self._btree.insert(id_to_term.get(key), ii_dict.get(key))
self.reverse_btree()
# Ya no se necesita esto
ii_dict = None
term_dict = None
document_dict = None
results = {}
try:
postings = CompressedPostings(self._output + '/' + 'index.cii')
except FileNotFoundError:
postings = UncompressedPostings(self._output + '/' + 'index.ii')
for term in terms:
metadata_list = self.wildcard_search(term)
postings_list = []
if metadata_list:
for metadata in metadata_list:
postings_list += postings.retrieve_postings_list(metadata[0], metadata[2])
results[term] = [id_to_document_list.get(x) for x in postings_list]
else:
results[term] = None
postings.close_postings_file()
return results
def reverse_btree(self):
for key in self._btree:
self._reverse_btree.insert(key[::-1], self._btree[key])
def wildcard_search(self, word):
if word.find("*") == -1:
# No wildcard
try:
stemmer = Stemmer.Stemmer('spanish')
stemmed_word = stemmer.stemWord(word)
return list([self._btree[stemmed_word]])
except KeyError:
return None
elif word[-1] == "*":
return list(self._btree.values(min=word[:-1]+self._alphabet[0], max=word[:-1]+self._alphabet[26]))
elif word[0] == "*":
print("Desde %s hasta %s" % (word[::-1][:-1]+self._alphabet[0], word[::-1][:-1]+self._alphabet[26]))
return list(self._reverse_btree.values(
min=word[::-1][:-1]+self._alphabet[0], max=word[::-1][:-1]+self._alphabet[26])
)
else:
return None