-
Notifications
You must be signed in to change notification settings - Fork 0
/
search_index.py
91 lines (72 loc) · 3.14 KB
/
search_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/python
import codecs
import unicodedata
import csv
"""
Stores the clean_wiki-entities_kb_doc.txt file as a whoosh inverted index file with entity_name, fieldname(relation), content
as the schema. The documents are the doc contents for each entity.
First stopwords are removed from the question, then it is converted into query objects by the queryparser,
then the query objects are used to return the top 20(limit) matching documents from the inverted index.
From those matching documents the respective entities are returned.
"""
from whoosh import qparser
from whoosh import scoring
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import QueryParser
"""
The job of a query parser is to convert a query string submitted by a user
into query objects (objects from the whoosh.query module).
For example, the user query:
'rendering shading'
might be parsed into query objects like this:
And([Term("content", u"rendering"), Term("content", u"shading")])
"""
from whoosh.filedb.filestore import RamStorage
def read_file_as_dict(input_path):
d = {}
with open(input_path) as input_file:
reader = csv.DictReader(input_file, delimiter='\t', fieldnames=['col1', 'col2'])
for row in reader:
d[row['col1']] = int(row['col2'])
return d
class SearchIndex(object):
def __init__(self, doc_path, stopwords=None):
st = RamStorage()
st.create()
schema = Schema(entity1_name=TEXT(stored=True), fieldname=TEXT(stored=True), entity2_name=TEXT(stored=True))
self.ix = st.create_index(schema)
writer = self.ix.writer()
self.remove_stopwords_while_indexing = False
if stopwords:
self.remove_stopwords_while_indexing = True
self.stopwords_dict = read_file_as_dict(stopwords)
with open(doc_path, 'r') as graph_file:
reader = csv.DictReader(graph_file, delimiter="\t", fieldnames=['e1_relation', 'e2'])
for row in tqdm(reader):
entity_relation, e2 = row['e1_relation'], row['e2']
tokens=entity_relation.split()
e1=tokens[1]
relation=tokens[2]
writer.add_document(entity1_name=e1, fieldname=relation, entity2_name=e2)
writer.commit()
def remove_stopwords_from_text(self, content):
words = content.split(SPACE)
words_clean = []
for word in words:
if self.remove_stopwords_while_indexing and word not in self.stopwords_dict:
words_clean.append(word)
return " ".join(words_clean) if len(words_clean) > 0 else content
def get_candidate_docs(self, question, limit=20):
docs = set([])
question = self.remove_stopwords_from_text(question)
with self.ix.searcher() as searcher:
query = QueryParser("content", self.ix.schema, group=qparser.OrGroup).parse(question)
results = searcher.search(query, limit=limit)
for result in results:
docs.add(result['entity_name'])
docs = [unicodedata.normalize('NFKD', doc).encode('ascii','ignore') for doc in docs]
return docs
if __name__=="__main__":
searcher = SearchIndex("../data/extendedkb1.txt","../data/stopwords.txt")
#print searcher.get_candidate_docs("ginger rogers and")