Skip to content

Commit

Permalink
speed improvements for phrase boosts
Browse files Browse the repository at this point in the history
  • Loading branch information
capjamesg committed Nov 19, 2024
1 parent 0c44325 commit 4f96c1a
Showing 1 changed file with 16 additions and 18 deletions.
34 changes: 16 additions & 18 deletions jamesql/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,30 +919,28 @@ def search(
doc["_score"] += term_score

for field in fields:
word_pos = defaultdict(list)
gsi_index = self.gsis.get(field)["gsi"]

for i, term in enumerate(doc[field].lower().split(" ")):
word_pos[term].append(i)
word_pos = {word: gsi_index.get(word, {}).get("documents", {}).get("uuid", {}).get(doc["uuid"], []) for word in term_queries}

for term in term_queries:
# give a boost if all terms are within 1 word of each other
# so a doc with "all too well" would do better than "all well too"

if (
all([w in word_pos for w in term_queries])
and len(term_queries) > 1
):
first_word_pos = set(word_pos[term_queries[0]])
for i, term in enumerate(term_queries):
positions = set([x - i for x in word_pos[term]])
first_word_pos &= positions

if first_word_pos and field != "title_lower":
doc["_score"] += (
len(first_word_pos) + 1
) # * len(set(word_pos[term_queries[0]]))
elif first_word_pos and field == "title_lower":
doc["_score"] *= 2 + len(first_word_pos)
if len(term_queries) < 2:
continue

first_word_pos = set(word_pos[term_queries[0]])
for i, term in enumerate(term_queries):
positions = set([x - i for x in word_pos[term]])
first_word_pos &= positions

if first_word_pos and field != "title_lower":
doc["_score"] += (
len(first_word_pos) + 1
) # * len(set(word_pos[term_queries[0]]))
elif first_word_pos and field == "title_lower":
doc["_score"] *= 2 + len(first_word_pos)

# sort by doc score
results = sorted(results, key=lambda x: x.get("_score", 0), reverse=True)
Expand Down

0 comments on commit 4f96c1a

Please sign in to comment.