Skip to content

Commit

Permalink
Change hybrid rerankCount to limit + offset (#927)
Browse files Browse the repository at this point in the history
  • Loading branch information
vicilliar authored Aug 6, 2024
1 parent 879e8d1 commit 857b33b
Show file tree
Hide file tree
Showing 3 changed files with 195 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,7 @@ def _to_vespa_hybrid_query(self, marqo_query: MarqoHybridQuery) -> Dict[str, Any
'searchChain': 'marqo',
'yql': 'PLACEHOLDER. WILL NOT BE USED IN HYBRID SEARCH.',
'ranking': common.RANK_PROFILE_HYBRID_CUSTOM_SEARCHER,
'ranking.rerankCount': marqo_query.limit, # limits the number of results going to phase 2
'ranking.rerankCount': marqo_query.limit + marqo_query.offset, # limits the number of results going to phase 2

'model_restrict': self._marqo_index.schema_name,
'hits': marqo_query.limit,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ def _to_vespa_hybrid_query(self, marqo_query: MarqoHybridQuery) -> Dict[str, Any
'searchChain': 'marqo',
'yql': 'PLACEHOLDER. WILL NOT BE USED IN HYBRID SEARCH.',
'ranking': unstructured_common.RANK_PROFILE_HYBRID_CUSTOM_SEARCHER,
'ranking.rerankCount': marqo_query.limit, # limits the number of results going to phase 2
'ranking.rerankCount': marqo_query.limit + marqo_query.offset, # limits the number of results going to phase 2

'model_restrict': self._marqo_index.schema_name,
'hits': marqo_query.limit,
Expand Down
227 changes: 193 additions & 34 deletions tests/tensor_search/test_pagination.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import math
import os
import random
import string
import unittest
from unittest import mock

Expand All @@ -10,6 +11,7 @@
from marqo.api.exceptions import (
IllegalRequestedDocCount
)
from marqo.core.models.marqo_index import *
from marqo.core.models.marqo_index import FieldType, FieldFeature, IndexType
from marqo.core.models.marqo_index_request import FieldRequest
from marqo.tensor_search import tensor_search, utils
Expand All @@ -33,9 +35,23 @@ def setUpClass(cls) -> None:
name='desc',
type=FieldType.Text,
features=[FieldFeature.LexicalSearch]
)
),
FieldRequest(name="text_field_1", type=FieldType.Text,
features=[FieldFeature.LexicalSearch, FieldFeature.Filter]),
FieldRequest(name="text_field_2", type=FieldType.Text,
features=[FieldFeature.LexicalSearch, FieldFeature.Filter]),
FieldRequest(name="text_field_3", type=FieldType.Text,
features=[FieldFeature.LexicalSearch, FieldFeature.Filter]),
FieldRequest(name="add_field_1", type=FieldType.Float,
features=[FieldFeature.ScoreModifier]),
FieldRequest(name="add_field_2", type=FieldType.Float,
features=[FieldFeature.ScoreModifier]),
FieldRequest(name="mult_field_1", type=FieldType.Float,
features=[FieldFeature.ScoreModifier]),
FieldRequest(name="mult_field_2", type=FieldType.Float,
features=[FieldFeature.ScoreModifier])
],
tensor_fields=['title']
tensor_fields=['title', 'text_field_1', 'text_field_2', 'text_field_3']
)
index_request_unstructured = cls.unstructured_marqo_index_request()

Expand All @@ -57,17 +73,51 @@ def setUp(self) -> None:
def tearDown(self):
self.device_patcher.stop()

def generate_random_string(self, length):
# Generate a random string of given length
letters = string.ascii_lowercase
return ''.join(random.choice(letters) for _ in range(length))

def generate_unique_strings(self, num_strings):
"""
Generate strings that will have different bm25 scores, to confirm pagination works as expected
"""
unique_strings = set()
while len(unique_strings) < num_strings:
# Vary content and length
length = random.randint(1, 5) # Random length between 1 and 5
rand_str = self.generate_random_string(length)

# Vary term frequency by repeating some words
words = rand_str.split()
if len(words) > 1 and random.random() > 0.5:
rand_str += " " + random.choice(words)

unique_strings.add(rand_str)

return list(unique_strings)

def test_pagination_single_field(self):
num_docs = 400
batch_size = 100

for index in [self.index_structured, self.index_unstructured]:
for _ in range(0, num_docs, batch_size):
docs = []
for i in range(batch_size):
title = "my title"
for j in range(i):
title += " ".join(self.generate_unique_strings(j))
doc = {"_id": str(i),
"title": title,
'desc': 'my description'}
docs.append(doc)

r = tensor_search.add_documents(
config=self.config,
add_docs_params=AddDocsParams(index_name=index.name,
docs=[{"title": 'my title', 'desc': 'my title'} for i in
range(batch_size)],
# Add docs with increasing title word count, so each will have unique tensor and lexical scores
docs=docs,
device="cpu",
tensor_fields=['title'] if index.type == IndexType.Unstructured
else None
Expand All @@ -83,7 +133,8 @@ def test_pagination_single_field(self):
text='my title',
result_count=400)

for page_size in [5, 10, 100, 200]:
# TODO: Re-add page size 5, 10 when KNN inconsistency bug is fixed
for page_size in [100, 200]:
with self.subTest(f'Index: {index.type}, Search method: {search_method}, Page size: {page_size}'):
paginated_search_results = {"hits": []}

Expand All @@ -101,19 +152,29 @@ def test_pagination_single_field(self):

# Compare paginated to full results (length only for now)
self.assertEqual(len(full_search_results["hits"]), len(paginated_search_results["hits"]))
for i in range(len(full_search_results["hits"])):
self.assertEqual(full_search_results["hits"][i]["_id"], paginated_search_results["hits"][i]["_id"])
self.assertEqual(full_search_results["hits"][i]["_score"], paginated_search_results["hits"][i]["_score"])

def test_pagination_hybrid(self):
num_docs = 400
batch_size = 100

# TODO: Add unstructured index when supported
for index in [self.index_structured, self.index_unstructured]:
for _ in range(0, num_docs, batch_size):
docs = []
for i in range(batch_size):
title = "my title"
for j in range(i):
title += " ".join(self.generate_unique_strings(j))
doc = {"_id": str(i),
"title": title,
'desc': 'my description'}
docs.append(doc)
r = tensor_search.add_documents(
config=self.config,
add_docs_params=AddDocsParams(index_name=index.name,
docs=[{"title": 'my title', 'desc': 'my title'} for i in
range(batch_size)],
docs=docs,
device="cpu",
tensor_fields=['title'] if index.type == IndexType.Unstructured
else None
Expand All @@ -128,35 +189,133 @@ def test_pagination_hybrid(self):
]

for retrieval_method, ranking_method in test_cases:
full_search_results = tensor_search.search(
search_method="HYBRID",
hybrid_parameters=HybridParameters(retrievalMethod=retrieval_method,
rankingMethod=ranking_method),
config=self.config,
index_name=index.name,
text='my title',
result_count=400)
with self.subTest(retrieval_method=retrieval_method, ranking_method=ranking_method):
full_search_results = tensor_search.search(
search_method="HYBRID",
hybrid_parameters=HybridParameters(retrievalMethod=retrieval_method,
rankingMethod=ranking_method),
config=self.config,
index_name=index.name,
text='my title',
result_count=num_docs)

for page_size in [5, 10, 100, 200]:
with self.subTest(f'Index: {index.type}, Page size: {page_size}'):
paginated_search_results = {"hits": []}
# TODO: Re-add page size 5, 10 when KNN inconsistency bug is fixed
for page_size in [100, 200]:
with self.subTest(f'Index: {index.type}, Page size: {page_size}'):
paginated_search_results = {"hits": []}

for page_num in range(math.ceil(num_docs / page_size)):
lim = page_size
off = page_num * page_size
page_res = tensor_search.search(
search_method="HYBRID",
hybrid_parameters=HybridParameters(retrievalMethod=retrieval_method,
rankingMethod=ranking_method),
config=self.config,
index_name=index.name,
text='my title',
result_count=lim, offset=off)
for page_num in range(math.ceil(num_docs / page_size)):
lim = page_size
off = page_num * page_size
page_res = tensor_search.search(
search_method="HYBRID",
hybrid_parameters=HybridParameters(retrievalMethod=retrieval_method,
rankingMethod=ranking_method),
config=self.config,
index_name=index.name,
text='my title',
result_count=lim, offset=off)

paginated_search_results["hits"].extend(page_res["hits"])
paginated_search_results["hits"].extend(page_res["hits"])

# Compare paginated to full results (length only for now)
self.assertEqual(len(full_search_results["hits"]), len(paginated_search_results["hits"]))
# Compare paginated to full results (length only for now)
self.assertEqual(len(full_search_results["hits"]), len(paginated_search_results["hits"]))
# Scores need to match, except for disjunction/rrf (where scores are determined by rank)
if (retrieval_method, ranking_method) != ("disjunction", "rrf"):
for i in range(len(full_search_results["hits"])):
self.assertEqual(full_search_results["hits"][i]["_score"],
paginated_search_results["hits"][i]["_score"])

@unittest.skip
def test_pagination_hybrid_lexical_tensor_with_modifiers(self):
"""
Show that pagination is consistent when using hybrid search with lexical retrieval, tensor ranking,
with lexical score modifiers eliminating some results.
"""
for index in [self.index_structured, self.index_unstructured]:
with self.subTest(index=type(index)):
# Add documents
add_docs_res = tensor_search.add_documents(
config=self.config,
add_docs_params=AddDocsParams(
index_name=index.name,
docs=[
{"_id": "doc4", "text_field_1": "HELLO WORLD",
"mult_field_1": 0.5, "add_field_1": 20}, # OUT (negative)
{"_id": "doc5", "text_field_1": "HELLO WORLD", "mult_field_1": 1.0}, # OUT (negative)
{"_id": "doc6", "text_field_1": "HELLO WORLD"}, # Top result
{"_id": "doc7", "text_field_1": "HELLO WORLD", "add_field_1": 1.0}, # Top result
{"_id": "doc8", "text_field_1": "HELLO WORLD", "mult_field_1": 2.0}, # OUT (negative)
{"_id": "doc9", "text_field_1": "HELLO WORLD", "mult_field_1": 3.0}, # OUT (negative)
{"_id": "doc10", "text_field_1": "HELLO WORLD", "mult_field_2": 3.0}, # Top result
],
tensor_fields=["text_field_1"] if isinstance(index, UnstructuredMarqoIndex) \
else None
)
)

full_res = tensor_search.search(
config=self.config,
index_name=index.name,
text="HELLO WORLD",
search_method="HYBRID",
hybrid_parameters=HybridParameters(
retrievalMethod=RetrievalMethod.Lexical,
rankingMethod=RankingMethod.Tensor,
scoreModifiersLexical={
"multiply_score_by": [
{"field_name": "mult_field_1", "weight": -10},
# Will bring down doc8 and doc9. Keep doc6, doc7, doc10
]
},
scoreModifiersTensor={
"multiply_score_by": [
{"field_name": "mult_field_1", "weight": 10},
{"field_name": "mult_field_2", "weight": -10}
],
"add_to_score": [
{"field_name": "add_field_1", "weight": 5}
]
},
verbose=True
),
result_count=3
)

# Paginate (3 pages of 1 result each)
paginated_res = []
for i in range(3):
paginated_res.extend(tensor_search.search(
config=self.config,
index_name=index.name,
text="HELLO WORLD",
search_method="HYBRID",
hybrid_parameters=HybridParameters(
retrievalMethod=RetrievalMethod.Lexical,
rankingMethod=RankingMethod.Tensor,
scoreModifiersLexical={
"multiply_score_by": [
{"field_name": "mult_field_1", "weight": -10},
# Will bring down doc8 and doc9. Keep doc6, doc7, doc10
]
},
scoreModifiersTensor={
"multiply_score_by": [
{"field_name": "mult_field_1", "weight": 10},
{"field_name": "mult_field_2", "weight": -10}
],
"add_to_score": [
{"field_name": "add_field_1", "weight": 5}
]
},
verbose=True
),
result_count=1,
offset=i
)["hits"])

# Compare full results to paginated results
self.assertEqual(full_res["hits"], paginated_res)

def test_pagination_high_limit_offset(self):
"""
Expand Down Expand Up @@ -309,7 +468,7 @@ def test_pagination_multi_field(self):
# Compare paginated to full results (length only for now)
assert len(full_search_results["hits"]) == len(paginated_search_results["hits"])

# TODO: re-add this assert when KNN incosistency bug is fixed
# TODO: re-add this assert when KNN inconsistency bug is fixed
# assert full_search_results["hits"] == paginated_search_results["hits"]

@unittest.skip
Expand Down

0 comments on commit 857b33b

Please sign in to comment.