Skip to content

Commit

Permalink
Hybrid search lexical/tensor score modifiers fix (#922)
Browse files Browse the repository at this point in the history
Co-authored-by: yihanzhao <[email protected]>
  • Loading branch information
vicilliar and papa99do authored Aug 2, 2024
1 parent 924055a commit 879e8d1
Show file tree
Hide file tree
Showing 9 changed files with 211 additions and 52 deletions.
1 change: 1 addition & 0 deletions scripts/vespa_local/services.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
<document-api/>
<search/>
<nodes>
<jvm options="-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005" />
<node hostalias="node1"/>
</nodes>
</container>
Expand Down
2 changes: 1 addition & 1 deletion scripts/vespa_local/vespa_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def start(args):
os.system("docker run --detach "
"--name vespa "
"--hostname vespa-container "
"--publish 8080:8080 --publish 19071:19071 --publish 2181:2181 "
"--publish 8080:8080 --publish 19071:19071 --publish 2181:2181 --publish 127.0.0.1:5005:5005 "
"vespaengine/vespa:8.367.14")


Expand Down
6 changes: 4 additions & 2 deletions src/marqo/core/models/hybrid_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,11 @@ def validate_properties(cls, values):

# score_modifiers_lexical can only be defined for Lexical, RRF, NormalizeLinear
if values.get('scoreModifiersLexical') is not None:
if values.get('rankingMethod') not in [RankingMethod.Lexical, RankingMethod.RRF]:
if not (values.get('rankingMethod') in [RankingMethod.Lexical, RankingMethod.RRF] or
values.get('retrievalMethod') == RetrievalMethod.Lexical):
raise ValueError(
"'scoreModifiersLexical' can only be defined for 'lexical', 'rrf' ranking methods") # TODO: re-add normalize_linear
"'scoreModifiersLexical' can only be defined for 'lexical', 'rrf' ranking methods or "
"'lexical' retrieval method.") # TODO: re-add normalize_linear

# score_modifiers_tensor can only be defined for Tensor, RRF, NormalizeLinear
if values.get('scoreModifiersTensor') is not None:
Expand Down
16 changes: 13 additions & 3 deletions src/marqo/core/structured_vespa_index/structured_vespa_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,16 @@ def _to_vespa_hybrid_query(self, marqo_query: MarqoHybridQuery) -> Dict[str, Any
}
})

"""
# TODO: implement this if no longer using custom searcher for lexical/tensor and tensor/lexical
query_inputs.update({
f: 1 for f in fields_to_search_lexical
})
query_inputs.update({
f: 1 for f in fields_to_search_tensor
})
"""

# Extract score modifiers
hybrid_score_modifiers = self._get_hybrid_score_modifiers(marqo_query)
if hybrid_score_modifiers[constants.MARQO_SEARCH_METHOD_LEXICAL]:
Expand All @@ -544,7 +554,8 @@ def _to_vespa_hybrid_query(self, marqo_query: MarqoHybridQuery) -> Dict[str, Any
'searchChain': 'marqo',
'yql': 'PLACEHOLDER. WILL NOT BE USED IN HYBRID SEARCH.',
'ranking': common.RANK_PROFILE_HYBRID_CUSTOM_SEARCHER,

'ranking.rerankCount': marqo_query.limit, # limits the number of results going to phase 2

'model_restrict': self._marqo_index.schema_name,
'hits': marqo_query.limit,
'offset': marqo_query.offset,
Expand All @@ -562,10 +573,9 @@ def _to_vespa_hybrid_query(self, marqo_query: MarqoHybridQuery) -> Dict[str, Any

'marqo__hybrid.retrievalMethod': marqo_query.hybrid_parameters.retrievalMethod,
'marqo__hybrid.rankingMethod': marqo_query.hybrid_parameters.rankingMethod,
'marqo__hybrid.tensorScoreModifiersPresent': True if hybrid_score_modifiers[constants.MARQO_SEARCH_METHOD_TENSOR] else False,
'marqo__hybrid.lexicalScoreModifiersPresent': True if hybrid_score_modifiers[constants.MARQO_SEARCH_METHOD_LEXICAL] else False,
'marqo__hybrid.verbose': marqo_query.hybrid_parameters.verbose
}

query = {k: v for k, v in query.items() if v is not None}

if marqo_query.hybrid_parameters.rankingMethod in {RankingMethod.RRF}: # TODO: Add NormalizeLinear
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@ def _to_vespa_hybrid_query(self, marqo_query: MarqoHybridQuery) -> Dict[str, Any
'searchChain': 'marqo',
'yql': 'PLACEHOLDER. WILL NOT BE USED IN HYBRID SEARCH.',
'ranking': unstructured_common.RANK_PROFILE_HYBRID_CUSTOM_SEARCHER,
'ranking.rerankCount': marqo_query.limit, # limits the number of results going to phase 2

'model_restrict': self._marqo_index.schema_name,
'hits': marqo_query.limit,
Expand All @@ -365,10 +366,6 @@ def _to_vespa_hybrid_query(self, marqo_query: MarqoHybridQuery) -> Dict[str, Any

'marqo__hybrid.retrievalMethod': marqo_query.hybrid_parameters.retrievalMethod,
'marqo__hybrid.rankingMethod': marqo_query.hybrid_parameters.rankingMethod,
'marqo__hybrid.tensorScoreModifiersPresent': True if hybrid_score_modifiers[
constants.MARQO_SEARCH_METHOD_TENSOR] else False,
'marqo__hybrid.lexicalScoreModifiersPresent': True if hybrid_score_modifiers[
constants.MARQO_SEARCH_METHOD_LEXICAL] else False,
'marqo__hybrid.verbose': marqo_query.hybrid_parameters.verbose
}
query = {k: v for k, v in query.items() if v is not None}
Expand Down
77 changes: 75 additions & 2 deletions tests/tensor_search/integ_tests/test_hybrid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,8 +223,6 @@ def run():
self.assertIn("PLACEHOLDER. WILL NOT BE USED IN HYBRID SEARCH.", vespa_query_kwargs["yql"])
self.assertEqual(vespa_query_kwargs["marqo__hybrid.retrievalMethod"], RetrievalMethod.Disjunction)
self.assertEqual(vespa_query_kwargs["marqo__hybrid.rankingMethod"], RankingMethod.RRF)
self.assertEqual(vespa_query_kwargs["marqo__hybrid.tensorScoreModifiersPresent"], True)
self.assertEqual(vespa_query_kwargs["marqo__hybrid.lexicalScoreModifiersPresent"], True)
self.assertEqual(vespa_query_kwargs["marqo__hybrid.alpha"], 0.6)
self.assertEqual(vespa_query_kwargs["marqo__hybrid.rrf_k"], 61)

Expand Down Expand Up @@ -834,6 +832,72 @@ def test_hybrid_search_score_modifiers(self):
self.assertAlmostEqual(hybrid_res["hits"][-1]["_tensor_score"], base_tensor_score * -10 * 3)


def test_hybrid_search_lexical_tensor_with_lexical_score_modifiers_succeeds(self):
"""
Tests that if we do hybrid search with lexical retrieval and tensor ranking, we can use both lexical and tensor
score modifiers.
The lexical score modifiers should affect the actual result set, while the tensor score modifiers should
affect the order and score.
"""

for index in [self.structured_text_index_score_modifiers, self.unstructured_default_text_index]:
with self.subTest(index=type(index)):
# Add documents
tensor_search.add_documents(
config=self.config,
add_docs_params=AddDocsParams(
index_name=index.name,
docs=[
{"_id": "doc4", "text_field_1": "HELLO WORLD",
"mult_field_1": 0.5, "add_field_1": 20}, # OUT (negative)
{"_id": "doc5", "text_field_1": "HELLO WORLD", "mult_field_1": 1.0}, # OUT (negative)
{"_id": "doc6", "text_field_1": "HELLO WORLD"}, # Top result
{"_id": "doc7", "text_field_1": "HELLO WORLD", "add_field_1": 1.0}, # Top result
{"_id": "doc8", "text_field_1": "HELLO WORLD", "mult_field_1": 2.0}, # OUT (negative)
{"_id": "doc9", "text_field_1": "HELLO WORLD", "mult_field_1": 3.0}, # OUT (negative)
{"_id": "doc10", "text_field_1": "HELLO WORLD", "mult_field_2": 3.0}, # Top result
],
tensor_fields=["text_field_1"] if isinstance(index, UnstructuredMarqoIndex) \
else None
)
)

hybrid_res = tensor_search.search(
config=self.config,
index_name=index.name,
text="HELLO WORLD",
search_method="HYBRID",
hybrid_parameters=HybridParameters(
retrievalMethod=RetrievalMethod.Lexical,
rankingMethod=RankingMethod.Tensor,
scoreModifiersLexical={
"multiply_score_by": [
{"field_name": "mult_field_1", "weight": -10}, # Will bring down doc8 and doc9. Keep doc6, doc7, doc10
]
},
scoreModifiersTensor={
"multiply_score_by": [
{"field_name": "mult_field_1", "weight": 10},
{"field_name": "mult_field_2", "weight": -10}
],
"add_to_score": [
{"field_name": "add_field_1", "weight": 5}
]
},
verbose=True
),
result_count=3
)
self.assertIn("hits", hybrid_res)
self.assertEqual(hybrid_res["hits"][0]["_id"], "doc7") # (score + 5*1)
self.assertEqual(hybrid_res["hits"][0]["_score"], 6.0)
self.assertEqual(hybrid_res["hits"][1]["_id"], "doc6") # (score)
self.assertEqual(hybrid_res["hits"][1]["_score"], 1.0)
self.assertEqual(hybrid_res["hits"][2]["_id"], "doc10") # (score*-10*3)
self.assertEqual(hybrid_res["hits"][2]["_score"], -30.0)


def test_hybrid_search_same_retrieval_and_ranking_matches_original_method(self):
"""
Tests that hybrid search with:
Expand Down Expand Up @@ -1325,6 +1389,15 @@ def test_hybrid_search_invalid_parameters_fails(self):
]
},
}, "can only be defined for 'lexical',"),
({ # tensor/lexical can only have lexical score modifiers
"retrievalMethod": "tensor",
"rankingMethod": "lexical",
"scoreModifiersTensor": {
"multiply_score_by": [
{"field_name": "mult_field_1", "weight": 1.0}
]
},
}, "can only be defined for 'tensor',"),
({
"retrievalMethod": "lexical",
"rankingMethod": "lexical",
Expand Down
2 changes: 1 addition & 1 deletion tests/tensor_search/test_pagination.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def test_pagination_hybrid(self):
batch_size = 100

# TODO: Add unstructured index when supported
for index in [self.index_structured]:
for index in [self.index_structured, self.index_unstructured]:
for _ in range(0, num_docs, batch_size):
r = tensor_search.add_documents(
config=self.config,
Expand Down
30 changes: 21 additions & 9 deletions vespa/src/main/java/ai/marqo/search/HybridSearcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -273,8 +273,7 @@ HitGroup rrf(
* @param query
* @param verbose
*/
void addFieldToRankFeatures(
Cell cell, Query query, String retrievalMethod, String rankingMethod, boolean verbose) {
void addFieldToRankFeatures(Cell cell, Query query, boolean verbose) {
TensorAddress cellKey = cell.getKey();
String queryInputString;
int dimensions = cellKey.size();
Expand Down Expand Up @@ -329,16 +328,29 @@ Query createSubQuery(
Query queryNew = query.clone();
queryNew.properties().set("yql", yqlNew);

// Set fields to rank (extract using RANKING method)
// Set fields to rank
// Extract using RETRIEVAL method (first-phase)
String featureNameFieldsToRank =
addQueryWrapper(QUERY_INPUT_FIELDS_TO_RANK + "_" + rankingMethod);
logIfVerbose("Using fields to rank from " + featureNameFieldsToRank, verbose);
addQueryWrapper(QUERY_INPUT_FIELDS_TO_RANK + "_" + retrievalMethod);
logIfVerbose(
"Extracting using fields to rank from RETRIEVAL method: " + featureNameFieldsToRank,
verbose);
Tensor fieldsToRank = extractTensorRankFeature(query, featureNameFieldsToRank);
Iterator<Cell> cells = fieldsToRank.cellIterator();
cells.forEachRemaining(
(cell) ->
addFieldToRankFeatures(
cell, queryNew, retrievalMethod, rankingMethod, verbose));
cells.forEachRemaining((cell) -> addFieldToRankFeatures(cell, queryNew, verbose));

// Extract using RANKING method (second-phase)
if (!(retrievalMethod.equals(rankingMethod))) {
featureNameFieldsToRank =
addQueryWrapper(QUERY_INPUT_FIELDS_TO_RANK + "_" + rankingMethod);
logIfVerbose(
"Extracting using fields to rank from RANKING method: "
+ featureNameFieldsToRank,
verbose);
fieldsToRank = extractTensorRankFeature(query, featureNameFieldsToRank);
cells = fieldsToRank.cellIterator();
cells.forEachRemaining((cell) -> addFieldToRankFeatures(cell, queryNew, verbose));
}

// Set rank profile (using RANKING method)
queryNew.getRanking().setProfile(rankProfileNew);
Expand Down
Loading

0 comments on commit 879e8d1

Please sign in to comment.