MB-69641: Add support for retrieval of multi-vector documents (#2258)

CascadingRadium · Copilot · abhinavdangeti · web-flow · commit 346764b88979 · 2025-12-15T12:05:10.000+05:30
- KNN search can now be performed on documents that contain a field with multiple vectors (e.g., vectors in JSON object arrays or arrays of vectors). For such documents, we select only the best-scoring vector and discard the rest, assigning the document the similarity score of that best vector. - Added unit tests to verify this behavior. - Fixed a bug where using a filter query that marked every document as ineligible in a multi-vector query could corrupt the internal state of the KNN collector. - Fixed a bug where a KNN query was incorrectly marked as non-fieldable due to not adhering to the `FieldableQuery` interface. - Requires: blevesearch/zapx#348 --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Abhinav Dangeti <abhinav@couchbase.com>
diff --git a/go.mod b/go.mod
@@ -26,7 +26,7 @@ require (
 	github.com/blevesearch/zapx/v13 v13.4.2
 	github.com/blevesearch/zapx/v14 v14.4.2
 	github.com/blevesearch/zapx/v15 v15.4.2
-	github.com/blevesearch/zapx/v16 v16.2.7
+	github.com/blevesearch/zapx/v16 v16.2.8
 	github.com/couchbase/moss v0.2.0
 	github.com/spf13/cobra v1.8.1
 	go.etcd.io/bbolt v1.4.0
diff --git a/go.sum b/go.sum
@@ -44,8 +44,8 @@ github.com/blevesearch/zapx/v14 v14.4.2 h1:2SGHakVKd+TrtEqpfeq8X+So5PShQ5nW6GNxT
 github.com/blevesearch/zapx/v14 v14.4.2/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8=
 github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFxEsp31k=
 github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw=
-github.com/blevesearch/zapx/v16 v16.2.7 h1:xcgFRa7f/tQXOwApVq7JWgPYSlzyUMmkuYa54tMDuR0=
-github.com/blevesearch/zapx/v16 v16.2.7/go.mod h1:murSoCJPCk25MqURrcJaBQ1RekuqSCSfMjXH4rHyA14=
+github.com/blevesearch/zapx/v16 v16.2.8 h1:SlnzF0YGtSlrsOE3oE7EgEX6BIepGpeqxs1IjMbHLQI=
+github.com/blevesearch/zapx/v16 v16.2.8/go.mod h1:murSoCJPCk25MqURrcJaBQ1RekuqSCSfMjXH4rHyA14=
 github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps=
 github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k=
 github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o=
diff --git a/search/facet/facet_builder_terms_test.go b/search/facet/facet_builder_terms_test.go
@@ -201,11 +201,11 @@ func TestTermsFacetPrefixAndRegex(t *testing.T) {
 	terms := []string{
 		"env:prod",
 		"env:staging",
-		"env:dev",      // has prefix but doesn't match regex
-		"env:test",     // has prefix but doesn't match regex
-		"type:server",  // no prefix
-		"env:prod",     // duplicate
-		"env:staging",  // duplicate
+		"env:dev",     // has prefix but doesn't match regex
+		"env:test",    // has prefix but doesn't match regex
+		"type:server", // no prefix
+		"env:prod",    // duplicate
+		"env:staging", // duplicate
 	}
 
 	for _, term := range terms {
diff --git a/search/query/knn.go b/search/query/knn.go
@@ -53,7 +53,7 @@ func (q *KNNQuery) SetK(k int64) {
 	q.K = k
 }
 
-func (q *KNNQuery) SetFieldVal(field string) {
+func (q *KNNQuery) SetField(field string) {
 	q.VectorField = field
 }
 
diff --git a/search_knn.go b/search_knn.go
@@ -288,10 +288,15 @@ func createKNNQuery(req *SearchRequest, knnFilterResults map[int]index.EligibleD
 			// If it's a filtered kNN but has no eligible filter hits, then
 			// do not run the kNN query.
 			if selector, exists := knnFilterResults[i]; exists && selector == nil {
+				// if the kNN query is filtered and has no eligible filter hits, then
+				// do not run the kNN query, so we add a match_none query to the subQueries.
+				// this will ensure that the score breakdown is set to 0 for this kNN query.
+				subQueries = append(subQueries, NewMatchNoneQuery())
+				kArray = append(kArray, 0)
 				continue
 			}
 			knnQuery := query.NewKNNQuery(knn.Vector)
-			knnQuery.SetFieldVal(knn.Field)
+			knnQuery.SetField(knn.Field)
 			knnQuery.SetK(knn.K)
 			knnQuery.SetBoost(knn.Boost.Value())
 			knnQuery.SetParams(knn.Params)
diff --git a/search_knn_test.go b/search_knn_test.go
@@ -1281,23 +1281,29 @@ func TestKNNScoreBoosting(t *testing.T) {
 	searchRequest.AddKNN("vector", queryVec, 3, 1.0)
 	searchRequest.Fields = []string{"content", "vector"}
 
-	hits, _ := index.Search(searchRequest)
+	hits, err := index.Search(searchRequest)
+	if err != nil {
+		t.Fatal(err)
+	}
 	hitsMap := make(map[string]float64, 0)
 	for _, hit := range hits.Hits {
 		hitsMap[hit.ID] = (hit.Score)
 	}
 
-	searchRequest2 := NewSearchRequest(NewMatchNoneQuery())
+	searchRequest = NewSearchRequest(NewMatchNoneQuery())
 	searchRequest.AddKNN("vector", queryVec, 3, 10.0)
 	searchRequest.Fields = []string{"content", "vector"}
 
-	hits2, _ := index.Search(searchRequest2)
+	hits, err = index.Search(searchRequest)
+	if err != nil {
+		t.Fatal(err)
+	}
 	hitsMap2 := make(map[string]float64, 0)
-	for _, hit := range hits2.Hits {
+	for _, hit := range hits.Hits {
 		hitsMap2[hit.ID] = (hit.Score)
 	}
 
-	for _, hit := range hits2.Hits {
+	for _, hit := range hits.Hits {
 		if hitsMap[hit.ID] != hitsMap2[hit.ID]/10 {
 			t.Errorf("boosting not working: %v %v \n", hitsMap[hit.ID], hitsMap2[hit.ID])
 		}
@@ -1645,6 +1651,210 @@ func TestNestedVectors(t *testing.T) {
 	}
 }
 
+// -----------------------------------------------------------------------------
+// TestMultiVector tests the KNN functionality which handles duplicate
+// vectors being matched within the same document. When a document has multiple vectors
+// (via [[]] array of vectors or [{}] array of objects with vectors), the KNN
+// searcher must pick the best scoring vector match for that document. This test covers these scenarios:
+// - Single vector field (baseline)
+// - [[]] style: array of vectors (same doc appears multiple times)
+// - [{}] style: array of objects with vector field (chunks pattern)
+func TestMultiVector(t *testing.T) {
+	tmpIndexPath := createTmpIndexPath(t)
+	defer cleanupTmpIndexPath(t, tmpIndexPath)
+
+	// JSON documents covering merger scenarios:
+	// - Single vector (baseline)
+	// - [[]] style: array of vectors (same doc appears multiple times)
+	// - [{}] style: array of objects with vector field (chunks pattern)
+	docs := map[string]string{
+		// Single vector - baseline
+		"doc1": `{
+			"vec": [10, 10, 10],
+			"vecB": [100, 100, 100]
+		}`,
+		// [[]] style - array of 2 vectors
+		"doc2": `{
+			"vec": [[0, 0, 0], [500, 500, 500]],
+			"vecB": [[900, 900, 900], [950, 950, 950], [975, 975, 975], [990, 990, 990]]
+		}`,
+		// [[]] style - array of 3 vectors
+		"doc3": `{ 
+			"vec": [[50, 50, 50], [200, 200, 200], [400, 400, 400]],
+			"vecB": [[800, 800, 800], [850, 850, 850]]
+		}`,
+		// Single vector - baseline
+		"doc4": `{
+			"vec": [1000, 1000, 1000],
+			"vecB": [1, 1, 1]
+		}`,
+		// [{}] style - array of objects with vector field (chunks pattern)
+		"doc5": `{
+			"chunks": [
+				{"vec": [10, 10, 10], "text": "chunk1"},
+				{"vec": [20, 20, 20], "text": "chunk2"},
+				{"vec": [30, 30, 30], "text": "chunk3"},
+				{"vec": [40, 40, 40], "text": "chunk4"}
+			]
+		}`,
+		"doc6": `{
+			"chunks": [
+				{"vec": [[10, 10, 10],[20, 20, 20]], "text": "chunk1"},
+				{"vec": [[30, 30, 30],[40, 40, 40]], "text": "chunk2"}
+			]
+		}`,
+	}
+
+	// Parse JSON documents
+	dataset := make(map[string]map[string]interface{})
+	for docID, jsonStr := range docs {
+		var doc map[string]interface{}
+		if err := json.Unmarshal([]byte(jsonStr), &doc); err != nil {
+			t.Fatalf("failed to unmarshal %s: %v", docID, err)
+		}
+		dataset[docID] = doc
+	}
+
+	// Index mapping
+	indexMapping := NewIndexMapping()
+
+	vecMapping := mapping.NewVectorFieldMapping()
+	vecMapping.Dims = 3
+	vecMapping.Similarity = index.InnerProduct
+	indexMapping.DefaultMapping.AddFieldMappingsAt("vec", vecMapping)
+	indexMapping.DefaultMapping.AddFieldMappingsAt("vecB", vecMapping)
+
+	// Nested chunks mapping for [{}] style
+	chunksMapping := mapping.NewDocumentMapping()
+	chunksMapping.AddFieldMappingsAt("vec", vecMapping)
+	indexMapping.DefaultMapping.AddSubDocumentMapping("chunks", chunksMapping)
+
+	// Create and populate index
+	idx, err := New(tmpIndexPath, indexMapping)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer func() {
+		if err := idx.Close(); err != nil {
+			t.Fatal(err)
+		}
+	}()
+
+	batch := idx.NewBatch()
+	for docID, doc := range dataset {
+		if err := batch.Index(docID, doc); err != nil {
+			t.Fatal(err)
+		}
+	}
+	if err := idx.Batch(batch); err != nil {
+		t.Fatal(err)
+	}
+
+	// Test: Single KNN query - basic functionality
+	t.Run("VecFieldSingle", func(t *testing.T) {
+		searchReq := NewSearchRequest(query.NewMatchNoneQuery())
+		searchReq.AddKNN("vec", []float32{1, 1, 1}, 20, 1.0)
+		res, err := idx.Search(searchReq)
+		if err != nil {
+			t.Fatal(err)
+		}
+		// Inner product: score = sum(query_i * doc_i)
+		// doc1 vec=[10,10,10]: 1*10*3 = 30
+		// doc2 vec best is [500,500,500]: 1*500*3 = 1500
+		// doc3 vec best is [400,400,400]: 1*400*3 = 1200
+		// doc4 vec=[1000,1000,1000]: 1*1000*3 = 3000
+		expectedResult := []struct {
+			docID         string
+			expectedScore float64
+		}{
+			{docID: "doc4", expectedScore: 3000},
+			{docID: "doc2", expectedScore: 1500},
+			{docID: "doc3", expectedScore: 1200},
+			{docID: "doc1", expectedScore: 30},
+		}
+
+		if len(res.Hits) != len(expectedResult) {
+			t.Fatalf("expected %d hits, got %d", len(expectedResult), len(res.Hits))
+		}
+
+		for i, expected := range expectedResult {
+			if res.Hits[i].ID != expected.docID {
+				t.Fatalf("at rank %d, expected docID %s, got %s", i+1, expected.docID, res.Hits[i].ID)
+			}
+			if res.Hits[i].Score != expected.expectedScore {
+				t.Fatalf("at rank %d, expected score %v, got %v", i+1, expected.expectedScore, res.Hits[i].Score)
+			}
+		}
+	})
+
+	// Test: Single KNN query on vecB field
+	t.Run("VecBFieldSingle", func(t *testing.T) {
+		searchReq := NewSearchRequest(query.NewMatchNoneQuery())
+		searchReq.AddKNN("vecB", []float32{1000, 1000, 1000}, 20, 1.0)
+		res, err := idx.Search(searchReq)
+		if err != nil {
+			t.Fatal(err)
+		}
+		// Inner product: score = sum(query_i * doc_i) for each dimension
+		// doc1: vecB=[100,100,100] -> 1000*100*3 = 300,000
+		// doc2: vecB best is [990,990,990] -> 1000*990*3 = 2,970,000
+		// doc3: vecB best is [850,850,850] -> 1000*850*3 = 2,550,000
+		// doc4: vecB=[1,1,1] -> 1000*1*3 = 3,000
+		expectedResult := []struct {
+			docID         string
+			expectedScore float64
+		}{
+			{docID: "doc2", expectedScore: 2970000},
+			{docID: "doc3", expectedScore: 2550000},
+			{docID: "doc1", expectedScore: 300000},
+			{docID: "doc4", expectedScore: 3000},
+		}
+
+		if len(res.Hits) != len(expectedResult) {
+			t.Fatalf("expected %d hits, got %d", len(expectedResult), len(res.Hits))
+		}
+
+		for i, expected := range expectedResult {
+			if res.Hits[i].ID != expected.docID {
+				t.Fatalf("at rank %d, expected docID %s, got %s", i+1, expected.docID, res.Hits[i].ID)
+			}
+			if res.Hits[i].Score != expected.expectedScore {
+				t.Fatalf("at rank %d, expected score %v, got %v", i+1, expected.expectedScore, res.Hits[i].Score)
+			}
+		}
+	})
+
+	// Test: Single KNN query on nested chunks.vec field
+	t.Run("ChunksVecFieldSingle", func(t *testing.T) {
+		searchReq := NewSearchRequest(query.NewMatchNoneQuery())
+		searchReq.AddKNN("chunks.vec", []float32{1, 1, 1}, 20, 1.0)
+		searchReq.SortBy([]string{"_score", "docID"})
+		res, err := idx.Search(searchReq)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		// Only doc5 and doc6 have chunks.vec
+		// doc5 chunks: [10,10,10], [20,20,20], [30,30,30], [40,40,40]
+		//   Best score: 1*40*3 = 120
+		// doc6 chunks: [[10,10,10],[20,20,20]], [[30,30,30],[40,40,40]]
+		//   Best score: 1*40*3 = 120
+		if len(res.Hits) != 2 {
+			t.Fatalf("expected 2 hits, got %d", len(res.Hits))
+		}
+
+		// Both should have score 120
+		for _, hit := range res.Hits {
+			if hit.ID != "doc5" && hit.ID != "doc6" {
+				t.Fatalf("unexpected docID %s, expected doc5 or doc6", hit.ID)
+			}
+			if hit.Score != 120 {
+				t.Fatalf("for %s, expected score 120, got %v", hit.ID, hit.Score)
+			}
+		}
+	})
+}
+
 func TestNumVecsStat(t *testing.T) {
 
 	dataset, _, err := readDatasetAndQueries(testInputCompressedFile)

Original file line number	Diff line number	Diff line change
`@@ -53,7 +53,7 @@ func (q *KNNQuery) SetK(k int64) {`
`53`	`53`	`q.K = k`
`54`	`54`	`}`
`55`	`55`
`56`		`-func (q *KNNQuery) SetFieldVal(field string) {`
	`56`	`+func (q *KNNQuery) SetField(field string) {`
`57`	`57`	`q.VectorField = field`
`58`	`58`	`}`
`59`	`59`