Skip to content

Commit 346764b

Browse files
CascadingRadiumCopilotabhinavdangeti
authored
MB-69641: Add support for retrieval of multi-vector documents (#2258)
- KNN search can now be performed on documents that contain a field with multiple vectors (e.g., vectors in JSON object arrays or arrays of vectors). For such documents, we select only the best-scoring vector and discard the rest, assigning the document the similarity score of that best vector. - Added unit tests to verify this behavior. - Fixed a bug where using a filter query that marked every document as ineligible in a multi-vector query could corrupt the internal state of the KNN collector. - Fixed a bug where a KNN query was incorrectly marked as non-fieldable due to not adhering to the `FieldableQuery` interface. - Requires: blevesearch/zapx#348 --------- Co-authored-by: Copilot <[email protected]> Co-authored-by: Abhinav Dangeti <[email protected]>
1 parent 5b4af86 commit 346764b

File tree

6 files changed

+230
-15
lines changed

6 files changed

+230
-15
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ require (
2626
github.com/blevesearch/zapx/v13 v13.4.2
2727
github.com/blevesearch/zapx/v14 v14.4.2
2828
github.com/blevesearch/zapx/v15 v15.4.2
29-
github.com/blevesearch/zapx/v16 v16.2.7
29+
github.com/blevesearch/zapx/v16 v16.2.8
3030
github.com/couchbase/moss v0.2.0
3131
github.com/spf13/cobra v1.8.1
3232
go.etcd.io/bbolt v1.4.0

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ github.com/blevesearch/zapx/v14 v14.4.2 h1:2SGHakVKd+TrtEqpfeq8X+So5PShQ5nW6GNxT
4444
github.com/blevesearch/zapx/v14 v14.4.2/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8=
4545
github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFxEsp31k=
4646
github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw=
47-
github.com/blevesearch/zapx/v16 v16.2.7 h1:xcgFRa7f/tQXOwApVq7JWgPYSlzyUMmkuYa54tMDuR0=
48-
github.com/blevesearch/zapx/v16 v16.2.7/go.mod h1:murSoCJPCk25MqURrcJaBQ1RekuqSCSfMjXH4rHyA14=
47+
github.com/blevesearch/zapx/v16 v16.2.8 h1:SlnzF0YGtSlrsOE3oE7EgEX6BIepGpeqxs1IjMbHLQI=
48+
github.com/blevesearch/zapx/v16 v16.2.8/go.mod h1:murSoCJPCk25MqURrcJaBQ1RekuqSCSfMjXH4rHyA14=
4949
github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps=
5050
github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k=
5151
github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o=

search/facet/facet_builder_terms_test.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -201,11 +201,11 @@ func TestTermsFacetPrefixAndRegex(t *testing.T) {
201201
terms := []string{
202202
"env:prod",
203203
"env:staging",
204-
"env:dev", // has prefix but doesn't match regex
205-
"env:test", // has prefix but doesn't match regex
206-
"type:server", // no prefix
207-
"env:prod", // duplicate
208-
"env:staging", // duplicate
204+
"env:dev", // has prefix but doesn't match regex
205+
"env:test", // has prefix but doesn't match regex
206+
"type:server", // no prefix
207+
"env:prod", // duplicate
208+
"env:staging", // duplicate
209209
}
210210

211211
for _, term := range terms {

search/query/knn.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ func (q *KNNQuery) SetK(k int64) {
5353
q.K = k
5454
}
5555

56-
func (q *KNNQuery) SetFieldVal(field string) {
56+
func (q *KNNQuery) SetField(field string) {
5757
q.VectorField = field
5858
}
5959

search_knn.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,10 +288,15 @@ func createKNNQuery(req *SearchRequest, knnFilterResults map[int]index.EligibleD
288288
// If it's a filtered kNN but has no eligible filter hits, then
289289
// do not run the kNN query.
290290
if selector, exists := knnFilterResults[i]; exists && selector == nil {
291+
// if the kNN query is filtered and has no eligible filter hits, then
292+
// do not run the kNN query, so we add a match_none query to the subQueries.
293+
// this will ensure that the score breakdown is set to 0 for this kNN query.
294+
subQueries = append(subQueries, NewMatchNoneQuery())
295+
kArray = append(kArray, 0)
291296
continue
292297
}
293298
knnQuery := query.NewKNNQuery(knn.Vector)
294-
knnQuery.SetFieldVal(knn.Field)
299+
knnQuery.SetField(knn.Field)
295300
knnQuery.SetK(knn.K)
296301
knnQuery.SetBoost(knn.Boost.Value())
297302
knnQuery.SetParams(knn.Params)

search_knn_test.go

Lines changed: 215 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1281,23 +1281,29 @@ func TestKNNScoreBoosting(t *testing.T) {
12811281
searchRequest.AddKNN("vector", queryVec, 3, 1.0)
12821282
searchRequest.Fields = []string{"content", "vector"}
12831283

1284-
hits, _ := index.Search(searchRequest)
1284+
hits, err := index.Search(searchRequest)
1285+
if err != nil {
1286+
t.Fatal(err)
1287+
}
12851288
hitsMap := make(map[string]float64, 0)
12861289
for _, hit := range hits.Hits {
12871290
hitsMap[hit.ID] = (hit.Score)
12881291
}
12891292

1290-
searchRequest2 := NewSearchRequest(NewMatchNoneQuery())
1293+
searchRequest = NewSearchRequest(NewMatchNoneQuery())
12911294
searchRequest.AddKNN("vector", queryVec, 3, 10.0)
12921295
searchRequest.Fields = []string{"content", "vector"}
12931296

1294-
hits2, _ := index.Search(searchRequest2)
1297+
hits, err = index.Search(searchRequest)
1298+
if err != nil {
1299+
t.Fatal(err)
1300+
}
12951301
hitsMap2 := make(map[string]float64, 0)
1296-
for _, hit := range hits2.Hits {
1302+
for _, hit := range hits.Hits {
12971303
hitsMap2[hit.ID] = (hit.Score)
12981304
}
12991305

1300-
for _, hit := range hits2.Hits {
1306+
for _, hit := range hits.Hits {
13011307
if hitsMap[hit.ID] != hitsMap2[hit.ID]/10 {
13021308
t.Errorf("boosting not working: %v %v \n", hitsMap[hit.ID], hitsMap2[hit.ID])
13031309
}
@@ -1645,6 +1651,210 @@ func TestNestedVectors(t *testing.T) {
16451651
}
16461652
}
16471653

1654+
// -----------------------------------------------------------------------------
1655+
// TestMultiVector tests the KNN functionality which handles duplicate
1656+
// vectors being matched within the same document. When a document has multiple vectors
1657+
// (via [[]] array of vectors or [{}] array of objects with vectors), the KNN
1658+
// searcher must pick the best scoring vector match for that document. This test covers these scenarios:
1659+
// - Single vector field (baseline)
1660+
// - [[]] style: array of vectors (same doc appears multiple times)
1661+
// - [{}] style: array of objects with vector field (chunks pattern)
1662+
func TestMultiVector(t *testing.T) {
1663+
tmpIndexPath := createTmpIndexPath(t)
1664+
defer cleanupTmpIndexPath(t, tmpIndexPath)
1665+
1666+
// JSON documents covering merger scenarios:
1667+
// - Single vector (baseline)
1668+
// - [[]] style: array of vectors (same doc appears multiple times)
1669+
// - [{}] style: array of objects with vector field (chunks pattern)
1670+
docs := map[string]string{
1671+
// Single vector - baseline
1672+
"doc1": `{
1673+
"vec": [10, 10, 10],
1674+
"vecB": [100, 100, 100]
1675+
}`,
1676+
// [[]] style - array of 2 vectors
1677+
"doc2": `{
1678+
"vec": [[0, 0, 0], [500, 500, 500]],
1679+
"vecB": [[900, 900, 900], [950, 950, 950], [975, 975, 975], [990, 990, 990]]
1680+
}`,
1681+
// [[]] style - array of 3 vectors
1682+
"doc3": `{
1683+
"vec": [[50, 50, 50], [200, 200, 200], [400, 400, 400]],
1684+
"vecB": [[800, 800, 800], [850, 850, 850]]
1685+
}`,
1686+
// Single vector - baseline
1687+
"doc4": `{
1688+
"vec": [1000, 1000, 1000],
1689+
"vecB": [1, 1, 1]
1690+
}`,
1691+
// [{}] style - array of objects with vector field (chunks pattern)
1692+
"doc5": `{
1693+
"chunks": [
1694+
{"vec": [10, 10, 10], "text": "chunk1"},
1695+
{"vec": [20, 20, 20], "text": "chunk2"},
1696+
{"vec": [30, 30, 30], "text": "chunk3"},
1697+
{"vec": [40, 40, 40], "text": "chunk4"}
1698+
]
1699+
}`,
1700+
"doc6": `{
1701+
"chunks": [
1702+
{"vec": [[10, 10, 10],[20, 20, 20]], "text": "chunk1"},
1703+
{"vec": [[30, 30, 30],[40, 40, 40]], "text": "chunk2"}
1704+
]
1705+
}`,
1706+
}
1707+
1708+
// Parse JSON documents
1709+
dataset := make(map[string]map[string]interface{})
1710+
for docID, jsonStr := range docs {
1711+
var doc map[string]interface{}
1712+
if err := json.Unmarshal([]byte(jsonStr), &doc); err != nil {
1713+
t.Fatalf("failed to unmarshal %s: %v", docID, err)
1714+
}
1715+
dataset[docID] = doc
1716+
}
1717+
1718+
// Index mapping
1719+
indexMapping := NewIndexMapping()
1720+
1721+
vecMapping := mapping.NewVectorFieldMapping()
1722+
vecMapping.Dims = 3
1723+
vecMapping.Similarity = index.InnerProduct
1724+
indexMapping.DefaultMapping.AddFieldMappingsAt("vec", vecMapping)
1725+
indexMapping.DefaultMapping.AddFieldMappingsAt("vecB", vecMapping)
1726+
1727+
// Nested chunks mapping for [{}] style
1728+
chunksMapping := mapping.NewDocumentMapping()
1729+
chunksMapping.AddFieldMappingsAt("vec", vecMapping)
1730+
indexMapping.DefaultMapping.AddSubDocumentMapping("chunks", chunksMapping)
1731+
1732+
// Create and populate index
1733+
idx, err := New(tmpIndexPath, indexMapping)
1734+
if err != nil {
1735+
t.Fatal(err)
1736+
}
1737+
defer func() {
1738+
if err := idx.Close(); err != nil {
1739+
t.Fatal(err)
1740+
}
1741+
}()
1742+
1743+
batch := idx.NewBatch()
1744+
for docID, doc := range dataset {
1745+
if err := batch.Index(docID, doc); err != nil {
1746+
t.Fatal(err)
1747+
}
1748+
}
1749+
if err := idx.Batch(batch); err != nil {
1750+
t.Fatal(err)
1751+
}
1752+
1753+
// Test: Single KNN query - basic functionality
1754+
t.Run("VecFieldSingle", func(t *testing.T) {
1755+
searchReq := NewSearchRequest(query.NewMatchNoneQuery())
1756+
searchReq.AddKNN("vec", []float32{1, 1, 1}, 20, 1.0)
1757+
res, err := idx.Search(searchReq)
1758+
if err != nil {
1759+
t.Fatal(err)
1760+
}
1761+
// Inner product: score = sum(query_i * doc_i)
1762+
// doc1 vec=[10,10,10]: 1*10*3 = 30
1763+
// doc2 vec best is [500,500,500]: 1*500*3 = 1500
1764+
// doc3 vec best is [400,400,400]: 1*400*3 = 1200
1765+
// doc4 vec=[1000,1000,1000]: 1*1000*3 = 3000
1766+
expectedResult := []struct {
1767+
docID string
1768+
expectedScore float64
1769+
}{
1770+
{docID: "doc4", expectedScore: 3000},
1771+
{docID: "doc2", expectedScore: 1500},
1772+
{docID: "doc3", expectedScore: 1200},
1773+
{docID: "doc1", expectedScore: 30},
1774+
}
1775+
1776+
if len(res.Hits) != len(expectedResult) {
1777+
t.Fatalf("expected %d hits, got %d", len(expectedResult), len(res.Hits))
1778+
}
1779+
1780+
for i, expected := range expectedResult {
1781+
if res.Hits[i].ID != expected.docID {
1782+
t.Fatalf("at rank %d, expected docID %s, got %s", i+1, expected.docID, res.Hits[i].ID)
1783+
}
1784+
if res.Hits[i].Score != expected.expectedScore {
1785+
t.Fatalf("at rank %d, expected score %v, got %v", i+1, expected.expectedScore, res.Hits[i].Score)
1786+
}
1787+
}
1788+
})
1789+
1790+
// Test: Single KNN query on vecB field
1791+
t.Run("VecBFieldSingle", func(t *testing.T) {
1792+
searchReq := NewSearchRequest(query.NewMatchNoneQuery())
1793+
searchReq.AddKNN("vecB", []float32{1000, 1000, 1000}, 20, 1.0)
1794+
res, err := idx.Search(searchReq)
1795+
if err != nil {
1796+
t.Fatal(err)
1797+
}
1798+
// Inner product: score = sum(query_i * doc_i) for each dimension
1799+
// doc1: vecB=[100,100,100] -> 1000*100*3 = 300,000
1800+
// doc2: vecB best is [990,990,990] -> 1000*990*3 = 2,970,000
1801+
// doc3: vecB best is [850,850,850] -> 1000*850*3 = 2,550,000
1802+
// doc4: vecB=[1,1,1] -> 1000*1*3 = 3,000
1803+
expectedResult := []struct {
1804+
docID string
1805+
expectedScore float64
1806+
}{
1807+
{docID: "doc2", expectedScore: 2970000},
1808+
{docID: "doc3", expectedScore: 2550000},
1809+
{docID: "doc1", expectedScore: 300000},
1810+
{docID: "doc4", expectedScore: 3000},
1811+
}
1812+
1813+
if len(res.Hits) != len(expectedResult) {
1814+
t.Fatalf("expected %d hits, got %d", len(expectedResult), len(res.Hits))
1815+
}
1816+
1817+
for i, expected := range expectedResult {
1818+
if res.Hits[i].ID != expected.docID {
1819+
t.Fatalf("at rank %d, expected docID %s, got %s", i+1, expected.docID, res.Hits[i].ID)
1820+
}
1821+
if res.Hits[i].Score != expected.expectedScore {
1822+
t.Fatalf("at rank %d, expected score %v, got %v", i+1, expected.expectedScore, res.Hits[i].Score)
1823+
}
1824+
}
1825+
})
1826+
1827+
// Test: Single KNN query on nested chunks.vec field
1828+
t.Run("ChunksVecFieldSingle", func(t *testing.T) {
1829+
searchReq := NewSearchRequest(query.NewMatchNoneQuery())
1830+
searchReq.AddKNN("chunks.vec", []float32{1, 1, 1}, 20, 1.0)
1831+
searchReq.SortBy([]string{"_score", "docID"})
1832+
res, err := idx.Search(searchReq)
1833+
if err != nil {
1834+
t.Fatal(err)
1835+
}
1836+
1837+
// Only doc5 and doc6 have chunks.vec
1838+
// doc5 chunks: [10,10,10], [20,20,20], [30,30,30], [40,40,40]
1839+
// Best score: 1*40*3 = 120
1840+
// doc6 chunks: [[10,10,10],[20,20,20]], [[30,30,30],[40,40,40]]
1841+
// Best score: 1*40*3 = 120
1842+
if len(res.Hits) != 2 {
1843+
t.Fatalf("expected 2 hits, got %d", len(res.Hits))
1844+
}
1845+
1846+
// Both should have score 120
1847+
for _, hit := range res.Hits {
1848+
if hit.ID != "doc5" && hit.ID != "doc6" {
1849+
t.Fatalf("unexpected docID %s, expected doc5 or doc6", hit.ID)
1850+
}
1851+
if hit.Score != 120 {
1852+
t.Fatalf("for %s, expected score 120, got %v", hit.ID, hit.Score)
1853+
}
1854+
}
1855+
})
1856+
}
1857+
16481858
func TestNumVecsStat(t *testing.T) {
16491859

16501860
dataset, _, err := readDatasetAndQueries(testInputCompressedFile)

0 commit comments

Comments
 (0)