Skip to content

Commit

Permalink
Fix 9.12.0 backcompat break (Lucene 9.12.0 cannot read 9.11.x indices…
Browse files Browse the repository at this point in the history
… written with quantized HNSW, `Lucene99HnswScalarQuantizedVectorsFormat`) (#13874)

* carefully regenerate the int8_hnsw bwc indices so that they do in fact use Lucene99ScalarQuantizedVectorsFormat ... when running TestInt8HnswBackwardsCompatibility it now fails (as expected) on 9.11.0 and 9.11.1 bwc indices, but not on 9.10.0

* rename int8 -> int7 bwc tests since we are actually testing 7 bit quantization

* actually fix the bwc bug: only allow compress=true when bits is 7 or 8 in HNSW scalar quantization

* tidy

* Revert "rename int8 -> int7 bwc tests since we are actually testing 7 bit quantization"

This reverts commit eeb3f8a.

* Reapply "rename int8 -> int7 bwc tests since we are actually testing 7 bit quantization"

This reverts commit 3487c42.

* #13880: add test to verify the int7 quantized indices are in fact using quantized vectors not float32

* bump 9.12.x version to 9.12.1 and add bwc indices for 9.12.0

* remove duplicate 9.12.0 Version constant

* revert changes to index.9.12.0-cfs.zip, index.9.12.0-nocfs.zip, sorted.9.12.0.zip

* remove unused bwc index

Closes #13867
Closes #13880
  • Loading branch information
mikemccand committed Oct 9, 2024
1 parent e6bb5e2 commit eadc07c
Show file tree
Hide file tree
Showing 12 changed files with 48 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,8 @@ public abstract class BackwardsCompatibilityTestBase extends LuceneTestCase {
* This is a base constructor for parameterized BWC tests. The constructor arguments are provided
* by {@link com.carrotsearch.randomizedtesting.RandomizedRunner} during test execution. A {@link
* com.carrotsearch.randomizedtesting.annotations.ParametersFactory} specified in a subclass
* provides a list lists of arguments for the tests and RandomizedRunner will execute the test for
* each of the argument list.
* provides a list of arguments for the tests and RandomizedRunner will execute the test for each
* of the argument list.
*
* @param version the version this test should run for
* @param indexPattern an index pattern in order to open an index of see {@link
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public class TestGenerateBwcIndices extends LuceneTestCase {
// To generate backcompat indexes with the current default codec, run the following gradle
// command:
// gradlew test -Ptests.bwcdir=/path/to/store/indexes -Ptests.codec=default
// -Ptests.useSecurityManager=false --tests TestGenerateBwcIndices
// -Ptests.useSecurityManager=false --tests TestGenerateBwcIndices --max-workers=1
//
// Also add testmethod with one of the index creation methods below, for example:
// -Ptestmethod=testCreateCFS
Expand Down Expand Up @@ -82,14 +82,14 @@ public void testCreateSortedIndex() throws IOException {
sortedTest.createBWCIndex();
}

public void testCreateInt8HNSWIndices() throws IOException {
TestInt8HnswBackwardsCompatibility int8HnswBackwardsCompatibility =
new TestInt8HnswBackwardsCompatibility(
public void testCreateInt7HNSWIndices() throws IOException {
TestInt7HnswBackwardsCompatibility int7HnswBackwardsCompatibility =
new TestInt7HnswBackwardsCompatibility(
Version.LATEST,
createPattern(
TestInt8HnswBackwardsCompatibility.INDEX_NAME,
TestInt8HnswBackwardsCompatibility.SUFFIX));
int8HnswBackwardsCompatibility.createBWCIndex();
TestInt7HnswBackwardsCompatibility.INDEX_NAME,
TestInt7HnswBackwardsCompatibility.SUFFIX));
int7HnswBackwardsCompatibility.createBWCIndex();
}

private boolean isInitialMajorVersionRelease() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,41 +23,46 @@
import org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.KnnFloatVectorField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.analysis.MockAnalyzer;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.quantization.QuantizedByteVectorValues;

public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTestBase {
public class TestInt7HnswBackwardsCompatibility extends BackwardsCompatibilityTestBase {

static final String INDEX_NAME = "int8_hnsw";
static final String INDEX_NAME = "int7_hnsw";
static final String SUFFIX = "";
private static final Version FIRST_INT8_HNSW_VERSION = Version.LUCENE_9_10_0;
private static final Version FIRST_INT7_HNSW_VERSION = Version.LUCENE_9_10_0;
private static final String KNN_VECTOR_FIELD = "knn_field";
private static final int DOC_COUNT = 30;
private static final FieldType KNN_VECTOR_FIELD_TYPE =
KnnFloatVectorField.createFieldType(3, VectorSimilarityFunction.COSINE);
private static final float[] KNN_VECTOR = {0.2f, -0.1f, 0.1f};

public TestInt8HnswBackwardsCompatibility(Version version, String pattern) {
public TestInt7HnswBackwardsCompatibility(Version version, String pattern) {
super(version, pattern);
}

/** Provides all sorted versions to the test-framework */
@ParametersFactory(argumentFormatting = "Lucene-Version:%1$s; Pattern: %2$s")
public static Iterable<Object[]> testVersionsFactory() throws IllegalAccessException {
return allVersion(INDEX_NAME, SUFFIX);
Expand All @@ -76,15 +81,15 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) {

@Override
protected boolean supportsVersion(Version version) {
return version.onOrAfter(FIRST_INT8_HNSW_VERSION);
return version.onOrAfter(FIRST_INT7_HNSW_VERSION);
}

@Override
void verifyUsesDefaultCodec(Directory dir, String name) throws IOException {
// We don't use the default codec
}

public void testInt8HnswIndexAndSearch() throws Exception {
public void testInt7HnswIndexAndSearch() throws Exception {
IndexWriterConfig indexWriterConfig =
newIndexWriterConfig(new MockAnalyzer(random()))
.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
Expand All @@ -108,7 +113,6 @@ public void testInt8HnswIndexAndSearch() throws Exception {
assertKNNSearch(searcher, KNN_VECTOR, 10, 10, "0");
}
}
// This will confirm the docs are really sorted
TestUtil.checkIndex(directory);
}

Expand All @@ -117,7 +121,7 @@ protected void createIndex(Directory dir) throws IOException {
IndexWriterConfig conf =
new IndexWriterConfig(new MockAnalyzer(random()))
.setMaxBufferedDocs(10)
.setCodec(TestUtil.getDefaultCodec())
.setCodec(getCodec())
.setMergePolicy(NoMergePolicy.INSTANCE);
try (IndexWriter writer = new IndexWriter(dir, conf)) {
for (int i = 0; i < DOC_COUNT; i++) {
Expand Down Expand Up @@ -147,4 +151,29 @@ public void testReadOldIndices() throws Exception {
assertKNNSearch(searcher, KNN_VECTOR, 10, 10, "0");
}
}

// #13880: make sure the BWC index really contains quantized HNSW not float32
public void testIndexIsReallyQuantized() throws Exception {
try (DirectoryReader reader = DirectoryReader.open(directory)) {
for (LeafReaderContext leafContext : reader.leaves()) {
KnnVectorsReader knnVectorsReader = ((CodecReader) leafContext.reader()).getVectorReader();
assertTrue(
"expected PerFieldKnnVectorsFormat.FieldsReader but got: " + knnVectorsReader,
knnVectorsReader instanceof PerFieldKnnVectorsFormat.FieldsReader);

KnnVectorsReader forField =
((PerFieldKnnVectorsFormat.FieldsReader) knnVectorsReader)
.getFieldReader(KNN_VECTOR_FIELD);

assertTrue(forField instanceof Lucene99HnswVectorsReader);

QuantizedByteVectorValues quantized =
((Lucene99HnswVectorsReader) forField).getQuantizedVectorValues(KNN_VECTOR_FIELD);

assertNotNull(
"KnnVectorsReader should have quantized interface for field " + KNN_VECTOR_FIELD,
quantized);
}
}
}
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ static void validateFieldEntry(FieldInfo info, FieldEntry fieldEntry) {
}

final long quantizedVectorBytes;
if (fieldEntry.compress) {
if (fieldEntry.bits <= 4 && fieldEntry.compress) {
// two dimensions -> one byte
quantizedVectorBytes = ((dimension + 1) >> 1) + Float.BYTES;
} else {
Expand Down

0 comments on commit eadc07c

Please sign in to comment.