Skip to content

Commit a49d1e0

Browse files
lvcaclaude
andcommitted
perf: improved lsm vector serialization
page0 now has been completely removed, metadata are al on the schema json Co-Authored-By: Claude <[email protected]>
1 parent c29176d commit a49d1e0

File tree

4 files changed

+38
-134
lines changed

4 files changed

+38
-134
lines changed

engine/src/main/java/com/arcadedb/index/vector/LSMVectorIndex.java

Lines changed: 24 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,7 @@
4242
import com.arcadedb.schema.Schema;
4343
import com.arcadedb.schema.Type;
4444
import com.arcadedb.serializer.BinaryComparator;
45-
import com.arcadedb.serializer.json.JSONArray;
4645
import com.arcadedb.serializer.json.JSONObject;
47-
import com.arcadedb.utility.FileUtils;
4846
import com.arcadedb.utility.LockManager;
4947
import io.github.jbellis.jvector.graph.GraphIndexBuilder;
5048
import io.github.jbellis.jvector.graph.GraphSearcher;
@@ -248,7 +246,7 @@ protected LSMVectorIndex(final LSMVectorIndexBuilder builder) throws IOException
248246
this.associatedBucketId = -1; // Will be set via setMetadata()
249247

250248
// Initialize compaction fields
251-
this.currentMutablePages = new AtomicInteger(1); // Start with page 0
249+
this.currentMutablePages = new AtomicInteger(0); // No page0 - start with 0 pages
252250
this.minPagesToScheduleACompaction = builder.getDatabase().getConfiguration()
253251
.getValueAsInteger(com.arcadedb.GlobalConfiguration.INDEX_COMPACTION_MIN_PAGES_SCHEDULE);
254252
this.compactedSubIndex = null;
@@ -258,6 +256,9 @@ protected LSMVectorIndex(final LSMVectorIndexBuilder builder) throws IOException
258256
ComponentFile.MODE.READ_WRITE, DEF_PAGE_SIZE);
259257
this.component.setMainIndex(this);
260258

259+
// Metadata is stored only in schema JSON (via toJSON()), not in pages
260+
// No page0 initialization needed - all pages contain only vector data
261+
261262
initializeGraphIndex();
262263
}
263264

@@ -285,58 +286,6 @@ protected LSMVectorIndex(final DatabaseInternal database, final String name, fin
285286
.getValueAsInteger(com.arcadedb.GlobalConfiguration.INDEX_COMPACTION_MIN_PAGES_SCHEDULE);
286287
this.compactedSubIndex = null;
287288

288-
// Load configuration from metadata file or use sensible defaults
289-
// Metadata will be applied from schema later via applyMetadataFromSchema() if available
290-
JSONObject json = null;
291-
292-
// Try to read from metadata file if it exists (backward compatibility with non-replicated indexes)
293-
String originalFilePath = filePath.replaceAll("\\.[0-9]+\\.[0-9]+\\.v[0-9]+\\." + FILE_EXT + "$", "");
294-
originalFilePath = originalFilePath.replaceAll("\\." + FILE_EXT + "$", "");
295-
final String metadataPath = originalFilePath + ".metadata.json";
296-
final File metadataFile = new File(metadataPath);
297-
298-
if (metadataFile.exists()) {
299-
try {
300-
final String fileContent = FileUtils.readFileAsString(metadataFile);
301-
json = new JSONObject(fileContent);
302-
LogManager.instance().log(this, Level.FINE, "Loaded vector index metadata from file: %s", metadataPath);
303-
} catch (final Exception e) {
304-
LogManager.instance()
305-
.log(this, Level.WARNING, "Failed to read metadata file %s, using defaults: %s", e, metadataPath, e.getMessage());
306-
}
307-
}
308-
309-
// Use sensible defaults if metadata file doesn't exist
310-
// This is normal during schema replication when metadata is embedded in the schema and
311-
// will be applied after construction via applyMetadataFromSchema()
312-
if (json == null) {
313-
LogManager.instance().log(this, Level.FINE,
314-
"Metadata file not found for index %s. Using defaults (will be overridden by schema if available).", name);
315-
json = new JSONObject();
316-
json.put("dimensions", 10); // Default dimensions
317-
json.put("similarityFunction", "COSINE");
318-
json.put("maxConnections", 16);
319-
json.put("beamWidth", 100);
320-
json.put("idPropertyName", "id");
321-
json.put("properties", new JSONArray());
322-
}
323-
324-
// indexName already set in constructor
325-
this.typeName = json.getString("typeName", "");
326-
this.dimensions = json.getInt("dimensions");
327-
this.similarityFunction = VectorSimilarityFunction.valueOf(json.getString("similarityFunction", "COSINE"));
328-
this.maxConnections = json.getInt("maxConnections", 16);
329-
this.beamWidth = json.getInt("beamWidth", 100);
330-
this.idPropertyName = json.getString("idPropertyName", "id");
331-
332-
// Load property names
333-
this.propertyNames = new ArrayList<>();
334-
if (json.has("properties")) {
335-
final var jsonArray = json.getJSONArray("properties");
336-
for (int i = 0; i < jsonArray.length(); i++)
337-
propertyNames.add(jsonArray.getString(i));
338-
}
339-
340289
// Load vectors from pages - only if this is an existing index file
341290
// During replication on replicas, the file may not exist yet and will be created/replicated later
342291
try {
@@ -437,42 +386,18 @@ private void loadVectorsFromPages() {
437386
com.arcadedb.log.LogManager.instance().log(this, java.util.logging.Level.WARNING,
438387
"DEBUG: loadVectorsFromPages STARTED: index=%s, totalPages=%d", indexName, getTotalPages());
439388
try {
440-
// Read header from page 0
441-
final BasePage page0 = getDatabase().getTransaction().getPage(new PageId(getDatabase(), getFileId(), 0), getPageSize());
442-
final ByteBuffer buffer0 = page0.getContent();
443-
buffer0.position(0);
389+
// NOTE: All metadata (dimensions, similarityFunction, maxConnections, beamWidth) comes from schema JSON
390+
// via applyMetadataFromSchema(). Pages contain only vector data, no metadata.
444391

445-
final int storedNextId = buffer0.getInt();
446-
com.arcadedb.log.LogManager.instance().log(this, java.util.logging.Level.WARNING,
447-
"DEBUG: loadVectorsFromPages - page0 storedNextId=%d, index=%s", storedNextId, indexName);
448-
449-
if (storedNextId == 0) {
450-
LogManager.instance().log(this, Level.FINE, "No vectors to load - empty index: " + indexName);
451-
return;
452-
}
453-
454-
// Read and validate metadata
455-
final int storedDimensions = buffer0.getInt();
456-
com.arcadedb.log.LogManager.instance().log(this, java.util.logging.Level.WARNING,
457-
"DEBUG: loadVectorsFromPages - storedDimensions=%d, expectedDimensions=%d, index=%s",
458-
storedDimensions, dimensions, indexName);
459-
460-
if (storedDimensions != dimensions) {
461-
throw new IndexException("Dimension mismatch: expected " + dimensions + " but found " + storedDimensions);
462-
}
463-
464-
// Skip similarity, maxConnections, beamWidth - already set from constructor
465-
buffer0.getInt();
466-
buffer0.getInt();
467-
buffer0.getInt();
468-
469-
nextId.set(storedNextId);
470-
471-
// Read all data pages (1 onwards) in LSM style
392+
// Read all data pages (starting from page 0) in LSM style
472393
final int totalPages = getTotalPages();
473394
int entriesRead = 0;
395+
int maxVectorId = -1; // Track max ID to compute nextId
474396

475-
for (int pageNum = 1; pageNum < totalPages; pageNum++) {
397+
com.arcadedb.log.LogManager.instance().log(this, java.util.logging.Level.WARNING,
398+
"DEBUG: loadVectorsFromPages STARTED: index=%s, totalPages=%d", indexName, totalPages);
399+
400+
for (int pageNum = 0; pageNum < totalPages; pageNum++) {
476401
final BasePage currentPage = getDatabase().getTransaction().getPage(
477402
new PageId(getDatabase(), getFileId(), pageNum), getPageSize());
478403
final ByteBuffer pageBuffer = currentPage.getContent();
@@ -508,6 +433,10 @@ private void loadVectorsFromPages() {
508433

509434
final boolean deleted = pageBuffer.get() == 1;
510435

436+
// Track max vector ID to compute nextId
437+
if (id > maxVectorId)
438+
maxVectorId = id;
439+
511440
// Add/update in registry (LSM style: later entries override earlier ones)
512441
final VectorEntry entry = new VectorEntry(id, rid, vector);
513442
entry.deleted = deleted;
@@ -516,9 +445,12 @@ private void loadVectorsFromPages() {
516445
}
517446
}
518447

448+
// Compute nextId from the maximum vector ID found + 1
449+
nextId.set(maxVectorId + 1);
450+
519451
LogManager.instance().log(this, Level.INFO,
520452
"Loaded " + vectorRegistry.size() + " unique vectors (" + entriesRead + " total entries) from " +
521-
(totalPages - 1) + " pages for index: " + indexName);
453+
totalPages + " pages for index: " + indexName + ", nextId=" + nextId.get());
522454

523455
// Rebuild the graph index with loaded non-deleted vectors
524456
if (!vectorRegistry.isEmpty()) {
@@ -543,27 +475,18 @@ private void persistVectorsDeltaIncremental(final List<Integer> changedVectorIds
543475
"DEBUG: persistVectorsDeltaIncremental called: index=%s, changedVectorIds=%d, totalPages=%d",
544476
indexName, changedVectorIds.size(), getTotalPages());
545477

546-
// Update metadata in page 0
547-
final BasePage page0 = getDatabase().getTransaction().getPageToModify(
548-
new PageId(getDatabase(), getFileId(), 0), getPageSize(), false);
549-
final ByteBuffer buffer0 = page0.getContent();
550-
buffer0.position(0);
551-
buffer0.putInt(nextId.get()); // Update next ID
552-
buffer0.putInt(dimensions);
553-
buffer0.putInt(similarityFunction.ordinal());
554-
buffer0.putInt(maxConnections);
555-
buffer0.putInt(beamWidth);
478+
// NO page0 writes needed! Metadata is stored in schema JSON, nextId is computed from max vector ID during load
556479

557480
if (changedVectorIds.isEmpty())
558481
return;
559482

560483
// Calculate entry size: id(4) + position(8) + bucketId(4) + vector(dimensions*4) + deleted(1)
561484
final int entrySize = 4 + 8 + 4 + (dimensions * 4) + 1;
562485

563-
// Get or create the last mutable page
486+
// Get or create the last mutable page (pages start from 0 now - no page0 metadata)
564487
int lastPageNum = getTotalPages() - 1;
565-
if (lastPageNum < 1) {
566-
lastPageNum = 1;
488+
if (lastPageNum < 0) {
489+
lastPageNum = 0;
567490
createNewVectorDataPage(lastPageNum);
568491
}
569492

engine/src/main/java/com/arcadedb/index/vector/LSMVectorIndexComponent.java

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,9 @@ protected LSMVectorIndexComponent(final DatabaseInternal database, final String
7373
final ComponentFile.MODE mode, final int pageSize) throws IOException {
7474
super(database, name, filePath, FILE_EXT, mode, pageSize, CURRENT_VERSION);
7575

76-
// Initialize page 0 for new indexes (like LSMTreeIndexMutable does)
76+
// No page0 initialization needed - all pages contain only vector data
77+
// Metadata is stored in schema JSON only
7778
database.checkTransactionIsActive(database.isAutoTransaction());
78-
initializePage0();
7979
}
8080

8181
/**
@@ -157,26 +157,4 @@ public PaginatedComponent getComponent() {
157157
public String getFilePath() {
158158
return filePath;
159159
}
160-
161-
/**
162-
* Initialize page 0 with metadata header.
163-
* Page 0 contains: nextId(4) + dimensions(4) + similarityFunction(4) + maxConnections(4) + beamWidth(4)
164-
* This is called when creating a new index to set up the metadata page.
165-
*/
166-
private void initializePage0() throws IOException {
167-
final PageId pageId = new PageId(database, getFileId(), 0);
168-
final MutablePage page0 = database.isTransactionActive() ?
169-
database.getTransaction().addPage(pageId, getPageSize()) :
170-
new MutablePage(pageId, getPageSize());
171-
172-
final ByteBuffer buffer = page0.getContent();
173-
buffer.position(0);
174-
175-
// Initialize metadata with defaults (will be overwritten by LSMVectorIndex)
176-
buffer.putInt(0); // nextId = 0
177-
buffer.putInt(0); // dimensions (placeholder)
178-
buffer.putInt(0); // similarityFunction (placeholder)
179-
buffer.putInt(0); // maxConnections (placeholder)
180-
buffer.putInt(0); // beamWidth (placeholder)
181-
}
182160
}

server/src/main/java/com/arcadedb/server/ha/message/TxForwardRequest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ public void fromStream(final ArcadeDBServer server, final Binary stream) {
7979

8080
@Override
8181
public HACommand execute(final HAServer server, final String remoteServerName, final long messageNumber) {
82-
final DatabaseInternal db = (DatabaseInternal) server.getServer().getDatabase(databaseName);
82+
final DatabaseInternal db = server.getServer().getDatabase(databaseName);
8383
if (!db.isOpen())
8484
throw new ReplicationException("Database '" + databaseName + "' is closed");
8585

server/src/test/java/com/arcadedb/server/ha/IndexCompactionReplicationIT.java

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -124,15 +124,17 @@ public void lsmVectorCompactionReplication() throws Exception {
124124
final VertexType v = database.getSchema().buildVertexType().withName("Embedding").withTotalBuckets(1).create();
125125
v.createProperty("vector", float[].class);
126126

127-
final String indexName = "Embedding[vector]";
128127
// USE BUILDER FOR VECTOR INDEXES WITH DIMENSION = 10
129-
final com.arcadedb.index.TypeIndex vectorIndex =
130-
database.getSchema().buildTypeIndex("Embedding", new String[]{"vector"})
131-
.withType(Schema.INDEX_TYPE.LSM_VECTOR)
132-
.withDimensions(10)
133-
.create();
128+
com.arcadedb.schema.TypeIndexBuilder builder =
129+
database.getSchema().buildTypeIndex("Embedding", new String[]{"vector"});
130+
builder = builder.withType(Schema.INDEX_TYPE.LSM_VECTOR);
134131

135-
LogManager.instance().log(this, Level.FINE, "Vector index created: %s", indexName);
132+
// Cast to LSMVectorIndexBuilder for vector-specific configuration
133+
((com.arcadedb.schema.LSMVectorIndexBuilder) builder).withDimensions(10);
134+
135+
final com.arcadedb.index.TypeIndex vectorIndex = builder.create();
136+
137+
LogManager.instance().log(this, Level.FINE, "Vector index created: %s", vectorIndex.getName());
136138
Assertions.assertNotNull(vectorIndex, "Vector index should be created successfully");
137139

138140
LogManager.instance().log(this, Level.FINE, "Inserting %d records into vector index...", TOTAL_RECORDS);
@@ -162,13 +164,14 @@ public void lsmVectorCompactionReplication() throws Exception {
162164
Thread.sleep(2000);
163165

164166
// VERIFY THAT VECTOR INDEX DEFINITION IS REPLICATED TO ALL SERVERS
167+
final String actualIndexName = vectorIndex.getName();
165168
testEachServer((serverIndex) -> {
166169
LogManager.instance().log(this, Level.FINE, "Verifying vector index definition on server %d...", serverIndex);
167170

168171
final Database serverDb = getServerDatabase(serverIndex, getDatabaseName());
169172

170173
// Check if the index exists in schema
171-
final com.arcadedb.index.Index serverVectorIndex = serverDb.getSchema().getIndexByName(indexName);
174+
final com.arcadedb.index.Index serverVectorIndex = serverDb.getSchema().getIndexByName(actualIndexName);
172175
if (serverVectorIndex == null) {
173176
// Index not found, check the type's indexes
174177
final com.arcadedb.schema.DocumentType embeddingType = serverDb.getSchema().getType("Embedding");

0 commit comments

Comments
 (0)