Skip to content

Commit 8fe04d8

Browse files
hx235rkhachatryan
authored andcommitted
Fix bug of newer ingested data assigned with an older seqno (#12257)
Summary: **Context:** We found an edge case where newer ingested data is assigned with an older seqno. This causes older data of that key to be returned for read. Consider the following lsm shape: ![image](https://github.com/facebook/rocksdb/assets/83968999/973fd160-5065-49cd-8b7b-b6ab4badae23) Then ingest a file to L5 containing new data of key_overlap. Because of [this](https://l.facebook.com/l.php?u=https%3A%2F%2Fgithub.com%2Ffacebook%2Frocksdb%2Fblob%2F5a26f392ca640818da0b8590be6119699e852b07%2Fdb%2Fexternal_sst_file_ingestion_job.cc%3Ffbclid%3DIwAR10clXxpUSrt6sYg12sUMeHfShS7XigFrsJHvZoUDroQpbj_Sb3dG_JZFc%23L951-L956&h=AT0m56P7O0ZML7jk1sdjgnZZyGPMXg9HkKvBEb8mE9ZM3fpJjPrArAMsaHWZQPt9Ki-Pn7lv7x-RT9NEd_202Y6D2juIVHOIt3EjCZptDKBLRBMG49F8iBUSM9ypiKe8XCfM-FNW2Hl4KbVq2e3nZRbMvUM), the file is assigned with seqno 2, older than the old data's seqno 4. After just another compaction, we will drop the new_v for key_overlap because of the seqno and cause older data to be returned. ![image](https://github.com/facebook/rocksdb/assets/83968999/a3ef95e4-e7ae-4c30-8d03-955cd4b5ed42) **Summary:** This PR removes the incorrect seqno assignment Pull Request resolved: facebook/rocksdb#12257 Test Plan: - New unit test failed before the fix but passes after - python3 tools/db_crashtest.py --compaction_style=1 --ingest_external_file_one_in=10 --preclude_last_level_data_seconds=36000 --compact_files_one_in=10 --enable_blob_files=0 blackbox` - Rehearsal stress test Reviewed By: cbi42 Differential Revision: D52926092 Pulled By: hx235 fbshipit-source-id: 9e4dade0f6cc44e548db8fca27ccbc81a621cd6f (cherry picked from commit 1b2b16b38ef760252d61b123e7e39c26306cd1c7)
1 parent 9a10201 commit 8fe04d8

File tree

3 files changed

+77
-20
lines changed

3 files changed

+77
-20
lines changed

db/external_sst_file_basic_test.cc

+76
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
#include "db/version_edit.h"
1010
#include "port/port.h"
1111
#include "port/stack_trace.h"
12+
#include "rocksdb/advanced_options.h"
13+
#include "rocksdb/options.h"
1214
#include "rocksdb/sst_file_writer.h"
1315
#include "test_util/testharness.h"
1416
#include "test_util/testutil.h"
@@ -1292,6 +1294,80 @@ TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) {
12921294
Destroy(options);
12931295
}
12941296

1297+
TEST_F(ExternalSSTFileBasicTest, ReadOldValueOfIngestedKeyBug) {
1298+
Options options = CurrentOptions();
1299+
options.compaction_style = kCompactionStyleUniversal;
1300+
options.disable_auto_compactions = true;
1301+
options.num_levels = 3;
1302+
options.preserve_internal_time_seconds = 36000;
1303+
DestroyAndReopen(options);
1304+
1305+
// To create the following LSM tree to trigger the bug:
1306+
// L0
1307+
// L1 with seqno [1, 2]
1308+
// L2 with seqno [3, 4]
1309+
1310+
// To create L1 shape
1311+
ASSERT_OK(
1312+
db_->Put(WriteOptions(), db_->DefaultColumnFamily(), "k1", "seqno1"));
1313+
ASSERT_OK(db_->Flush(FlushOptions()));
1314+
ASSERT_OK(
1315+
db_->Put(WriteOptions(), db_->DefaultColumnFamily(), "k1", "seqno2"));
1316+
ASSERT_OK(db_->Flush(FlushOptions()));
1317+
ColumnFamilyMetaData meta_1;
1318+
db_->GetColumnFamilyMetaData(&meta_1);
1319+
auto& files_1 = meta_1.levels[0].files;
1320+
ASSERT_EQ(files_1.size(), 2);
1321+
std::string file1 = files_1[0].db_path + files_1[0].name;
1322+
std::string file2 = files_1[1].db_path + files_1[1].name;
1323+
ASSERT_OK(db_->CompactFiles(CompactionOptions(), {file1, file2}, 1));
1324+
// To confirm L1 shape
1325+
ColumnFamilyMetaData meta_2;
1326+
db_->GetColumnFamilyMetaData(&meta_2);
1327+
ASSERT_EQ(meta_2.levels[0].files.size(), 0);
1328+
ASSERT_EQ(meta_2.levels[1].files.size(), 1);
1329+
// Seqno starts from non-zero due to seqno reservation for
1330+
// preserve_internal_time_seconds greater than 0;
1331+
ASSERT_EQ(meta_2.levels[1].files[0].largest_seqno, 102);
1332+
ASSERT_EQ(meta_2.levels[2].files.size(), 0);
1333+
// To create L2 shape
1334+
ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), "k2overlap",
1335+
"old_value"));
1336+
ASSERT_OK(db_->Flush(FlushOptions()));
1337+
ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), "k2overlap",
1338+
"old_value"));
1339+
ASSERT_OK(db_->Flush(FlushOptions()));
1340+
ColumnFamilyMetaData meta_3;
1341+
db_->GetColumnFamilyMetaData(&meta_3);
1342+
auto& files_3 = meta_3.levels[0].files;
1343+
std::string file3 = files_3[0].db_path + files_3[0].name;
1344+
std::string file4 = files_3[1].db_path + files_3[1].name;
1345+
ASSERT_OK(db_->CompactFiles(CompactionOptions(), {file3, file4}, 2));
1346+
// To confirm L2 shape
1347+
ColumnFamilyMetaData meta_4;
1348+
db_->GetColumnFamilyMetaData(&meta_4);
1349+
ASSERT_EQ(meta_4.levels[0].files.size(), 0);
1350+
ASSERT_EQ(meta_4.levels[1].files.size(), 1);
1351+
ASSERT_EQ(meta_4.levels[2].files.size(), 1);
1352+
ASSERT_EQ(meta_4.levels[2].files[0].largest_seqno, 104);
1353+
1354+
// Ingest a file with new value of the key "k2overlap"
1355+
SstFileWriter sst_file_writer(EnvOptions(), options);
1356+
std::string f = sst_files_dir_ + "f.sst";
1357+
ASSERT_OK(sst_file_writer.Open(f));
1358+
ASSERT_OK(sst_file_writer.Put("k2overlap", "new_value"));
1359+
ExternalSstFileInfo f_info;
1360+
ASSERT_OK(sst_file_writer.Finish(&f_info));
1361+
ASSERT_OK(db_->IngestExternalFile({f}, IngestExternalFileOptions()));
1362+
1363+
// To verify new value of the key "k2overlap" is correctly returned
1364+
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
1365+
std::string value;
1366+
ASSERT_OK(db_->Get(ReadOptions(), "k2overlap", &value));
1367+
// Before the fix, the value would be "old_value" and assertion failed
1368+
ASSERT_EQ(value, "new_value");
1369+
}
1370+
12951371
TEST_F(ExternalSSTFileBasicTest, IngestRangeDeletionTombstoneWithGlobalSeqno) {
12961372
for (int i = 5; i < 25; i++) {
12971373
ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(i),

db/external_sst_file_ingestion_job.cc

-20
Original file line numberDiff line numberDiff line change
@@ -937,26 +937,6 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
937937
overlap_with_db = true;
938938
break;
939939
}
940-
941-
if (compaction_style == kCompactionStyleUniversal && lvl != 0) {
942-
const std::vector<FileMetaData*>& level_files =
943-
vstorage->LevelFiles(lvl);
944-
const SequenceNumber level_largest_seqno =
945-
(*std::max_element(level_files.begin(), level_files.end(),
946-
[](FileMetaData* f1, FileMetaData* f2) {
947-
return f1->fd.largest_seqno <
948-
f2->fd.largest_seqno;
949-
}))
950-
->fd.largest_seqno;
951-
// should only assign seqno to current level's largest seqno when
952-
// the file fits
953-
if (level_largest_seqno != 0 &&
954-
IngestedFileFitInLevel(file_to_ingest, lvl)) {
955-
*assigned_seqno = level_largest_seqno;
956-
} else {
957-
continue;
958-
}
959-
}
960940
} else if (compaction_style == kCompactionStyleUniversal) {
961941
continue;
962942
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix a bug where older data of an ingested key can be returned for read when universal compaction is used

0 commit comments

Comments
 (0)