From 0aebf1c6527ea47feaedfbc46ee98f688e7661a4 Mon Sep 17 00:00:00 2001 From: nqd Date: Wed, 1 Oct 2025 21:52:03 +0800 Subject: [PATCH 01/65] Add buffer pool as a backend for page manager --- .github/workflows/pr.yml | 6 +- Cargo.toml | 8 + Makefile | 7 + cli/Cargo.lock | 722 ++++++++++++++++++------- src/database.rs | 40 +- src/page.rs | 7 +- src/page/manager.rs | 7 + src/page/manager/buffer_pool.rs | 890 +++++++++++++++++++++++++++++++ src/page/manager/cache_evict.rs | 51 ++ src/page/manager/mmap.rs | 19 +- src/page/manager/options.rs | 20 +- src/page/page.rs | 61 ++- src/page/slotted_page.rs | 40 +- src/storage/debug.rs | 19 +- src/storage/engine.rs | 4 +- tests/ethereum_execution_spec.rs | 4 +- 16 files changed, 1651 insertions(+), 254 deletions(-) create mode 100644 src/page/manager/buffer_pool.rs create mode 100644 src/page/manager/cache_evict.rs diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 83bb38b5..83dfa838 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -68,6 +68,7 @@ jobs: matrix: os: [ubuntu-latest, macos-latest] profile: [dev, release] + backend: [mmap_backend, buffer_pool_backend] runs-on: ${{ matrix.os }} steps: @@ -79,13 +80,14 @@ jobs: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Run unit tests run: | - make unit-tests cargo_flags='--verbose --profile=${{ matrix.profile }}' + make unit-tests cargo_flags='--verbose --profile=${{ matrix.profile }} --no-default-features --features ${{ matrix.backend }}' integration-tests: strategy: matrix: os: [ubuntu-latest, macos-latest] profile: [dev, release] + backend: [mmap_backend, buffer_pool_backend] runs-on: ${{ matrix.os }} steps: @@ -97,7 +99,7 @@ jobs: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Run integration tests run: - make integration-tests cargo_flags=--profile=${{ matrix.profile }} test_flags=--nocapture + make integration-tests cargo_flags='--profile=${{ matrix.profile }} --no-default-features --features ${{ matrix.backend }}' test_flags=--nocapture check-cli-lockfile: runs-on: ubuntu-latest diff --git a/Cargo.toml b/Cargo.toml index ed0f05ca..a53c581d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,14 @@ parking_lot = { version = "0.12.4", features = ["send_guard"] } fxhash = "0.2.1" static_assertions = "1.1.0" rayon = "1.10.0" +evict = "0.3.1" +dashmap = "6.1.0" +libc = "0.2.174" + +[features] +default = ["mmap_backend"] +buffer_pool_backend = [] +mmap_backend = [] [dev-dependencies] criterion = "0.6.0" diff --git a/Makefile b/Makefile index 0b701840..c801d119 100644 --- a/Makefile +++ b/Makefile @@ -36,6 +36,13 @@ unit-tests: integration-tests: tests/fixtures @cargo test --test '*' $(cargo_flags) -- $(test_flags) +.PHONY: all-tests +all-tests: + @echo "Running tests with mmap backend" + @cargo test --no-default-features --features mmap_backend $(cargo_flags) -- $(test_flags) + echo "Running tests with buffer pool backend" + @cargo test --no-default-features --features buffer_pool_backend $(cargo_flags) -- $(test_flags) + tests/fixtures: tests/fixtures_stable.tar.gz @tar -xzf $< -C $(@D) @rm -rf tests/fixtures_stable.tar.gz diff --git a/cli/Cargo.lock b/cli/Cargo.lock index f6a71082..c8ff6ef6 100644 --- a/cli/Cargo.lock +++ b/cli/Cargo.lock @@ -4,14 +4,14 @@ version = 4 [[package]] name = "ahash" -version = "0.8.11" +version = "0.8.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", "once_cell", "version_check", - "zerocopy 0.7.35", + "zerocopy", ] [[package]] @@ -25,20 +25,19 @@ dependencies = [ [[package]] name = "alloy-primitives" -version = "1.2.1" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6177ed26655d4e84e00b65cb494d4e0b8830e7cae7ef5d63087d445a2600fb55" +checksum = "bc9485c56de23438127a731a6b4c87803d49faf1a7068dcd1d8768aca3a9edb9" dependencies = [ "alloy-rlp", "arbitrary", "bytes", "cfg-if", "const-hex", - "derive_arbitrary", "derive_more", "foldhash", - "getrandom 0.3.2", - "hashbrown", + "getrandom 0.3.3", + "hashbrown 0.15.5", "indexmap", "itoa", "k256", @@ -46,7 +45,7 @@ dependencies = [ "paste", "proptest", "proptest-derive 0.5.1", - "rand 0.9.1", + "rand 0.9.2", "ruint", "rustc-hash", "serde", @@ -73,7 +72,7 @@ checksum = "64b728d511962dda67c1bc7ea7c03736ec275ed2cf4c35d9585298ac9ccf3b73" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.106", ] [[package]] @@ -96,11 +95,20 @@ dependencies = [ "tracing", ] +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "anstream" -version = "0.6.18" +version = "0.6.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +checksum = "3ae563653d1938f79b1ab1b5e668c87c76a9930414574a6583a7b7e11a8e6192" dependencies = [ "anstyle", "anstyle-parse", @@ -113,44 +121,44 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" +checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" [[package]] name = "anstyle-parse" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.1.2" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" dependencies = [ - "windows-sys", + "windows-sys 0.60.2", ] [[package]] name = "anstyle-wincon" -version = "3.0.7" +version = "3.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" +checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" dependencies = [ "anstyle", - "once_cell", - "windows-sys", + "once_cell_polyfill", + "windows-sys 0.60.2", ] [[package]] name = "arbitrary" -version = "1.4.1" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223" +checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" dependencies = [ "derive_arbitrary", ] @@ -293,14 +301,14 @@ checksum = "ffdcb70bdbc4d478427380519163274ac86e52916e10f0a8889adf0f96d3fee7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.106", ] [[package]] name = "autocfg" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "base16ct" @@ -310,9 +318,9 @@ checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf" [[package]] name = "base64ct" -version = "1.7.3" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89e25b6adfb930f02d1981565a6e5d9c547ac15a96606256d3b59040e5cd4ca3" +checksum = "55248b47b0caf0546f7988906588779981c43bb1bc9d0c44087278f80cdb44ba" [[package]] name = "bit-set" @@ -331,9 +339,9 @@ checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" [[package]] name = "bitflags" -version = "2.9.0" +version = "2.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" +checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" [[package]] name = "bitvec" @@ -356,6 +364,12 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bumpalo" +version = "3.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" + [[package]] name = "byte-slice-cast" version = "1.2.3" @@ -376,24 +390,38 @@ checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" [[package]] name = "cc" -version = "1.2.19" +version = "1.2.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e3a13707ac958681c13b39b458c073d0d9bc8a22cb1b2f4c8e55eb72c13f362" +checksum = "5252b3d2648e5eedbc1a6f501e3c795e07025c1e93bbf8bbdd6eef7f447a6d54" dependencies = [ + "find-msvc-tools", "shlex", ] [[package]] name = "cfg-if" -version = "1.0.0" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" + +[[package]] +name = "chrono" +version = "0.4.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-traits", + "wasm-bindgen", + "windows-link 0.2.0", +] [[package]] name = "clap" -version = "4.5.37" +version = "4.5.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eccb054f56cbd38340b380d4a8e69ef1f02f1af43db2f0cc817a4774d80ae071" +checksum = "7eac00902d9d136acd712710d71823fb8ac8004ca445a89e73a41d45aa712931" dependencies = [ "clap_builder", "clap_derive", @@ -401,9 +429,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.37" +version = "4.5.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efd9466fac8543255d3b1fcad4762c5e116ffe808c8a3043d4263cd4fd4862a2" +checksum = "2ad9bbf750e73b5884fb8a211a9424a1906c1e156724260fdae972f31d70e1d6" dependencies = [ "anstream", "anstyle", @@ -413,21 +441,21 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.32" +version = "4.5.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7" +checksum = "bbfd7eae0b0f1a6e63d4b13c9c478de77c2eb546fba158ad50b4203dc24b9f9c" dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.106", ] [[package]] name = "clap_lex" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" +checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" [[package]] name = "cli" @@ -442,15 +470,15 @@ dependencies = [ [[package]] name = "colorchoice" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "const-hex" -version = "1.14.0" +version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b0485bab839b018a8f1723fc5391819fea5f8f0f32288ef8a735fd096b6160c" +checksum = "dccd746bf9b1038c0507b7cec21eb2b11222db96a2902c96e8c185d6d20fb9c4" dependencies = [ "cfg-if", "cpufeatures", @@ -485,6 +513,12 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + [[package]] name = "cpufeatures" version = "0.2.17" @@ -521,9 +555,9 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crunchy" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" [[package]] name = "crypto-bigint" @@ -547,6 +581,20 @@ dependencies = [ "typenum", ] +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + [[package]] name = "der" version = "0.7.10" @@ -570,13 +618,13 @@ dependencies = [ [[package]] name = "derive_arbitrary" -version = "1.4.1" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800" +checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.106", ] [[package]] @@ -596,7 +644,7 @@ checksum = "bda628edc44c4bb645fbe0f758797143e4e07926f7ebf4e9bdfbd3d2ce621df3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.106", "unicode-xid", ] @@ -668,12 +716,25 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "errno" -version = "0.3.11" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "976dd42dc7e85965fe702eb8164f21f450704bdde31faefd6471dba214cb594e" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.61.0", +] + +[[package]] +name = "evict" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cc2975dc9f2891a5f88bd1525fd8422669c9e532cc2d48ff5721f2d4fd93df6" +dependencies = [ + "chrono", + "hlc-gen", + "parking_lot", + "priority-queue", + "thiserror", ] [[package]] @@ -714,6 +775,12 @@ dependencies = [ "subtle", ] +[[package]] +name = "find-msvc-tools" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fd99930f64d146689264c637b5af2f0233a933bef0d8570e2526bf9e083192d" + [[package]] name = "fixed-hash" version = "0.8.0" @@ -772,19 +839,19 @@ checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" dependencies = [ "cfg-if", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi 0.11.1+wasi-snapshot-preview1", ] [[package]] name = "getrandom" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0" +checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" dependencies = [ "cfg-if", "libc", "r-efi", - "wasi 0.14.2+wasi-0.2.4", + "wasi 0.14.5+wasi-0.2.4", ] [[package]] @@ -800,9 +867,15 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.2" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ "foldhash", ] @@ -819,6 +892,17 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hlc-gen" +version = "1.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b68dcd92c73e6e26c23309179046a1645190bb7737b22e19ca3afb1092f83daa" +dependencies = [ + "chrono", + "parking_lot", + "thiserror", +] + [[package]] name = "hmac" version = "0.12.1" @@ -828,6 +912,30 @@ dependencies = [ "digest 0.10.7", ] +[[package]] +name = "iana-time-zone" +version = "0.1.63" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0c919e5debc312ad217002b8048a17b7d83f80703865bbfcfebb0458b0b27d8" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + [[package]] name = "impl-codec" version = "0.6.0" @@ -845,18 +953,18 @@ checksum = "a0eb5a3343abf848c0984fe4604b2b105da9539376e24fc0a3b0007411ae4fd9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.106", ] [[package]] name = "indexmap" -version = "2.9.0" +version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" +checksum = "206a8042aec68fa4a62e8d3f7aa4ceb508177d9324faf261e1959e495b7a1921" dependencies = [ "arbitrary", "equivalent", - "hashbrown", + "hashbrown 0.15.5", ] [[package]] @@ -880,6 +988,16 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +[[package]] +name = "js-sys" +version = "0.3.78" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0b063578492ceec17683ef2f8c5e89121fbd0b172cbc280635ab7567db2738" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + [[package]] name = "k256" version = "0.13.4" @@ -920,21 +1038,21 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" -version = "0.2.172" +version = "0.2.175" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" +checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" [[package]] name = "libm" -version = "0.2.13" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9627da5196e5d8ed0b0495e61e518847578da83483c37288316d9b2e03a7f72" +checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" [[package]] name = "linux-raw-sys" -version = "0.9.4" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" [[package]] name = "lock_api" @@ -946,17 +1064,23 @@ dependencies = [ "scopeguard", ] +[[package]] +name = "log" +version = "0.4.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" + [[package]] name = "memchr" -version = "2.7.4" +version = "2.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" [[package]] name = "memmap2" -version = "0.9.5" +version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" +checksum = "843a98750cd611cc2965a8213b53b43e715f13c37a9e096c6408e69990961db7" dependencies = [ "libc", ] @@ -980,7 +1104,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "syn 2.0.100", + "syn 2.0.106", ] [[package]] @@ -1032,11 +1156,17 @@ version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +[[package]] +name = "once_cell_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" + [[package]] name = "parity-scale-codec" -version = "3.7.4" +version = "3.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9fde3d0718baf5bc92f577d652001da0f8d54cd03a7974e118d04fc888dc23d" +checksum = "799781ae679d79a948e13d4824a40970bfa500058d245760dd857301059810fa" dependencies = [ "arrayvec", "bitvec", @@ -1050,14 +1180,14 @@ dependencies = [ [[package]] name = "parity-scale-codec-derive" -version = "3.7.4" +version = "3.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "581c837bb6b9541ce7faa9377c20616e4fb7650f6b0f68bc93c827ee504fb7b3" +checksum = "34b4653168b563151153c9e4c08ebed57fb8262bebfa79711552fa983c623e7a" dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.106", ] [[package]] @@ -1080,7 +1210,7 @@ dependencies = [ "libc", "redox_syscall", "smallvec", - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -1091,9 +1221,9 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "pest" -version = "2.8.0" +version = "2.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "198db74531d58c70a361c42201efde7e2591e976d518caf7662a47dc5720e7b6" +checksum = "21e0a3a33733faeaf8651dfee72dd0f388f0c8e5ad496a3478fa5a922f49cfa8" dependencies = [ "memchr", "thiserror", @@ -1118,9 +1248,9 @@ dependencies = [ [[package]] name = "portable-atomic" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" +checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" [[package]] name = "ppv-lite86" @@ -1128,7 +1258,7 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" dependencies = [ - "zerocopy 0.8.24", + "zerocopy", ] [[package]] @@ -1142,6 +1272,17 @@ dependencies = [ "uint", ] +[[package]] +name = "priority-queue" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5676d703dda103cbb035b653a9f11448c0a7216c7926bd35fcb5865475d0c970" +dependencies = [ + "autocfg", + "equivalent", + "indexmap", +] + [[package]] name = "proc-macro-crate" version = "3.3.0" @@ -1153,9 +1294,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.95" +version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" dependencies = [ "unicode-ident", ] @@ -1171,7 +1312,7 @@ dependencies = [ "bitflags", "lazy_static", "num-traits", - "rand 0.9.1", + "rand 0.9.2", "rand_chacha 0.9.0", "rand_xorshift", "regex-syntax", @@ -1188,7 +1329,7 @@ checksum = "4ee1c9ac207483d5e7db4940700de86a9aae46ef90c48b57f99fe7edb8345e49" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.106", ] [[package]] @@ -1199,7 +1340,7 @@ checksum = "095a99f75c69734802359b682be8daaf8980296731f6470434ea2c652af1dd30" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.106", ] [[package]] @@ -1219,9 +1360,9 @@ dependencies = [ [[package]] name = "r-efi" -version = "5.2.0" +version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" [[package]] name = "radium" @@ -1242,9 +1383,9 @@ dependencies = [ [[package]] name = "rand" -version = "0.9.1" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.3", @@ -1285,7 +1426,7 @@ version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" dependencies = [ - "getrandom 0.3.2", + "getrandom 0.3.3", ] [[package]] @@ -1299,9 +1440,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" dependencies = [ "either", "rayon-core", @@ -1309,9 +1450,9 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.12.1" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" dependencies = [ "crossbeam-deque", "crossbeam-utils", @@ -1319,18 +1460,18 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.11" +version = "0.5.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2f103c6d277498fbceb16e84d317e2a400f160f46904d5f5410848c829511a3" +checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" dependencies = [ "bitflags", ] [[package]] name = "regex" -version = "1.11.1" +version = "1.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912" dependencies = [ "aho-corasick", "memchr", @@ -1340,9 +1481,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.9" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +checksum = "6b9458fa0bfeeac22b5ca447c63aaf45f28439a709ccd244698632f9aa6394d6" dependencies = [ "aho-corasick", "memchr", @@ -1351,9 +1492,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" [[package]] name = "rfc6979" @@ -1377,9 +1518,9 @@ dependencies = [ [[package]] name = "ruint" -version = "1.14.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78a46eb779843b2c4f21fac5773e25d6d5b7c8f0922876c91541790d2ca27eef" +checksum = "9ecb38f82477f20c5c3d62ef52d7c4e536e38ea9b73fb570a20c5cae0e14bcf6" dependencies = [ "alloy-rlp", "arbitrary", @@ -1395,7 +1536,7 @@ dependencies = [ "primitive-types", "proptest", "rand 0.8.5", - "rand 0.9.1", + "rand 0.9.2", "rlp", "ruint-macro", "serde", @@ -1444,22 +1585,22 @@ dependencies = [ [[package]] name = "rustix" -version = "1.0.5" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d97817398dd4bb2e6da002002db259209759911da105da92bec29ccb12cf58bf" +checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" dependencies = [ "bitflags", "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.61.0", ] [[package]] name = "rustversion" -version = "1.0.20" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "rusty-fork" @@ -1487,7 +1628,7 @@ checksum = "22f968c5ea23d555e670b449c1c5e7b2fc399fdaec1d304a17cd48e288abc107" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.106", ] [[package]] @@ -1545,14 +1686,14 @@ checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.106", ] [[package]] name = "sha2" -version = "0.10.8" +version = "0.10.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", "cpufeatures", @@ -1597,9 +1738,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.15.0" +version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" dependencies = [ "arbitrary", ] @@ -1645,9 +1786,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.100" +version = "2.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" +checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" dependencies = [ "proc-macro2", "quote", @@ -1662,35 +1803,35 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "tempfile" -version = "3.19.1" +version = "3.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7437ac7763b9b123ccf33c338a5cc1bac6f69b45a136c19bdd8a65e3916435bf" +checksum = "84fa4d11fadde498443cca10fd3ac23c951f0dc59e080e9f4b93d4df4e4eea53" dependencies = [ "fastrand", - "getrandom 0.3.2", + "getrandom 0.3.3", "once_cell", "rustix", - "windows-sys", + "windows-sys 0.61.0", ] [[package]] name = "thiserror" -version = "2.0.12" +version = "2.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" +checksum = "3467d614147380f2e4e374161426ff399c91084acd2363eaf549172b3d5e60c0" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "2.0.12" +version = "2.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" +checksum = "6c5e1be1c48b9172ee610da68fd9cd2770e7a4056cb3fc98710ee6906f0c7960" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.106", ] [[package]] @@ -1704,15 +1845,15 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.6.8" +version = "0.6.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" [[package]] name = "toml_edit" -version = "0.22.24" +version = "0.22.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17b4795ff5edd201c7cd6dca065ae59972ce77d1b80fa0a84d94950ece7d1474" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" dependencies = [ "indexmap", "toml_datetime", @@ -1731,9 +1872,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.33" +version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" +checksum = "b9d12581f227e93f094d3af2ae690a574abb8a2b9b7a96e7cfe9647b2b617678" dependencies = [ "once_cell", ] @@ -1746,7 +1887,10 @@ dependencies = [ "alloy-rlp", "alloy-trie", "arrayvec", + "dashmap", + "evict", "fxhash", + "libc", "memmap2", "metrics", "metrics-derive", @@ -1756,7 +1900,7 @@ dependencies = [ "rayon", "sealed", "static_assertions", - "zerocopy 0.8.24", + "zerocopy", ] [[package]] @@ -1791,9 +1935,9 @@ checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" [[package]] name = "unicode-ident" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" +checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" [[package]] name = "unicode-xid" @@ -1830,26 +1974,168 @@ dependencies = [ [[package]] name = "wasi" -version = "0.11.0+wasi-snapshot-preview1" +version = "0.11.1+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasi" -version = "0.14.2+wasi-0.2.4" +version = "0.14.5+wasi-0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" +checksum = "a4494f6290a82f5fe584817a676a34b9d6763e8d9d18204009fb31dceca98fd4" dependencies = [ - "wit-bindgen-rt", + "wasip2", +] + +[[package]] +name = "wasip2" +version = "1.0.0+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03fa2761397e5bd52002cd7e73110c71af2109aca4e521a9f40473fe685b0a24" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e14915cadd45b529bb8d1f343c4ed0ac1de926144b746e2710f9cd05df6603b" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e28d1ba982ca7923fd01448d5c30c6864d0a14109560296a162f80f305fb93bb" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn 2.0.106", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c3d463ae3eff775b0c45df9da45d68837702ac35af998361e2c84e7c5ec1b0d" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bb4ce89b08211f923caf51d527662b75bdc9c9c7aab40f86dcb9fb85ac552aa" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f143854a3b13752c6950862c906306adb27c7e839f7414cec8fea35beab624c1" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "windows-core" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link 0.1.3", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "windows-interface" +version = "0.59.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "windows-link" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" + +[[package]] +name = "windows-link" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45e46c0661abb7180e7b9c281db115305d49ca1709ab8242adf09666d2173c65" + +[[package]] +name = "windows-result" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6" +dependencies = [ + "windows-link 0.1.3", +] + +[[package]] +name = "windows-strings" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57" +dependencies = [ + "windows-link 0.1.3", ] [[package]] name = "windows-sys" -version = "0.59.0" +version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" dependencies = [ - "windows-targets", + "windows-targets 0.53.3", +] + +[[package]] +name = "windows-sys" +version = "0.61.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e201184e40b2ede64bc2ea34968b28e33622acdbbf37104f0e4a33f7abe657aa" +dependencies = [ + "windows-link 0.2.0", ] [[package]] @@ -1858,14 +2144,31 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" +dependencies = [ + "windows-link 0.1.3", + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", ] [[package]] @@ -1874,65 +2177,110 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + [[package]] name = "winnow" -version = "0.7.7" +version = "0.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6cb8234a863ea0e8cd7284fcdd4f145233eb00fee02bbdd9861aec44e6477bc5" +checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf" dependencies = [ "memchr", ] [[package]] -name = "wit-bindgen-rt" -version = "0.39.0" +name = "wit-bindgen" +version = "0.45.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" -dependencies = [ - "bitflags", -] +checksum = "5c573471f125075647d03df72e026074b7203790d41351cd6edc96f46bcccd36" [[package]] name = "wyz" @@ -1945,42 +2293,22 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.7.35" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" -dependencies = [ - "zerocopy-derive 0.7.35", -] - -[[package]] -name = "zerocopy" -version = "0.8.24" +version = "0.8.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879" +checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c" dependencies = [ - "zerocopy-derive 0.8.24", -] - -[[package]] -name = "zerocopy-derive" -version = "0.7.35" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.100", + "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.24" +version = "0.8.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be" +checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.106", ] [[package]] @@ -2000,5 +2328,5 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.106", ] diff --git a/src/database.rs b/src/database.rs index f5a11bf0..77a074c1 100644 --- a/src/database.rs +++ b/src/database.rs @@ -3,7 +3,7 @@ use crate::{ executor::threadpool, meta::{MetadataManager, OpenMetadataError}, metrics::DatabaseMetrics, - page::{PageError, PageId, PageManager}, + page::{Page, PageError, PageId, PageManager, PageManagerOptions}, storage::engine::{self, StorageEngine}, transaction::{Transaction, TransactionError, TransactionManager, RO, RW}, }; @@ -26,7 +26,7 @@ pub struct Database { } #[must_use] -#[derive(Default, Debug)] +#[derive(Debug)] pub struct DatabaseOptions { create: bool, create_new: bool, @@ -34,6 +34,7 @@ pub struct DatabaseOptions { meta_path: Option, max_pages: u32, num_threads: Option>, + num_frames: u32, // for PageManager implementation with buffer pool } #[derive(Debug)] @@ -50,6 +51,27 @@ pub enum OpenError { IO(io::Error), } +impl Default for DatabaseOptions { + fn default() -> Self { + let num_frames = if cfg!(not(test)) { + PageManagerOptions::DEFAULT_NUM_FRAMES + } else { + // Use a smaller buffer pool for tests to reduce memory usage + 1024 + }; + + Self { + create: false, + create_new: false, + wipe: false, + meta_path: None, + max_pages: Page::MAX_COUNT, + num_frames, + num_threads: None, + } + } +} + impl DatabaseOptions { /// Sets the option to create a new database, or open it if it already exists. /// @@ -101,6 +123,15 @@ impl DatabaseOptions { self } + /// Sets the number of frames for the PageManager. + /// + /// The default is [`PageManagerOptions::DEFAULT_NUM_FRAMES`], and be used for the buffer pool + /// backend. + pub fn num_frames(&mut self, num_frames: u32) -> &mut Self { + self.num_frames = num_frames; + self + } + /// Opens the database file at the given path. pub fn open(&self, db_path: impl AsRef) -> Result { let db_path = db_path.as_ref(); @@ -156,6 +187,7 @@ impl Database { .create_new(opts.create_new) .wipe(opts.wipe) .page_count(page_count) + .num_frames(opts.num_frames) .open(db_path) .map_err(OpenError::PageError)?; @@ -485,7 +517,7 @@ mod tests { assert_eq!( db.storage_engine .page_manager - .get(1, *page_id) + .get(*page_id) .unwrap_or_else(|err| panic!("page {page_id} not found: {err:?}")) .snapshot_id(), 1 @@ -514,7 +546,7 @@ mod tests { let page = db .storage_engine .page_manager - .get(1, *page_id) + .get(*page_id) .unwrap_or_else(|err| panic!("page {page_id} not found: {err:?}")); if old_page_ids.contains(page_id) { assert_eq!(page.snapshot_id(), 1); diff --git a/src/page.rs b/src/page.rs index f40f5d05..0c72c6cb 100644 --- a/src/page.rs +++ b/src/page.rs @@ -11,7 +11,12 @@ mod page; mod slotted_page; mod state; -pub use manager::{mmap::PageManager, options::PageManagerOptions, PageError}; +#[cfg(feature = "buffer_pool_backend")] +pub use manager::buffer_pool::PageManager; +#[cfg(feature = "mmap_backend")] +pub use manager::mmap::PageManager; + +pub use manager::{options::PageManagerOptions, PageError}; pub use page::{Page, PageMut}; pub use slotted_page::{SlottedPage, SlottedPageMut, CELL_POINTER_SIZE}; diff --git a/src/page/manager.rs b/src/page/manager.rs index 21996001..c7e15d3e 100644 --- a/src/page/manager.rs +++ b/src/page/manager.rs @@ -1,5 +1,10 @@ use crate::page::PageId; +#[cfg(feature = "buffer_pool_backend")] +pub(super) mod buffer_pool; +#[cfg(feature = "buffer_pool_backend")] +pub(super) mod cache_evict; +#[cfg(feature = "mmap_backend")] pub(super) mod mmap; pub(super) mod options; @@ -18,5 +23,7 @@ pub enum PageError { IO(std::io::Error), InvalidValue, InvalidPageContents(PageId), + OutOfMemory, + EvictionPolicy, // TODO: add more errors here for other cases. } diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs new file mode 100644 index 00000000..1e1738f2 --- /dev/null +++ b/src/page/manager/buffer_pool.rs @@ -0,0 +1,890 @@ +use std::{ + ffi::CString, + fs::File, + io::{self, IoSlice, Seek, SeekFrom, Write}, + os::{fd::FromRawFd, unix::fs::FileExt}, + path::Path, + sync::atomic::{AtomicU32, AtomicU64, Ordering}, +}; + +use dashmap::{DashMap, DashSet}; +use parking_lot::RwLock; + +use crate::{ + page::{ + manager::cache_evict::CacheEvict, + state::{PageState, RawPageState}, + Page, PageError, PageId, PageManagerOptions, PageMut, + }, + snapshot::SnapshotId, +}; + +#[derive(Debug, Clone)] +struct Frame { + ptr: *mut [u8; Page::SIZE], +} + +// SAFETY: Frame contains a pointer to heap-allocated memory that we own exclusively. +// The memory is allocated via Box and we manage its lifetime, so it's safe to send +// between threads. +unsafe impl Send for Frame {} +unsafe impl Sync for Frame {} + +#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] +pub(crate) struct FrameId(u32); + +#[derive(Debug)] +pub struct PageManager { + num_frames: u32, + page_count: AtomicU32, + file: RwLock, + file_len: AtomicU64, + frames: Vec, /* list of frames that hold pages' data, indexed by frame id with fix + * num_frames size */ + page_table: DashMap, /* mapping between page id and buffer pool frames, + * indexed by page id with fix num_frames size */ + original_free_frame_idx: AtomicU32, + lru_replacer: CacheEvict, /* the replacer to find unpinned/candidate pages for eviction */ + loading_page: DashSet, /* set of pages that are being loaded from disk */ +} + +impl PageManager { + pub fn options() -> PageManagerOptions { + PageManagerOptions::new() + } + + pub fn open(path: impl AsRef) -> Result { + let opts = PageManagerOptions::new(); + Self::open_with_options(&opts, path) + } + + pub fn open_with_options( + opts: &PageManagerOptions, + path: impl AsRef, + ) -> Result { + let path_cstr = CString::new(path.as_ref().to_string_lossy().as_bytes()) + .map_err(|_| PageError::InvalidValue)?; + // Use O_DIRECT on Linux for better performance, but not available on macOS + #[cfg(target_os = "linux")] + let flags = libc::O_RDWR | libc::O_CREAT | libc::O_DIRECT; + #[cfg(not(target_os = "linux"))] + let flags = libc::O_RDWR | libc::O_CREAT; + + let fd = unsafe { libc::open(path_cstr.as_ptr(), flags, 0o644) }; + if fd == -1 { + return Err(PageError::IO(io::Error::last_os_error())); + } + let file = unsafe { File::from_raw_fd(fd) }; + + Self::from_file_with_options(opts, file) + } + + pub(super) fn from_file_with_options( + opts: &PageManagerOptions, + file: File, + ) -> Result { + let num_frames = opts.num_frames; + let page_count = AtomicU32::new(opts.page_count); + let file_len = AtomicU64::new(file.metadata().map_err(PageError::IO)?.len()); + let page_table = DashMap::with_capacity(num_frames as usize); + let mut frames = Vec::with_capacity(num_frames as usize); + for _ in 0..num_frames { + let boxed_array = Box::new([0; Page::SIZE]); + let ptr = Box::into_raw(boxed_array); + frames.push(Frame { ptr }); + } + let lru_replacer = CacheEvict::new(num_frames as usize); + + Ok(PageManager { + num_frames, + page_count, + file: RwLock::new(file), + file_len, + frames, + page_table, + original_free_frame_idx: AtomicU32::new(0), + lru_replacer, + loading_page: DashSet::with_capacity(num_frames as usize), + }) + } + + #[cfg(test)] + pub fn open_temp_file() -> Result { + Self::options().open_temp_file() + } + + /// Retrieves a page from the buffer pool. + pub fn get(&self, page_id: PageId) -> Result, PageError> { + if page_id > self.page_count.load(Ordering::Relaxed) { + return Err(PageError::PageNotFound(page_id)); + } + loop { + // Check if page is already in the cache + if let Some(frame_id) = self.page_table.get(&page_id) { + let frame = &self.frames[frame_id.0 as usize]; + self.lru_replacer.touch(page_id).map_err(|_| PageError::EvictionPolicy)?; + return unsafe { Page::from_ptr(page_id, frame.ptr, self) } + } + + // Otherwise, need to load the page from disk + if self.loading_page.insert(page_id) { + // This thread is the first to load this page + let frame_id = self.get_free_frame().ok_or(PageError::OutOfMemory)?; + let buf: *mut [u8; Page::SIZE] = self.frames[frame_id.0 as usize].ptr; + unsafe { + self.file + .read() + .read_exact_at(&mut *buf, page_id.as_offset() as u64) + .map_err(PageError::IO)?; + } + self.page_table.insert(page_id, frame_id); + self.lru_replacer.pin_read(page_id).map_err(|_| PageError::EvictionPolicy)?; + self.loading_page.remove(&page_id); + return unsafe { Page::from_ptr(page_id, buf, self) } + } + // Another thread is already loading this page, spin/yield and retry + std::thread::yield_now(); + } + } + + /// Retrieves a mutable page from the buffer pool. + pub fn get_mut( + &self, + snapshot_id: SnapshotId, + page_id: PageId, + ) -> Result, PageError> { + if page_id > self.page_count.load(Ordering::Relaxed) { + return Err(PageError::PageNotFound(page_id)); + } + loop { + // Check if page is already in the cache + if let Some(frame_id) = self.page_table.get(&page_id) { + let frame = &self.frames[frame_id.0 as usize]; + return unsafe { PageMut::from_ptr(page_id, snapshot_id, frame.ptr, self) } + } + // Otherwise, need to load the page from disk + if self.loading_page.insert(page_id) { + // This thread is the first to load this page + let frame_id = self.get_free_frame().ok_or(PageError::OutOfMemory)?; + let buf: *mut [u8; Page::SIZE] = self.frames[frame_id.0 as usize].ptr; + unsafe { + self.file + .read() + .read_exact_at(&mut *buf, page_id.as_offset() as u64) + .map_err(PageError::IO)?; + } + self.page_table.insert(page_id, frame_id); + self.lru_replacer + .pin_write(frame_id, page_id) + .map_err(|_| PageError::EvictionPolicy)?; + self.loading_page.remove(&page_id); + return unsafe { PageMut::from_ptr(page_id, snapshot_id, buf, self) } + } else { + // Another thread is already loading this page, spin/yield and retry + std::thread::yield_now(); + continue; + } + } + } + + /// Adds a new page to the buffer pool. + /// + /// Returns an error if the buffer pool is full. + pub fn allocate(&self, snapshot_id: SnapshotId) -> Result, PageError> { + let frame_id = self.get_free_frame().ok_or(PageError::OutOfMemory)?; + let (page_id, new_count) = self.next_page_id().ok_or(PageError::PageLimitReached)?; + + self.grow_if_needed(new_count as u64 * Page::SIZE as u64)?; + + self.page_table.insert(page_id, frame_id); + self.lru_replacer.pin_write(frame_id, page_id).map_err(|_| PageError::EvictionPolicy)?; + + let data = self.frames[frame_id.0 as usize].ptr; + unsafe { PageMut::acquire_unchecked(page_id, snapshot_id, data, self) } + } + + /// Checks if a page is currently in the Dirty state. + /// + /// This method allows checking if a page is being written to without + /// the overhead of acquiring the page. + pub fn is_dirty(&self, page_id: PageId) -> Result { + if page_id > self.page_count.load(Ordering::Relaxed) { + return Err(PageError::PageNotFound(page_id)); + } + // A page is dirty if it is in the page_table + if let Some(frame_id) = self.page_table.get(&page_id) { + let frame = &self.frames[frame_id.0 as usize]; + // SAFETY: We're just reading the state atomically, respecting the memory model + let state = unsafe { RawPageState::from_ptr(frame.ptr.cast()) }; + + Ok(matches!(state.load(), PageState::Dirty(_))) + } else { + // Otherwise, the page is not dirty + Ok(false) + } + } + + /// Syncs the buffer pool to the file. + /// + /// Could explore the parallel write strategy to improve performance. + pub fn sync(&self) -> io::Result<()> { + let file = &mut self.file.write(); + // Get all value at write_frames + let mut dirty_pages = self.lru_replacer.write_frames.lock(); + dirty_pages.sort_by_key(|(_, page_id)| page_id.as_offset()); + + // Group contiguous pages together + let mut current_offset = None; + let mut batch: Vec = Vec::new(); + + for (frame_id, page_id) in dirty_pages.iter() { + let offset = page_id.as_offset() as u64; + if let Some(prev_offset) = current_offset { + if offset != prev_offset + (batch.len() * Page::SIZE) as u64 { + // write the current batch + self.write(&mut batch, file, prev_offset)?; + batch.clear(); + } + } + if batch.is_empty() { + current_offset = Some(offset); + } + let frame = &self.frames[frame_id.0 as usize]; + unsafe { + let page_data = std::slice::from_raw_parts(frame.ptr as *const u8, Page::SIZE); + batch.push(IoSlice::new(page_data)); + } + } + // Write final batch + if !batch.is_empty() { + self.write(&mut batch, file, current_offset.unwrap())?; + } + file.flush()?; + for (_, page_id) in dirty_pages.iter() { + self.lru_replacer + .unpin(*page_id) + .map_err(|e| io::Error::other(format!("eviction policy error: {e:?}")))?; + } + dirty_pages.clear(); + Ok(()) + } + + #[inline] + fn write(&self, batch: &mut Vec, file: &mut File, offset: u64) -> io::Result<()> { + file.seek(SeekFrom::Start(offset))?; + let mut total_written: usize = 0; + while total_written < batch.iter().map(|b| b.len()).sum() { + let written = file.write_vectored(batch)?; + if written == 0 { + return Err(io::Error::new( + io::ErrorKind::WriteZero, + "failed to write whole buffer", + )); + } + total_written += written; + // Remove fully written slices from the batch + let mut bytes_left = written; + while !batch.is_empty() && bytes_left >= batch[0].len() { + bytes_left -= batch[0].len(); + batch.remove(0); + } + // Adjust the first slice if it was partially written + if !batch.is_empty() && bytes_left > 0 { + // SAFETY: IoSlice only needs a reference for the duration of the write call, + // and batch[0] is still valid here. + let ptr = batch[0].as_ptr(); + let len = batch[0].len(); + if bytes_left < len { + let new_slice = unsafe { + std::slice::from_raw_parts(ptr.add(bytes_left), len - bytes_left) + }; + batch[0] = IoSlice::new(new_slice); + } + } + } + Ok(()) + } + + /// Syncs and closes the buffer pool. + pub fn close(&self) -> io::Result<()> { + self.sync() + } + + /// Returns the number of pages currently stored in the file. + #[inline] + pub fn size(&self) -> u32 { + self.page_count.load(Ordering::Relaxed) + } + + #[inline] + pub fn capacity(&self) -> u32 { + self.num_frames + } + + #[inline] + pub fn drop_page(&self, page_id: PageId) { + // unpin() must be successful, or an indication of a bug in the code + self.lru_replacer.unpin(page_id).unwrap(); + } + + fn next_page_id(&self) -> Option<(PageId, u32)> { + let mut old_count = self.page_count.load(Ordering::Relaxed); + loop { + let new_count = old_count.checked_add(1)?; + let page_id = PageId::try_from(new_count).ok()?; + match self.page_count.compare_exchange_weak( + old_count, + new_count, + Ordering::Relaxed, + Ordering::Relaxed, + ) { + Ok(_) => return Some((page_id, new_count)), + Err(val) => old_count = val, // Another thread modiled page_count, retry. + } + } + } + + fn get_free_frame(&self) -> Option { + let mut original_free_frame_idx = self.original_free_frame_idx.load(Ordering::Relaxed); + loop { + if original_free_frame_idx < self.num_frames { + match self.original_free_frame_idx.compare_exchange_weak( + original_free_frame_idx, + original_free_frame_idx + 1, + Ordering::Relaxed, + Ordering::Relaxed, + ) { + Ok(_) => return Some(FrameId(original_free_frame_idx)), + Err(val) => original_free_frame_idx = val, /* Another thread modified original_free_frame_idx, retry. */ + } + } else { + let evicted_page = self.lru_replacer.evict(); + if let Some(page_id) = evicted_page { + return self.page_table.remove(&page_id).map(|(_, frame_id)| frame_id) + } else { + return None + } + } + } + } + + #[inline] + fn grow_if_needed(&self, min_len: u64) -> Result<(), PageError> { + if min_len <= self.file_len.load(Ordering::Relaxed) { + return Ok(()); + } + // Acquire write lock to grow the file + let file = &mut self.file.write(); + let cur_len = self.file_len.load(Ordering::Relaxed); + if min_len <= cur_len { + return Ok(()); + } + // Grow the file by at least 12.5% of current size, or 4MiB, whichever is larger + let increment = (cur_len / 8).max(1024 * Page::SIZE as u64); + file.set_len(cur_len + increment).map_err(PageError::IO)?; + self.file_len.store(cur_len + increment, Ordering::Relaxed); + Ok(()) + } +} + +impl Drop for PageManager { + fn drop(&mut self) { + self.sync().expect("sync failed"); + } +} + +#[cfg(test)] +mod tests { + use crate::page_id; + + use super::*; + use std::{ + io::Seek, + sync::{Arc, Barrier}, + thread, + }; + + fn len(f: &File) -> usize { + f.metadata().expect("fetching file metadata failed").len().try_into().unwrap() + } + + fn read(mut f: &File, n: usize) -> Vec { + use std::io::Read; + let mut buf = vec![0; n]; + f.read_exact(&mut buf).expect("read failed"); + buf + } + + fn seek(mut f: &File, offset: u64) { + f.seek(SeekFrom::Start(offset)).expect("seek failed"); + } + + #[test] + fn test_is_dirty() { + let snapshot = 1234; + let f = tempfile::tempfile().expect("temporary file creation failed"); + let mut opts = PageManagerOptions::new(); + opts.num_frames(255); + let m = PageManager::from_file_with_options(&opts, f.try_clone().unwrap()) + .expect("mmap creation failed"); + + let mut page = m.allocate(snapshot).unwrap(); + page.contents_mut().iter_mut().for_each(|byte| *byte = 0x12); + assert!(m.is_dirty(page_id!(1)).unwrap()); + drop(page); + assert!(!m.is_dirty(page_id!(1)).unwrap()); + m.sync().expect("sync failed"); + assert!(!m.is_dirty(page_id!(1)).unwrap()); + } + + #[test] + fn test_allocate_cache() { + let snapshot = 1234; + let f = tempfile::tempfile().expect("temporary file creation failed"); + let mut opts = PageManagerOptions::new(); + opts.num_frames(255); + let m = PageManager::from_file_with_options(&opts, f.try_clone().unwrap()) + .expect("mmap creation failed"); + + for i in 1..=10 { + let i = PageId::new(i).unwrap(); + let err = m.get(i).unwrap_err(); + assert!(matches!(err, PageError::PageNotFound(page_id) if page_id == i)); + + let page = m.allocate(snapshot).unwrap(); + assert_eq!(page.id(), i); + assert_eq!(page.contents(), &mut [0; Page::DATA_SIZE]); + assert_eq!(page.snapshot_id(), snapshot); + drop(page); + } + + // Verify pages are in the cache, and are dirty after allocate + for i in 1..=10 { + let i = PageId::new(i).unwrap(); + let frame_id = m.page_table.get(&i).expect("page not in cache"); + let dirty_frames = m.lru_replacer.write_frames.lock(); + assert!(dirty_frames.iter().any(|x| x.0 == *frame_id && x.1 == i)); + } + } + + #[test] + fn test_allocate_get() { + let snapshot = 1234; + let f = tempfile::tempfile().expect("temporary file creation failed"); + let mut opts = PageManagerOptions::new(); + opts.num_frames(255); + let m = PageManager::from_file_with_options(&opts, f.try_clone().unwrap()) + .expect("mmap creation failed"); + + for i in 1..=10 { + let i = PageId::new(i).unwrap(); + let err = m.get(i).unwrap_err(); + assert!(matches!(err, PageError::PageNotFound(page_id) if page_id == i)); + + let mut page = m.allocate(snapshot).unwrap(); + assert_eq!(page.id(), i); + assert_eq!(page.contents(), &mut [0; Page::DATA_SIZE]); + assert_eq!(page.snapshot_id(), snapshot); + page.contents_mut().iter_mut().for_each(|byte| *byte = 0x12); + drop(page); + + // Verify the page content with get() + let page = m.get(i).unwrap(); + assert_eq!(page.id(), i); + assert_eq!(page.contents(), &mut [0x12; Page::DATA_SIZE]); + assert_eq!(page.snapshot_id(), snapshot); + } + + // Verify the capability of frame + } + + #[test] + fn test_allocate_get_mut() { + let snapshot = 1235; + let f = tempfile::tempfile().expect("temporary file creation failed"); + let mut opts = PageManagerOptions::new(); + opts.num_frames(255); + let m = PageManager::from_file_with_options(&opts, f.try_clone().unwrap()) + .expect("mmap creation failed"); + + let mut page = m.allocate(snapshot).unwrap(); + assert_eq!(page.id(), page_id!(1)); + assert_eq!(page.contents(), &mut [0; Page::DATA_SIZE]); + assert_eq!(page.snapshot_id(), snapshot); + page.contents_mut().iter_mut().for_each(|byte| *byte = 0xab); + drop(page); + + let p1 = m.get(page_id!(1)).unwrap(); + assert_eq!(p1.id(), page_id!(1)); + assert_eq!(p1.snapshot_id(), snapshot); + assert_eq!(p1.contents(), &mut [0xab; Page::DATA_SIZE]); + + let p2 = m.allocate(snapshot).unwrap(); + assert_eq!(p2.id(), page_id!(2)); + drop(p2); + let p3 = m.allocate(snapshot).unwrap(); + assert_eq!(p3.id(), page_id!(3)); + drop(p3); + + let mut p1 = m.get_mut(snapshot, page_id!(1)).unwrap(); + p1.contents_mut().iter_mut().for_each(|byte| *byte = 0xcd); + drop(p1); + + // Verify the page content with get after get_mut and modify + let p1 = m.get(page_id!(1)).unwrap(); + assert_eq!(p1.id(), page_id!(1)); + assert_eq!(p1.snapshot_id(), snapshot); + assert_eq!(p1.contents(), &mut [0xcd; Page::DATA_SIZE]); + } + + #[test] + fn persistence() { + let snapshot = 123; + let f = tempfile::tempfile().expect("temporary file creation failed"); + let mut opts = PageManagerOptions::new(); + opts.num_frames(255); + let m = PageManager::from_file_with_options(&opts, f.try_clone().unwrap()) + .expect("buffer pool creation failed"); + + // No page has been allocated; file should be empty + assert_eq!(len(&f), 0); + + // Allocate a page; verify that the size of the file is `1024 * Page::SIZE` + let mut p = m.allocate(snapshot).expect("page allocation failed"); + p.contents_mut().iter_mut().for_each(|byte| *byte = 0xab); + drop(p); + m.sync().expect("sync failed"); + seek(&f, 0); + assert_eq!(len(&f), 1024 * Page::SIZE); + assert_eq!(read(&f, 8), snapshot.to_le_bytes()); + assert_eq!(read(&f, Page::DATA_SIZE - 8), [0xab; Page::DATA_SIZE - 8]); + + // Repeat the test with more pages + for i in 1..=255 { + let mut p = m.allocate(snapshot + i as u64).expect("page allocation failed"); + p.contents_mut().iter_mut().for_each(|byte| *byte = 0xab ^ (i as u8)); + } + m.sync().expect("sync failed"); + + assert_eq!(len(&f), 1024 * Page::SIZE); + for i in 1..=255 { + seek(&f, i * Page::SIZE as u64); + assert_eq!(read(&f, 8), (snapshot + i).to_le_bytes()); + assert_eq!(read(&f, Page::DATA_SIZE - 8), [0xab ^ (i as u8); Page::DATA_SIZE - 8]); + } + } + + #[test] + fn get_cache() { + let snapshot = 123; + let f = tempfile::tempfile().expect("temporary file creation failed"); + { + let mut opts = PageManagerOptions::new(); + opts.num_frames(255); + let m = PageManager::from_file_with_options(&opts, f.try_clone().unwrap()) + .expect("buffer pool creation failed"); + for i in 1..=255 { + let mut p = m.allocate(snapshot + i as u64).expect("page allocation failed"); + p.contents_mut().iter_mut().for_each(|byte| *byte = 0xab ^ (i as u8)); + } + m.sync().expect("sync failed"); + } + { + // get + let mut opts = PageManagerOptions::new(); + opts.num_frames(255).page_count(255); + let m = PageManager::from_file_with_options(&opts, f.try_clone().unwrap()) + .expect("buffer pool creation failed"); + for i in 1..=255 { + let page_id = PageId::new(i).unwrap(); + let page = m.get(page_id).expect("page not in cache"); + assert_eq!(page.contents(), &mut [0xab ^ (i as u8); Page::DATA_SIZE]); + let frame_id = m.page_table.get(&page_id).expect("page not in cache"); + let frame = &m.frames[frame_id.0 as usize]; + assert_eq!(frame.ptr as *const u8, page.all_contents().as_ptr()); + } + } + { + // get_mut + let mut opts = PageManagerOptions::new(); + opts.num_frames(255).page_count(255); + let m = PageManager::from_file_with_options(&opts, f.try_clone().unwrap()) + .expect("buffer pool creation failed"); + for i in 1..=255 { + let page_id = PageId::new(i).unwrap(); + let page = m.get_mut(snapshot + i as u64, page_id).expect("page not in cache"); + assert_eq!(page.contents(), &mut [0xab ^ (i as u8); Page::DATA_SIZE]); + let frame_id = m.page_table.get(&page_id).expect("page not in cache"); + let frame = &m.frames[frame_id.0 as usize]; + assert_eq!(frame.ptr as *const u8, page.all_contents().as_ptr()); + } + } + } + + #[test] + fn test_allocate_oom() { + let snapshot = 1234; + let f = tempfile::tempfile().expect("temporary file creation failed"); + let mut opts = PageManagerOptions::new(); + opts.num_frames(10); + let m = PageManager::from_file_with_options(&opts, f.try_clone().unwrap()) + .expect("mmap creation failed"); + + for _ in 1..=10 { + m.allocate(snapshot).expect("failed to allocate page"); + } + let page = m.allocate(snapshot).unwrap_err(); + assert!(matches!(page, PageError::OutOfMemory)); + } + + #[test] + fn pool_eviction() { + let snapshot = 123; + let temp_file = tempfile::NamedTempFile::new().expect("temporary file creation failed"); + { + let mut opts = PageManagerOptions::new(); + opts.num_frames(200); + let m = PageManager::open_with_options(&opts, temp_file.path()) + .expect("buffer pool creation failed"); + + (1..=200).for_each(|i| { + let mut p = m + .allocate(snapshot + i as u64) + .unwrap_or_else(|_| panic!("page allocation failed {i}")); + p.contents_mut().iter_mut().for_each(|byte| *byte = 0x10 + i as u8); + }); + m.sync().expect("sync failed"); + } + { + let mut opts = PageManagerOptions::new(); + opts.num_frames(10).page_count(200); + let m = PageManager::open_with_options(&opts, temp_file.path()) + .expect("buffer pool creation failed"); + (1..=200).for_each(|i| { + let page_id = PageId::new(i).unwrap(); + let page = m.get(page_id).unwrap_or_else(|_| panic!("failed to get page {i}")); + assert_eq!(page.contents(), &mut [0x10 + i as u8; Page::DATA_SIZE]); + }); + } + } + + #[test] + fn test_concurrent_get_same_page() { + // Test high contention race by having multiple threads accessing same pages with cache + // hits/misses + let snapshot = 1234; + let temp_file = tempfile::NamedTempFile::new().expect("temporary file creation failed"); + let total_pages = 50; + let num_frames = 200; // Plenty of frames to avoid eviction + + // Pre-populate the file with test data + { + let mut opts = PageManagerOptions::new(); + opts.num_frames(num_frames); + let m = PageManager::open_with_options(&opts, temp_file.path()) + .expect("buffer pool creation failed"); + + // Allocate and initialize test pages + for i in 1..=total_pages { + let mut page = m.allocate(snapshot + i as u64).expect("page allocation failed"); + page.contents_mut().iter_mut().for_each(|byte| *byte = i as u8); + drop(page); + } + m.sync().expect("sync failed"); + } + + // Test concurrent access to the same pages + { + let mut opts = PageManagerOptions::new(); + opts.num_frames(num_frames).page_count(total_pages); + let m = Arc::new( + PageManager::open_with_options(&opts, temp_file.path()) + .expect("buffer pool creation failed"), + ); + + let num_threads = 16; + let iterations = 100; + let barrier = Arc::new(Barrier::new(num_threads)); + + let handles: Vec<_> = (0..num_threads) + .map(|thread_id| { + let m = m.clone(); + let barrier = barrier.clone(); + + thread::spawn(move || { + barrier.wait(); // Synchronize start to maximize race conditions + + for iter in 0..iterations { + // Mix of different pages, but with high probability of conflicts + let page_id = + PageId::new(1 + (iter as u32 + thread_id as u32) % 10).unwrap(); + + match m.get(page_id) { + Ok(page) => { + // Verify page contents are correct + let expected = page_id.as_u32() as u8; + assert_eq!(page.contents(), &[expected; Page::DATA_SIZE]); + + // Hold the page for a random short time to increase contention + if (thread_id + iter) % 7 == 0 { + thread::sleep(std::time::Duration::from_micros(1)); + } + } + Err(e) => { + panic!("Unexpected error getting page {page_id}: {e:?}") + } + } + } + }) + }) + .collect(); + + for handle in handles { + handle.join().expect("thread panicked"); + } + + // Verify final state consistency + for i in 1..=total_pages { + let page_id = PageId::new(i).unwrap(); + let page = m.get(page_id).expect("page should exist"); + assert_eq!(page.contents(), &[i as u8; Page::DATA_SIZE]); + } + } + } + + #[test] + fn test_concurrent_get_different_pages_limited_frames() { + // Test eviction under pressure race condition + let snapshot = 1234; + let temp_file = tempfile::NamedTempFile::new().expect("temporary file creation failed"); + let total_pages = 1000; + + // Pre-populate the file with test data + { + let mut opts = PageManagerOptions::new(); + opts.num_frames(1000); + let m = PageManager::open_with_options(&opts, temp_file.path()) + .expect("buffer pool creation failed"); + + for i in 1..=total_pages { + let mut page = m.allocate(snapshot + i as u64).expect("page allocation failed"); + page.contents_mut().iter_mut().for_each(|byte| *byte = i as u8); + drop(page); + } + m.sync().expect("sync failed"); + } + + // Test with limited frames to force eviction + { + let num_threads = 16; + let iterations = 50; + let num_frames = 32; + let mut opts = PageManagerOptions::new(); + opts.num_frames(num_frames).page_count(total_pages); // Force frequent eviction + let m = Arc::new( + PageManager::open_with_options(&opts, temp_file.path()) + .expect("buffer pool creation failed"), + ); + + let barrier = Arc::new(Barrier::new(num_threads)); + + let handles: Vec<_> = (0..num_threads) + .map(|thread_id| { + let m = m.clone(); + let barrier = barrier.clone(); + + thread::spawn(move || { + barrier.wait(); + + for iter in 0..iterations { + // Access different pages to force frame reuse + let page_id = PageId::new( + 1 + (thread_id as u32 * iterations + iter) % total_pages, + ) + .unwrap(); + + match m.get(page_id) { + Ok(page) => { + let expected = page_id.as_u32() as u8; + assert_eq!(page.contents(), &[expected; Page::DATA_SIZE]); + } + Err(e) => { + panic!("Unexpected error getting page {page_id}: {e:?}") + } + } + } + }) + }) + .collect(); + + for handle in handles { + handle.join().expect("thread panicked"); + } + } + } + + #[test] + fn test_concurrent_allocate_and_get() { + // Test allocation vs get race condition + let snapshot = 1234; + let temp_file = tempfile::NamedTempFile::new().expect("temporary file creation failed"); + let num_threads = 8; + let pages_per_thread: usize = 64; + let mut opts = PageManagerOptions::new(); + opts.num_frames(pages_per_thread as u32 + 1); + let m = Arc::new( + PageManager::open_with_options(&opts, temp_file.path()) + .expect("buffer pool creation failed"), + ); + let barrier = Arc::new(Barrier::new(num_threads)); + + let handles: Vec<_> = (0..num_threads) + .map(|thread_id| { + let m = m.clone(); + let barrier = barrier.clone(); + + thread::spawn(move || { + barrier.wait(); + + if thread_id == 0 { + // Allocate new pages + for i in 0..pages_per_thread { + match m.allocate(snapshot + thread_id as u64 * 1000 + i as u64) { + Ok(mut page) => { + page.contents_mut().iter_mut().for_each(|byte| { + *byte = (thread_id as u8).wrapping_add(i as u8) + }); + } + Err(e) => panic!("Unexpected error allocating page: {e:?}"), + } + } + } else { + for i in 0..pages_per_thread { + // Try to get existing pages + let page_id = + PageId::new(1 + (thread_id as u32 + i as u32) % 20).unwrap(); + match m.get(page_id) { + Ok(_page) => { + // Expected + } + Err(PageError::PageNotFound(_)) => { + // Expected if page doesn't exist yet + } + Err(PageError::PageDirty(_)) => { + // Expected if page is dirty + } + Err(e) => { + panic!("Unexpected error getting page {page_id}: {e:?}") + } + } + } + } + }) + }) + .collect(); + + for handle in handles { + handle.join().expect("thread panicked"); + } + } +} diff --git a/src/page/manager/cache_evict.rs b/src/page/manager/cache_evict.rs new file mode 100644 index 00000000..f201782c --- /dev/null +++ b/src/page/manager/cache_evict.rs @@ -0,0 +1,51 @@ +use std::fmt; + +use evict::{EvictResult, EvictionPolicy, LruReplacer}; +use parking_lot::Mutex; + +use crate::page::{manager::buffer_pool::FrameId, PageId}; + +// TODO: Temporarily use LruReplacer as the eviction policy, replace with a better eviction policy +pub(crate) struct CacheEvict { + lru_replacer: LruReplacer, + read_frames: Mutex>, + pub(crate) write_frames: Mutex>, +} + +impl fmt::Debug for CacheEvict { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "CacheEvict") + } +} + +impl CacheEvict { + pub(crate) fn new(capacity: usize) -> Self { + Self { + lru_replacer: LruReplacer::new(capacity), + read_frames: Mutex::new(Vec::with_capacity(capacity)), + write_frames: Mutex::new(Vec::with_capacity(capacity)), + } + } + + pub(crate) fn evict(&self) -> Option { + self.lru_replacer.evict() + } + + pub(crate) fn touch(&self, page_id: PageId) -> EvictResult<(), PageId> { + self.lru_replacer.touch(page_id) + } + + pub(crate) fn pin_read(&self, page_id: PageId) -> EvictResult<(), PageId> { + self.read_frames.lock().push(page_id); + self.lru_replacer.pin(page_id) + } + + pub(crate) fn pin_write(&self, frame_id: FrameId, page_id: PageId) -> EvictResult<(), PageId> { + self.write_frames.lock().push((frame_id, page_id)); + self.lru_replacer.pin(page_id) + } + + pub(crate) fn unpin(&self, page_id: PageId) -> EvictResult<(), PageId> { + self.lru_replacer.unpin(page_id) + } +} diff --git a/src/page/manager/mmap.rs b/src/page/manager/mmap.rs index 41d62087..4b829dde 100644 --- a/src/page/manager/mmap.rs +++ b/src/page/manager/mmap.rs @@ -117,6 +117,9 @@ impl PageManager { (self.mmap.len() / Page::SIZE).min(u32::MAX as usize) as u32 } + #[inline] + pub fn drop_page(&self, _page_id: PageId) {} + /// Grows the size of the underlying file to make room for additional pages. /// /// This will increase the file size by a constant factor of 1024 pages, or a relative factor @@ -188,7 +191,7 @@ impl PageManager { } /// Retrieves a page from the memory mapped file. - pub fn get(&self, _snapshot_id: SnapshotId, page_id: PageId) -> Result, PageError> { + pub fn get(&self, page_id: PageId) -> Result, PageError> { if page_id > self.page_count.load(Ordering::Relaxed) { return Err(PageError::PageNotFound(page_id)); } @@ -199,7 +202,7 @@ impl PageManager { // SAFETY: All memory from the memory map is accessed through `Page` or `PageMut`, thus // respecting the page state access memory model. - unsafe { Page::from_ptr(page_id, data) } + unsafe { Page::from_ptr(page_id, data, self) } } /// Retrieves a mutable page from the memory mapped file. @@ -218,7 +221,7 @@ impl PageManager { // TODO: This is actually unsafe, as it's possible to call `get()` arbitrary times before // calling this function (this will be fixed in a future commit). - unsafe { PageMut::from_ptr(page_id, snapshot_id, data) } + unsafe { PageMut::from_ptr(page_id, snapshot_id, data, self) } } /// Adds a new page. @@ -243,7 +246,7 @@ impl PageManager { // time, they would get a different `page_id`. // - All memory from the memory map is accessed through `Page` or `PageMut`, thus respecting // the page state access memory model. - unsafe { PageMut::acquire_unchecked(page_id, snapshot_id, data) } + unsafe { PageMut::acquire_unchecked(page_id, snapshot_id, data, self) } } /// Checks if a page is currently in the Dirty state. @@ -297,7 +300,7 @@ mod tests { for i in 1..=10 { let i = PageId::new(i).unwrap(); - let err = manager.get(42, i).unwrap_err(); + let err = manager.get(i).unwrap_err(); assert!(matches!(err, PageError::PageNotFound(page_id) if page_id == i)); let page = manager.allocate(42).unwrap(); @@ -306,7 +309,7 @@ mod tests { assert_eq!(page.snapshot_id(), 42); drop(page); - let page = manager.get(42, i).unwrap(); + let page = manager.get(i).unwrap(); assert_eq!(page.id(), i); assert_eq!(page.contents(), &mut [0; Page::DATA_SIZE]); assert_eq!(page.snapshot_id(), 42); @@ -328,7 +331,7 @@ mod tests { page.contents_mut()[0] = 1; drop(page); - let old_page = manager.get(42, page_id!(1)).unwrap(); + let old_page = manager.get(page_id!(1)).unwrap(); assert_eq!(old_page.id(), page_id!(1)); assert_eq!(old_page.contents()[0], 1); assert_eq!(old_page.snapshot_id(), 42); @@ -353,7 +356,7 @@ mod tests { assert_eq!(page2_mut.contents()[0], 2); drop(page2_mut); - let page2 = manager.get(42, page_id!(2)).unwrap(); + let page2 = manager.get(page_id!(2)).unwrap(); assert_eq!(page2.contents()[0], 2); } diff --git a/src/page/manager/options.rs b/src/page/manager/options.rs index f95cc20d..5b5de24d 100644 --- a/src/page/manager/options.rs +++ b/src/page/manager/options.rs @@ -9,9 +9,12 @@ pub struct PageManagerOptions { pub(super) open_options: OpenOptions, pub(super) page_count: u32, pub(super) max_pages: u32, + pub(super) num_frames: u32, // for buffer pool backend } impl PageManagerOptions { + pub const DEFAULT_NUM_FRAMES: u32 = 1024 * 1024 * 2; + pub fn new() -> Self { let mut open_options = File::options(); open_options.read(true).write(true).create(true).truncate(false); @@ -24,7 +27,14 @@ impl PageManagerOptions { Page::MAX_COUNT / 1024 }; - Self { open_options, page_count: 0, max_pages } + let num_frames = if cfg!(not(test)) { + Self::DEFAULT_NUM_FRAMES + } else { + // Use a smaller buffer pool for tests to reduce memory usage + 1024 + }; + + Self { open_options, page_count: 0, max_pages, num_frames } } /// Sets the option to create a new file, or open it if it already exists. @@ -61,6 +71,14 @@ impl PageManagerOptions { self } + /// Sets the number of frames for the buffer pool backend. + /// + /// The default is [`DEFAULT_NUM_FRAMES`]. + pub fn num_frames(&mut self, num_frames: u32) -> &mut Self { + self.num_frames = num_frames; + self + } + /// Causes the file length to be set to 0 after opening it. /// /// Note that if `wipe(true)` is set, then setting [`page_count()`](Self::page_count) with any diff --git a/src/page/page.rs b/src/page/page.rs index 6a28ad47..b5f93052 100644 --- a/src/page/page.rs +++ b/src/page/page.rs @@ -5,6 +5,7 @@ use crate::{ PageId, }, snapshot::SnapshotId, + PageManager, }; use std::{fmt, marker::PhantomData, mem, ops::Deref}; @@ -22,21 +23,22 @@ compile_error!("This code only supports little-endian platforms"); /// /// This struct mainly exists to allow safe transmutation from [`PageMut`] to [`Page`]. #[derive(Copy, Clone)] -struct UnsafePage { +struct UnsafePage<'p> { id: PageId, ptr: *mut [u8; Page::SIZE], + page_manager: &'p PageManager, } #[repr(transparent)] -#[derive(Copy, Clone)] +#[derive(Clone)] pub struct Page<'p> { - inner: UnsafePage, + inner: UnsafePage<'p>, phantom: PhantomData<&'p ()>, } #[repr(transparent)] pub struct PageMut<'p> { - inner: UnsafePage, + inner: UnsafePage<'p>, phantom: PhantomData<&'p ()>, } @@ -48,7 +50,7 @@ fn fmt_page(name: &str, p: &Page<'_>, f: &mut fmt::Formatter<'_>) -> fmt::Result .finish() } -impl Page<'_> { +impl<'p> Page<'p> { pub const SIZE: usize = 4096; pub const HEADER_SIZE: usize = 8; pub const DATA_SIZE: usize = Self::SIZE - Self::HEADER_SIZE; @@ -68,11 +70,15 @@ impl Page<'_> { /// /// [valid]: core::ptr#safety /// [memory model for page state access]: state#memory-model - pub unsafe fn from_ptr(id: PageId, ptr: *mut [u8; Page::SIZE]) -> Result { + pub unsafe fn from_ptr( + id: PageId, + ptr: *mut [u8; Page::SIZE], + page_manager: &'p PageManager, + ) -> Result { // SAFETY: guaranteed by the caller match RawPageState::from_ptr(ptr.cast()).load() { PageState::Occupied(_) => { - Ok(Self { inner: UnsafePage { id, ptr }, phantom: PhantomData }) + Ok(Self { inner: UnsafePage { id, ptr, page_manager }, phantom: PhantomData }) } PageState::Unused => Err(PageError::PageNotFound(id)), PageState::Dirty(_) => Err(PageError::PageDirty(id)), @@ -111,6 +117,12 @@ impl Page<'_> { pub fn contents(&self) -> &[u8] { self.raw_contents() } + + /// Returns all contents of the page, including the header + #[cfg(test)] + pub fn all_contents(&self) -> &[u8; Page::SIZE] { + unsafe { &*self.inner.ptr.cast() } + } } impl fmt::Debug for Page<'_> { @@ -119,6 +131,12 @@ impl fmt::Debug for Page<'_> { } } +impl Drop for Page<'_> { + fn drop(&mut self) { + self.inner.page_manager.drop_page(self.id()); + } +} + impl<'p> PageMut<'p> { /// Constructs a new `PageMut` from a pointer to an *occupied* page. /// @@ -139,6 +157,7 @@ impl<'p> PageMut<'p> { id: PageId, snapshot_id: SnapshotId, ptr: *mut [u8; Page::SIZE], + page_manager: &'p PageManager, ) -> Result { let new_state = PageState::dirty(snapshot_id).expect("invalid value for `snapshot_id`"); @@ -147,7 +166,7 @@ impl<'p> PageMut<'p> { PageState::Unused | PageState::Occupied(_) => Some(new_state), PageState::Dirty(_) => None, }) { - Ok(_) => Ok(Self { inner: UnsafePage { id, ptr }, phantom: PhantomData }), + Ok(_) => Ok(Self { inner: UnsafePage { id, ptr, page_manager }, phantom: PhantomData }), Err(PageState::Unused) => Err(PageError::PageNotFound(id)), Err(PageState::Dirty(_)) => Err(PageError::PageDirty(id)), Err(PageState::Occupied(_)) => unreachable!(), @@ -175,13 +194,15 @@ impl<'p> PageMut<'p> { id: PageId, snapshot_id: SnapshotId, ptr: *mut [u8; Page::SIZE], + page_manager: &'p PageManager, ) -> Result { let new_state = PageState::dirty(snapshot_id).expect("invalid value for `snapshot_id`"); // SAFETY: guaranteed by the caller match RawPageStateMut::from_ptr(ptr.cast()).compare_exchange(PageState::Unused, new_state) { Ok(_) => { - let mut p = Self { inner: UnsafePage { id, ptr }, phantom: PhantomData }; + let mut p = + Self { inner: UnsafePage { id, ptr, page_manager }, phantom: PhantomData }; p.raw_contents_mut().fill(0); Ok(p) } @@ -219,11 +240,12 @@ impl<'p> PageMut<'p> { id: PageId, snapshot_id: SnapshotId, ptr: *mut [u8; Page::SIZE], + page_manager: &'p PageManager, ) -> Result { let new_state = PageState::dirty(snapshot_id).expect("invalid value for `snapshot_id`"); RawPageStateMut::from_ptr(ptr.cast()).store(new_state); - let mut p = Self { inner: UnsafePage { id, ptr }, phantom: PhantomData }; + let mut p = Self { inner: UnsafePage { id, ptr, page_manager }, phantom: PhantomData }; p.raw_contents_mut().fill(0); Ok(p) @@ -233,10 +255,15 @@ impl<'p> PageMut<'p> { /// /// This method is safe because the mutable reference ensures that there cannot be any other /// living reference to this page. - pub fn new(id: PageId, snapshot_id: SnapshotId, data: &'p mut [u8; Page::SIZE]) -> Self { + pub fn new( + id: PageId, + snapshot_id: SnapshotId, + data: &'p mut [u8; Page::SIZE], + page_manager: &'p PageManager, + ) -> Self { // SAFETY: `data` is behind a mutable reference, therefore we have exclusive access to the // data. - unsafe { Self::acquire(id, snapshot_id, data) }.unwrap() + unsafe { Self::acquire(id, snapshot_id, data, page_manager) }.unwrap() } #[inline] @@ -327,8 +354,9 @@ mod tests { let snapshot = 123u64; let mut data = DataArray([0; Page::SIZE]); data.0[..8].copy_from_slice(&snapshot.to_le_bytes()); - - let page = unsafe { Page::from_ptr(id, &mut data.0).expect("loading page failed") }; + let page_manager = PageManager::open_temp_file().unwrap(); + let page = + unsafe { Page::from_ptr(id, &mut data.0, &page_manager).expect("loading page failed") }; assert_eq!(page.id(), 42); assert_eq!(page.snapshot_id(), snapshot); @@ -340,9 +368,10 @@ mod tests { let id = page_id!(42); let snapshot = 1337; let mut data = DataArray([0; Page::SIZE]); - + let page_manager = PageManager::open_temp_file().unwrap(); let page_mut = unsafe { - PageMut::from_ptr(id, snapshot, &mut data.0).expect("loading mutable page failed") + PageMut::from_ptr(id, snapshot, &mut data.0, &page_manager) + .expect("loading mutable page failed") }; assert_eq!(page_mut.id(), 42); diff --git a/src/page/slotted_page.rs b/src/page/slotted_page.rs index 7082004d..1ee14a35 100644 --- a/src/page/slotted_page.rs +++ b/src/page/slotted_page.rs @@ -16,7 +16,7 @@ pub const CELL_POINTER_SIZE: usize = 3; // where the pointers are stored in a contiguous array of 3-byte cell pointers from the // beginning of the page, and the values are added from the end of the page. #[repr(transparent)] -#[derive(Copy, Clone)] +#[derive(Clone)] pub struct SlottedPage<'p> { page: Page<'p>, } @@ -466,15 +466,16 @@ impl fmt::Debug for SlottedPageMut<'_> { #[cfg(test)] mod tests { use super::*; - use crate::page::page_id; + use crate::{page::page_id, PageManager}; #[repr(align(4096))] struct DataArray([u8; Page::SIZE]); #[test] fn test_insert_get_value() { + let page_manager = PageManager::open_temp_file().unwrap(); let mut data = DataArray([0; Page::SIZE]); - let page = PageMut::new(page_id!(42), 123, &mut data.0); + let page = PageMut::new(page_id!(42), 123, &mut data.0, &page_manager); let mut subtrie_page = SlottedPageMut::try_from(page).unwrap(); let v1 = String::from("hello"); @@ -501,8 +502,9 @@ mod tests { #[test] fn test_insert_set_value() { + let page_manager = PageManager::open_temp_file().unwrap(); let mut data = DataArray([0; Page::SIZE]); - let page = PageMut::new(page_id!(42), 123, &mut data.0); + let page = PageMut::new(page_id!(42), 123, &mut data.0, &page_manager); let mut subtrie_page = SlottedPageMut::try_from(page).unwrap(); let v1 = String::from("hello"); @@ -521,8 +523,9 @@ mod tests { #[test] fn test_set_value_same_length() { + let page_manager = PageManager::open_temp_file().unwrap(); let mut data = DataArray([0; Page::SIZE]); - let page = PageMut::new(page_id!(42), 123, &mut data.0); + let page = PageMut::new(page_id!(42), 123, &mut data.0, &page_manager); let mut subtrie_page = SlottedPageMut::try_from(page).unwrap(); let v1 = String::from("hello"); @@ -551,8 +554,9 @@ mod tests { #[test] fn test_set_value_shrink() { + let page_manager = PageManager::open_temp_file().unwrap(); let mut data = DataArray([0; Page::SIZE]); - let page = PageMut::new(page_id!(42), 123, &mut data.0); + let page = PageMut::new(page_id!(42), 123, &mut data.0, &page_manager); let mut subtrie_page = SlottedPageMut::try_from(page).unwrap(); let v1 = String::from("hello"); @@ -583,8 +587,9 @@ mod tests { #[test] fn test_set_value_shrink_with_neighbors() { + let page_manager = PageManager::open_temp_file().unwrap(); let mut data = DataArray([0; Page::SIZE]); - let page = PageMut::new(page_id!(42), 123, &mut data.0); + let page = PageMut::new(page_id!(42), 123, &mut data.0, &page_manager); let mut subtrie_page = SlottedPageMut::try_from(page).unwrap(); let v1 = String::from("one"); @@ -643,8 +648,9 @@ mod tests { #[test] fn test_set_value_grow() { + let page_manager = PageManager::open_temp_file().unwrap(); let mut data = DataArray([0; Page::SIZE]); - let page = PageMut::new(page_id!(42), 123, &mut data.0); + let page = PageMut::new(page_id!(42), 123, &mut data.0, &page_manager); let mut subtrie_page = SlottedPageMut::try_from(page).unwrap(); let v1 = String::from("this"); @@ -673,8 +679,9 @@ mod tests { #[test] fn test_set_value_grow_with_neighbors() { + let page_manager = PageManager::open_temp_file().unwrap(); let mut data = DataArray([0; Page::SIZE]); - let page = PageMut::new(page_id!(42), 123, &mut data.0); + let page = PageMut::new(page_id!(42), 123, &mut data.0, &page_manager); let mut subtrie_page = SlottedPageMut::try_from(page).unwrap(); let v1 = String::from("one"); @@ -733,8 +740,9 @@ mod tests { #[test] fn test_allocate_get_delete_cell_pointer() { + let page_manager = PageManager::open_temp_file().unwrap(); let mut data = DataArray([0; Page::SIZE]); - let page = PageMut::new(page_id!(42), 123, &mut data.0); + let page = PageMut::new(page_id!(42), 123, &mut data.0, &page_manager); let mut subtrie_page = SlottedPageMut::try_from(page).unwrap(); let cell_index = subtrie_page.insert_value(&String::from("foo")).unwrap(); assert_eq!(cell_index, 0); @@ -826,8 +834,9 @@ mod tests { #[test] fn test_allocate_reuse_deleted_space() { + let page_manager = PageManager::open_temp_file().unwrap(); let mut data = DataArray([0; Page::SIZE]); - let page = PageMut::new(page_id!(42), 123, &mut data.0); + let page = PageMut::new(page_id!(42), 123, &mut data.0, &page_manager); let mut subtrie_page = SlottedPageMut::try_from(page).unwrap(); let i0 = subtrie_page.insert_value(&String::from_iter(&['a'; 1020])).unwrap(); @@ -855,8 +864,9 @@ mod tests { #[test] fn test_allocate_reuse_deleted_spaces() { + let page_manager = PageManager::open_temp_file().unwrap(); let mut data = DataArray([0; Page::SIZE]); - let page = PageMut::new(page_id!(42), 123, &mut data.0); + let page = PageMut::new(page_id!(42), 123, &mut data.0, &page_manager); let mut subtrie_page = SlottedPageMut::try_from(page).unwrap(); // bytes 0-12 are used by the header, and the next 4072 are used by the first 4 cells @@ -913,8 +923,9 @@ mod tests { #[test] fn test_defragment_page() { + let page_manager = PageManager::open_temp_file().unwrap(); let mut data = DataArray([0; Page::SIZE]); - let page = PageMut::new(page_id!(42), 123, &mut data.0); + let page = PageMut::new(page_id!(42), 123, &mut data.0, &page_manager); let mut subtrie_page = SlottedPageMut::try_from(page).unwrap(); let i0 = subtrie_page.insert_value(&String::from_iter(&['a'; 814])).unwrap(); @@ -959,8 +970,9 @@ mod tests { #[test] fn test_defragment_page_cells_out_of_order() { + let page_manager = PageManager::open_temp_file().unwrap(); let mut data = DataArray([0; Page::SIZE]); - let page = PageMut::new(page_id!(42), 123, &mut data.0); + let page = PageMut::new(page_id!(42), 123, &mut data.0, &page_manager); let mut slotted_page = SlottedPageMut::try_from(page).unwrap(); slotted_page.set_num_cells(16); diff --git a/src/storage/debug.rs b/src/storage/debug.rs index fd4335f2..e755a72a 100644 --- a/src/storage/debug.rs +++ b/src/storage/debug.rs @@ -134,10 +134,11 @@ impl<'a> StorageDebugger<'a> { new_indent.push('\t'); if let Some(direct_child) = storage_root { + let slotted_page_id = slotted_page.id(); let (new_slotted_page, cell_index) = self.get_slotted_page_and_index(context, direct_child, slotted_page)?; // child is on different page, and we are only printing the current page - if new_slotted_page.id() != slotted_page.id() && !print_whole_db { + if new_slotted_page.id() != slotted_page_id && !print_whole_db { let child_page_id = direct_child.location().page_id().unwrap(); writeln!(buf, "{new_indent}Child on new page: {child_page_id:?}")?; Ok(()) @@ -165,7 +166,7 @@ impl<'a> StorageDebugger<'a> { //check if child is on same page let (new_slotted_page, cell_index) = - self.get_slotted_page_and_index(context, child, slotted_page)?; + self.get_slotted_page_and_index(context, child, slotted_page.clone())?; // child is on new page, and we are only printing the current page if new_slotted_page.id() != slotted_page.id() && !print_whole_db { let child_page_id = child.location().page_id().unwrap(); @@ -277,11 +278,12 @@ impl<'a> StorageDebugger<'a> { match child_pointer { Some(child_pointer) => { + let slotted_page_id = slotted_page.id(); let (child_slotted_page, child_cell_index) = self.get_slotted_page_and_index(context, child_pointer, slotted_page)?; // If we're moving to a new page and extra_verbose is true, print the new page - if child_slotted_page.id() != slotted_page.id() { + if child_slotted_page.id() != slotted_page_id { if verbosity_level == 2 { //extra verbose; print new page contents writeln!(buf, "\n\n\nNEW PAGE: {}\n", child_slotted_page.id())?; @@ -421,10 +423,11 @@ impl<'a> StorageDebugger<'a> { AccountLeaf { ref storage_root, .. } => { //Note: direct child is not counted as part of stats.num_children if let Some(direct_child) = storage_root { + let slotted_page_id = slotted_page.id(); let (new_slotted_page, cell_index) = self.get_slotted_page_and_index(context, direct_child, slotted_page)?; //if we move to a new page, update relevent stats - if new_slotted_page.id() != slotted_page.id() { + if new_slotted_page.id() != slotted_page_id { let occupied_bytes = new_slotted_page.num_occupied_bytes(); let occupied_cells = new_slotted_page.num_occupied_cells(); @@ -465,7 +468,7 @@ impl<'a> StorageDebugger<'a> { for child in child_iter { //check if child is on same page let (new_slotted_page, cell_index) = - self.get_slotted_page_and_index(context, child, slotted_page)?; + self.get_slotted_page_and_index(context, child, slotted_page.clone())?; //update page depth if we move to a new page if new_slotted_page.id() != slotted_page.id() { let occupied_bytes = new_slotted_page.num_occupied_bytes(); @@ -702,7 +705,7 @@ impl<'a> StorageDebugger<'a> { Branch { ref children } => { for child in children.iter().flatten() { let (new_slotted_page, new_cell_index) = - self.get_slotted_page_and_index(context, child, slotted_page)?; + self.get_slotted_page_and_index(context, child, slotted_page.clone())?; if new_slotted_page.id() != page_id { reachable.insert(new_slotted_page.id()); self.consistency_check_helper( @@ -772,8 +775,8 @@ impl<'a> StorageDebugger<'a> { } /// Helper function to get a page from the page manager. - fn get_page(&self, context: &TransactionContext, page_id: PageId) -> Result, Error> { - self.page_manager.get(context.snapshot_id, page_id).map_err(Error::PageError) + fn get_page(&self, _context: &TransactionContext, page_id: PageId) -> Result, Error> { + self.page_manager.get(page_id).map_err(Error::PageError) } /// Prints information about the root page and database metadata. diff --git a/src/storage/engine.rs b/src/storage/engine.rs index 1e814ff9..aa5ebb21 100644 --- a/src/storage/engine.rs +++ b/src/storage/engine.rs @@ -108,7 +108,7 @@ impl StorageEngine { context: &mut TransactionContext, page_id: PageId, ) -> Result, Error> { - let original_page = self.page_manager.get(context.snapshot_id, page_id)?; + let original_page = self.page_manager.get(page_id)?; context.transaction_metrics.inc_pages_read(); // if the page already has the correct snapshot id, return it without cloning. @@ -1496,7 +1496,7 @@ impl StorageEngine { context: &TransactionContext, page_id: PageId, ) -> Result, Error> { - let page = self.page_manager.get(context.snapshot_id, page_id)?; + let page = self.page_manager.get(page_id)?; context.transaction_metrics.inc_pages_read(); Ok(page) } diff --git a/tests/ethereum_execution_spec.rs b/tests/ethereum_execution_spec.rs index e7aadbc0..1413dfd7 100644 --- a/tests/ethereum_execution_spec.rs +++ b/tests/ethereum_execution_spec.rs @@ -17,6 +17,7 @@ use triedb::{ }; use walkdir::WalkDir; +#[cfg(test)] #[test] fn run_ethereum_execution_spec_state_tests() { for test_spec_entry in @@ -34,7 +35,8 @@ fn run_ethereum_execution_spec_state_tests() { .as_str() .replace("/", "_")[0..min(test_case_name.len(), 100)]; let file_path = tmp_dir.path().join(database_file_name).to_str().unwrap().to_owned(); - let test_database = Database::create_new(file_path).unwrap(); + let test_database = + Database::options().create_new(true).num_frames(1024).open(file_path).unwrap(); // will track accounts and storage that need to be deleted. this is essentially the // "diff" between the pre state and post state. From c820e6582b3bf2d388855eccb203e6de5e9597ee Mon Sep 17 00:00:00 2001 From: nqd Date: Wed, 1 Oct 2025 21:52:16 +0800 Subject: [PATCH 02/65] add example --- examples/insert/main.rs | 91 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 examples/insert/main.rs diff --git a/examples/insert/main.rs b/examples/insert/main.rs new file mode 100644 index 00000000..3c59f86c --- /dev/null +++ b/examples/insert/main.rs @@ -0,0 +1,91 @@ +use std::env; + +use alloy_primitives::{Address, StorageKey, StorageValue, U256}; +use alloy_trie::{EMPTY_ROOT_HASH, KECCAK_EMPTY}; +use rand::prelude::*; +use triedb::{ + account::Account, + path::{AddressPath, StoragePath}, + transaction::TransactionError, + Database, +}; + +pub const DEFAULT_SETUP_DB_EOA_SIZE: usize = 1_000_000; +pub const DEFAULT_SETUP_DB_CONTRACT_SIZE: usize = 100_000; +pub const DEFAULT_SETUP_DB_STORAGE_PER_CONTRACT: usize = 10; +const SEED_EOA: u64 = 42; // EOA seeding value +const SEED_CONTRACT: u64 = 43; // contract account seeding value + +pub fn generate_random_address(rng: &mut StdRng) -> AddressPath { + let addr = Address::random_with(rng); + AddressPath::for_address(addr) +} + +pub fn setup_database( + db: &Database, + repeat: usize, + eoa_size: usize, + contract_size: usize, + storage_per_contract: usize, +) -> Result<(), TransactionError> { + // Populate database with initial accounts + let mut eoa_rng = StdRng::seed_from_u64(SEED_EOA); + let mut contract_rng = StdRng::seed_from_u64(SEED_CONTRACT); + for _i in 0..repeat { + let mut tx = db.begin_rw()?; + for i in 1..=eoa_size { + let address = generate_random_address(&mut eoa_rng); + let account = + Account::new(i as u64, U256::from(i as u64), EMPTY_ROOT_HASH, KECCAK_EMPTY); + + tx.set_account(address, Some(account))?; + } + + for i in 1..=contract_size { + let address = generate_random_address(&mut contract_rng); + let account = + Account::new(i as u64, U256::from(i as u64), EMPTY_ROOT_HASH, KECCAK_EMPTY); + + tx.set_account(address.clone(), Some(account))?; + + // add random storage to each account + for key in 1..=storage_per_contract { + let storage_key = StorageKey::from(U256::from(key)); + let storage_path = + StoragePath::for_address_path_and_slot(address.clone(), storage_key); + let storage_value = + StorageValue::from_be_slice(storage_path.get_slot().pack().as_slice()); + + tx.set_storage_slot(storage_path, Some(storage_value))?; + } + } + + tx.commit()?; + } + println!("root hash: {:?}", db.state_root()); + + Ok(()) +} + +fn main() { + let args: Vec = env::args().collect(); + + let db_path = args.get(1).unwrap(); + let repeat = args.get(2).and_then(|s| s.parse::().ok()).unwrap_or(1); + let eoa_size = + args.get(3).and_then(|s| s.parse::().ok()).unwrap_or(DEFAULT_SETUP_DB_EOA_SIZE); + let contract_size = + args.get(4).and_then(|s| s.parse::().ok()).unwrap_or(DEFAULT_SETUP_DB_CONTRACT_SIZE); + let storage_per_contract = args + .get(5) + .and_then(|s| s.parse::().ok()) + .unwrap_or(DEFAULT_SETUP_DB_STORAGE_PER_CONTRACT); + + let db = Database::create_new(db_path).unwrap(); + + println!("eoa size: {eoa_size}"); + println!("repeat {repeat} times"); + println!("contract size: {contract_size}, storage per contract: {storage_per_contract}"); + + setup_database(&db, repeat, eoa_size, contract_size, storage_per_contract).unwrap(); +} From edfcae997eafd28a7c07f920e48e5f4e075fa1a4 Mon Sep 17 00:00:00 2001 From: nqd Date: Fri, 3 Oct 2025 10:33:30 +0800 Subject: [PATCH 03/65] fix the duplication, and write correct amount of data --- src/page/manager/buffer_pool.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 1e1738f2..70a55193 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -231,7 +231,9 @@ impl PageManager { let file = &mut self.file.write(); // Get all value at write_frames let mut dirty_pages = self.lru_replacer.write_frames.lock(); + // remove duplicate pages dirty_pages.sort_by_key(|(_, page_id)| page_id.as_offset()); + dirty_pages.dedup_by_key(|(_, page_id)| *page_id); // Group contiguous pages together let mut current_offset = None; @@ -271,9 +273,10 @@ impl PageManager { #[inline] fn write(&self, batch: &mut Vec, file: &mut File, offset: u64) -> io::Result<()> { + let total_len = batch.iter().map(|b| b.len()).sum::(); file.seek(SeekFrom::Start(offset))?; let mut total_written: usize = 0; - while total_written < batch.iter().map(|b| b.len()).sum() { + while total_written < total_len { let written = file.write_vectored(batch)?; if written == 0 { return Err(io::Error::new( From f08293b7a99b9f1712175ae7227c0313f29bff2a Mon Sep 17 00:00:00 2001 From: nqd Date: Fri, 3 Oct 2025 18:04:14 +0800 Subject: [PATCH 04/65] Pin the frame when get from page_table --- src/page/manager/buffer_pool.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 70a55193..7f0e4c27 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -159,6 +159,9 @@ impl PageManager { loop { // Check if page is already in the cache if let Some(frame_id) = self.page_table.get(&page_id) { + self.lru_replacer + .pin_write(frame_id, page_id) + .map_err(|_| PageError::EvictionPolicy)?; let frame = &self.frames[frame_id.0 as usize]; return unsafe { PageMut::from_ptr(page_id, snapshot_id, frame.ptr, self) } } From 785b6316bfaacce5786cebe2beab6dfd884f4009 Mon Sep 17 00:00:00 2001 From: nqd Date: Fri, 3 Oct 2025 18:10:13 +0800 Subject: [PATCH 05/65] tmp update insert --- examples/insert/main.rs | 36 +++++++++++++++++------------------- examples/read/main.rs | 25 +++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 19 deletions(-) create mode 100644 examples/read/main.rs diff --git a/examples/insert/main.rs b/examples/insert/main.rs index 3c59f86c..fc54db04 100644 --- a/examples/insert/main.rs +++ b/examples/insert/main.rs @@ -30,7 +30,7 @@ pub fn setup_database( ) -> Result<(), TransactionError> { // Populate database with initial accounts let mut eoa_rng = StdRng::seed_from_u64(SEED_EOA); - let mut contract_rng = StdRng::seed_from_u64(SEED_CONTRACT); + // let mut contract_rng = StdRng::seed_from_u64(SEED_CONTRACT); for _i in 0..repeat { let mut tx = db.begin_rw()?; for i in 1..=eoa_size { @@ -41,24 +41,24 @@ pub fn setup_database( tx.set_account(address, Some(account))?; } - for i in 1..=contract_size { - let address = generate_random_address(&mut contract_rng); - let account = - Account::new(i as u64, U256::from(i as u64), EMPTY_ROOT_HASH, KECCAK_EMPTY); + // for i in 1..=contract_size { + // let address = generate_random_address(&mut contract_rng); + // let account = + // Account::new(i as u64, U256::from(i as u64), EMPTY_ROOT_HASH, KECCAK_EMPTY); - tx.set_account(address.clone(), Some(account))?; + // tx.set_account(address.clone(), Some(account))?; - // add random storage to each account - for key in 1..=storage_per_contract { - let storage_key = StorageKey::from(U256::from(key)); - let storage_path = - StoragePath::for_address_path_and_slot(address.clone(), storage_key); - let storage_value = - StorageValue::from_be_slice(storage_path.get_slot().pack().as_slice()); + // // add random storage to each account + // for key in 1..=storage_per_contract { + // let storage_key = StorageKey::from(U256::from(key)); + // let storage_path = + // StoragePath::for_address_path_and_slot(address.clone(), storage_key); + // let storage_value = + // StorageValue::from_be_slice(storage_path.get_slot().pack().as_slice()); - tx.set_storage_slot(storage_path, Some(storage_value))?; - } - } + // tx.set_storage_slot(storage_path, Some(storage_value))?; + // } + // } tx.commit()?; } @@ -83,9 +83,7 @@ fn main() { let db = Database::create_new(db_path).unwrap(); - println!("eoa size: {eoa_size}"); - println!("repeat {repeat} times"); - println!("contract size: {contract_size}, storage per contract: {storage_per_contract}"); + println!("repeat {repeat} times, eoa size: {eoa_size}, contract size: {contract_size}, storage per contract: {storage_per_contract}"); setup_database(&db, repeat, eoa_size, contract_size, storage_per_contract).unwrap(); } diff --git a/examples/read/main.rs b/examples/read/main.rs new file mode 100644 index 00000000..bc732c39 --- /dev/null +++ b/examples/read/main.rs @@ -0,0 +1,25 @@ +use alloy_primitives::Address; +use rand::prelude::*; +use triedb::{path::AddressPath, Database}; + +pub const SEED_EOA: u64 = 42; // EOA seeding value +pub const BATCH_SIZE: usize = 10_000; + +pub fn generate_random_address(rng: &mut StdRng) -> AddressPath { + let addr = Address::random_with(rng); + AddressPath::for_address(addr) +} + +fn main() { + let path = std::env::args().nth(1).unwrap(); + let db = Database::open(path).unwrap(); + let mut rng = StdRng::seed_from_u64(SEED_EOA); + let addresses: Vec = + (0..BATCH_SIZE).map(|_| generate_random_address(&mut rng)).collect(); + let mut tx = db.begin_ro().unwrap(); + addresses.iter().enumerate().for_each(|(i, addr)| { + let a = tx.get_account(addr).unwrap(); + assert!(a.is_some(), "{:?}: account not found for address: {:?}", i, addr); + }); + tx.commit().unwrap(); +} From e1b9d4af3e4452bfd737fbf158752f21d1790233 Mon Sep 17 00:00:00 2001 From: nqd Date: Fri, 3 Oct 2025 18:18:35 +0800 Subject: [PATCH 06/65] Revert "tmp update insert" This reverts commit 785b6316bfaacce5786cebe2beab6dfd884f4009. --- examples/insert/main.rs | 36 +++++++++++++++++++----------------- examples/read/main.rs | 25 ------------------------- 2 files changed, 19 insertions(+), 42 deletions(-) delete mode 100644 examples/read/main.rs diff --git a/examples/insert/main.rs b/examples/insert/main.rs index fc54db04..3c59f86c 100644 --- a/examples/insert/main.rs +++ b/examples/insert/main.rs @@ -30,7 +30,7 @@ pub fn setup_database( ) -> Result<(), TransactionError> { // Populate database with initial accounts let mut eoa_rng = StdRng::seed_from_u64(SEED_EOA); - // let mut contract_rng = StdRng::seed_from_u64(SEED_CONTRACT); + let mut contract_rng = StdRng::seed_from_u64(SEED_CONTRACT); for _i in 0..repeat { let mut tx = db.begin_rw()?; for i in 1..=eoa_size { @@ -41,24 +41,24 @@ pub fn setup_database( tx.set_account(address, Some(account))?; } - // for i in 1..=contract_size { - // let address = generate_random_address(&mut contract_rng); - // let account = - // Account::new(i as u64, U256::from(i as u64), EMPTY_ROOT_HASH, KECCAK_EMPTY); + for i in 1..=contract_size { + let address = generate_random_address(&mut contract_rng); + let account = + Account::new(i as u64, U256::from(i as u64), EMPTY_ROOT_HASH, KECCAK_EMPTY); - // tx.set_account(address.clone(), Some(account))?; + tx.set_account(address.clone(), Some(account))?; - // // add random storage to each account - // for key in 1..=storage_per_contract { - // let storage_key = StorageKey::from(U256::from(key)); - // let storage_path = - // StoragePath::for_address_path_and_slot(address.clone(), storage_key); - // let storage_value = - // StorageValue::from_be_slice(storage_path.get_slot().pack().as_slice()); + // add random storage to each account + for key in 1..=storage_per_contract { + let storage_key = StorageKey::from(U256::from(key)); + let storage_path = + StoragePath::for_address_path_and_slot(address.clone(), storage_key); + let storage_value = + StorageValue::from_be_slice(storage_path.get_slot().pack().as_slice()); - // tx.set_storage_slot(storage_path, Some(storage_value))?; - // } - // } + tx.set_storage_slot(storage_path, Some(storage_value))?; + } + } tx.commit()?; } @@ -83,7 +83,9 @@ fn main() { let db = Database::create_new(db_path).unwrap(); - println!("repeat {repeat} times, eoa size: {eoa_size}, contract size: {contract_size}, storage per contract: {storage_per_contract}"); + println!("eoa size: {eoa_size}"); + println!("repeat {repeat} times"); + println!("contract size: {contract_size}, storage per contract: {storage_per_contract}"); setup_database(&db, repeat, eoa_size, contract_size, storage_per_contract).unwrap(); } diff --git a/examples/read/main.rs b/examples/read/main.rs deleted file mode 100644 index bc732c39..00000000 --- a/examples/read/main.rs +++ /dev/null @@ -1,25 +0,0 @@ -use alloy_primitives::Address; -use rand::prelude::*; -use triedb::{path::AddressPath, Database}; - -pub const SEED_EOA: u64 = 42; // EOA seeding value -pub const BATCH_SIZE: usize = 10_000; - -pub fn generate_random_address(rng: &mut StdRng) -> AddressPath { - let addr = Address::random_with(rng); - AddressPath::for_address(addr) -} - -fn main() { - let path = std::env::args().nth(1).unwrap(); - let db = Database::open(path).unwrap(); - let mut rng = StdRng::seed_from_u64(SEED_EOA); - let addresses: Vec = - (0..BATCH_SIZE).map(|_| generate_random_address(&mut rng)).collect(); - let mut tx = db.begin_ro().unwrap(); - addresses.iter().enumerate().for_each(|(i, addr)| { - let a = tx.get_account(addr).unwrap(); - assert!(a.is_some(), "{:?}: account not found for address: {:?}", i, addr); - }); - tx.commit().unwrap(); -} From 9caa462a3d7a898e1b81ba4f2963c5759f34c169 Mon Sep 17 00:00:00 2001 From: nqd Date: Fri, 3 Oct 2025 18:19:22 +0800 Subject: [PATCH 07/65] fix --- src/page/manager/buffer_pool.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 7f0e4c27..ba7581c4 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -160,7 +160,7 @@ impl PageManager { // Check if page is already in the cache if let Some(frame_id) = self.page_table.get(&page_id) { self.lru_replacer - .pin_write(frame_id, page_id) + .pin_write(*frame_id, page_id) .map_err(|_| PageError::EvictionPolicy)?; let frame = &self.frames[frame_id.0 as usize]; return unsafe { PageMut::from_ptr(page_id, snapshot_id, frame.ptr, self) } From 07a552c5a5abbb26cf8480e10d64b52a1a707957 Mon Sep 17 00:00:00 2001 From: nqd Date: Sat, 4 Oct 2025 22:17:54 +0800 Subject: [PATCH 08/65] add run.sh for testing --- run.sh | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100755 run.sh diff --git a/run.sh b/run.sh new file mode 100755 index 00000000..e8c75284 --- /dev/null +++ b/run.sh @@ -0,0 +1,29 @@ +cargo build --release --example insert --no-default-features --features buffer_pool_backend +mv ./target/release/examples/insert ./target/release/examples/insert-buffer-pool +cargo build --release --example insert --no-default-features --features mmap_backend +mv ./target/release/examples/insert ./target/release/examples/insert-mmap + +mkdir -p target/db/mmap +mkdir -p target/db/buffer-pool + +for i in 1 2 1000 10000 100000; do + echo "==============================================================================" + echo "== Running for mmap/db_${i}" + time ./target/release/examples/insert-mmap ./target/db/mmap/db_${i} ${i} 1000 100 10 + sleep 10 + echo "== Running for buffer-pool/db_${i}" + time ./target/release/examples/insert-buffer-pool ./target/db/buffer-pool/db_${i} ${i} 1000 100 10 + sleep 10 + + # Get and compare hashes + mmap_hash=$(shasum -a 256 ./target/db/mmap/db_${i} | cut -d' ' -f1) + buffer_hash=$(shasum -a 256 ./target/db/buffer-pool/db_${i} | cut -d' ' -f1) + echo "Comparing hashes for db_${i}:" + echo "MMAP: ${mmap_hash}" + echo "Buffer Pool: ${buffer_hash}" + if [ "$mmap_hash" = "$buffer_hash" ]; then + echo "✅ Hashes match" + else + echo "❌ Hashes differ" + fi +done From c4bab50d510f730c75e82818fb7773e69e999d61 Mon Sep 17 00:00:00 2001 From: nqd Date: Mon, 6 Oct 2025 21:24:21 +0800 Subject: [PATCH 09/65] add time --- run.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/run.sh b/run.sh index e8c75284..ca9c4cde 100755 --- a/run.sh +++ b/run.sh @@ -9,9 +9,11 @@ mkdir -p target/db/buffer-pool for i in 1 2 1000 10000 100000; do echo "==============================================================================" echo "== Running for mmap/db_${i}" + echo "Time: $(date)" time ./target/release/examples/insert-mmap ./target/db/mmap/db_${i} ${i} 1000 100 10 sleep 10 echo "== Running for buffer-pool/db_${i}" + echo "Time: $(date)" time ./target/release/examples/insert-buffer-pool ./target/db/buffer-pool/db_${i} ${i} 1000 100 10 sleep 10 From 3e26672ac3976e6b9479d05d57e7b483d935de27 Mon Sep 17 00:00:00 2001 From: nqd Date: Fri, 10 Oct 2025 09:51:53 +0800 Subject: [PATCH 10/65] increase buffer size --- examples/insert/main.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/insert/main.rs b/examples/insert/main.rs index 3c59f86c..828c73b5 100644 --- a/examples/insert/main.rs +++ b/examples/insert/main.rs @@ -5,6 +5,7 @@ use alloy_trie::{EMPTY_ROOT_HASH, KECCAK_EMPTY}; use rand::prelude::*; use triedb::{ account::Account, + database::DatabaseOptions, path::{AddressPath, StoragePath}, transaction::TransactionError, Database, @@ -81,11 +82,13 @@ fn main() { .and_then(|s| s.parse::().ok()) .unwrap_or(DEFAULT_SETUP_DB_STORAGE_PER_CONTRACT); - let db = Database::create_new(db_path).unwrap(); + let db = DatabaseOptions::default() + .create_new(true) + .num_frames(1024 * 1024 * 6) + .open(db_path) + .unwrap(); - println!("eoa size: {eoa_size}"); - println!("repeat {repeat} times"); - println!("contract size: {contract_size}, storage per contract: {storage_per_contract}"); + println!("eoa size: {eoa_size}, contract size: {contract_size}, storage per contract: {storage_per_contract}, repeat: {repeat}"); setup_database(&db, repeat, eoa_size, contract_size, storage_per_contract).unwrap(); } From d7f16a6f601f31d897b8c3ed8875e8c3249b0f93 Mon Sep 17 00:00:00 2001 From: nqd Date: Wed, 15 Oct 2025 18:40:05 +0800 Subject: [PATCH 11/65] Write pages in parallel via writer pool --- Cargo.toml | 3 +- src/page/manager/buffer_pool.rs | 268 ++++++++++++++++++++++---------- src/page/manager/cache_evict.rs | 40 ++++- src/page/manager/options.rs | 21 ++- 4 files changed, 242 insertions(+), 90 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a53c581d..a1ca16d5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,9 +22,10 @@ rayon = "1.10.0" evict = "0.3.1" dashmap = "6.1.0" libc = "0.2.174" +crossbeam-channel = "0.5.15" [features] -default = ["mmap_backend"] +default = ["buffer_pool_backend"] buffer_pool_backend = [] mmap_backend = [] diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index ba7581c4..91f9cee3 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -1,10 +1,15 @@ +use crossbeam_channel::{Receiver, Sender}; use std::{ ffi::CString, fs::File, io::{self, IoSlice, Seek, SeekFrom, Write}, os::{fd::FromRawFd, unix::fs::FileExt}, path::Path, - sync::atomic::{AtomicU32, AtomicU64, Ordering}, + sync::{ + atomic::{AtomicU32, AtomicU64, Ordering}, + Arc, + }, + thread::{self, JoinHandle}, }; use dashmap::{DashMap, DashSet}; @@ -33,19 +38,31 @@ unsafe impl Sync for Frame {} #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] pub(crate) struct FrameId(u32); +enum WriteMessage { + Page((FrameId, PageId)), + Batch(Vec<(FrameId, PageId)>), + Shutdown, +} + #[derive(Debug)] pub struct PageManager { num_frames: u32, page_count: AtomicU32, file: RwLock, file_len: AtomicU64, - frames: Vec, /* list of frames that hold pages' data, indexed by frame id with fix - * num_frames size */ + frames: Arc>, /* list of frames that hold pages' data, indexed by frame id with + * fix num_frames size */ page_table: DashMap, /* mapping between page id and buffer pool frames, * indexed by page id with fix num_frames size */ original_free_frame_idx: AtomicU32, lru_replacer: CacheEvict, /* the replacer to find unpinned/candidate pages for eviction */ loading_page: DashSet, /* set of pages that are being loaded from disk */ + + // write worker + num_workers: usize, + worker_handles: Vec>, + tx_job: Sender, + rx_result: Receiver>, } impl PageManager { @@ -95,17 +112,122 @@ impl PageManager { } let lru_replacer = CacheEvict::new(num_frames as usize); - Ok(PageManager { + let (tx_job, rx_job) = crossbeam_channel::unbounded(); + let (tx_result, rx_result) = crossbeam_channel::unbounded(); + + let mut page_manager = PageManager { num_frames, page_count, file: RwLock::new(file), file_len, - frames, + frames: Arc::new(frames), page_table, original_free_frame_idx: AtomicU32::new(0), lru_replacer, loading_page: DashSet::with_capacity(num_frames as usize), - }) + + num_workers: opts.num_workers, + worker_handles: Vec::with_capacity(opts.num_workers), + tx_job, + rx_result, + }; + page_manager.start_write_workers(rx_job, tx_result)?; + + Ok(page_manager) + } + + fn start_write_workers( + &mut self, + rx_job: Receiver, + tx_result: Sender>, + ) -> Result<(), PageError> { + let rx_job_arc = Arc::new(rx_job); + for _ in 0..self.num_workers { + let rx_job_arc = Arc::clone(&rx_job_arc); + let mut worker_file = self.file.write().try_clone().map_err(PageError::IO)?; + let frames = self.frames.clone(); + let tx_result = tx_result.clone(); + let handle = thread::spawn(move || loop { + match rx_job_arc.recv() { + Ok(WriteMessage::Page((frame_id, page_id))) => { + // write to file at specific offset + let offset = page_id.as_offset() as u64; + let frame = &frames[frame_id.0 as usize]; + unsafe { + let page_data = + std::slice::from_raw_parts(frame.ptr as *const u8, Page::SIZE); + let result = worker_file.write_at(page_data, offset); + tx_result.send(result.map(|_| ())).unwrap(); + } + } + Ok(WriteMessage::Batch(batch)) => { + if batch.is_empty() { + continue; + } + let mut batch_slices = Vec::with_capacity(batch.len()); + for (frame_id, _) in &batch { + let frame = &frames[frame_id.0 as usize]; + unsafe { + let page_data = + std::slice::from_raw_parts(frame.ptr as *const u8, Page::SIZE); + batch_slices.push(IoSlice::new(page_data)); + } + } + let result = Self::write_batch( + &mut batch_slices, + &mut worker_file, + batch[0].1.as_offset() as u64, + ); + tx_result.send(result).unwrap(); + } + Ok(WriteMessage::Shutdown) => { + break; // Graceful shutdown + } + Err(_) => { + break; // Channel closed + } + } + }); + self.worker_handles.push(handle); + } + Ok(()) + } + + #[inline] + fn write_batch(batch: &mut Vec, file: &mut File, offset: u64) -> io::Result<()> { + let total_len = batch.iter().map(|b| b.len()).sum::(); + file.seek(SeekFrom::Start(offset))?; + let mut total_written: usize = 0; + while total_written < total_len { + let written = file.write_vectored(batch)?; + if written == 0 { + return Err(io::Error::new( + io::ErrorKind::WriteZero, + "failed to write whole buffer", + )); + } + total_written += written; + // Remove fully written slices from the batch + let mut bytes_left = written; + while !batch.is_empty() && bytes_left >= batch[0].len() { + bytes_left -= batch[0].len(); + batch.remove(0); + } + // Adjust the first slice if it was partially written + if !batch.is_empty() && bytes_left > 0 { + // SAFETY: IoSlice only needs a reference for the duration of the write call, + // and batch[0] is still valid here. + let ptr = batch[0].as_ptr(); + let len = batch[0].len(); + if bytes_left < len { + let new_slice = unsafe { + std::slice::from_raw_parts(ptr.add(bytes_left), len - bytes_left) + }; + batch[0] = IoSlice::new(new_slice); + } + } + } + Ok(()) } #[cfg(test)] @@ -160,7 +282,7 @@ impl PageManager { // Check if page is already in the cache if let Some(frame_id) = self.page_table.get(&page_id) { self.lru_replacer - .pin_write(*frame_id, page_id) + .pin_write_update_page(*frame_id, page_id) .map_err(|_| PageError::EvictionPolicy)?; let frame = &self.frames[frame_id.0 as usize]; return unsafe { PageMut::from_ptr(page_id, snapshot_id, frame.ptr, self) } @@ -178,7 +300,7 @@ impl PageManager { } self.page_table.insert(page_id, frame_id); self.lru_replacer - .pin_write(frame_id, page_id) + .pin_write_update_page(frame_id, page_id) .map_err(|_| PageError::EvictionPolicy)?; self.loading_page.remove(&page_id); return unsafe { PageMut::from_ptr(page_id, snapshot_id, buf, self) } @@ -200,7 +322,9 @@ impl PageManager { self.grow_if_needed(new_count as u64 * Page::SIZE as u64)?; self.page_table.insert(page_id, frame_id); - self.lru_replacer.pin_write(frame_id, page_id).map_err(|_| PageError::EvictionPolicy)?; + self.lru_replacer + .pin_write_new_page(frame_id, page_id) + .map_err(|_| PageError::EvictionPolicy)?; let data = self.frames[frame_id.0 as usize].ptr; unsafe { PageMut::acquire_unchecked(page_id, snapshot_id, data, self) } @@ -232,82 +356,44 @@ impl PageManager { /// Could explore the parallel write strategy to improve performance. pub fn sync(&self) -> io::Result<()> { let file = &mut self.file.write(); - // Get all value at write_frames - let mut dirty_pages = self.lru_replacer.write_frames.lock(); - // remove duplicate pages - dirty_pages.sort_by_key(|(_, page_id)| page_id.as_offset()); - dirty_pages.dedup_by_key(|(_, page_id)| *page_id); - - // Group contiguous pages together - let mut current_offset = None; - let mut batch: Vec = Vec::new(); - - for (frame_id, page_id) in dirty_pages.iter() { - let offset = page_id.as_offset() as u64; - if let Some(prev_offset) = current_offset { - if offset != prev_offset + (batch.len() * Page::SIZE) as u64 { - // write the current batch - self.write(&mut batch, file, prev_offset)?; - batch.clear(); - } - } - if batch.is_empty() { - current_offset = Some(offset); - } - let frame = &self.frames[frame_id.0 as usize]; - unsafe { - let page_data = std::slice::from_raw_parts(frame.ptr as *const u8, Page::SIZE); - batch.push(IoSlice::new(page_data)); - } - } - // Write final batch - if !batch.is_empty() { - self.write(&mut batch, file, current_offset.unwrap())?; - } - file.flush()?; - for (_, page_id) in dirty_pages.iter() { - self.lru_replacer - .unpin(*page_id) - .map_err(|e| io::Error::other(format!("eviction policy error: {e:?}")))?; + + // Get all value at update_frames and new_frames + let mut update_pages = self.lru_replacer.update_frames.lock(); + update_pages.sort_by_key(|(_, page_id)| page_id.as_offset()); + update_pages.dedup_by_key(|(_, page_id)| *page_id); + // New pages should be sorted by page_id ascending and no duplicate + let mut new_pages = self.lru_replacer.new_frames.lock(); + + let mut write_num = 0; + + // Write new pages first, since this group could be large and take time to write + if !new_pages.is_empty() { + write_num += 1; + let write_message = WriteMessage::Batch(new_pages.clone()); + self.tx_job.send(write_message).unwrap(); } - dirty_pages.clear(); - Ok(()) - } - #[inline] - fn write(&self, batch: &mut Vec, file: &mut File, offset: u64) -> io::Result<()> { - let total_len = batch.iter().map(|b| b.len()).sum::(); - file.seek(SeekFrom::Start(offset))?; - let mut total_written: usize = 0; - while total_written < total_len { - let written = file.write_vectored(batch)?; - if written == 0 { - return Err(io::Error::new( - io::ErrorKind::WriteZero, - "failed to write whole buffer", - )); - } - total_written += written; - // Remove fully written slices from the batch - let mut bytes_left = written; - while !batch.is_empty() && bytes_left >= batch[0].len() { - bytes_left -= batch[0].len(); - batch.remove(0); - } - // Adjust the first slice if it was partially written - if !batch.is_empty() && bytes_left > 0 { - // SAFETY: IoSlice only needs a reference for the duration of the write call, - // and batch[0] is still valid here. - let ptr = batch[0].as_ptr(); - let len = batch[0].len(); - if bytes_left < len { - let new_slice = unsafe { - std::slice::from_raw_parts(ptr.add(bytes_left), len - bytes_left) - }; - batch[0] = IoSlice::new(new_slice); - } + // Follow by the individual update pages + if !update_pages.is_empty() { + write_num += update_pages.len(); + for (frame_id, page_id) in update_pages.iter() { + let page = WriteMessage::Page((*frame_id, *page_id)); + self.tx_job.send(page).unwrap(); } } + + // Waiting for all write workers to finish + for _ in 0..write_num { + _ = self.rx_result.recv().unwrap(); + } + + file.flush()?; + new_pages + .iter() + .chain(update_pages.iter()) + .for_each(|(_, page_id)| self.lru_replacer.unpin(*page_id).unwrap()); + new_pages.clear(); + update_pages.clear(); Ok(()) } @@ -395,7 +481,23 @@ impl PageManager { impl Drop for PageManager { fn drop(&mut self) { - self.sync().expect("sync failed"); + // Sync the remaining work + if let Err(e) = self.sync() { + eprintln!("Warning: Sync failed during drop: {}", e); + } + // Then send shutdown messages to workers + for _ in 0..self.num_workers { + if let Err(_) = self.tx_job.send(WriteMessage::Shutdown) { + // Ignore the error + break; + } + } + // Wait for all worker threads to finish + for handle in self.worker_handles.drain(..) { + if let Err(_) = handle.join() { + eprintln!("Warning: Worker thread panicked during shutdown"); + } + } } } @@ -468,7 +570,7 @@ mod tests { for i in 1..=10 { let i = PageId::new(i).unwrap(); let frame_id = m.page_table.get(&i).expect("page not in cache"); - let dirty_frames = m.lru_replacer.write_frames.lock(); + let dirty_frames = m.lru_replacer.new_frames.lock(); assert!(dirty_frames.iter().any(|x| x.0 == *frame_id && x.1 == i)); } } diff --git a/src/page/manager/cache_evict.rs b/src/page/manager/cache_evict.rs index f201782c..1f6206c9 100644 --- a/src/page/manager/cache_evict.rs +++ b/src/page/manager/cache_evict.rs @@ -9,7 +9,8 @@ use crate::page::{manager::buffer_pool::FrameId, PageId}; pub(crate) struct CacheEvict { lru_replacer: LruReplacer, read_frames: Mutex>, - pub(crate) write_frames: Mutex>, + pub(crate) update_frames: Mutex>, + pub(crate) new_frames: Mutex>, } impl fmt::Debug for CacheEvict { @@ -23,7 +24,8 @@ impl CacheEvict { Self { lru_replacer: LruReplacer::new(capacity), read_frames: Mutex::new(Vec::with_capacity(capacity)), - write_frames: Mutex::new(Vec::with_capacity(capacity)), + update_frames: Mutex::new(Vec::with_capacity(capacity)), + new_frames: Mutex::new(Vec::with_capacity(capacity)), } } @@ -40,8 +42,38 @@ impl CacheEvict { self.lru_replacer.pin(page_id) } - pub(crate) fn pin_write(&self, frame_id: FrameId, page_id: PageId) -> EvictResult<(), PageId> { - self.write_frames.lock().push((frame_id, page_id)); + pub(crate) fn pin_write_update_page( + &self, + frame_id: FrameId, + page_id: PageId, + ) -> EvictResult<(), PageId> { + if let Some((_, first_page_id)) = self.new_frames.lock().first() { + if page_id.as_u32() < first_page_id.as_u32() { + self.update_frames.lock().push((frame_id, page_id)); + } + } else { + self.update_frames.lock().push((frame_id, page_id)); + } + + self.lru_replacer.pin(page_id) + } + + pub(crate) fn pin_write_new_page( + &self, + frame_id: FrameId, + page_id: PageId, + ) -> EvictResult<(), PageId> { + let mut new_frames = self.new_frames.lock(); + if let Some((_, last_page_id)) = new_frames.last() { + debug_assert!( + last_page_id.as_u32() + 1 == page_id, + "page_id: {:?}, last_page_id: {:?}", + page_id, + last_page_id + ); + } + new_frames.push((frame_id, page_id)); + self.lru_replacer.pin(page_id) } diff --git a/src/page/manager/options.rs b/src/page/manager/options.rs index 5b5de24d..2f443e6f 100644 --- a/src/page/manager/options.rs +++ b/src/page/manager/options.rs @@ -9,11 +9,13 @@ pub struct PageManagerOptions { pub(super) open_options: OpenOptions, pub(super) page_count: u32, pub(super) max_pages: u32, - pub(super) num_frames: u32, // for buffer pool backend + pub(super) num_frames: u32, // for buffer pool backend + pub(super) num_workers: usize, // for buffer pool backend } impl PageManagerOptions { pub const DEFAULT_NUM_FRAMES: u32 = 1024 * 1024 * 2; + pub const DEFAULT_NUM_WORKERS: usize = 4; pub fn new() -> Self { let mut open_options = File::options(); @@ -34,7 +36,14 @@ impl PageManagerOptions { 1024 }; - Self { open_options, page_count: 0, max_pages, num_frames } + let num_workers = if cfg!(not(test)) { + Self::DEFAULT_NUM_WORKERS + } else { + // Use a smaller number of workers for tests to reduce memory usage + 1 + }; + + Self { open_options, page_count: 0, max_pages, num_frames, num_workers } } /// Sets the option to create a new file, or open it if it already exists. @@ -79,6 +88,14 @@ impl PageManagerOptions { self } + /// Sets the number of workers for the buffer pool backend. + /// + /// The default is [`DEFAULT_NUM_WORKERS`]. + pub fn num_workers(&mut self, num_workers: usize) -> &mut Self { + self.num_workers = num_workers; + self + } + /// Causes the file length to be set to 0 after opening it. /// /// Note that if `wipe(true)` is set, then setting [`page_count()`](Self::page_count) with any From 4f25329623eda50922a2e3ee7a3c5fda9a2714e8 Mon Sep 17 00:00:00 2001 From: nqd Date: Sun, 26 Oct 2025 21:43:42 +0800 Subject: [PATCH 12/65] using iouring --- Cargo.toml | 2 +- src/page/manager/buffer_pool.rs | 111 ++++++++++++++++++-------------- 2 files changed, 63 insertions(+), 50 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a1ca16d5..2c7fd7e5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ rayon = "1.10.0" evict = "0.3.1" dashmap = "6.1.0" libc = "0.2.174" -crossbeam-channel = "0.5.15" +io-uring = "0.7.10" [features] default = ["buffer_pool_backend"] diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 91f9cee3..a49fdb21 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -1,4 +1,5 @@ use crossbeam_channel::{Receiver, Sender}; +use io_uring::{opcode, types, IoUring}; use std::{ ffi::CString, fs::File, @@ -38,12 +39,6 @@ unsafe impl Sync for Frame {} #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] pub(crate) struct FrameId(u32); -enum WriteMessage { - Page((FrameId, PageId)), - Batch(Vec<(FrameId, PageId)>), - Shutdown, -} - #[derive(Debug)] pub struct PageManager { num_frames: u32, @@ -58,11 +53,7 @@ pub struct PageManager { lru_replacer: CacheEvict, /* the replacer to find unpinned/candidate pages for eviction */ loading_page: DashSet, /* set of pages that are being loaded from disk */ - // write worker - num_workers: usize, - worker_handles: Vec>, - tx_job: Sender, - rx_result: Receiver>, + io_uring: RwLock, } impl PageManager { @@ -112,8 +103,10 @@ impl PageManager { } let lru_replacer = CacheEvict::new(num_frames as usize); - let (tx_job, rx_job) = crossbeam_channel::unbounded(); - let (tx_result, rx_result) = crossbeam_channel::unbounded(); + // Initialize io)uring with queue depth base on num_frames + let queue_depth = num_frames.max(2048) as u32; + let io_uring = IoUring::new(queue_depth) + .map_err(|e| PageError::IO(io::Error::new(io::ErrorKind::Other, e)))?; let mut page_manager = PageManager { num_frames, @@ -126,10 +119,7 @@ impl PageManager { lru_replacer, loading_page: DashSet::with_capacity(num_frames as usize), - num_workers: opts.num_workers, - worker_handles: Vec::with_capacity(opts.num_workers), - tx_job, - rx_result, + io_uring: RwLock::new(io_uring), }; page_manager.start_write_workers(rx_job, tx_result)?; @@ -355,7 +345,7 @@ impl PageManager { /// /// Could explore the parallel write strategy to improve performance. pub fn sync(&self) -> io::Result<()> { - let file = &mut self.file.write(); + let file = &mut self.file.read(); // Get all value at update_frames and new_frames let mut update_pages = self.lru_replacer.update_frames.lock(); @@ -364,30 +354,66 @@ impl PageManager { // New pages should be sorted by page_id ascending and no duplicate let mut new_pages = self.lru_replacer.new_frames.lock(); - let mut write_num = 0; - - // Write new pages first, since this group could be large and take time to write - if !new_pages.is_empty() { - write_num += 1; - let write_message = WriteMessage::Batch(new_pages.clone()); - self.tx_job.send(write_message).unwrap(); + let total_writes = new_pages.len() + update_pages.len(); + if total_writes == 0 { + return Ok(()); } + // Combine all writes + let all_writes = new_pages.iter().chain(update_pages.iter()); - // Follow by the individual update pages - if !update_pages.is_empty() { - write_num += update_pages.len(); - for (frame_id, page_id) in update_pages.iter() { - let page = WriteMessage::Page((*frame_id, *page_id)); - self.tx_job.send(page).unwrap(); + // Submit writes to io_uring + let mut ring = self.io_uring.write(); + + // Build submission queue + for (i, (frame_id, page_id)) in all_writes.enumerate() { + let frame = &self.frames[frame_id.0 as usize]; + let offset = page_id.as_offset(); + + unsafe { + let page_data = std::slice::from_raw_parts(frame.ptr as *const u8, Page::SIZE); + // Create write operation + let write_op = + opcode::Write::new(types::Fd(fd), page_data.as_ptr(), Page::SIZE as u32) + .offset(offset as u64) + .build() + .user_data(i as u64); + // Submit to ring + loop { + let mut sq = ring.submission(); + match unsafe { sq.push(&write_op) } { + Ok(_) => break, + Err(_) => { + // Submission queue is full, submit and wait + drop(sq); + ring.submit_and_wait(1)?; + } + } + } } } - - // Waiting for all write workers to finish - for _ in 0..write_num { - _ = self.rx_result.recv().unwrap(); + // Submit all pending operations + ring.submit()?; + // Wait for all completions + let mut completed = 0; + while completed < total_writes { + let cq = ring.completion(); + for cqe in cq { + let result = cqe.result(); + if result < 0 { + return Err(io::Error::from_raw_os_error(-result)); + } + completed += 1; + } + if completed < total_writes { + // Wait for more completions + ring.submit_and_wait(1)?; + } } + // Drop the write lock on io_uring before calling file operations + drop(ring); + drop(file); - file.flush()?; + self.file.write().flush()?; new_pages .iter() .chain(update_pages.iter()) @@ -485,19 +511,6 @@ impl Drop for PageManager { if let Err(e) = self.sync() { eprintln!("Warning: Sync failed during drop: {}", e); } - // Then send shutdown messages to workers - for _ in 0..self.num_workers { - if let Err(_) = self.tx_job.send(WriteMessage::Shutdown) { - // Ignore the error - break; - } - } - // Wait for all worker threads to finish - for handle in self.worker_handles.drain(..) { - if let Err(_) = handle.join() { - eprintln!("Warning: Worker thread panicked during shutdown"); - } - } } } From 2daaf75d947acfa8321d545bc1667f372fa81883 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 26 Oct 2025 13:57:03 +0000 Subject: [PATCH 13/65] cleanup --- src/page/manager/buffer_pool.rs | 60 +-------------------------------- 1 file changed, 1 insertion(+), 59 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index a49fdb21..0949aa13 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -1,4 +1,3 @@ -use crossbeam_channel::{Receiver, Sender}; use io_uring::{opcode, types, IoUring}; use std::{ ffi::CString, @@ -108,7 +107,7 @@ impl PageManager { let io_uring = IoUring::new(queue_depth) .map_err(|e| PageError::IO(io::Error::new(io::ErrorKind::Other, e)))?; - let mut page_manager = PageManager { + let page_manager = PageManager { num_frames, page_count, file: RwLock::new(file), @@ -121,67 +120,10 @@ impl PageManager { io_uring: RwLock::new(io_uring), }; - page_manager.start_write_workers(rx_job, tx_result)?; Ok(page_manager) } - fn start_write_workers( - &mut self, - rx_job: Receiver, - tx_result: Sender>, - ) -> Result<(), PageError> { - let rx_job_arc = Arc::new(rx_job); - for _ in 0..self.num_workers { - let rx_job_arc = Arc::clone(&rx_job_arc); - let mut worker_file = self.file.write().try_clone().map_err(PageError::IO)?; - let frames = self.frames.clone(); - let tx_result = tx_result.clone(); - let handle = thread::spawn(move || loop { - match rx_job_arc.recv() { - Ok(WriteMessage::Page((frame_id, page_id))) => { - // write to file at specific offset - let offset = page_id.as_offset() as u64; - let frame = &frames[frame_id.0 as usize]; - unsafe { - let page_data = - std::slice::from_raw_parts(frame.ptr as *const u8, Page::SIZE); - let result = worker_file.write_at(page_data, offset); - tx_result.send(result.map(|_| ())).unwrap(); - } - } - Ok(WriteMessage::Batch(batch)) => { - if batch.is_empty() { - continue; - } - let mut batch_slices = Vec::with_capacity(batch.len()); - for (frame_id, _) in &batch { - let frame = &frames[frame_id.0 as usize]; - unsafe { - let page_data = - std::slice::from_raw_parts(frame.ptr as *const u8, Page::SIZE); - batch_slices.push(IoSlice::new(page_data)); - } - } - let result = Self::write_batch( - &mut batch_slices, - &mut worker_file, - batch[0].1.as_offset() as u64, - ); - tx_result.send(result).unwrap(); - } - Ok(WriteMessage::Shutdown) => { - break; // Graceful shutdown - } - Err(_) => { - break; // Channel closed - } - } - }); - self.worker_handles.push(handle); - } - Ok(()) - } #[inline] fn write_batch(batch: &mut Vec, file: &mut File, offset: u64) -> io::Result<()> { From 11c143d2c6b6a2997981be094c12dd5e2a259438 Mon Sep 17 00:00:00 2001 From: nqd Date: Sun, 26 Oct 2025 14:21:48 +0000 Subject: [PATCH 14/65] cleanup --- src/page/manager/buffer_pool.rs | 79 +++++++++++++-------------------- 1 file changed, 30 insertions(+), 49 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 0949aa13..5a12f3c1 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -3,13 +3,15 @@ use std::{ ffi::CString, fs::File, io::{self, IoSlice, Seek, SeekFrom, Write}, - os::{fd::FromRawFd, unix::fs::FileExt}, + os::{ + fd::{AsRawFd, FromRawFd}, + unix::fs::FileExt, + }, path::Path, sync::{ atomic::{AtomicU32, AtomicU64, Ordering}, Arc, }, - thread::{self, JoinHandle}, }; use dashmap::{DashMap, DashSet}; @@ -38,7 +40,6 @@ unsafe impl Sync for Frame {} #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] pub(crate) struct FrameId(u32); -#[derive(Debug)] pub struct PageManager { num_frames: u32, page_count: AtomicU32, @@ -55,6 +56,23 @@ pub struct PageManager { io_uring: RwLock, } +impl std::fmt::Debug for PageManager { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PageManager") + .field("num_frames", &self.num_frames) + .field("page_count", &self.page_count) + .field("file", &self.file) + .field("file_len", &self.file_len) + .field("frames", &self.frames) + .field("page_table", &self.page_table) + .field("original_free_frame_idx", &self.original_free_frame_idx) + .field("lru_replacer", &self.lru_replacer) + .field("loading_page", &self.loading_page) + .field("io_uring", &"") + .finish() + } +} + impl PageManager { pub fn options() -> PageManagerOptions { PageManagerOptions::new() @@ -124,44 +142,6 @@ impl PageManager { Ok(page_manager) } - - #[inline] - fn write_batch(batch: &mut Vec, file: &mut File, offset: u64) -> io::Result<()> { - let total_len = batch.iter().map(|b| b.len()).sum::(); - file.seek(SeekFrom::Start(offset))?; - let mut total_written: usize = 0; - while total_written < total_len { - let written = file.write_vectored(batch)?; - if written == 0 { - return Err(io::Error::new( - io::ErrorKind::WriteZero, - "failed to write whole buffer", - )); - } - total_written += written; - // Remove fully written slices from the batch - let mut bytes_left = written; - while !batch.is_empty() && bytes_left >= batch[0].len() { - bytes_left -= batch[0].len(); - batch.remove(0); - } - // Adjust the first slice if it was partially written - if !batch.is_empty() && bytes_left > 0 { - // SAFETY: IoSlice only needs a reference for the duration of the write call, - // and batch[0] is still valid here. - let ptr = batch[0].as_ptr(); - let len = batch[0].len(); - if bytes_left < len { - let new_slice = unsafe { - std::slice::from_raw_parts(ptr.add(bytes_left), len - bytes_left) - }; - batch[0] = IoSlice::new(new_slice); - } - } - } - Ok(()) - } - #[cfg(test)] pub fn open_temp_file() -> Result { Self::options().open_temp_file() @@ -177,7 +157,7 @@ impl PageManager { if let Some(frame_id) = self.page_table.get(&page_id) { let frame = &self.frames[frame_id.0 as usize]; self.lru_replacer.touch(page_id).map_err(|_| PageError::EvictionPolicy)?; - return unsafe { Page::from_ptr(page_id, frame.ptr, self) } + return unsafe { Page::from_ptr(page_id, frame.ptr, self) }; } // Otherwise, need to load the page from disk @@ -194,7 +174,7 @@ impl PageManager { self.page_table.insert(page_id, frame_id); self.lru_replacer.pin_read(page_id).map_err(|_| PageError::EvictionPolicy)?; self.loading_page.remove(&page_id); - return unsafe { Page::from_ptr(page_id, buf, self) } + return unsafe { Page::from_ptr(page_id, buf, self) }; } // Another thread is already loading this page, spin/yield and retry std::thread::yield_now(); @@ -217,7 +197,7 @@ impl PageManager { .pin_write_update_page(*frame_id, page_id) .map_err(|_| PageError::EvictionPolicy)?; let frame = &self.frames[frame_id.0 as usize]; - return unsafe { PageMut::from_ptr(page_id, snapshot_id, frame.ptr, self) } + return unsafe { PageMut::from_ptr(page_id, snapshot_id, frame.ptr, self) }; } // Otherwise, need to load the page from disk if self.loading_page.insert(page_id) { @@ -235,7 +215,7 @@ impl PageManager { .pin_write_update_page(frame_id, page_id) .map_err(|_| PageError::EvictionPolicy)?; self.loading_page.remove(&page_id); - return unsafe { PageMut::from_ptr(page_id, snapshot_id, buf, self) } + return unsafe { PageMut::from_ptr(page_id, snapshot_id, buf, self) }; } else { // Another thread is already loading this page, spin/yield and retry std::thread::yield_now(); @@ -287,7 +267,8 @@ impl PageManager { /// /// Could explore the parallel write strategy to improve performance. pub fn sync(&self) -> io::Result<()> { - let file = &mut self.file.read(); + let file = self.file.read(); + let fd = file.as_raw_fd(); // Get all value at update_frames and new_frames let mut update_pages = self.lru_replacer.update_frames.lock(); @@ -322,7 +303,7 @@ impl PageManager { // Submit to ring loop { let mut sq = ring.submission(); - match unsafe { sq.push(&write_op) } { + match sq.push(&write_op) { Ok(_) => break, Err(_) => { // Submission queue is full, submit and wait @@ -420,9 +401,9 @@ impl PageManager { } else { let evicted_page = self.lru_replacer.evict(); if let Some(page_id) = evicted_page { - return self.page_table.remove(&page_id).map(|(_, frame_id)| frame_id) + return self.page_table.remove(&page_id).map(|(_, frame_id)| frame_id); } else { - return None + return None; } } } From c9da483b50c1ce607bd78565a7d5b929de8be786 Mon Sep 17 00:00:00 2001 From: nqd Date: Wed, 29 Oct 2025 13:37:46 +0000 Subject: [PATCH 15/65] fix --- src/page/manager/buffer_pool.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 5a12f3c1..99d61101 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -121,7 +121,7 @@ impl PageManager { let lru_replacer = CacheEvict::new(num_frames as usize); // Initialize io)uring with queue depth base on num_frames - let queue_depth = num_frames.max(2048) as u32; + let queue_depth = num_frames.min(2048) as u32; let io_uring = IoUring::new(queue_depth) .map_err(|e| PageError::IO(io::Error::new(io::ErrorKind::Other, e)))?; From 436d36f8042cb4ccc43bff52cd3b1a89b6d3d267 Mon Sep 17 00:00:00 2001 From: nqd Date: Thu, 30 Oct 2025 14:52:54 +0000 Subject: [PATCH 16/65] use fxhash for dashmap/dashset --- src/page/manager/buffer_pool.rs | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 99d61101..ff7b8996 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -1,17 +1,12 @@ +use fxhash::{FxBuildHasher, FxHasher}; use io_uring::{opcode, types, IoUring}; use std::{ - ffi::CString, - fs::File, - io::{self, IoSlice, Seek, SeekFrom, Write}, - os::{ + ffi::CString, fs::File, hash::BuildHasherDefault, io::{self, IoSlice, Seek, SeekFrom, Write}, os::{ fd::{AsRawFd, FromRawFd}, unix::fs::FileExt, - }, - path::Path, - sync::{ - atomic::{AtomicU32, AtomicU64, Ordering}, - Arc, - }, + }, path::Path, sync::{ + Arc, atomic::{AtomicU32, AtomicU64, Ordering} + } }; use dashmap::{DashMap, DashSet}; @@ -47,11 +42,11 @@ pub struct PageManager { file_len: AtomicU64, frames: Arc>, /* list of frames that hold pages' data, indexed by frame id with * fix num_frames size */ - page_table: DashMap, /* mapping between page id and buffer pool frames, + page_table: DashMap>, /* mapping between page id and buffer pool frames, * indexed by page id with fix num_frames size */ original_free_frame_idx: AtomicU32, lru_replacer: CacheEvict, /* the replacer to find unpinned/candidate pages for eviction */ - loading_page: DashSet, /* set of pages that are being loaded from disk */ + loading_page: DashSet>, /* set of pages that are being loaded from disk */ io_uring: RwLock, } @@ -111,7 +106,7 @@ impl PageManager { let num_frames = opts.num_frames; let page_count = AtomicU32::new(opts.page_count); let file_len = AtomicU64::new(file.metadata().map_err(PageError::IO)?.len()); - let page_table = DashMap::with_capacity(num_frames as usize); + let page_table = DashMap::with_capacity_and_hasher(num_frames as usize, FxBuildHasher::default()); let mut frames = Vec::with_capacity(num_frames as usize); for _ in 0..num_frames { let boxed_array = Box::new([0; Page::SIZE]); @@ -119,6 +114,7 @@ impl PageManager { frames.push(Frame { ptr }); } let lru_replacer = CacheEvict::new(num_frames as usize); + let loading_page = DashSet::with_capacity_and_hasher(num_frames as usize, FxBuildHasher::default()); // Initialize io)uring with queue depth base on num_frames let queue_depth = num_frames.min(2048) as u32; @@ -134,7 +130,7 @@ impl PageManager { page_table, original_free_frame_idx: AtomicU32::new(0), lru_replacer, - loading_page: DashSet::with_capacity(num_frames as usize), + loading_page, io_uring: RwLock::new(io_uring), }; From d46b3ea09c8ff8c993e8225b0f99bc7eb5a467b4 Mon Sep 17 00:00:00 2001 From: nqd Date: Sat, 1 Nov 2025 14:12:07 +0000 Subject: [PATCH 17/65] write in batch --- src/page/manager/buffer_pool.rs | 78 ++++++++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 12 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index ff7b8996..f36b236f 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -1,7 +1,7 @@ use fxhash::{FxBuildHasher, FxHasher}; use io_uring::{opcode, types, IoUring}; use std::{ - ffi::CString, fs::File, hash::BuildHasherDefault, io::{self, IoSlice, Seek, SeekFrom, Write}, os::{ + ffi::CString, fs::File, hash::BuildHasherDefault, io::{self, Write}, os::{ fd::{AsRawFd, FromRawFd}, unix::fs::FileExt, }, path::Path, sync::{ @@ -9,6 +9,9 @@ use std::{ } }; +#[cfg(test)] +use std::io::SeekFrom; + use dashmap::{DashMap, DashSet}; use parking_lot::RwLock; @@ -261,7 +264,8 @@ impl PageManager { /// Syncs the buffer pool to the file. /// - /// Could explore the parallel write strategy to improve performance. + /// New pages at the end of the file are batch written using vectored I/O Writev, since they are guaranteed to be contiguous. + /// Update pages are usually random pages scattered throughout the file, and written individually with Write. pub fn sync(&self) -> io::Result<()> { let file = self.file.read(); let fd = file.as_raw_fd(); @@ -273,18 +277,62 @@ impl PageManager { // New pages should be sorted by page_id ascending and no duplicate let mut new_pages = self.lru_replacer.new_frames.lock(); - let total_writes = new_pages.len() + update_pages.len(); - if total_writes == 0 { + if new_pages.is_empty() && update_pages.is_empty() { return Ok(()); } - // Combine all writes - let all_writes = new_pages.iter().chain(update_pages.iter()); // Submit writes to io_uring let mut ring = self.io_uring.write(); + let mut op_count = 0; + + // Write contiguous new pages as a batch using writev. + // Note: iovecs must stay alive until operations complete, so we define it outside the scope + let _iovecs = if !new_pages.is_empty() { + // Collect iovec for new pages + let iovecs: Vec = new_pages + .iter() + .map(|(frame_id, _)| { + let frame = &self.frames[frame_id.0 as usize]; + libc::iovec { + iov_base: frame.ptr as *mut libc::c_void, + iov_len: Page::SIZE, + } + }) + .collect(); + + // Get the offset of the first new page + let first_offset = new_pages[0].1.as_offset() as u64; + + unsafe { + let writev_op = opcode::Writev::new(types::Fd(fd), iovecs.as_ptr(), iovecs.len() as u32) + .offset(first_offset) + .build() + .user_data(op_count); - // Build submission queue - for (i, (frame_id, page_id)) in all_writes.enumerate() { + // Submit to ring + loop { + let mut sq = ring.submission(); + match sq.push(&writev_op) { + Ok(_) => { + op_count += 1; + break; + } + Err(_) => { + // Submission queue is full, submit and wait + drop(sq); + ring.submit_and_wait(1)?; + } + } + } + } + + Some(iovecs) + } else { + None + }; + + // Write update_pages individually (they may not be contiguous) + for (frame_id, page_id) in update_pages.iter() { let frame = &self.frames[frame_id.0 as usize]; let offset = page_id.as_offset(); @@ -295,12 +343,15 @@ impl PageManager { opcode::Write::new(types::Fd(fd), page_data.as_ptr(), Page::SIZE as u32) .offset(offset as u64) .build() - .user_data(i as u64); + .user_data(op_count); // Submit to ring loop { let mut sq = ring.submission(); match sq.push(&write_op) { - Ok(_) => break, + Ok(_) => { + op_count += 1; + break; + } Err(_) => { // Submission queue is full, submit and wait drop(sq); @@ -310,11 +361,13 @@ impl PageManager { } } } + // Submit all pending operations ring.submit()?; + // Wait for all completions let mut completed = 0; - while completed < total_writes { + while completed < op_count { let cq = ring.completion(); for cqe in cq { let result = cqe.result(); @@ -323,11 +376,12 @@ impl PageManager { } completed += 1; } - if completed < total_writes { + if completed < op_count { // Wait for more completions ring.submit_and_wait(1)?; } } + // Drop the write lock on io_uring before calling file operations drop(ring); drop(file); From 614a6ac63caaae84ab079d5bce7f8b15288e8f15 Mon Sep 17 00:00:00 2001 From: nqd Date: Sun, 2 Nov 2025 13:11:23 +0000 Subject: [PATCH 18/65] format --- src/page/manager/buffer_pool.rs | 39 +++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index f36b236f..76b68fb0 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -1,12 +1,19 @@ use fxhash::{FxBuildHasher, FxHasher}; use io_uring::{opcode, types, IoUring}; use std::{ - ffi::CString, fs::File, hash::BuildHasherDefault, io::{self, Write}, os::{ + ffi::CString, + fs::File, + hash::BuildHasherDefault, + io::{self, Write}, + os::{ fd::{AsRawFd, FromRawFd}, unix::fs::FileExt, - }, path::Path, sync::{ - Arc, atomic::{AtomicU32, AtomicU64, Ordering} - } + }, + path::Path, + sync::{ + atomic::{AtomicU32, AtomicU64, Ordering}, + Arc, + }, }; #[cfg(test)] @@ -46,7 +53,7 @@ pub struct PageManager { frames: Arc>, /* list of frames that hold pages' data, indexed by frame id with * fix num_frames size */ page_table: DashMap>, /* mapping between page id and buffer pool frames, - * indexed by page id with fix num_frames size */ + * indexed by page id with fix num_frames size */ original_free_frame_idx: AtomicU32, lru_replacer: CacheEvict, /* the replacer to find unpinned/candidate pages for eviction */ loading_page: DashSet>, /* set of pages that are being loaded from disk */ @@ -109,7 +116,8 @@ impl PageManager { let num_frames = opts.num_frames; let page_count = AtomicU32::new(opts.page_count); let file_len = AtomicU64::new(file.metadata().map_err(PageError::IO)?.len()); - let page_table = DashMap::with_capacity_and_hasher(num_frames as usize, FxBuildHasher::default()); + let page_table = + DashMap::with_capacity_and_hasher(num_frames as usize, FxBuildHasher::default()); let mut frames = Vec::with_capacity(num_frames as usize); for _ in 0..num_frames { let boxed_array = Box::new([0; Page::SIZE]); @@ -117,7 +125,8 @@ impl PageManager { frames.push(Frame { ptr }); } let lru_replacer = CacheEvict::new(num_frames as usize); - let loading_page = DashSet::with_capacity_and_hasher(num_frames as usize, FxBuildHasher::default()); + let loading_page = + DashSet::with_capacity_and_hasher(num_frames as usize, FxBuildHasher::default()); // Initialize io)uring with queue depth base on num_frames let queue_depth = num_frames.min(2048) as u32; @@ -293,10 +302,7 @@ impl PageManager { .iter() .map(|(frame_id, _)| { let frame = &self.frames[frame_id.0 as usize]; - libc::iovec { - iov_base: frame.ptr as *mut libc::c_void, - iov_len: Page::SIZE, - } + libc::iovec { iov_base: frame.ptr as *mut libc::c_void, iov_len: Page::SIZE } }) .collect(); @@ -304,10 +310,11 @@ impl PageManager { let first_offset = new_pages[0].1.as_offset() as u64; unsafe { - let writev_op = opcode::Writev::new(types::Fd(fd), iovecs.as_ptr(), iovecs.len() as u32) - .offset(first_offset) - .build() - .user_data(op_count); + let writev_op = + opcode::Writev::new(types::Fd(fd), iovecs.as_ptr(), iovecs.len() as u32) + .offset(first_offset) + .build() + .user_data(op_count); // Submit to ring loop { @@ -325,7 +332,7 @@ impl PageManager { } } } - + Some(iovecs) } else { None From d3c64a437b5384fc33a10b919f3cfec63f45dc3a Mon Sep 17 00:00:00 2001 From: nqd Date: Sun, 2 Nov 2025 14:11:15 +0000 Subject: [PATCH 19/65] change update_frames type --- src/page/manager/buffer_pool.rs | 44 ++++++++++++++++++++++----------- src/page/manager/cache_evict.rs | 15 +++++++---- 2 files changed, 39 insertions(+), 20 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 76b68fb0..d3c63eca 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -3,7 +3,6 @@ use io_uring::{opcode, types, IoUring}; use std::{ ffi::CString, fs::File, - hash::BuildHasherDefault, io::{self, Write}, os::{ fd::{AsRawFd, FromRawFd}, @@ -45,6 +44,9 @@ unsafe impl Sync for Frame {} #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] pub(crate) struct FrameId(u32); +pub(crate) type FxMap = DashMap; +pub(crate) type FxSet = DashSet; + pub struct PageManager { num_frames: u32, page_count: AtomicU32, @@ -52,11 +54,11 @@ pub struct PageManager { file_len: AtomicU64, frames: Arc>, /* list of frames that hold pages' data, indexed by frame id with * fix num_frames size */ - page_table: DashMap>, /* mapping between page id and buffer pool frames, - * indexed by page id with fix num_frames size */ + page_table: FxMap, /* mapping between page id and buffer pool frames, + * indexed by page id with fix num_frames size */ original_free_frame_idx: AtomicU32, lru_replacer: CacheEvict, /* the replacer to find unpinned/candidate pages for eviction */ - loading_page: DashSet>, /* set of pages that are being loaded from disk */ + loading_page: FxSet, /* set of pages that are being loaded from disk */ io_uring: RwLock, } @@ -279,14 +281,8 @@ impl PageManager { let file = self.file.read(); let fd = file.as_raw_fd(); - // Get all value at update_frames and new_frames - let mut update_pages = self.lru_replacer.update_frames.lock(); - update_pages.sort_by_key(|(_, page_id)| page_id.as_offset()); - update_pages.dedup_by_key(|(_, page_id)| *page_id); - // New pages should be sorted by page_id ascending and no duplicate let mut new_pages = self.lru_replacer.new_frames.lock(); - - if new_pages.is_empty() && update_pages.is_empty() { + if new_pages.is_empty() && self.lru_replacer.update_frames.is_empty() { return Ok(()); } @@ -339,7 +335,9 @@ impl PageManager { }; // Write update_pages individually (they may not be contiguous) - for (frame_id, page_id) in update_pages.iter() { + for entry in self.lru_replacer.update_frames.iter() { + let frame_id = *entry.value(); + let page_id = *entry.key(); let frame = &self.frames[frame_id.0 as usize]; let offset = page_id.as_offset(); @@ -393,13 +391,21 @@ impl PageManager { drop(ring); drop(file); + // println!("sync, new_pages: {:?}", new_pages); + // println!("sync, update_pages: {:?}", update_pages); self.file.write().flush()?; new_pages .iter() - .chain(update_pages.iter()) .for_each(|(_, page_id)| self.lru_replacer.unpin(*page_id).unwrap()); new_pages.clear(); - update_pages.clear(); + + // TODO: is there any race condition here? + self.lru_replacer + .update_frames + .iter() + .for_each(|entry| self.lru_replacer.unpin(*entry.key()).unwrap()); + self.lru_replacer.update_frames.clear(); + Ok(()) } @@ -421,7 +427,15 @@ impl PageManager { #[inline] pub fn drop_page(&self, page_id: PageId) { - // unpin() must be successful, or an indication of a bug in the code + // println!("drop_page: {:?}", page_id); + // // if the drop_page is in the dirty page list, save it + // let updated_page = self.lru_replacer.new_frames.lock(); + // if (page_id.as_u32() >= updated_page[0].1.as_u32()) + // || (page_id.as_u32() <= updated_page[updated_page.len() - 1].1.as_u32()) + // { + // // todo: could check if this already inserted + // self.lru_replacer.drop_pages.insert(page_id); + // } self.lru_replacer.unpin(page_id).unwrap(); } diff --git a/src/page/manager/cache_evict.rs b/src/page/manager/cache_evict.rs index 1f6206c9..8b0050e3 100644 --- a/src/page/manager/cache_evict.rs +++ b/src/page/manager/cache_evict.rs @@ -1,15 +1,20 @@ use std::fmt; +use dashmap::DashMap; use evict::{EvictResult, EvictionPolicy, LruReplacer}; +use fxhash::FxBuildHasher; use parking_lot::Mutex; -use crate::page::{manager::buffer_pool::FrameId, PageId}; +use crate::page::{ + manager::buffer_pool::{FrameId, FxMap}, + PageId, +}; // TODO: Temporarily use LruReplacer as the eviction policy, replace with a better eviction policy pub(crate) struct CacheEvict { lru_replacer: LruReplacer, read_frames: Mutex>, - pub(crate) update_frames: Mutex>, + pub(crate) update_frames: FxMap, pub(crate) new_frames: Mutex>, } @@ -24,7 +29,7 @@ impl CacheEvict { Self { lru_replacer: LruReplacer::new(capacity), read_frames: Mutex::new(Vec::with_capacity(capacity)), - update_frames: Mutex::new(Vec::with_capacity(capacity)), + update_frames: DashMap::with_capacity_and_hasher(capacity, FxBuildHasher::default()), new_frames: Mutex::new(Vec::with_capacity(capacity)), } } @@ -49,10 +54,10 @@ impl CacheEvict { ) -> EvictResult<(), PageId> { if let Some((_, first_page_id)) = self.new_frames.lock().first() { if page_id.as_u32() < first_page_id.as_u32() { - self.update_frames.lock().push((frame_id, page_id)); + self.update_frames.insert(page_id, frame_id); } } else { - self.update_frames.lock().push((frame_id, page_id)); + self.update_frames.insert(page_id, frame_id); } self.lru_replacer.pin(page_id) From a1a4e5cb400ca1f1dbc044008b37eec61cbaba56 Mon Sep 17 00:00:00 2001 From: nqd Date: Sun, 2 Nov 2025 14:51:42 +0000 Subject: [PATCH 20/65] cleanup after change updated_frames from vec to dashmap --- src/page/manager/buffer_pool.rs | 44 +++++++++++++++------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index d3c63eca..390e2f1c 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -341,27 +341,25 @@ impl PageManager { let frame = &self.frames[frame_id.0 as usize]; let offset = page_id.as_offset(); - unsafe { - let page_data = std::slice::from_raw_parts(frame.ptr as *const u8, Page::SIZE); - // Create write operation - let write_op = - opcode::Write::new(types::Fd(fd), page_data.as_ptr(), Page::SIZE as u32) - .offset(offset as u64) - .build() - .user_data(op_count); - // Submit to ring - loop { - let mut sq = ring.submission(); - match sq.push(&write_op) { - Ok(_) => { - op_count += 1; - break; - } - Err(_) => { - // Submission queue is full, submit and wait - drop(sq); - ring.submit_and_wait(1)?; - } + let page_data = + unsafe { std::slice::from_raw_parts(frame.ptr as *const u8, Page::SIZE) }; + // Create write operation + let write_op = opcode::Write::new(types::Fd(fd), page_data.as_ptr(), Page::SIZE as u32) + .offset(offset as u64) + .build() + .user_data(op_count); + // Submit to ring + loop { + let mut sq = ring.submission(); + match unsafe { sq.push(&write_op) } { + Ok(_) => { + op_count += 1; + break; + } + Err(_) => { + // Submission queue is full, submit and wait + drop(sq); + ring.submit_and_wait(1)?; } } } @@ -394,9 +392,7 @@ impl PageManager { // println!("sync, new_pages: {:?}", new_pages); // println!("sync, update_pages: {:?}", update_pages); self.file.write().flush()?; - new_pages - .iter() - .for_each(|(_, page_id)| self.lru_replacer.unpin(*page_id).unwrap()); + new_pages.iter().for_each(|(_, page_id)| self.lru_replacer.unpin(*page_id).unwrap()); new_pages.clear(); // TODO: is there any race condition here? From f6027c8d33b033ddd2145cb35bd41d4c9d7e81fe Mon Sep 17 00:00:00 2001 From: nqd Date: Mon, 3 Nov 2025 14:11:52 +0000 Subject: [PATCH 21/65] move cleanup to before checking job completion --- src/page/manager/buffer_pool.rs | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 390e2f1c..95165fd1 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -368,7 +368,17 @@ impl PageManager { // Submit all pending operations ring.submit()?; - // Wait for all completions + // Do cleanup work + new_pages.iter().for_each(|(_, page_id)| self.lru_replacer.unpin(*page_id).unwrap()); + new_pages.clear(); + // TODO: is there any race condition here? + self.lru_replacer + .update_frames + .iter() + .for_each(|entry| self.lru_replacer.unpin(*entry.key()).unwrap()); + self.lru_replacer.update_frames.clear(); + + // Wait for all jobs to complete let mut completed = 0; while completed < op_count { let cq = ring.completion(); @@ -380,7 +390,6 @@ impl PageManager { completed += 1; } if completed < op_count { - // Wait for more completions ring.submit_and_wait(1)?; } } @@ -392,15 +401,6 @@ impl PageManager { // println!("sync, new_pages: {:?}", new_pages); // println!("sync, update_pages: {:?}", update_pages); self.file.write().flush()?; - new_pages.iter().for_each(|(_, page_id)| self.lru_replacer.unpin(*page_id).unwrap()); - new_pages.clear(); - - // TODO: is there any race condition here? - self.lru_replacer - .update_frames - .iter() - .for_each(|entry| self.lru_replacer.unpin(*entry.key()).unwrap()); - self.lru_replacer.update_frames.clear(); Ok(()) } From 5cf08f3d736437f0b42ef15a845cdc277b715a0f Mon Sep 17 00:00:00 2001 From: nqd Date: Fri, 7 Nov 2025 12:43:20 +0000 Subject: [PATCH 22/65] Wip --- src/page/manager/buffer_pool.rs | 32 ++++++++++++++++++++++---------- src/page/manager/cache_evict.rs | 13 +++++++------ src/page/page.rs | 3 ++- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 95165fd1..ecee1dc4 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -1,4 +1,4 @@ -use fxhash::{FxBuildHasher, FxHasher}; +use fxhash::FxBuildHasher; use io_uring::{opcode, types, IoUring}; use std::{ ffi::CString, @@ -369,6 +369,16 @@ impl PageManager { ring.submit()?; // Do cleanup work + // println!( + // "drop len: {:?}, update len: {:?}, new len: {:?}", + // self.lru_replacer.drop_pages.len(), + // self.lru_replacer.update_frames.len(), + // new_pages.len() + // ); + // println!("drop_pages: {:?}", self.lru_replacer.drop_pages); + // println!("update_pages: {:?}", self.lru_replacer.update_frames); + // println!("new_pages: {:?}", new_pages); + new_pages.iter().for_each(|(_, page_id)| self.lru_replacer.unpin(*page_id).unwrap()); new_pages.clear(); // TODO: is there any race condition here? @@ -377,6 +387,7 @@ impl PageManager { .iter() .for_each(|entry| self.lru_replacer.unpin(*entry.key()).unwrap()); self.lru_replacer.update_frames.clear(); + self.lru_replacer.drop_pages.clear(); // Wait for all jobs to complete let mut completed = 0; @@ -423,15 +434,16 @@ impl PageManager { #[inline] pub fn drop_page(&self, page_id: PageId) { - // println!("drop_page: {:?}", page_id); - // // if the drop_page is in the dirty page list, save it - // let updated_page = self.lru_replacer.new_frames.lock(); - // if (page_id.as_u32() >= updated_page[0].1.as_u32()) - // || (page_id.as_u32() <= updated_page[updated_page.len() - 1].1.as_u32()) - // { - // // todo: could check if this already inserted - // self.lru_replacer.drop_pages.insert(page_id); - // } + self.lru_replacer.unpin(page_id).unwrap(); + } + + #[inline] + pub fn drop_page_mut(&self, page_id: PageId) { + if self.lru_replacer.update_frames.get(&page_id).is_some() { + self.lru_replacer.drop_pages.insert(page_id); + // println!("drop_page from update_frames: {:?}", page_id); + } + self.lru_replacer.unpin(page_id).unwrap(); } diff --git a/src/page/manager/cache_evict.rs b/src/page/manager/cache_evict.rs index 8b0050e3..a9ec71dc 100644 --- a/src/page/manager/cache_evict.rs +++ b/src/page/manager/cache_evict.rs @@ -1,21 +1,21 @@ use std::fmt; -use dashmap::DashMap; +use dashmap::{DashMap, DashSet}; use evict::{EvictResult, EvictionPolicy, LruReplacer}; use fxhash::FxBuildHasher; use parking_lot::Mutex; use crate::page::{ - manager::buffer_pool::{FrameId, FxMap}, - PageId, + PageId, manager::buffer_pool::{FrameId, FxMap, FxSet} }; // TODO: Temporarily use LruReplacer as the eviction policy, replace with a better eviction policy pub(crate) struct CacheEvict { lru_replacer: LruReplacer, - read_frames: Mutex>, + pub(crate) read_frames: FxSet, pub(crate) update_frames: FxMap, pub(crate) new_frames: Mutex>, + pub(crate) drop_pages: FxSet, } impl fmt::Debug for CacheEvict { @@ -28,9 +28,10 @@ impl CacheEvict { pub(crate) fn new(capacity: usize) -> Self { Self { lru_replacer: LruReplacer::new(capacity), - read_frames: Mutex::new(Vec::with_capacity(capacity)), + read_frames: DashSet::with_capacity_and_hasher(capacity, FxBuildHasher::default()), update_frames: DashMap::with_capacity_and_hasher(capacity, FxBuildHasher::default()), new_frames: Mutex::new(Vec::with_capacity(capacity)), + drop_pages: DashSet::with_capacity_and_hasher(capacity, FxBuildHasher::default()) } } @@ -43,7 +44,7 @@ impl CacheEvict { } pub(crate) fn pin_read(&self, page_id: PageId) -> EvictResult<(), PageId> { - self.read_frames.lock().push(page_id); + self.read_frames.insert(page_id); self.lru_replacer.pin(page_id) } diff --git a/src/page/page.rs b/src/page/page.rs index b5f93052..f4c8225d 100644 --- a/src/page/page.rs +++ b/src/page/page.rs @@ -336,7 +336,8 @@ impl fmt::Debug for PageMut<'_> { impl Drop for PageMut<'_> { fn drop(&mut self) { - self.commit_internal() + self.commit_internal(); + self.inner.page_manager.drop_page_mut(self.id()); } } From 0d09f9997e0edb062e4d031b15036dee805b9ce2 Mon Sep 17 00:00:00 2001 From: nqd Date: Sun, 9 Nov 2025 14:31:30 +0000 Subject: [PATCH 23/65] debug send dropped pages to background job: --- Cargo.toml | 1 + src/page/manager/buffer_pool.rs | 134 ++++++++++++++++++++++++++++++-- 2 files changed, 130 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2c7fd7e5..2520067d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ evict = "0.3.1" dashmap = "6.1.0" libc = "0.2.174" io-uring = "0.7.10" +crossbeam-channel = "0.5.15" [features] default = ["buffer_pool_backend"] diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index ecee1dc4..ec663d8b 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -1,3 +1,4 @@ +use crossbeam_channel::{Receiver, Sender}; use fxhash::FxBuildHasher; use io_uring::{opcode, types, IoUring}; use std::{ @@ -13,11 +14,9 @@ use std::{ atomic::{AtomicU32, AtomicU64, Ordering}, Arc, }, + thread, }; -#[cfg(test)] -use std::io::SeekFrom; - use dashmap::{DashMap, DashSet}; use parking_lot::RwLock; @@ -61,6 +60,13 @@ pub struct PageManager { loading_page: FxSet, /* set of pages that are being loaded from disk */ io_uring: RwLock, + tx_job: Sender, +} + +enum WriteMessage { + Pages(Vec<(PageId, FrameId)>), + Sync, + Shutdown, } impl std::fmt::Debug for PageManager { @@ -135,6 +141,7 @@ impl PageManager { let io_uring = IoUring::new(queue_depth) .map_err(|e| PageError::IO(io::Error::new(io::ErrorKind::Other, e)))?; + let (tx_job, rx_job) = crossbeam_channel::unbounded(); let page_manager = PageManager { num_frames, page_count, @@ -147,11 +154,113 @@ impl PageManager { loading_page, io_uring: RwLock::new(io_uring), + tx_job, }; + page_manager.start_write_worker(rx_job)?; + Ok(page_manager) } + fn start_write_worker(&self, rx_job: Receiver) -> Result<(), PageError> { + let rx_job = Arc::new(rx_job); + thread::spawn(move || { + loop { + match rx_job.recv() { + Ok(WriteMessage::Pages(pages)) => { + Self::write_updated_pages(&pages); + // let file = self.file.read(); + // let fd = file.as_raw_fd(); + // let mut op_count = 0; + // let mut ring = self.io_uring.write(); + // for page_id in self.lru_replacer.drop_pages.iter() { + // let frame_id = self.page_table.get(&page_id); + // if let Some(frame_id) = frame_id { + // let x = frame_id.value(); + // let frame = self.frames.get(x.0 as usize); + // if let Some(frame) = frame { + // let offset = page_id.as_offset(); + // let page_data = unsafe { + // std::slice::from_raw_parts( + // frame.ptr as *const u8, + // Page::SIZE, + // ) + // }; + // // Create write operation + // let write_op = opcode::Write::new( + // types::Fd(fd), + // page_data.as_ptr(), + // Page::SIZE as u32, + // ) + // .offset(offset as u64) + // .build() + // .user_data(op_count); + // // Submit to ring + // loop { + // let mut sq = ring.submission(); + // match unsafe { sq.push(&write_op) } { + // Ok(_) => { + // op_count += 1; + // break; + // } + // Err(_) => { + // // Submission queue is full, submit and wait + // drop(sq); + // // TODO: send back the error + // ring.submit_and_wait(1).unwrap(); + // } + // } + // } + // } + // } + // } + // // Submit all pending operations + // // TODO: send back the error + // ring.submit().unwrap(); + // // Wait for jobs to complete + // let mut completed = 0; + // while completed < op_count { + // let cq = ring.completion(); + // for cqe in cq { + // let result = cqe.result(); + // if result < 0 { + // // TODO: send back the error + // // return Err(io::Error::from_raw_os_error(-result)); + // panic!("failed to get entry result"); + // } + // completed += 1; + // } + // if completed < op_count { + // // TODO: send back the error + // ring.submit_and_wait(1).unwrap(); + // } + // } + } + Ok(WriteMessage::Sync) => { + Self::write_new_pages(); + } + Ok(WriteMessage::Shutdown) => { + println!("Shutdown"); + // ignore for now + } + Err(_) => { + // Channel closed + break; + } + } + } + }); + Ok(()) + } + + fn write_updated_pages(pages: &[(PageId, FrameId)]) { + println!("Write updated pages: {:?}", pages); + } + + fn write_new_pages() { + println!("Write new pages"); + } + #[cfg(test)] pub fn open_temp_file() -> Result { Self::options().open_temp_file() @@ -440,9 +549,23 @@ impl PageManager { #[inline] pub fn drop_page_mut(&self, page_id: PageId) { if self.lru_replacer.update_frames.get(&page_id).is_some() { - self.lru_replacer.drop_pages.insert(page_id); + if self.lru_replacer.drop_pages.insert(page_id) { + if self.lru_replacer.drop_pages.len() > 10 { + // iter thru all items in drop_pages and remove from the drop_pages + let mut pages = Vec::with_capacity(10); + self.lru_replacer.drop_pages.retain(|p| { + if let Some(f) = self.page_table.get(p) { + pages.push((*f.key(), *f.value())); + } + true + }); + + self.tx_job.send(WriteMessage::Pages(pages)).unwrap(); + + } + } // println!("drop_page from update_frames: {:?}", page_id); - } + } self.lru_replacer.unpin(page_id).unwrap(); } @@ -519,6 +642,7 @@ impl Drop for PageManager { #[cfg(test)] mod tests { use crate::page_id; + use std::io::SeekFrom; use super::*; use std::{ From c24b5495eeb6372bfa91159a1fa210b524c3fcf5 Mon Sep 17 00:00:00 2001 From: nqd Date: Mon, 10 Nov 2025 14:34:43 +0000 Subject: [PATCH 24/65] wip --- src/page/manager/buffer_pool.rs | 191 ++++++++++++++++---------------- 1 file changed, 95 insertions(+), 96 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index ec663d8b..7600cbb3 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -59,13 +59,15 @@ pub struct PageManager { lru_replacer: CacheEvict, /* the replacer to find unpinned/candidate pages for eviction */ loading_page: FxSet, /* set of pages that are being loaded from disk */ - io_uring: RwLock, + io_uring: Arc>, tx_job: Sender, } enum WriteMessage { Pages(Vec<(PageId, FrameId)>), + #[allow(dead_code)] Sync, + #[allow(dead_code)] Shutdown, } @@ -136,7 +138,7 @@ impl PageManager { let loading_page = DashSet::with_capacity_and_hasher(num_frames as usize, FxBuildHasher::default()); - // Initialize io)uring with queue depth base on num_frames + // Initialize io_uring with queue depth base on num_frames let queue_depth = num_frames.min(2048) as u32; let io_uring = IoUring::new(queue_depth) .map_err(|e| PageError::IO(io::Error::new(io::ErrorKind::Other, e)))?; @@ -153,7 +155,7 @@ impl PageManager { lru_replacer, loading_page, - io_uring: RwLock::new(io_uring), + io_uring: Arc::new(RwLock::new(io_uring)), tx_job, }; @@ -163,78 +165,24 @@ impl PageManager { } fn start_write_worker(&self, rx_job: Receiver) -> Result<(), PageError> { - let rx_job = Arc::new(rx_job); + // let rx_job = Arc::new(rx_job); + let worker_file = self.file.write().try_clone().map_err(PageError::IO)?; + let frames = self.frames.clone(); + let io_uring = self.io_uring.clone(); thread::spawn(move || { loop { match rx_job.recv() { Ok(WriteMessage::Pages(pages)) => { - Self::write_updated_pages(&pages); - // let file = self.file.read(); - // let fd = file.as_raw_fd(); - // let mut op_count = 0; - // let mut ring = self.io_uring.write(); - // for page_id in self.lru_replacer.drop_pages.iter() { - // let frame_id = self.page_table.get(&page_id); - // if let Some(frame_id) = frame_id { - // let x = frame_id.value(); - // let frame = self.frames.get(x.0 as usize); - // if let Some(frame) = frame { - // let offset = page_id.as_offset(); - // let page_data = unsafe { - // std::slice::from_raw_parts( - // frame.ptr as *const u8, - // Page::SIZE, - // ) - // }; - // // Create write operation - // let write_op = opcode::Write::new( - // types::Fd(fd), - // page_data.as_ptr(), - // Page::SIZE as u32, - // ) - // .offset(offset as u64) - // .build() - // .user_data(op_count); - // // Submit to ring - // loop { - // let mut sq = ring.submission(); - // match unsafe { sq.push(&write_op) } { - // Ok(_) => { - // op_count += 1; - // break; - // } - // Err(_) => { - // // Submission queue is full, submit and wait - // drop(sq); - // // TODO: send back the error - // ring.submit_and_wait(1).unwrap(); - // } - // } - // } - // } - // } - // } - // // Submit all pending operations - // // TODO: send back the error - // ring.submit().unwrap(); - // // Wait for jobs to complete - // let mut completed = 0; - // while completed < op_count { - // let cq = ring.completion(); - // for cqe in cq { - // let result = cqe.result(); - // if result < 0 { - // // TODO: send back the error - // // return Err(io::Error::from_raw_os_error(-result)); - // panic!("failed to get entry result"); - // } - // completed += 1; - // } - // if completed < op_count { - // // TODO: send back the error - // ring.submit_and_wait(1).unwrap(); - // } - // } + let result = Self::write_updated_pages( + io_uring.clone(), + frames.clone(), + &pages, + worker_file.try_clone().unwrap(), + ); + // TODO: need to handle error here + if result.is_err() { + panic!("{:?}", result); + } } Ok(WriteMessage::Sync) => { Self::write_new_pages(); @@ -253,8 +201,61 @@ impl PageManager { Ok(()) } - fn write_updated_pages(pages: &[(PageId, FrameId)]) { - println!("Write updated pages: {:?}", pages); + fn write_updated_pages(ring: Arc>, frames: Arc>, pages: &[(PageId, FrameId)], file: File) -> io::Result<()> { + println!("Write updated pages: {:?}", pages.len()); + let fd = file.as_raw_fd(); + let mut op_count = 0; + let mut ring_guard = ring.write(); + + for page in pages { + let page_id = page.0; + let frame_id = page.1; + let offset = page_id.as_offset(); + let frame = &frames[frame_id.0 as usize]; + let page_data = + unsafe { std::slice::from_raw_parts(frame.ptr as *const u8, Page::SIZE) }; + + // Create write operation + let write_op = opcode::Write::new(types::Fd(fd), page_data.as_ptr(), Page::SIZE as u32) + .offset(offset as u64) + .build() + .user_data(op_count); + // Submit to ring + loop { + let mut sq = ring_guard.submission(); + match unsafe { sq.push(&write_op) } { + Ok(_) => { + op_count += 1; + drop(sq); + break; + } + Err(_) => { + // Submission queue is full, submit and wait + drop(sq); + let _ = ring_guard.submit_and_wait(1); + } + } + } + } + // Submit all pending operations + let _ = ring_guard.submit(); + + // Wait for jobs to complete + let mut completed = 0; + while completed < op_count { + let cq = ring_guard.completion(); + for cqe in cq { + let result = cqe.result(); + if result < 0 { + return Err(io::Error::from_raw_os_error(-result)); + } + completed += 1; + } + if completed < op_count { + ring_guard.submit_and_wait(1)?; + } + } + Ok(()) } fn write_new_pages() { @@ -396,7 +397,7 @@ impl PageManager { } // Submit writes to io_uring - let mut ring = self.io_uring.write(); + let mut ring_guard = self.io_uring.write(); let mut op_count = 0; // Write contiguous new pages as a batch using writev. @@ -423,7 +424,7 @@ impl PageManager { // Submit to ring loop { - let mut sq = ring.submission(); + let mut sq = ring_guard.submission(); match sq.push(&writev_op) { Ok(_) => { op_count += 1; @@ -432,7 +433,7 @@ impl PageManager { Err(_) => { // Submission queue is full, submit and wait drop(sq); - ring.submit_and_wait(1)?; + ring_guard.submit_and_wait(1)?; } } } @@ -459,7 +460,7 @@ impl PageManager { .user_data(op_count); // Submit to ring loop { - let mut sq = ring.submission(); + let mut sq = ring_guard.submission(); match unsafe { sq.push(&write_op) } { Ok(_) => { op_count += 1; @@ -468,14 +469,14 @@ impl PageManager { Err(_) => { // Submission queue is full, submit and wait drop(sq); - ring.submit_and_wait(1)?; + ring_guard.submit_and_wait(1)?; } } } } // Submit all pending operations - ring.submit()?; + ring_guard.submit()?; // Do cleanup work // println!( @@ -501,7 +502,7 @@ impl PageManager { // Wait for all jobs to complete let mut completed = 0; while completed < op_count { - let cq = ring.completion(); + let cq = ring_guard.completion(); for cqe in cq { let result = cqe.result(); if result < 0 { @@ -510,12 +511,12 @@ impl PageManager { completed += 1; } if completed < op_count { - ring.submit_and_wait(1)?; + ring_guard.submit_and_wait(1)?; } } // Drop the write lock on io_uring before calling file operations - drop(ring); + drop(ring_guard); drop(file); // println!("sync, new_pages: {:?}", new_pages); @@ -549,22 +550,20 @@ impl PageManager { #[inline] pub fn drop_page_mut(&self, page_id: PageId) { if self.lru_replacer.update_frames.get(&page_id).is_some() { - if self.lru_replacer.drop_pages.insert(page_id) { - if self.lru_replacer.drop_pages.len() > 10 { - // iter thru all items in drop_pages and remove from the drop_pages - let mut pages = Vec::with_capacity(10); - self.lru_replacer.drop_pages.retain(|p| { - if let Some(f) = self.page_table.get(p) { - pages.push((*f.key(), *f.value())); - } - true - }); - - self.tx_job.send(WriteMessage::Pages(pages)).unwrap(); + if self.lru_replacer.drop_pages.insert(page_id) + && self.lru_replacer.drop_pages.len() >= 10 + { + // iter thru all items in drop_pages and remove from the drop_pages + let mut pages = Vec::with_capacity(10); + self.lru_replacer.drop_pages.retain(|p| { + if let Some(f) = self.page_table.get(p) { + pages.push((*f.key(), *f.value())); + } + false + }); - } + self.tx_job.send(WriteMessage::Pages(pages)).unwrap(); } - // println!("drop_page from update_frames: {:?}", page_id); } self.lru_replacer.unpin(page_id).unwrap(); From dfc4cb79ec7b1a408e681b161554d2d532a887b0 Mon Sep 17 00:00:00 2001 From: nqd Date: Mon, 10 Nov 2025 15:00:27 +0000 Subject: [PATCH 25/65] wup --- src/page/manager/buffer_pool.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 7600cbb3..f6185ec6 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -56,7 +56,7 @@ pub struct PageManager { page_table: FxMap, /* mapping between page id and buffer pool frames, * indexed by page id with fix num_frames size */ original_free_frame_idx: AtomicU32, - lru_replacer: CacheEvict, /* the replacer to find unpinned/candidate pages for eviction */ + lru_replacer: Arc, /* the replacer to find unpinned/candidate pages for eviction */ loading_page: FxSet, /* set of pages that are being loaded from disk */ io_uring: Arc>, @@ -134,7 +134,7 @@ impl PageManager { let ptr = Box::into_raw(boxed_array); frames.push(Frame { ptr }); } - let lru_replacer = CacheEvict::new(num_frames as usize); + let lru_replacer = Arc::new(CacheEvict::new(num_frames as usize)); let loading_page = DashSet::with_capacity_and_hasher(num_frames as usize, FxBuildHasher::default()); @@ -169,6 +169,7 @@ impl PageManager { let worker_file = self.file.write().try_clone().map_err(PageError::IO)?; let frames = self.frames.clone(); let io_uring = self.io_uring.clone(); + let lru_replacer = self.lru_replacer.clone(); thread::spawn(move || { loop { match rx_job.recv() { @@ -183,6 +184,12 @@ impl PageManager { if result.is_err() { panic!("{:?}", result); } + // Note: it's possible that when a mut page get dropped, before it's wrote to the disk, the same page is used again as mut page. + // If the page_id is removed from update_frames while its data is being updated, we will lost the data. + // Thought in the current schema doesn't allow this, any further change needs to consider this. + pages.iter().for_each(|(page_id, _)| { + lru_replacer.update_frames.remove(page_id); + }); } Ok(WriteMessage::Sync) => { Self::write_new_pages(); From dbaaa3e5690f52e572423a6a968cf13ad01d8c1a Mon Sep 17 00:00:00 2001 From: nqd Date: Tue, 11 Nov 2025 13:32:51 +0000 Subject: [PATCH 26/65] log --- src/page/manager/buffer_pool.rs | 44 ++++++++++++++++----------------- src/page/manager/cache_evict.rs | 5 ++-- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index f6185ec6..9023d888 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -57,7 +57,7 @@ pub struct PageManager { * indexed by page id with fix num_frames size */ original_free_frame_idx: AtomicU32, lru_replacer: Arc, /* the replacer to find unpinned/candidate pages for eviction */ - loading_page: FxSet, /* set of pages that are being loaded from disk */ + loading_page: FxSet, /* set of pages that are being loaded from disk */ io_uring: Arc>, tx_job: Sender, @@ -66,8 +66,6 @@ pub struct PageManager { enum WriteMessage { Pages(Vec<(PageId, FrameId)>), #[allow(dead_code)] - Sync, - #[allow(dead_code)] Shutdown, } @@ -165,7 +163,6 @@ impl PageManager { } fn start_write_worker(&self, rx_job: Receiver) -> Result<(), PageError> { - // let rx_job = Arc::new(rx_job); let worker_file = self.file.write().try_clone().map_err(PageError::IO)?; let frames = self.frames.clone(); let io_uring = self.io_uring.clone(); @@ -184,16 +181,16 @@ impl PageManager { if result.is_err() { panic!("{:?}", result); } - // Note: it's possible that when a mut page get dropped, before it's wrote to the disk, the same page is used again as mut page. - // If the page_id is removed from update_frames while its data is being updated, we will lost the data. - // Thought in the current schema doesn't allow this, any further change needs to consider this. + // Note: it's possible that when a mut page get dropped, before it's wrote + // to the disk, the same page is used again as mut page. If the page_id is + // removed from update_frames while its data is being updated, we will lost + // the data. Thought in the current schema doesn't allow this, any further + // change needs to consider this. + println!("Wrote to disk: {:?}", pages.len()); pages.iter().for_each(|(page_id, _)| { lru_replacer.update_frames.remove(page_id); }); } - Ok(WriteMessage::Sync) => { - Self::write_new_pages(); - } Ok(WriteMessage::Shutdown) => { println!("Shutdown"); // ignore for now @@ -208,7 +205,12 @@ impl PageManager { Ok(()) } - fn write_updated_pages(ring: Arc>, frames: Arc>, pages: &[(PageId, FrameId)], file: File) -> io::Result<()> { + fn write_updated_pages( + ring: Arc>, + frames: Arc>, + pages: &[(PageId, FrameId)], + file: File, + ) -> io::Result<()> { println!("Write updated pages: {:?}", pages.len()); let fd = file.as_raw_fd(); let mut op_count = 0; @@ -265,10 +267,6 @@ impl PageManager { Ok(()) } - fn write_new_pages() { - println!("Write new pages"); - } - #[cfg(test)] pub fn open_temp_file() -> Result { Self::options().open_temp_file() @@ -392,8 +390,9 @@ impl PageManager { /// Syncs the buffer pool to the file. /// - /// New pages at the end of the file are batch written using vectored I/O Writev, since they are guaranteed to be contiguous. - /// Update pages are usually random pages scattered throughout the file, and written individually with Write. + /// New pages at the end of the file are batch written using vectored I/O Writev, since they are + /// guaranteed to be contiguous. Update pages are usually random pages scattered throughout + /// the file, and written individually with Write. pub fn sync(&self) -> io::Result<()> { let file = self.file.read(); let fd = file.as_raw_fd(); @@ -492,9 +491,10 @@ impl PageManager { // self.lru_replacer.update_frames.len(), // new_pages.len() // ); - // println!("drop_pages: {:?}", self.lru_replacer.drop_pages); - // println!("update_pages: {:?}", self.lru_replacer.update_frames); - // println!("new_pages: {:?}", new_pages); + println!("syncing"); + println!("drop_pages: {:?}", self.lru_replacer.drop_pages.len()); + println!("update_pages: {:?}", self.lru_replacer.update_frames.len()); + println!("new_pages: {:?}", new_pages.len()); new_pages.iter().for_each(|(_, page_id)| self.lru_replacer.unpin(*page_id).unwrap()); new_pages.clear(); @@ -557,8 +557,8 @@ impl PageManager { #[inline] pub fn drop_page_mut(&self, page_id: PageId) { if self.lru_replacer.update_frames.get(&page_id).is_some() { - if self.lru_replacer.drop_pages.insert(page_id) - && self.lru_replacer.drop_pages.len() >= 10 + if self.lru_replacer.drop_pages.insert(page_id) && + self.lru_replacer.drop_pages.len() >= 10 { // iter thru all items in drop_pages and remove from the drop_pages let mut pages = Vec::with_capacity(10); diff --git a/src/page/manager/cache_evict.rs b/src/page/manager/cache_evict.rs index a9ec71dc..f1d1b502 100644 --- a/src/page/manager/cache_evict.rs +++ b/src/page/manager/cache_evict.rs @@ -6,7 +6,8 @@ use fxhash::FxBuildHasher; use parking_lot::Mutex; use crate::page::{ - PageId, manager::buffer_pool::{FrameId, FxMap, FxSet} + manager::buffer_pool::{FrameId, FxMap, FxSet}, + PageId, }; // TODO: Temporarily use LruReplacer as the eviction policy, replace with a better eviction policy @@ -31,7 +32,7 @@ impl CacheEvict { read_frames: DashSet::with_capacity_and_hasher(capacity, FxBuildHasher::default()), update_frames: DashMap::with_capacity_and_hasher(capacity, FxBuildHasher::default()), new_frames: Mutex::new(Vec::with_capacity(capacity)), - drop_pages: DashSet::with_capacity_and_hasher(capacity, FxBuildHasher::default()) + drop_pages: DashSet::with_capacity_and_hasher(capacity, FxBuildHasher::default()), } } From 31d275fa10dc0f8ae231863c1a2db4083ef0bf51 Mon Sep 17 00:00:00 2001 From: nqd Date: Wed, 12 Nov 2025 14:18:46 +0000 Subject: [PATCH 27/65] drop pages is a vector --- src/page/manager/buffer_pool.rs | 36 +++++++++++++-------------------- src/page/manager/cache_evict.rs | 2 -- 2 files changed, 14 insertions(+), 24 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 9023d888..da4a25e3 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -18,7 +18,7 @@ use std::{ }; use dashmap::{DashMap, DashSet}; -use parking_lot::RwLock; +use parking_lot::{Mutex, RwLock}; use crate::{ page::{ @@ -61,6 +61,7 @@ pub struct PageManager { io_uring: Arc>, tx_job: Sender, + drop_pages: Mutex>, } enum WriteMessage { @@ -155,6 +156,7 @@ impl PageManager { io_uring: Arc::new(RwLock::new(io_uring)), tx_job, + drop_pages: Mutex::new(Vec::new()), }; page_manager.start_write_worker(rx_job)?; @@ -186,7 +188,6 @@ impl PageManager { // removed from update_frames while its data is being updated, we will lost // the data. Thought in the current schema doesn't allow this, any further // change needs to consider this. - println!("Wrote to disk: {:?}", pages.len()); pages.iter().for_each(|(page_id, _)| { lru_replacer.update_frames.remove(page_id); }); @@ -211,7 +212,7 @@ impl PageManager { pages: &[(PageId, FrameId)], file: File, ) -> io::Result<()> { - println!("Write updated pages: {:?}", pages.len()); + // println!("Write updated pages: {:?}", pages.len()); let fd = file.as_raw_fd(); let mut op_count = 0; let mut ring_guard = ring.write(); @@ -485,16 +486,10 @@ impl PageManager { ring_guard.submit()?; // Do cleanup work - // println!( - // "drop len: {:?}, update len: {:?}, new len: {:?}", - // self.lru_replacer.drop_pages.len(), - // self.lru_replacer.update_frames.len(), - // new_pages.len() - // ); - println!("syncing"); - println!("drop_pages: {:?}", self.lru_replacer.drop_pages.len()); - println!("update_pages: {:?}", self.lru_replacer.update_frames.len()); - println!("new_pages: {:?}", new_pages.len()); + // println!("sync"); + // println!("\tdrop_pages: {:?}", self.lru_replacer.drop_pages.len()); + // println!("\tupdate_pages: {:?}", self.lru_replacer.update_frames.len()); + // println!("\tnew_pages: {:?}", new_pages.len()); new_pages.iter().for_each(|(_, page_id)| self.lru_replacer.unpin(*page_id).unwrap()); new_pages.clear(); @@ -504,7 +499,7 @@ impl PageManager { .iter() .for_each(|entry| self.lru_replacer.unpin(*entry.key()).unwrap()); self.lru_replacer.update_frames.clear(); - self.lru_replacer.drop_pages.clear(); + self.drop_pages.lock().clear(); // Wait for all jobs to complete let mut completed = 0; @@ -557,23 +552,20 @@ impl PageManager { #[inline] pub fn drop_page_mut(&self, page_id: PageId) { if self.lru_replacer.update_frames.get(&page_id).is_some() { - if self.lru_replacer.drop_pages.insert(page_id) && - self.lru_replacer.drop_pages.len() >= 10 - { + let mut drop_pages = self.drop_pages.lock(); + drop_pages.push(page_id); + if drop_pages.len() >= 10 { // iter thru all items in drop_pages and remove from the drop_pages let mut pages = Vec::with_capacity(10); - self.lru_replacer.drop_pages.retain(|p| { + drop_pages.iter().for_each(|p| { if let Some(f) = self.page_table.get(p) { pages.push((*f.key(), *f.value())); } - false }); - self.tx_job.send(WriteMessage::Pages(pages)).unwrap(); + drop_pages.clear(); } } - - self.lru_replacer.unpin(page_id).unwrap(); } fn next_page_id(&self) -> Option<(PageId, u32)> { diff --git a/src/page/manager/cache_evict.rs b/src/page/manager/cache_evict.rs index f1d1b502..25c2b19b 100644 --- a/src/page/manager/cache_evict.rs +++ b/src/page/manager/cache_evict.rs @@ -16,7 +16,6 @@ pub(crate) struct CacheEvict { pub(crate) read_frames: FxSet, pub(crate) update_frames: FxMap, pub(crate) new_frames: Mutex>, - pub(crate) drop_pages: FxSet, } impl fmt::Debug for CacheEvict { @@ -32,7 +31,6 @@ impl CacheEvict { read_frames: DashSet::with_capacity_and_hasher(capacity, FxBuildHasher::default()), update_frames: DashMap::with_capacity_and_hasher(capacity, FxBuildHasher::default()), new_frames: Mutex::new(Vec::with_capacity(capacity)), - drop_pages: DashSet::with_capacity_and_hasher(capacity, FxBuildHasher::default()), } } From 518e3fc4f197b39869d57e8a8b4de7a136f67487 Mon Sep 17 00:00:00 2001 From: nqd Date: Fri, 21 Nov 2025 23:05:57 +0800 Subject: [PATCH 28/65] wip --- src/page/manager.rs | 2 + src/page/manager/buffer_pool.rs | 17 +++++ src/page/manager/clock_replacer.rs | 99 ++++++++++++++++++++++++++++++ 3 files changed, 118 insertions(+) create mode 100644 src/page/manager/clock_replacer.rs diff --git a/src/page/manager.rs b/src/page/manager.rs index c7e15d3e..bde800f2 100644 --- a/src/page/manager.rs +++ b/src/page/manager.rs @@ -4,6 +4,8 @@ use crate::page::PageId; pub(super) mod buffer_pool; #[cfg(feature = "buffer_pool_backend")] pub(super) mod cache_evict; +// #[cfg(feature = "buffer_pool_backend")] +pub(super) mod clock_replacer; #[cfg(feature = "mmap_backend")] pub(super) mod mmap; pub(super) mod options; diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index da4a25e3..b6883b8e 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -43,6 +43,23 @@ unsafe impl Sync for Frame {} #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] pub(crate) struct FrameId(u32); +impl FrameId { + #[inline] + pub const fn as_u32(&self) -> u32 { + self.0 + } + + #[inline] + pub const fn as_usize(&self) -> usize { + self.0 as usize + } + + #[inline] + pub const fn from_usize(value: usize) -> Self { + FrameId(value as u32) + } +} + pub(crate) type FxMap = DashMap; pub(crate) type FxSet = DashSet; diff --git a/src/page/manager/clock_replacer.rs b/src/page/manager/clock_replacer.rs new file mode 100644 index 00000000..739ac615 --- /dev/null +++ b/src/page/manager/clock_replacer.rs @@ -0,0 +1,99 @@ +use std::sync::atomic::{AtomicBool, AtomicU8, Ordering}; + +use parking_lot::Mutex; + +use crate::page::manager::buffer_pool::FrameId; + +struct FrameState { + // The Second Chance bit + ref_bit: AtomicBool, + // If > 0, this frame cannot be evicted + pin_count: AtomicU8, +} + +struct ClockReplacer { + frames: Vec, + hand: Mutex, +} + +impl ClockReplacer { + fn new(num_frames: usize) -> Self { + let mut frames = Vec::with_capacity(num_frames); + for _ in 0..num_frames { + frames + .push(FrameState { ref_bit: AtomicBool::new(false), pin_count: AtomicU8::new(0) }); + } + + ClockReplacer { frames, hand: Mutex::new(0) } + } + + pub fn pin(&self, frame_id: FrameId) { + // Safety check + if frame_id.as_usize() >= self.frames.len() { + return; + } + + let frame = &self.frames[frame_id.as_usize()]; + // First increment pin count + frame.pin_count.fetch_add(1, Ordering::SeqCst); + // Then set usage bit to true (give it a second chance) + frame.ref_bit.store(true, Ordering::SeqCst); + } + + pub fn unpin(&self, frame_id: FrameId) { + if frame_id.as_usize() >= self.frames.len() { + return + } + + let frame = &self.frames[frame_id.as_usize()]; + frame.pin_count.fetch_sub(1, Ordering::SeqCst); + } + + // Find a frame to evict + pub fn victim(&self) -> Option { + let mut hand = self.hand.lock(); + let start_index = *hand; + let num_frames = self.frames.len(); + + for _ in 0..(num_frames * 3) { + let current_idx = *hand; + let frame = &self.frames[current_idx]; + + // Move hand forward for next iteration + *hand = (*hand + 1) % num_frames; + + let current_pins = frame.pin_count.load(Ordering::SeqCst); + if current_pins > 0 { + // This page is being used. Cannot evict. Skip it. + continue; + } + // Check reference bit: swap atomically returns old value and sets to false + if frame.ref_bit.swap(false, Ordering::SeqCst) { + // Had a second chance (was true, now set to false) + continue; + } + + return Some(FrameId::from_usize(current_idx)); + } + + // If get here, literally every single frame is Pinned. The buffer pool is exhausted. + None + } + + #[cfg(test)] + pub fn count_pinned(&self) -> usize { + self.frames.iter().filter(|f| f.pin_count.load(Ordering::SeqCst) > 0).count() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_pin() { + let r = ClockReplacer::new(5); + r.pin(FrameId::from_usize(value)); + assert_eq!(r.count_pinned(), 1); + } +} From a4a504d88ae514ae04b7f074a9cb282632ec9faa Mon Sep 17 00:00:00 2001 From: nqd Date: Sat, 22 Nov 2025 09:38:32 +0800 Subject: [PATCH 29/65] update flag --- Cargo.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2520067d..88bd87c2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,12 +22,12 @@ rayon = "1.10.0" evict = "0.3.1" dashmap = "6.1.0" libc = "0.2.174" -io-uring = "0.7.10" +io-uring = { version = "0.7.10", optional = true } crossbeam-channel = "0.5.15" [features] -default = ["buffer_pool_backend"] -buffer_pool_backend = [] +default = ["mmap_backend"] +buffer_pool_backend = ["io-uring"] mmap_backend = [] [dev-dependencies] From 84bb90f02157bfc6017d7270e73f9748654046ef Mon Sep 17 00:00:00 2001 From: nqd Date: Sat, 22 Nov 2025 02:14:00 +0000 Subject: [PATCH 30/65] adding test --- Cargo.toml | 2 +- src/page/manager/clock_replacer.rs | 44 ++++++++++++++++++++++++++---- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 88bd87c2..08dcdde3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,7 @@ io-uring = { version = "0.7.10", optional = true } crossbeam-channel = "0.5.15" [features] -default = ["mmap_backend"] +default = ["buffer_pool_backend"] buffer_pool_backend = ["io-uring"] mmap_backend = [] diff --git a/src/page/manager/clock_replacer.rs b/src/page/manager/clock_replacer.rs index 739ac615..2bac71ff 100644 --- a/src/page/manager/clock_replacer.rs +++ b/src/page/manager/clock_replacer.rs @@ -17,7 +17,7 @@ struct ClockReplacer { } impl ClockReplacer { - fn new(num_frames: usize) -> Self { + pub fn new(num_frames: usize) -> Self { let mut frames = Vec::with_capacity(num_frames); for _ in 0..num_frames { frames @@ -42,7 +42,7 @@ impl ClockReplacer { pub fn unpin(&self, frame_id: FrameId) { if frame_id.as_usize() >= self.frames.len() { - return + return; } let frame = &self.frames[frame_id.as_usize()]; @@ -52,7 +52,6 @@ impl ClockReplacer { // Find a frame to evict pub fn victim(&self) -> Option { let mut hand = self.hand.lock(); - let start_index = *hand; let num_frames = self.frames.len(); for _ in 0..(num_frames * 3) { @@ -81,7 +80,7 @@ impl ClockReplacer { } #[cfg(test)] - pub fn count_pinned(&self) -> usize { + fn count_pinned(&self) -> usize { self.frames.iter().filter(|f| f.pin_count.load(Ordering::SeqCst) > 0).count() } } @@ -93,7 +92,42 @@ mod tests { #[test] fn test_pin() { let r = ClockReplacer::new(5); - r.pin(FrameId::from_usize(value)); + r.pin(FrameId::from_usize(0)); + assert_eq!(r.count_pinned(), 1); + r.pin(FrameId::from_usize(0)); assert_eq!(r.count_pinned(), 1); + r.pin(FrameId::from_usize(4)); + assert_eq!(r.count_pinned(), 2); + } + + #[test] + fn test_upin() { + let r = ClockReplacer::new(5); + r.pin(FrameId::from_usize(0)); + r.pin(FrameId::from_usize(1)); + r.pin(FrameId::from_usize(1)); + r.pin(FrameId::from_usize(2)); + assert_eq!(r.count_pinned(), 3); + + r.unpin(FrameId::from_usize(2)); + assert_eq!(r.count_pinned(), 2); + + r.unpin(FrameId::from_usize(1)); + assert_eq!(r.count_pinned(), 2); + r.unpin(FrameId::from_usize(1)); + assert_eq!(r.count_pinned(), 1); + } + + #[test] + fn test_evict() { + let r = ClockReplacer::new(5); + (0..5).for_each(|i| { + assert_eq!(r.victim(), Some(FrameId::from_usize(i))); + r.pin(FrameId::from_usize(i)); + }); + assert_eq!(r.victim(), None); + + r.unpin(FrameId::from_usize(4)); + assert_eq!(r.victim(), Some(FrameId::from_usize(4))); } } From 8ce522cefce1618bcda39d049fb7292b704e0806 Mon Sep 17 00:00:00 2001 From: nqd Date: Sat, 22 Nov 2025 06:10:29 +0000 Subject: [PATCH 31/65] wip --- src/page/manager/buffer_pool.rs | 34 ++++++++++++++++-------------- src/page/manager/clock_replacer.rs | 2 +- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index b6883b8e..74344ee6 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -22,9 +22,7 @@ use parking_lot::{Mutex, RwLock}; use crate::{ page::{ - manager::cache_evict::CacheEvict, - state::{PageState, RawPageState}, - Page, PageError, PageId, PageManagerOptions, PageMut, + Page, PageError, PageId, PageManagerOptions, PageMut, manager::{cache_evict::CacheEvict, clock_replacer::ClockReplacer}, state::{PageState, RawPageState} }, snapshot::SnapshotId, }; @@ -72,9 +70,11 @@ pub struct PageManager { * fix num_frames size */ page_table: FxMap, /* mapping between page id and buffer pool frames, * indexed by page id with fix num_frames size */ - original_free_frame_idx: AtomicU32, - lru_replacer: Arc, /* the replacer to find unpinned/candidate pages for eviction */ - loading_page: FxSet, /* set of pages that are being loaded from disk */ + + // original_free_frame_idx: AtomicU32, + // lru_replacer: Arc, /* the replacer to find unpinned/candidate pages for eviction */ + // loading_page: FxSet, /* set of pages that are being loaded from disk */ + replacer: Arc, io_uring: Arc>, tx_job: Sender, @@ -96,9 +96,9 @@ impl std::fmt::Debug for PageManager { .field("file_len", &self.file_len) .field("frames", &self.frames) .field("page_table", &self.page_table) - .field("original_free_frame_idx", &self.original_free_frame_idx) - .field("lru_replacer", &self.lru_replacer) - .field("loading_page", &self.loading_page) + // .field("original_free_frame_idx", &self.original_free_frame_idx) + // .field("lru_replacer", &self.lru_replacer) + // .field("loading_page", &self.loading_page) .field("io_uring", &"") .finish() } @@ -150,9 +150,10 @@ impl PageManager { let ptr = Box::into_raw(boxed_array); frames.push(Frame { ptr }); } - let lru_replacer = Arc::new(CacheEvict::new(num_frames as usize)); - let loading_page = - DashSet::with_capacity_and_hasher(num_frames as usize, FxBuildHasher::default()); + // let lru_replacer = Arc::new(CacheEvict::new(num_frames as usize)); + // let loading_page = + // DashSet::with_capacity_and_hasher(num_frames as usize, FxBuildHasher::default()); + let replacer = ClockReplacer::new(num_frames as usize); // Initialize io_uring with queue depth base on num_frames let queue_depth = num_frames.min(2048) as u32; @@ -167,9 +168,10 @@ impl PageManager { file_len, frames: Arc::new(frames), page_table, - original_free_frame_idx: AtomicU32::new(0), - lru_replacer, - loading_page, + // original_free_frame_idx: AtomicU32::new(0), + // lru_replacer, + // loading_page, + replacer: Arc::new(replacer), io_uring: Arc::new(RwLock::new(io_uring)), tx_job, @@ -299,7 +301,7 @@ impl PageManager { // Check if page is already in the cache if let Some(frame_id) = self.page_table.get(&page_id) { let frame = &self.frames[frame_id.0 as usize]; - self.lru_replacer.touch(page_id).map_err(|_| PageError::EvictionPolicy)?; + self.replacer.pin(*frame_id); return unsafe { Page::from_ptr(page_id, frame.ptr, self) }; } diff --git a/src/page/manager/clock_replacer.rs b/src/page/manager/clock_replacer.rs index 2bac71ff..27f484a0 100644 --- a/src/page/manager/clock_replacer.rs +++ b/src/page/manager/clock_replacer.rs @@ -11,7 +11,7 @@ struct FrameState { pin_count: AtomicU8, } -struct ClockReplacer { +pub struct ClockReplacer { frames: Vec, hand: Mutex, } From 366abfa992a2d54ea82eafca1131ec2059f475b1 Mon Sep 17 00:00:00 2001 From: nqd Date: Sat, 22 Nov 2025 06:45:02 +0000 Subject: [PATCH 32/65] wip --- src/page/manager/buffer_pool.rs | 30 ++++++++----------------- src/page/manager/clock_replacer.rs | 36 ++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 21 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 74344ee6..a6204bfc 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -22,7 +22,9 @@ use parking_lot::{Mutex, RwLock}; use crate::{ page::{ - Page, PageError, PageId, PageManagerOptions, PageMut, manager::{cache_evict::CacheEvict, clock_replacer::ClockReplacer}, state::{PageState, RawPageState} + manager::{cache_evict::CacheEvict, clock_replacer::ClockReplacer}, + state::{PageState, RawPageState}, + Page, PageError, PageId, PageManagerOptions, PageMut, }, snapshot::SnapshotId, }; @@ -308,7 +310,7 @@ impl PageManager { // Otherwise, need to load the page from disk if self.loading_page.insert(page_id) { // This thread is the first to load this page - let frame_id = self.get_free_frame().ok_or(PageError::OutOfMemory)?; + let frame_id = self.replacer.victim_and_pin().ok_or(PageError::OutOfMemory)?; let buf: *mut [u8; Page::SIZE] = self.frames[frame_id.0 as usize].ptr; unsafe { self.file @@ -317,7 +319,6 @@ impl PageManager { .map_err(PageError::IO)?; } self.page_table.insert(page_id, frame_id); - self.lru_replacer.pin_read(page_id).map_err(|_| PageError::EvictionPolicy)?; self.loading_page.remove(&page_id); return unsafe { Page::from_ptr(page_id, buf, self) }; } @@ -338,16 +339,14 @@ impl PageManager { loop { // Check if page is already in the cache if let Some(frame_id) = self.page_table.get(&page_id) { - self.lru_replacer - .pin_write_update_page(*frame_id, page_id) - .map_err(|_| PageError::EvictionPolicy)?; let frame = &self.frames[frame_id.0 as usize]; + self.replacer.pin(*frame_id); return unsafe { PageMut::from_ptr(page_id, snapshot_id, frame.ptr, self) }; } // Otherwise, need to load the page from disk if self.loading_page.insert(page_id) { // This thread is the first to load this page - let frame_id = self.get_free_frame().ok_or(PageError::OutOfMemory)?; + let frame_id = self.replacer.victim_and_pin().ok_or(PageError::OutOfMemory)?; let buf: *mut [u8; Page::SIZE] = self.frames[frame_id.0 as usize].ptr; unsafe { self.file @@ -356,16 +355,11 @@ impl PageManager { .map_err(PageError::IO)?; } self.page_table.insert(page_id, frame_id); - self.lru_replacer - .pin_write_update_page(frame_id, page_id) - .map_err(|_| PageError::EvictionPolicy)?; self.loading_page.remove(&page_id); return unsafe { PageMut::from_ptr(page_id, snapshot_id, buf, self) }; - } else { - // Another thread is already loading this page, spin/yield and retry - std::thread::yield_now(); - continue; } + // Another thread is already loading this page, spin/yield and retry + std::thread::yield_now(); } } @@ -373,16 +367,10 @@ impl PageManager { /// /// Returns an error if the buffer pool is full. pub fn allocate(&self, snapshot_id: SnapshotId) -> Result, PageError> { - let frame_id = self.get_free_frame().ok_or(PageError::OutOfMemory)?; + let frame_id = self.replacer.victim_and_pin().ok_or(PageError::OutOfMemory)?; let (page_id, new_count) = self.next_page_id().ok_or(PageError::PageLimitReached)?; - self.grow_if_needed(new_count as u64 * Page::SIZE as u64)?; - self.page_table.insert(page_id, frame_id); - self.lru_replacer - .pin_write_new_page(frame_id, page_id) - .map_err(|_| PageError::EvictionPolicy)?; - let data = self.frames[frame_id.0 as usize].ptr; unsafe { PageMut::acquire_unchecked(page_id, snapshot_id, data, self) } } diff --git a/src/page/manager/clock_replacer.rs b/src/page/manager/clock_replacer.rs index 27f484a0..50e4d615 100644 --- a/src/page/manager/clock_replacer.rs +++ b/src/page/manager/clock_replacer.rs @@ -27,6 +27,7 @@ impl ClockReplacer { ClockReplacer { frames, hand: Mutex::new(0) } } + #[inline] pub fn pin(&self, frame_id: FrameId) { // Safety check if frame_id.as_usize() >= self.frames.len() { @@ -79,6 +80,41 @@ impl ClockReplacer { None } + // Find a frame to evict and pin it + pub fn victim_and_pin(&self) -> Option { + let mut hand = self.hand.lock(); + let num_frames = self.frames.len(); + + for _ in 0..(num_frames * 3) { + let current_idx = *hand; + let frame = &self.frames[current_idx]; + + // Move hand forward for next iteration + *hand = (*hand + 1) % num_frames; + + let current_pins = frame.pin_count.load(Ordering::SeqCst); + if current_pins > 0 { + // This page is being used. Cannot evict. Skip it. + continue; + } + // Check reference bit: swap atomically returns old value and sets to false + if frame.ref_bit.swap(false, Ordering::SeqCst) { + // Had a second chance (was true, now set to false) + continue; + } + + // Pin the frame + let frame = &self.frames[current_idx]; + frame.pin_count.fetch_add(1, Ordering::SeqCst); + frame.ref_bit.store(true, Ordering::SeqCst); + return Some(FrameId::from_usize(current_idx)); + } + + // If get here, literally every single frame is Pinned. The buffer pool is exhausted. + None + + } + #[cfg(test)] fn count_pinned(&self) -> usize { self.frames.iter().filter(|f| f.pin_count.load(Ordering::SeqCst) > 0).count() From ea9d155ea2ab6c5485c2ddebaf7ba6f236fc5665 Mon Sep 17 00:00:00 2001 From: nqd Date: Sat, 22 Nov 2025 13:56:15 +0000 Subject: [PATCH 33/65] wip --- src/page/manager/buffer_pool.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index a6204bfc..01ce64ed 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -77,6 +77,8 @@ pub struct PageManager { // lru_replacer: Arc, /* the replacer to find unpinned/candidate pages for eviction */ // loading_page: FxSet, /* set of pages that are being loaded from disk */ replacer: Arc, + updated_pages: FxMap, + new_pages: Mutex>, io_uring: Arc>, tx_job: Sender, @@ -173,7 +175,10 @@ impl PageManager { // original_free_frame_idx: AtomicU32::new(0), // lru_replacer, // loading_page, + replacer: Arc::new(replacer), + updated_pages: DashMap::with_hasher(FxBuildHasher::default()), + new_pages: Mutex::new(Vec::new()), io_uring: Arc::new(RwLock::new(io_uring)), tx_job, @@ -341,6 +346,8 @@ impl PageManager { if let Some(frame_id) = self.page_table.get(&page_id) { let frame = &self.frames[frame_id.0 as usize]; self.replacer.pin(*frame_id); + self.add_updated_page(page_id, frame_id); + return unsafe { PageMut::from_ptr(page_id, snapshot_id, frame.ptr, self) }; } // Otherwise, need to load the page from disk @@ -363,12 +370,24 @@ impl PageManager { } } + #[inline] + fn add_updated_page(&self, page_id: PageId, frame_id: FrameId) { + if let Some((_, first_page_id)) = self.new_pages.lock().first() { + if page_id.as_u32() < first_page_id.as_u32() { + self.updated_pages.insert(page_id, frame_id); + } + } else { + self.updated_pages.insert(page_id, frame_id); + } + } + /// Adds a new page to the buffer pool. /// /// Returns an error if the buffer pool is full. pub fn allocate(&self, snapshot_id: SnapshotId) -> Result, PageError> { let frame_id = self.replacer.victim_and_pin().ok_or(PageError::OutOfMemory)?; let (page_id, new_count) = self.next_page_id().ok_or(PageError::PageLimitReached)?; + self.new_pages.lock().push((frame_id, page_id)); self.grow_if_needed(new_count as u64 * Page::SIZE as u64)?; self.page_table.insert(page_id, frame_id); let data = self.frames[frame_id.0 as usize].ptr; From b859aff32147f9ed2afa9765deb031125a531b38 Mon Sep 17 00:00:00 2001 From: nqd Date: Sun, 23 Nov 2025 06:17:50 +0000 Subject: [PATCH 34/65] use new replacer --- src/page/manager/buffer_pool.rs | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 01ce64ed..71e75440 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -346,7 +346,7 @@ impl PageManager { if let Some(frame_id) = self.page_table.get(&page_id) { let frame = &self.frames[frame_id.0 as usize]; self.replacer.pin(*frame_id); - self.add_updated_page(page_id, frame_id); + self.add_updated_page(page_id, *frame_id); return unsafe { PageMut::from_ptr(page_id, snapshot_id, frame.ptr, self) }; } @@ -370,7 +370,7 @@ impl PageManager { } } - #[inline] + #[inline(always)] fn add_updated_page(&self, page_id: PageId, frame_id: FrameId) { if let Some((_, first_page_id)) = self.new_pages.lock().first() { if page_id.as_u32() < first_page_id.as_u32() { @@ -424,8 +424,8 @@ impl PageManager { let file = self.file.read(); let fd = file.as_raw_fd(); - let mut new_pages = self.lru_replacer.new_frames.lock(); - if new_pages.is_empty() && self.lru_replacer.update_frames.is_empty() { + let mut new_pages = self.new_pages.lock(); + if new_pages.is_empty() && self.updated_pages.is_empty() { return Ok(()); } @@ -478,7 +478,7 @@ impl PageManager { }; // Write update_pages individually (they may not be contiguous) - for entry in self.lru_replacer.update_frames.iter() { + for entry in self.updated_pages.iter() { let frame_id = *entry.value(); let page_id = *entry.key(); let frame = &self.frames[frame_id.0 as usize]; @@ -517,14 +517,12 @@ impl PageManager { // println!("\tupdate_pages: {:?}", self.lru_replacer.update_frames.len()); // println!("\tnew_pages: {:?}", new_pages.len()); - new_pages.iter().for_each(|(_, page_id)| self.lru_replacer.unpin(*page_id).unwrap()); + new_pages.iter().for_each(|(frame_id, _)| self.replacer.unpin(*frame_id)); new_pages.clear(); - // TODO: is there any race condition here? - self.lru_replacer - .update_frames - .iter() - .for_each(|entry| self.lru_replacer.unpin(*entry.key()).unwrap()); - self.lru_replacer.update_frames.clear(); + + self.updated_pages.iter().for_each(|entry| self.replacer.unpin(*entry.key)); + self.updated_pages.clear(); + self.drop_pages.lock().clear(); // Wait for all jobs to complete @@ -547,8 +545,6 @@ impl PageManager { drop(ring_guard); drop(file); - // println!("sync, new_pages: {:?}", new_pages); - // println!("sync, update_pages: {:?}", update_pages); self.file.write().flush()?; Ok(()) From cb1969d0c9d81edcc79e635104e24607fa5ca6ef Mon Sep 17 00:00:00 2001 From: nqd Date: Sun, 23 Nov 2025 07:02:06 +0000 Subject: [PATCH 35/65] load page atomically --- src/page/manager/buffer_pool.rs | 126 ++++++++++++-------------------- 1 file changed, 46 insertions(+), 80 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 71e75440..58a82226 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -72,10 +72,6 @@ pub struct PageManager { * fix num_frames size */ page_table: FxMap, /* mapping between page id and buffer pool frames, * indexed by page id with fix num_frames size */ - - // original_free_frame_idx: AtomicU32, - // lru_replacer: Arc, /* the replacer to find unpinned/candidate pages for eviction */ - // loading_page: FxSet, /* set of pages that are being loaded from disk */ replacer: Arc, updated_pages: FxMap, new_pages: Mutex>, @@ -175,7 +171,6 @@ impl PageManager { // original_free_frame_idx: AtomicU32::new(0), // lru_replacer, // loading_page, - replacer: Arc::new(replacer), updated_pages: DashMap::with_hasher(FxBuildHasher::default()), new_pages: Mutex::new(Vec::new()), @@ -304,32 +299,20 @@ impl PageManager { if page_id > self.page_count.load(Ordering::Relaxed) { return Err(PageError::PageNotFound(page_id)); } - loop { - // Check if page is already in the cache - if let Some(frame_id) = self.page_table.get(&page_id) { - let frame = &self.frames[frame_id.0 as usize]; - self.replacer.pin(*frame_id); - return unsafe { Page::from_ptr(page_id, frame.ptr, self) }; - } - // Otherwise, need to load the page from disk - if self.loading_page.insert(page_id) { - // This thread is the first to load this page - let frame_id = self.replacer.victim_and_pin().ok_or(PageError::OutOfMemory)?; - let buf: *mut [u8; Page::SIZE] = self.frames[frame_id.0 as usize].ptr; - unsafe { - self.file - .read() - .read_exact_at(&mut *buf, page_id.as_offset() as u64) - .map_err(PageError::IO)?; - } - self.page_table.insert(page_id, frame_id); - self.loading_page.remove(&page_id); - return unsafe { Page::from_ptr(page_id, buf, self) }; + // Atomically get or load the page - load_page_from_disk is called only once per page_id + let frame_id = match self.page_table.entry(page_id) { + dashmap::mapref::entry::Entry::Occupied(entry) => entry.get().clone(), + dashmap::mapref::entry::Entry::Vacant(entry) => { + let frame_id = self.load_page_from_disk(page_id)?; + entry.insert(frame_id).clone() } - // Another thread is already loading this page, spin/yield and retry - std::thread::yield_now(); - } + }; + + // frame_id could be pinned 2 times when loaded from disk, but still be good since pin is a bool value + self.replacer.pin(frame_id); + let frame = &self.frames[frame_id.0 as usize]; + unsafe { Page::from_ptr(page_id, frame.ptr, self) } } /// Retrieves a mutable page from the buffer pool. @@ -341,33 +324,38 @@ impl PageManager { if page_id > self.page_count.load(Ordering::Relaxed) { return Err(PageError::PageNotFound(page_id)); } - loop { - // Check if page is already in the cache - if let Some(frame_id) = self.page_table.get(&page_id) { - let frame = &self.frames[frame_id.0 as usize]; - self.replacer.pin(*frame_id); - self.add_updated_page(page_id, *frame_id); - - return unsafe { PageMut::from_ptr(page_id, snapshot_id, frame.ptr, self) }; - } - // Otherwise, need to load the page from disk - if self.loading_page.insert(page_id) { - // This thread is the first to load this page - let frame_id = self.replacer.victim_and_pin().ok_or(PageError::OutOfMemory)?; - let buf: *mut [u8; Page::SIZE] = self.frames[frame_id.0 as usize].ptr; - unsafe { - self.file - .read() - .read_exact_at(&mut *buf, page_id.as_offset() as u64) - .map_err(PageError::IO)?; - } - self.page_table.insert(page_id, frame_id); - self.loading_page.remove(&page_id); - return unsafe { PageMut::from_ptr(page_id, snapshot_id, buf, self) }; + + // Atomically get or load the page - load_page_from_disk is called only once per page_id + let frame_id = match self.page_table.entry(page_id) { + dashmap::mapref::entry::Entry::Occupied(entry) => entry.get().clone(), + dashmap::mapref::entry::Entry::Vacant(entry) => { + let frame_id = self.load_page_from_disk(page_id)?; + entry.insert(frame_id).clone() } - // Another thread is already loading this page, spin/yield and retry - std::thread::yield_now(); + }; + + self.add_updated_page(page_id, frame_id); + // frame_id could be pinned 2 times when loaded from disk, but still be good since pin is a bool value + self.replacer.pin(frame_id); + let frame = &self.frames[frame_id.0 as usize]; + unsafe { PageMut::from_ptr(page_id, snapshot_id, frame.ptr, self) } + } + + /// Loads a page from disk atomically. This method is called only once per page_id + /// by the entry().or_insert_with() pattern in get() and get_mut(). + fn load_page_from_disk(&self, page_id: PageId) -> Result { + let frame_id = self + .replacer + .victim_and_pin() + .ok_or(PageError::OutOfMemory)?; + let buf: *mut [u8; Page::SIZE] = self.frames[frame_id.0 as usize].ptr; + unsafe { + self.file + .read() + .read_exact_at(&mut *buf, page_id.as_offset() as u64) + .map_err(PageError::IO)?; } + Ok(frame_id) } #[inline(always)] @@ -568,12 +556,14 @@ impl PageManager { #[inline] pub fn drop_page(&self, page_id: PageId) { - self.lru_replacer.unpin(page_id).unwrap(); + if let Some(frame_id) = self.page_table.get(&page_id) { + self.replacer.unpin(*frame_id); + } } #[inline] pub fn drop_page_mut(&self, page_id: PageId) { - if self.lru_replacer.update_frames.get(&page_id).is_some() { + if self.updated_pages.get(&page_id).is_some() { let mut drop_pages = self.drop_pages.lock(); drop_pages.push(page_id); if drop_pages.len() >= 10 { @@ -607,30 +597,6 @@ impl PageManager { } } - fn get_free_frame(&self) -> Option { - let mut original_free_frame_idx = self.original_free_frame_idx.load(Ordering::Relaxed); - loop { - if original_free_frame_idx < self.num_frames { - match self.original_free_frame_idx.compare_exchange_weak( - original_free_frame_idx, - original_free_frame_idx + 1, - Ordering::Relaxed, - Ordering::Relaxed, - ) { - Ok(_) => return Some(FrameId(original_free_frame_idx)), - Err(val) => original_free_frame_idx = val, /* Another thread modified original_free_frame_idx, retry. */ - } - } else { - let evicted_page = self.lru_replacer.evict(); - if let Some(page_id) = evicted_page { - return self.page_table.remove(&page_id).map(|(_, frame_id)| frame_id); - } else { - return None; - } - } - } - } - #[inline] fn grow_if_needed(&self, min_len: u64) -> Result<(), PageError> { if min_len <= self.file_len.load(Ordering::Relaxed) { From 274d0bc9a52a5ef1edd794fadfb76d94841edb3d Mon Sep 17 00:00:00 2001 From: nqd Date: Sun, 23 Nov 2025 09:25:42 +0000 Subject: [PATCH 36/65] fix race --- src/page/manager/buffer_pool.rs | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 58a82226..b1586e56 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -22,7 +22,7 @@ use parking_lot::{Mutex, RwLock}; use crate::{ page::{ - manager::{cache_evict::CacheEvict, clock_replacer::ClockReplacer}, + manager::clock_replacer::ClockReplacer, state::{PageState, RawPageState}, Page, PageError, PageId, PageManagerOptions, PageMut, }, @@ -32,6 +32,7 @@ use crate::{ #[derive(Debug, Clone)] struct Frame { ptr: *mut [u8; Page::SIZE], + page_id: Option, } // SAFETY: Frame contains a pointer to heap-allocated memory that we own exclusively. @@ -148,7 +149,7 @@ impl PageManager { for _ in 0..num_frames { let boxed_array = Box::new([0; Page::SIZE]); let ptr = Box::into_raw(boxed_array); - frames.push(Frame { ptr }); + frames.push(Frame { ptr, page_id: None }); } // let lru_replacer = Arc::new(CacheEvict::new(num_frames as usize)); // let loading_page = @@ -344,10 +345,17 @@ impl PageManager { /// Loads a page from disk atomically. This method is called only once per page_id /// by the entry().or_insert_with() pattern in get() and get_mut(). fn load_page_from_disk(&self, page_id: PageId) -> Result { - let frame_id = self - .replacer - .victim_and_pin() - .ok_or(PageError::OutOfMemory)?; + let frame_id = self.replacer.victim_and_pin().ok_or(PageError::OutOfMemory)?; + // Todo: Could have race condition here + // Remove the current pageid, frame_id from page_table. + let current_frame = self.frames.get(frame_id.as_usize()); + if let Some(current_frame) = current_frame { + if let Some(page_id) = current_frame.page_id { + self.page_table.remove(&page_id); + } + } + + self.frames[frame_id.0 as usize].page_id = Some(page_id); let buf: *mut [u8; Page::SIZE] = self.frames[frame_id.0 as usize].ptr; unsafe { self.file @@ -374,6 +382,15 @@ impl PageManager { /// Returns an error if the buffer pool is full. pub fn allocate(&self, snapshot_id: SnapshotId) -> Result, PageError> { let frame_id = self.replacer.victim_and_pin().ok_or(PageError::OutOfMemory)?; + // Todo: Could have race condition here + // Remove the current pageid, frame_id from page_table. + let current_frame = self.frames.get(frame_id.as_usize()); + if let Some(current_frame) = current_frame { + if let Some(page_id) = current_frame.page_id { + self.page_table.remove(&page_id); + } + } + let (page_id, new_count) = self.next_page_id().ok_or(PageError::PageLimitReached)?; self.new_pages.lock().push((frame_id, page_id)); self.grow_if_needed(new_count as u64 * Page::SIZE as u64)?; From 0135df8b756a76222789b9579b1172b271ffa4ae Mon Sep 17 00:00:00 2001 From: nqd Date: Sun, 23 Nov 2025 13:06:12 +0000 Subject: [PATCH 37/65] cleanup old frame after evict --- src/page/manager/buffer_pool.rs | 51 +++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index b1586e56..06931854 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -29,10 +29,19 @@ use crate::{ snapshot::SnapshotId, }; -#[derive(Debug, Clone)] +#[derive(Debug)] struct Frame { ptr: *mut [u8; Page::SIZE], - page_id: Option, + page_id: AtomicU32, // 0 means None, otherwise it's the page_id + 1 +} + +impl Clone for Frame { + fn clone(&self) -> Self { + Frame { + ptr: self.ptr, + page_id: AtomicU32::new(self.page_id.load(Ordering::Acquire)), + } + } } // SAFETY: Frame contains a pointer to heap-allocated memory that we own exclusively. @@ -149,7 +158,7 @@ impl PageManager { for _ in 0..num_frames { let boxed_array = Box::new([0; Page::SIZE]); let ptr = Box::into_raw(boxed_array); - frames.push(Frame { ptr, page_id: None }); + frames.push(Frame { ptr, page_id: AtomicU32::new(0) }); } // let lru_replacer = Arc::new(CacheEvict::new(num_frames as usize)); // let loading_page = @@ -190,7 +199,7 @@ impl PageManager { let worker_file = self.file.write().try_clone().map_err(PageError::IO)?; let frames = self.frames.clone(); let io_uring = self.io_uring.clone(); - let lru_replacer = self.lru_replacer.clone(); + let updated_pages = self.updated_pages.clone(); thread::spawn(move || { loop { match rx_job.recv() { @@ -206,13 +215,12 @@ impl PageManager { panic!("{:?}", result); } // Note: it's possible that when a mut page get dropped, before it's wrote - // to the disk, the same page is used again as mut page. If the page_id is - // removed from update_frames while its data is being updated, we will lost - // the data. Thought in the current schema doesn't allow this, any further - // change needs to consider this. + // to the disk, the same page is used again as mut page. We rely on page_table + // to track which pages are currently in the buffer pool. pages.iter().for_each(|(page_id, _)| { - lru_replacer.update_frames.remove(page_id); + updated_pages.remove(page_id); }); + // TODO: unpin the frame? } Ok(WriteMessage::Shutdown) => { println!("Shutdown"); @@ -350,12 +358,15 @@ impl PageManager { // Remove the current pageid, frame_id from page_table. let current_frame = self.frames.get(frame_id.as_usize()); if let Some(current_frame) = current_frame { - if let Some(page_id) = current_frame.page_id { - self.page_table.remove(&page_id); + let stored_id = current_frame.page_id.load(Ordering::Acquire); + if stored_id != 0 { + if let Some(old_page_id) = PageId::new(stored_id - 1) { + self.page_table.remove(&old_page_id); + } } } - self.frames[frame_id.0 as usize].page_id = Some(page_id); + self.frames[frame_id.0 as usize].page_id.store(page_id.as_u32() + 1, Ordering::Release); let buf: *mut [u8; Page::SIZE] = self.frames[frame_id.0 as usize].ptr; unsafe { self.file @@ -386,8 +397,11 @@ impl PageManager { // Remove the current pageid, frame_id from page_table. let current_frame = self.frames.get(frame_id.as_usize()); if let Some(current_frame) = current_frame { - if let Some(page_id) = current_frame.page_id { - self.page_table.remove(&page_id); + let stored_id = current_frame.page_id.load(Ordering::Acquire); + if stored_id != 0 { + if let Some(old_page_id) = PageId::new(stored_id - 1) { + self.page_table.remove(&old_page_id); + } } } @@ -525,7 +539,7 @@ impl PageManager { new_pages.iter().for_each(|(frame_id, _)| self.replacer.unpin(*frame_id)); new_pages.clear(); - self.updated_pages.iter().for_each(|entry| self.replacer.unpin(*entry.key)); + self.updated_pages.iter().for_each(|entry| self.replacer.unpin(*entry.value())); self.updated_pages.clear(); self.drop_pages.lock().clear(); @@ -708,12 +722,11 @@ mod tests { drop(page); } - // Verify pages are in the cache, and are dirty after allocate + // Verify pages are in the cache for i in 1..=10 { let i = PageId::new(i).unwrap(); - let frame_id = m.page_table.get(&i).expect("page not in cache"); - let dirty_frames = m.lru_replacer.new_frames.lock(); - assert!(dirty_frames.iter().any(|x| x.0 == *frame_id && x.1 == i)); + let _frame_id = m.page_table.get(&i).expect("page not in cache"); + // Verify page exists in page_table } } From 4e8ce9cf8f91ca021eeb7ca1e047dcf3c8774938 Mon Sep 17 00:00:00 2001 From: nqd Date: Mon, 24 Nov 2025 14:00:49 +0000 Subject: [PATCH 38/65] update pool --- src/page/manager/buffer_pool.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 06931854..16f52faf 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -200,6 +200,7 @@ impl PageManager { let frames = self.frames.clone(); let io_uring = self.io_uring.clone(); let updated_pages = self.updated_pages.clone(); + let replacer = self.replacer.clone(); thread::spawn(move || { loop { match rx_job.recv() { @@ -217,10 +218,10 @@ impl PageManager { // Note: it's possible that when a mut page get dropped, before it's wrote // to the disk, the same page is used again as mut page. We rely on page_table // to track which pages are currently in the buffer pool. - pages.iter().for_each(|(page_id, _)| { + pages.iter().for_each(|(page_id, frame_id)| { updated_pages.remove(page_id); + replacer.unpin(*frame_id); }); - // TODO: unpin the frame? } Ok(WriteMessage::Shutdown) => { println!("Shutdown"); @@ -366,7 +367,7 @@ impl PageManager { } } - self.frames[frame_id.0 as usize].page_id.store(page_id.as_u32() + 1, Ordering::Release); + self.frames[frame_id.0 as usize].page_id.store(page_id.as_u32() + 1, Ordering::Relaxed); let buf: *mut [u8; Page::SIZE] = self.frames[frame_id.0 as usize].ptr; unsafe { self.file @@ -409,6 +410,7 @@ impl PageManager { self.new_pages.lock().push((frame_id, page_id)); self.grow_if_needed(new_count as u64 * Page::SIZE as u64)?; self.page_table.insert(page_id, frame_id); + self.frames[frame_id.0 as usize].page_id.store(page_id.as_u32() + 1, Ordering::Relaxed); let data = self.frames[frame_id.0 as usize].ptr; unsafe { PageMut::acquire_unchecked(page_id, snapshot_id, data, self) } } From 47d3d96790f6ab6b4a9311ba99c9ec891e19e2c1 Mon Sep 17 00:00:00 2001 From: nqd Date: Mon, 24 Nov 2025 14:11:20 +0000 Subject: [PATCH 39/65] clock replacer use pin as bool val --- src/page/manager/clock_replacer.rs | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/src/page/manager/clock_replacer.rs b/src/page/manager/clock_replacer.rs index 50e4d615..56df7091 100644 --- a/src/page/manager/clock_replacer.rs +++ b/src/page/manager/clock_replacer.rs @@ -7,8 +7,8 @@ use crate::page::manager::buffer_pool::FrameId; struct FrameState { // The Second Chance bit ref_bit: AtomicBool, - // If > 0, this frame cannot be evicted - pin_count: AtomicU8, + // If true, this frame cannot be evicted + pin: AtomicBool, } pub struct ClockReplacer { @@ -20,8 +20,10 @@ impl ClockReplacer { pub fn new(num_frames: usize) -> Self { let mut frames = Vec::with_capacity(num_frames); for _ in 0..num_frames { - frames - .push(FrameState { ref_bit: AtomicBool::new(false), pin_count: AtomicU8::new(0) }); + frames.push(FrameState { + ref_bit: AtomicBool::new(false), + pin: AtomicBool::new(false), + }); } ClockReplacer { frames, hand: Mutex::new(0) } @@ -36,7 +38,7 @@ impl ClockReplacer { let frame = &self.frames[frame_id.as_usize()]; // First increment pin count - frame.pin_count.fetch_add(1, Ordering::SeqCst); + frame.pin.store(true, Ordering::SeqCst); // Then set usage bit to true (give it a second chance) frame.ref_bit.store(true, Ordering::SeqCst); } @@ -47,7 +49,7 @@ impl ClockReplacer { } let frame = &self.frames[frame_id.as_usize()]; - frame.pin_count.fetch_sub(1, Ordering::SeqCst); + frame.pin.store(false, Ordering::SeqCst); } // Find a frame to evict @@ -62,8 +64,8 @@ impl ClockReplacer { // Move hand forward for next iteration *hand = (*hand + 1) % num_frames; - let current_pins = frame.pin_count.load(Ordering::SeqCst); - if current_pins > 0 { + let current_pins = frame.pin.load(Ordering::SeqCst); + if current_pins { // This page is being used. Cannot evict. Skip it. continue; } @@ -92,8 +94,8 @@ impl ClockReplacer { // Move hand forward for next iteration *hand = (*hand + 1) % num_frames; - let current_pins = frame.pin_count.load(Ordering::SeqCst); - if current_pins > 0 { + let current_pins = frame.pin.load(Ordering::SeqCst); + if current_pins { // This page is being used. Cannot evict. Skip it. continue; } @@ -105,19 +107,18 @@ impl ClockReplacer { // Pin the frame let frame = &self.frames[current_idx]; - frame.pin_count.fetch_add(1, Ordering::SeqCst); + frame.pin.store(true, Ordering::SeqCst); frame.ref_bit.store(true, Ordering::SeqCst); return Some(FrameId::from_usize(current_idx)); } // If get here, literally every single frame is Pinned. The buffer pool is exhausted. None - } #[cfg(test)] fn count_pinned(&self) -> usize { - self.frames.iter().filter(|f| f.pin_count.load(Ordering::SeqCst) > 0).count() + self.frames.iter().filter(|f| f.pin.load(Ordering::SeqCst)).count() } } @@ -149,7 +150,7 @@ mod tests { assert_eq!(r.count_pinned(), 2); r.unpin(FrameId::from_usize(1)); - assert_eq!(r.count_pinned(), 2); + assert_eq!(r.count_pinned(), 1); r.unpin(FrameId::from_usize(1)); assert_eq!(r.count_pinned(), 1); } From 6872fd98a064c28d6729ac03e8f6e0e745faf450 Mon Sep 17 00:00:00 2001 From: nqd Date: Mon, 24 Nov 2025 14:44:10 +0000 Subject: [PATCH 40/65] cleanup --- src/page/manager/buffer_pool.rs | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 16f52faf..70851a16 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -32,15 +32,12 @@ use crate::{ #[derive(Debug)] struct Frame { ptr: *mut [u8; Page::SIZE], - page_id: AtomicU32, // 0 means None, otherwise it's the page_id + 1 + page_id: AtomicU32, // 0 means None, otherwise it's the page_id } impl Clone for Frame { fn clone(&self) -> Self { - Frame { - ptr: self.ptr, - page_id: AtomicU32::new(self.page_id.load(Ordering::Acquire)), - } + Frame { ptr: self.ptr, page_id: AtomicU32::new(self.page_id.load(Ordering::Acquire)) } } } @@ -361,13 +358,13 @@ impl PageManager { if let Some(current_frame) = current_frame { let stored_id = current_frame.page_id.load(Ordering::Acquire); if stored_id != 0 { - if let Some(old_page_id) = PageId::new(stored_id - 1) { + if let Some(old_page_id) = PageId::new(stored_id) { self.page_table.remove(&old_page_id); } } } - self.frames[frame_id.0 as usize].page_id.store(page_id.as_u32() + 1, Ordering::Relaxed); + self.frames[frame_id.0 as usize].page_id.store(page_id.as_u32(), Ordering::Relaxed); let buf: *mut [u8; Page::SIZE] = self.frames[frame_id.0 as usize].ptr; unsafe { self.file @@ -400,7 +397,7 @@ impl PageManager { if let Some(current_frame) = current_frame { let stored_id = current_frame.page_id.load(Ordering::Acquire); if stored_id != 0 { - if let Some(old_page_id) = PageId::new(stored_id - 1) { + if let Some(old_page_id) = PageId::new(stored_id) { self.page_table.remove(&old_page_id); } } @@ -410,7 +407,7 @@ impl PageManager { self.new_pages.lock().push((frame_id, page_id)); self.grow_if_needed(new_count as u64 * Page::SIZE as u64)?; self.page_table.insert(page_id, frame_id); - self.frames[frame_id.0 as usize].page_id.store(page_id.as_u32() + 1, Ordering::Relaxed); + self.frames[frame_id.0 as usize].page_id.store(page_id.as_u32(), Ordering::Relaxed); let data = self.frames[frame_id.0 as usize].ptr; unsafe { PageMut::acquire_unchecked(page_id, snapshot_id, data, self) } } @@ -717,18 +714,20 @@ mod tests { let err = m.get(i).unwrap_err(); assert!(matches!(err, PageError::PageNotFound(page_id) if page_id == i)); - let page = m.allocate(snapshot).unwrap(); + let mut page = m.allocate(snapshot).unwrap(); assert_eq!(page.id(), i); assert_eq!(page.contents(), &mut [0; Page::DATA_SIZE]); assert_eq!(page.snapshot_id(), snapshot); + page.contents_mut().iter_mut().for_each(|byte| *byte = 0x11); drop(page); } // Verify pages are in the cache for i in 1..=10 { - let i = PageId::new(i).unwrap(); - let _frame_id = m.page_table.get(&i).expect("page not in cache"); - // Verify page exists in page_table + let page_id = PageId::new(i).unwrap(); + let frame_id = m.page_table.get(&page_id).expect("page not in cache"); + let frame = m.frames.get(frame_id.as_usize()).unwrap(); + assert_eq!(frame.page_id.load(Ordering::Relaxed), page_id.as_u32()); } } From 9825b65c1fdbe4e44cec44e3dfc189240f2a9a1c Mon Sep 17 00:00:00 2001 From: nqd Date: Tue, 25 Nov 2025 14:02:35 +0000 Subject: [PATCH 41/65] cleanup inside victim() --- src/page/manager/buffer_pool.rs | 37 +++++++++++++++--------------- src/page/manager/clock_replacer.rs | 15 +++++++----- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 70851a16..41e287bb 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -348,12 +348,8 @@ impl PageManager { unsafe { PageMut::from_ptr(page_id, snapshot_id, frame.ptr, self) } } - /// Loads a page from disk atomically. This method is called only once per page_id - /// by the entry().or_insert_with() pattern in get() and get_mut(). - fn load_page_from_disk(&self, page_id: PageId) -> Result { - let frame_id = self.replacer.victim_and_pin().ok_or(PageError::OutOfMemory)?; - // Todo: Could have race condition here - // Remove the current pageid, frame_id from page_table. + // Remove the current pageid, frame_id from page_table. + fn _cleanup_victim_page(&self, frame_id: FrameId) { let current_frame = self.frames.get(frame_id.as_usize()); if let Some(current_frame) = current_frame { let stored_id = current_frame.page_id.load(Ordering::Acquire); @@ -363,6 +359,17 @@ impl PageManager { } } } + } + + /// Loads a page from disk atomically. This method is called only once per page_id + /// by the entry().or_insert_with() pattern in get() and get_mut(). + fn load_page_from_disk(&self, page_id: PageId) -> Result { + let frame_id = self + .replacer + .victim_and_pin(|fid| { + self._cleanup_victim_page(fid); + }) + .ok_or(PageError::OutOfMemory)?; self.frames[frame_id.0 as usize].page_id.store(page_id.as_u32(), Ordering::Relaxed); let buf: *mut [u8; Page::SIZE] = self.frames[frame_id.0 as usize].ptr; @@ -390,18 +397,12 @@ impl PageManager { /// /// Returns an error if the buffer pool is full. pub fn allocate(&self, snapshot_id: SnapshotId) -> Result, PageError> { - let frame_id = self.replacer.victim_and_pin().ok_or(PageError::OutOfMemory)?; - // Todo: Could have race condition here - // Remove the current pageid, frame_id from page_table. - let current_frame = self.frames.get(frame_id.as_usize()); - if let Some(current_frame) = current_frame { - let stored_id = current_frame.page_id.load(Ordering::Acquire); - if stored_id != 0 { - if let Some(old_page_id) = PageId::new(stored_id) { - self.page_table.remove(&old_page_id); - } - } - } + let frame_id = self + .replacer + .victim_and_pin(|fid| { + self._cleanup_victim_page(fid); + }) + .ok_or(PageError::OutOfMemory)?; let (page_id, new_count) = self.next_page_id().ok_or(PageError::PageLimitReached)?; self.new_pages.lock().push((frame_id, page_id)); diff --git a/src/page/manager/clock_replacer.rs b/src/page/manager/clock_replacer.rs index 56df7091..ac3af4f2 100644 --- a/src/page/manager/clock_replacer.rs +++ b/src/page/manager/clock_replacer.rs @@ -20,10 +20,8 @@ impl ClockReplacer { pub fn new(num_frames: usize) -> Self { let mut frames = Vec::with_capacity(num_frames); for _ in 0..num_frames { - frames.push(FrameState { - ref_bit: AtomicBool::new(false), - pin: AtomicBool::new(false), - }); + frames + .push(FrameState { ref_bit: AtomicBool::new(false), pin: AtomicBool::new(false) }); } ClockReplacer { frames, hand: Mutex::new(0) } @@ -83,7 +81,10 @@ impl ClockReplacer { } // Find a frame to evict and pin it - pub fn victim_and_pin(&self) -> Option { + pub fn victim_and_pin(&self, cleanup: F) -> Option + where + F: FnOnce(FrameId), + { let mut hand = self.hand.lock(); let num_frames = self.frames.len(); @@ -109,7 +110,9 @@ impl ClockReplacer { let frame = &self.frames[current_idx]; frame.pin.store(true, Ordering::SeqCst); frame.ref_bit.store(true, Ordering::SeqCst); - return Some(FrameId::from_usize(current_idx)); + let frame_id = FrameId::from_usize(current_idx); + cleanup(frame_id); + return Some(frame_id); } // If get here, literally every single frame is Pinned. The buffer pool is exhausted. From 260726b706fe473128959f94b41b4ec87c65d9c5 Mon Sep 17 00:00:00 2001 From: nqd Date: Wed, 26 Nov 2025 14:22:51 +0000 Subject: [PATCH 42/65] add log --- src/page/manager/buffer_pool.rs | 7 +++---- src/page/manager/clock_replacer.rs | 15 +++++++-------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 41e287bb..4d16a39f 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -219,6 +219,7 @@ impl PageManager { updated_pages.remove(page_id); replacer.unpin(*frame_id); }); + println!("\tworker, write {:?}", pages.len()); } Ok(WriteMessage::Shutdown) => { println!("Shutdown"); @@ -531,10 +532,7 @@ impl PageManager { ring_guard.submit()?; // Do cleanup work - // println!("sync"); - // println!("\tdrop_pages: {:?}", self.lru_replacer.drop_pages.len()); - // println!("\tupdate_pages: {:?}", self.lru_replacer.update_frames.len()); - // println!("\tnew_pages: {:?}", new_pages.len()); + println!("Sync, new_pages: {:?}, updated_pages: {:?}", new_pages.len(), self.updated_pages.len()); new_pages.iter().for_each(|(frame_id, _)| self.replacer.unpin(*frame_id)); new_pages.clear(); @@ -564,6 +562,7 @@ impl PageManager { drop(ring_guard); drop(file); + self.file.write().flush()?; Ok(()) diff --git a/src/page/manager/clock_replacer.rs b/src/page/manager/clock_replacer.rs index ac3af4f2..68863a94 100644 --- a/src/page/manager/clock_replacer.rs +++ b/src/page/manager/clock_replacer.rs @@ -1,4 +1,4 @@ -use std::sync::atomic::{AtomicBool, AtomicU8, Ordering}; +use std::sync::atomic::{AtomicBool, Ordering}; use parking_lot::Mutex; @@ -11,13 +11,13 @@ struct FrameState { pin: AtomicBool, } -pub struct ClockReplacer { +pub(crate) struct ClockReplacer { frames: Vec, hand: Mutex, } impl ClockReplacer { - pub fn new(num_frames: usize) -> Self { + pub(super) fn new(num_frames: usize) -> Self { let mut frames = Vec::with_capacity(num_frames); for _ in 0..num_frames { frames @@ -27,8 +27,7 @@ impl ClockReplacer { ClockReplacer { frames, hand: Mutex::new(0) } } - #[inline] - pub fn pin(&self, frame_id: FrameId) { + pub(super) fn pin(&self, frame_id: FrameId) { // Safety check if frame_id.as_usize() >= self.frames.len() { return; @@ -41,7 +40,7 @@ impl ClockReplacer { frame.ref_bit.store(true, Ordering::SeqCst); } - pub fn unpin(&self, frame_id: FrameId) { + pub(super) fn unpin(&self, frame_id: FrameId) { if frame_id.as_usize() >= self.frames.len() { return; } @@ -51,7 +50,7 @@ impl ClockReplacer { } // Find a frame to evict - pub fn victim(&self) -> Option { + pub(super) fn victim(&self) -> Option { let mut hand = self.hand.lock(); let num_frames = self.frames.len(); @@ -81,7 +80,7 @@ impl ClockReplacer { } // Find a frame to evict and pin it - pub fn victim_and_pin(&self, cleanup: F) -> Option + pub(super) fn victim_and_pin(&self, cleanup: F) -> Option where F: FnOnce(FrameId), { From ea6ea65a36130e8f670bc3fa798f12ea31bd3738 Mon Sep 17 00:00:00 2001 From: nqd Date: Thu, 27 Nov 2025 08:54:09 +0000 Subject: [PATCH 43/65] use arc --- src/page/manager/buffer_pool.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 4d16a39f..7e13d5f6 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -80,7 +80,7 @@ pub struct PageManager { page_table: FxMap, /* mapping between page id and buffer pool frames, * indexed by page id with fix num_frames size */ replacer: Arc, - updated_pages: FxMap, + updated_pages: Arc>, new_pages: Mutex>, io_uring: Arc>, @@ -179,7 +179,7 @@ impl PageManager { // lru_replacer, // loading_page, replacer: Arc::new(replacer), - updated_pages: DashMap::with_hasher(FxBuildHasher::default()), + updated_pages: Arc::new(DashMap::with_hasher(FxBuildHasher::default())), new_pages: Mutex::new(Vec::new()), io_uring: Arc::new(RwLock::new(io_uring)), From 84f3553011d534db981d3f64842bd74168a43807 Mon Sep 17 00:00:00 2001 From: nqd Date: Thu, 27 Nov 2025 14:29:16 +0000 Subject: [PATCH 44/65] cleanup --- src/page/manager/buffer_pool.rs | 11 +++++------ src/page/manager/clock_replacer.rs | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 7e13d5f6..c7638a99 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -219,7 +219,6 @@ impl PageManager { updated_pages.remove(page_id); replacer.unpin(*frame_id); }); - println!("\tworker, write {:?}", pages.len()); } Ok(WriteMessage::Shutdown) => { println!("Shutdown"); @@ -241,7 +240,6 @@ impl PageManager { pages: &[(PageId, FrameId)], file: File, ) -> io::Result<()> { - // println!("Write updated pages: {:?}", pages.len()); let fd = file.as_raw_fd(); let mut op_count = 0; let mut ring_guard = ring.write(); @@ -531,9 +529,6 @@ impl PageManager { // Submit all pending operations ring_guard.submit()?; - // Do cleanup work - println!("Sync, new_pages: {:?}, updated_pages: {:?}", new_pages.len(), self.updated_pages.len()); - new_pages.iter().for_each(|(frame_id, _)| self.replacer.unpin(*frame_id)); new_pages.clear(); @@ -562,7 +557,6 @@ impl PageManager { drop(ring_guard); drop(file); - self.file.write().flush()?; Ok(()) @@ -817,7 +811,10 @@ mod tests { let mut p = m.allocate(snapshot).expect("page allocation failed"); p.contents_mut().iter_mut().for_each(|byte| *byte = 0xab); drop(p); + assert_eq!(m.replacer.count_pinned(), 1); m.sync().expect("sync failed"); + assert_eq!(m.replacer.count_pinned(), 0); + seek(&f, 0); assert_eq!(len(&f), 1024 * Page::SIZE); assert_eq!(read(&f, 8), snapshot.to_le_bytes()); @@ -828,7 +825,9 @@ mod tests { let mut p = m.allocate(snapshot + i as u64).expect("page allocation failed"); p.contents_mut().iter_mut().for_each(|byte| *byte = 0xab ^ (i as u8)); } + assert_eq!(m.replacer.count_pinned(), 255); m.sync().expect("sync failed"); + assert_eq!(m.replacer.count_pinned(), 0); assert_eq!(len(&f), 1024 * Page::SIZE); for i in 1..=255 { diff --git a/src/page/manager/clock_replacer.rs b/src/page/manager/clock_replacer.rs index 68863a94..8e81bdac 100644 --- a/src/page/manager/clock_replacer.rs +++ b/src/page/manager/clock_replacer.rs @@ -119,7 +119,7 @@ impl ClockReplacer { } #[cfg(test)] - fn count_pinned(&self) -> usize { + pub(super) fn count_pinned(&self) -> usize { self.frames.iter().filter(|f| f.pin.load(Ordering::SeqCst)).count() } } From 39a13b50a1dc2daa9426bf47018c14a631f11cfb Mon Sep 17 00:00:00 2001 From: nqd Date: Sat, 29 Nov 2025 03:06:03 +0000 Subject: [PATCH 45/65] resolve deadlock --- src/page/manager/buffer_pool.rs | 55 +++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index c7638a99..58c69ed1 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -82,6 +82,7 @@ pub struct PageManager { replacer: Arc, updated_pages: Arc>, new_pages: Mutex>, + loading_pages: FxSet, io_uring: Arc>, tx_job: Sender, @@ -181,6 +182,7 @@ impl PageManager { replacer: Arc::new(replacer), updated_pages: Arc::new(DashMap::with_hasher(FxBuildHasher::default())), new_pages: Mutex::new(Vec::new()), + loading_pages: DashSet::with_hasher(FxBuildHasher::default()), io_uring: Arc::new(RwLock::new(io_uring)), tx_job, @@ -306,17 +308,9 @@ impl PageManager { return Err(PageError::PageNotFound(page_id)); } - // Atomically get or load the page - load_page_from_disk is called only once per page_id - let frame_id = match self.page_table.entry(page_id) { - dashmap::mapref::entry::Entry::Occupied(entry) => entry.get().clone(), - dashmap::mapref::entry::Entry::Vacant(entry) => { - let frame_id = self.load_page_from_disk(page_id)?; - entry.insert(frame_id).clone() - } - }; + // Atomically get or load the page + let frame_id = self.select_frame_id(page_id)?; - // frame_id could be pinned 2 times when loaded from disk, but still be good since pin is a bool value - self.replacer.pin(frame_id); let frame = &self.frames[frame_id.0 as usize]; unsafe { Page::from_ptr(page_id, frame.ptr, self) } } @@ -331,24 +325,36 @@ impl PageManager { return Err(PageError::PageNotFound(page_id)); } - // Atomically get or load the page - load_page_from_disk is called only once per page_id - let frame_id = match self.page_table.entry(page_id) { - dashmap::mapref::entry::Entry::Occupied(entry) => entry.get().clone(), - dashmap::mapref::entry::Entry::Vacant(entry) => { - let frame_id = self.load_page_from_disk(page_id)?; - entry.insert(frame_id).clone() - } - }; + // Atomically get or load the page + let frame_id = self.select_frame_id(page_id)?; self.add_updated_page(page_id, frame_id); - // frame_id could be pinned 2 times when loaded from disk, but still be good since pin is a bool value - self.replacer.pin(frame_id); let frame = &self.frames[frame_id.0 as usize]; unsafe { PageMut::from_ptr(page_id, snapshot_id, frame.ptr, self) } } + fn select_frame_id(&self, page_id: PageId) -> Result { + loop { + // Check if page is in the cache + if let Some(frame_id) = self.page_table.get(&page_id) { + self.replacer.pin(*frame_id); + return Ok(*frame_id); + } + // Otherwise, need to load page from disk + if self.loading_pages.insert(page_id) { + // This thread is the first to load the page + let frame_id = self.load_page_from_disk(page_id)?; + self.page_table.insert(page_id, frame_id); + self.loading_pages.remove(&page_id); + return Ok(frame_id); + } + // Another thread is already loading this page, spin/yield and retry + std::thread::yield_now(); + } + } + // Remove the current pageid, frame_id from page_table. - fn _cleanup_victim_page(&self, frame_id: FrameId) { + fn cleanup_victim_page(&self, frame_id: FrameId) { let current_frame = self.frames.get(frame_id.as_usize()); if let Some(current_frame) = current_frame { let stored_id = current_frame.page_id.load(Ordering::Acquire); @@ -361,12 +367,13 @@ impl PageManager { } /// Loads a page from disk atomically. This method is called only once per page_id - /// by the entry().or_insert_with() pattern in get() and get_mut(). + /// by the select_frame_id pattern in get() and get_mut(). + /// The result FrameId get pinned after this function. fn load_page_from_disk(&self, page_id: PageId) -> Result { let frame_id = self .replacer .victim_and_pin(|fid| { - self._cleanup_victim_page(fid); + self.cleanup_victim_page(fid); }) .ok_or(PageError::OutOfMemory)?; @@ -399,7 +406,7 @@ impl PageManager { let frame_id = self .replacer .victim_and_pin(|fid| { - self._cleanup_victim_page(fid); + self.cleanup_victim_page(fid); }) .ok_or(PageError::OutOfMemory)?; From cd01f536c7b019c41c2a0755bb404fd6c4c9fb75 Mon Sep 17 00:00:00 2001 From: nqd Date: Sun, 30 Nov 2025 01:47:18 +0000 Subject: [PATCH 46/65] build for mmap --- src/page/manager.rs | 2 +- src/page/manager/mmap.rs | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/page/manager.rs b/src/page/manager.rs index bde800f2..f31f608f 100644 --- a/src/page/manager.rs +++ b/src/page/manager.rs @@ -4,7 +4,7 @@ use crate::page::PageId; pub(super) mod buffer_pool; #[cfg(feature = "buffer_pool_backend")] pub(super) mod cache_evict; -// #[cfg(feature = "buffer_pool_backend")] +#[cfg(feature = "buffer_pool_backend")] pub(super) mod clock_replacer; #[cfg(feature = "mmap_backend")] pub(super) mod mmap; diff --git a/src/page/manager/mmap.rs b/src/page/manager/mmap.rs index 4b829dde..0818f76f 100644 --- a/src/page/manager/mmap.rs +++ b/src/page/manager/mmap.rs @@ -120,6 +120,9 @@ impl PageManager { #[inline] pub fn drop_page(&self, _page_id: PageId) {} + #[inline] + pub fn drop_page_mut(&self, _page_id: PageId) {} + /// Grows the size of the underlying file to make room for additional pages. /// /// This will increase the file size by a constant factor of 1024 pages, or a relative factor From 25600bf445421e94984a1bf24d1ddbd0bd9afe2d Mon Sep 17 00:00:00 2001 From: nqd Date: Sat, 6 Dec 2025 11:40:44 +0000 Subject: [PATCH 47/65] cleanup --- src/page/manager.rs | 2 - src/page/manager/buffer_pool.rs | 9 +--- src/page/manager/cache_evict.rs | 88 --------------------------------- 3 files changed, 2 insertions(+), 97 deletions(-) delete mode 100644 src/page/manager/cache_evict.rs diff --git a/src/page/manager.rs b/src/page/manager.rs index f31f608f..a3511398 100644 --- a/src/page/manager.rs +++ b/src/page/manager.rs @@ -3,8 +3,6 @@ use crate::page::PageId; #[cfg(feature = "buffer_pool_backend")] pub(super) mod buffer_pool; #[cfg(feature = "buffer_pool_backend")] -pub(super) mod cache_evict; -#[cfg(feature = "buffer_pool_backend")] pub(super) mod clock_replacer; #[cfg(feature = "mmap_backend")] pub(super) mod mmap; diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 58c69ed1..3fe446ed 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -52,17 +52,12 @@ pub(crate) struct FrameId(u32); impl FrameId { #[inline] - pub const fn as_u32(&self) -> u32 { - self.0 - } - - #[inline] - pub const fn as_usize(&self) -> usize { + pub(crate) const fn as_usize(&self) -> usize { self.0 as usize } #[inline] - pub const fn from_usize(value: usize) -> Self { + pub(crate) const fn from_usize(value: usize) -> Self { FrameId(value as u32) } } diff --git a/src/page/manager/cache_evict.rs b/src/page/manager/cache_evict.rs deleted file mode 100644 index 25c2b19b..00000000 --- a/src/page/manager/cache_evict.rs +++ /dev/null @@ -1,88 +0,0 @@ -use std::fmt; - -use dashmap::{DashMap, DashSet}; -use evict::{EvictResult, EvictionPolicy, LruReplacer}; -use fxhash::FxBuildHasher; -use parking_lot::Mutex; - -use crate::page::{ - manager::buffer_pool::{FrameId, FxMap, FxSet}, - PageId, -}; - -// TODO: Temporarily use LruReplacer as the eviction policy, replace with a better eviction policy -pub(crate) struct CacheEvict { - lru_replacer: LruReplacer, - pub(crate) read_frames: FxSet, - pub(crate) update_frames: FxMap, - pub(crate) new_frames: Mutex>, -} - -impl fmt::Debug for CacheEvict { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "CacheEvict") - } -} - -impl CacheEvict { - pub(crate) fn new(capacity: usize) -> Self { - Self { - lru_replacer: LruReplacer::new(capacity), - read_frames: DashSet::with_capacity_and_hasher(capacity, FxBuildHasher::default()), - update_frames: DashMap::with_capacity_and_hasher(capacity, FxBuildHasher::default()), - new_frames: Mutex::new(Vec::with_capacity(capacity)), - } - } - - pub(crate) fn evict(&self) -> Option { - self.lru_replacer.evict() - } - - pub(crate) fn touch(&self, page_id: PageId) -> EvictResult<(), PageId> { - self.lru_replacer.touch(page_id) - } - - pub(crate) fn pin_read(&self, page_id: PageId) -> EvictResult<(), PageId> { - self.read_frames.insert(page_id); - self.lru_replacer.pin(page_id) - } - - pub(crate) fn pin_write_update_page( - &self, - frame_id: FrameId, - page_id: PageId, - ) -> EvictResult<(), PageId> { - if let Some((_, first_page_id)) = self.new_frames.lock().first() { - if page_id.as_u32() < first_page_id.as_u32() { - self.update_frames.insert(page_id, frame_id); - } - } else { - self.update_frames.insert(page_id, frame_id); - } - - self.lru_replacer.pin(page_id) - } - - pub(crate) fn pin_write_new_page( - &self, - frame_id: FrameId, - page_id: PageId, - ) -> EvictResult<(), PageId> { - let mut new_frames = self.new_frames.lock(); - if let Some((_, last_page_id)) = new_frames.last() { - debug_assert!( - last_page_id.as_u32() + 1 == page_id, - "page_id: {:?}, last_page_id: {:?}", - page_id, - last_page_id - ); - } - new_frames.push((frame_id, page_id)); - - self.lru_replacer.pin(page_id) - } - - pub(crate) fn unpin(&self, page_id: PageId) -> EvictResult<(), PageId> { - self.lru_replacer.unpin(page_id) - } -} From 359a1ad5f75857ba67a209bcb20de937d6627136 Mon Sep 17 00:00:00 2001 From: nqd Date: Sun, 7 Dec 2025 13:52:27 +0000 Subject: [PATCH 48/65] fix race condition --- src/page/manager/buffer_pool.rs | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 3fe446ed..a6b6e62d 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -209,12 +209,23 @@ impl PageManager { if result.is_err() { panic!("{:?}", result); } - // Note: it's possible that when a mut page get dropped, before it's wrote - // to the disk, the same page is used again as mut page. We rely on page_table - // to track which pages are currently in the buffer pool. pages.iter().for_each(|(page_id, frame_id)| { - updated_pages.remove(page_id); + let frame = &frames[frame_id.0 as usize]; + if let PageState::Dirty(_) = + unsafe { RawPageState::from_ptr(frame.ptr.cast()).load() } + { + return; + } replacer.unpin(*frame_id); + // It's possible that between checking for dirty state and unpin, the + // page is get_mut again. In that case, re-pin the frame. + if let PageState::Dirty(_) = + unsafe { RawPageState::from_ptr(frame.ptr.cast()).load() } + { + replacer.pin(*frame_id); + return; + } + updated_pages.remove(page_id); }); } Ok(WriteMessage::Shutdown) => { From 73732f0367fd86c4226d26e46cb9abe7186675f8 Mon Sep 17 00:00:00 2001 From: nqd Date: Mon, 8 Dec 2025 12:25:04 +0000 Subject: [PATCH 49/65] cleanup cache eviction --- src/page/manager/clock_replacer.rs | 50 ++++++------------------------ 1 file changed, 10 insertions(+), 40 deletions(-) diff --git a/src/page/manager/clock_replacer.rs b/src/page/manager/clock_replacer.rs index 8e81bdac..b2cd4311 100644 --- a/src/page/manager/clock_replacer.rs +++ b/src/page/manager/clock_replacer.rs @@ -35,9 +35,9 @@ impl ClockReplacer { let frame = &self.frames[frame_id.as_usize()]; // First increment pin count - frame.pin.store(true, Ordering::SeqCst); + frame.pin.store(true, Ordering::Release); // Then set usage bit to true (give it a second chance) - frame.ref_bit.store(true, Ordering::SeqCst); + frame.ref_bit.store(true, Ordering::Release); } pub(super) fn unpin(&self, frame_id: FrameId) { @@ -46,37 +46,7 @@ impl ClockReplacer { } let frame = &self.frames[frame_id.as_usize()]; - frame.pin.store(false, Ordering::SeqCst); - } - - // Find a frame to evict - pub(super) fn victim(&self) -> Option { - let mut hand = self.hand.lock(); - let num_frames = self.frames.len(); - - for _ in 0..(num_frames * 3) { - let current_idx = *hand; - let frame = &self.frames[current_idx]; - - // Move hand forward for next iteration - *hand = (*hand + 1) % num_frames; - - let current_pins = frame.pin.load(Ordering::SeqCst); - if current_pins { - // This page is being used. Cannot evict. Skip it. - continue; - } - // Check reference bit: swap atomically returns old value and sets to false - if frame.ref_bit.swap(false, Ordering::SeqCst) { - // Had a second chance (was true, now set to false) - continue; - } - - return Some(FrameId::from_usize(current_idx)); - } - - // If get here, literally every single frame is Pinned. The buffer pool is exhausted. - None + frame.pin.store(false, Ordering::Release); } // Find a frame to evict and pin it @@ -94,21 +64,21 @@ impl ClockReplacer { // Move hand forward for next iteration *hand = (*hand + 1) % num_frames; - let current_pins = frame.pin.load(Ordering::SeqCst); + let current_pins = frame.pin.load(Ordering::Relaxed); if current_pins { // This page is being used. Cannot evict. Skip it. continue; } // Check reference bit: swap atomically returns old value and sets to false - if frame.ref_bit.swap(false, Ordering::SeqCst) { + if frame.ref_bit.swap(false, Ordering::Relaxed) { // Had a second chance (was true, now set to false) continue; } // Pin the frame let frame = &self.frames[current_idx]; - frame.pin.store(true, Ordering::SeqCst); - frame.ref_bit.store(true, Ordering::SeqCst); + frame.pin.store(true, Ordering::Relaxed); + frame.ref_bit.store(true, Ordering::Relaxed); let frame_id = FrameId::from_usize(current_idx); cleanup(frame_id); return Some(frame_id); @@ -161,12 +131,12 @@ mod tests { fn test_evict() { let r = ClockReplacer::new(5); (0..5).for_each(|i| { - assert_eq!(r.victim(), Some(FrameId::from_usize(i))); + assert_eq!(r.victim_and_pin(|_| {}), Some(FrameId::from_usize(i))); r.pin(FrameId::from_usize(i)); }); - assert_eq!(r.victim(), None); + assert_eq!(r.victim_and_pin(|_| {}), None); r.unpin(FrameId::from_usize(4)); - assert_eq!(r.victim(), Some(FrameId::from_usize(4))); + assert_eq!(r.victim_and_pin(|_| {}), Some(FrameId::from_usize(4))); } } From e86c350e2561390dc4862f2d74e62e4476fa0e48 Mon Sep 17 00:00:00 2001 From: nqd Date: Mon, 8 Dec 2025 12:49:43 +0000 Subject: [PATCH 50/65] update ordering --- src/page/manager/clock_replacer.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/page/manager/clock_replacer.rs b/src/page/manager/clock_replacer.rs index b2cd4311..7cc9b517 100644 --- a/src/page/manager/clock_replacer.rs +++ b/src/page/manager/clock_replacer.rs @@ -34,7 +34,7 @@ impl ClockReplacer { } let frame = &self.frames[frame_id.as_usize()]; - // First increment pin count + // First set pin to true frame.pin.store(true, Ordering::Release); // Then set usage bit to true (give it a second chance) frame.ref_bit.store(true, Ordering::Release); @@ -64,21 +64,21 @@ impl ClockReplacer { // Move hand forward for next iteration *hand = (*hand + 1) % num_frames; - let current_pins = frame.pin.load(Ordering::Relaxed); + let current_pins = frame.pin.load(Ordering::Acquire); if current_pins { // This page is being used. Cannot evict. Skip it. continue; } // Check reference bit: swap atomically returns old value and sets to false - if frame.ref_bit.swap(false, Ordering::Relaxed) { + if frame.ref_bit.swap(false, Ordering::Acquire) { // Had a second chance (was true, now set to false) continue; } // Pin the frame let frame = &self.frames[current_idx]; - frame.pin.store(true, Ordering::Relaxed); - frame.ref_bit.store(true, Ordering::Relaxed); + frame.pin.store(true, Ordering::Release); + frame.ref_bit.store(true, Ordering::Release); let frame_id = FrameId::from_usize(current_idx); cleanup(frame_id); return Some(frame_id); From d37926f3ace460871612806854f4dda8b0deb0f1 Mon Sep 17 00:00:00 2001 From: nqd Date: Tue, 9 Dec 2025 14:50:29 +0000 Subject: [PATCH 51/65] using single atomicU8 --- src/page/manager/buffer_pool.rs | 8 +++--- src/page/manager/clock_replacer.rs | 42 +++++++++++++++--------------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index a6b6e62d..a2c3af1a 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -341,10 +341,10 @@ impl PageManager { fn select_frame_id(&self, page_id: PageId) -> Result { loop { - // Check if page is in the cache - if let Some(frame_id) = self.page_table.get(&page_id) { - self.replacer.pin(*frame_id); - return Ok(*frame_id); + // Use get().map() to release the DashMap lock immediately before pinning + if let Some(frame_id) = self.page_table.get(&page_id).map(|r| *r) { + self.replacer.pin(frame_id); + return Ok(frame_id); } // Otherwise, need to load page from disk if self.loading_pages.insert(page_id) { diff --git a/src/page/manager/clock_replacer.rs b/src/page/manager/clock_replacer.rs index 7cc9b517..570eb5a4 100644 --- a/src/page/manager/clock_replacer.rs +++ b/src/page/manager/clock_replacer.rs @@ -1,14 +1,13 @@ -use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::atomic::{AtomicU8, Ordering}; use parking_lot::Mutex; use crate::page::manager::buffer_pool::FrameId; struct FrameState { - // The Second Chance bit - ref_bit: AtomicBool, - // If true, this frame cannot be evicted - pin: AtomicBool, + // Bit 0: pin (If true, this frame cannot be evicted) + // Bit 1: ref_bit (Second Chance bit) + state: AtomicU8, } pub(crate) struct ClockReplacer { @@ -20,13 +19,13 @@ impl ClockReplacer { pub(super) fn new(num_frames: usize) -> Self { let mut frames = Vec::with_capacity(num_frames); for _ in 0..num_frames { - frames - .push(FrameState { ref_bit: AtomicBool::new(false), pin: AtomicBool::new(false) }); + frames.push(FrameState { state: AtomicU8::new(0) }); } ClockReplacer { frames, hand: Mutex::new(0) } } + #[inline] pub(super) fn pin(&self, frame_id: FrameId) { // Safety check if frame_id.as_usize() >= self.frames.len() { @@ -34,10 +33,8 @@ impl ClockReplacer { } let frame = &self.frames[frame_id.as_usize()]; - // First set pin to true - frame.pin.store(true, Ordering::Release); - // Then set usage bit to true (give it a second chance) - frame.ref_bit.store(true, Ordering::Release); + // Set both pin (bit 0) and ref_bit (bit 1) to true + frame.state.store(0b11, Ordering::Relaxed); } pub(super) fn unpin(&self, frame_id: FrameId) { @@ -46,7 +43,9 @@ impl ClockReplacer { } let frame = &self.frames[frame_id.as_usize()]; - frame.pin.store(false, Ordering::Release); + // Clear pin bit (bit 0), keep ref_bit unchanged + let current = frame.state.load(Ordering::Relaxed); + frame.state.store(current & 0b10, Ordering::Release); } // Find a frame to evict and pin it @@ -64,21 +63,22 @@ impl ClockReplacer { // Move hand forward for next iteration *hand = (*hand + 1) % num_frames; - let current_pins = frame.pin.load(Ordering::Acquire); - if current_pins { + // Check if pin bit (bit 0) is set + let current_state = frame.state.load(Ordering::Relaxed); + if (current_state & 0b01) != 0 { // This page is being used. Cannot evict. Skip it. continue; } - // Check reference bit: swap atomically returns old value and sets to false - if frame.ref_bit.swap(false, Ordering::Acquire) { - // Had a second chance (was true, now set to false) + // Check reference bit (bit 1): swap atomically returns old value and sets to false + let old_state = frame.state.swap(current_state & 0b01, Ordering::Relaxed); + if (old_state & 0b10) != 0 { + // Had a second chance (ref_bit was true, now set to false) continue; } - // Pin the frame + // Pin the frame: set both pin (bit 0) and ref_bit (bit 1) to true let frame = &self.frames[current_idx]; - frame.pin.store(true, Ordering::Release); - frame.ref_bit.store(true, Ordering::Release); + frame.state.store(0b11, Ordering::Relaxed); let frame_id = FrameId::from_usize(current_idx); cleanup(frame_id); return Some(frame_id); @@ -90,7 +90,7 @@ impl ClockReplacer { #[cfg(test)] pub(super) fn count_pinned(&self) -> usize { - self.frames.iter().filter(|f| f.pin.load(Ordering::SeqCst)).count() + self.frames.iter().filter(|f| (f.state.load(Ordering::SeqCst) & 0b01) != 0).count() } } From fec4bd8ab5dd0ee38579f726f8985bb0d3aa0192 Mon Sep 17 00:00:00 2001 From: nqd Date: Wed, 10 Dec 2025 08:51:14 +0000 Subject: [PATCH 52/65] use atomic func --- src/page/manager/clock_replacer.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/page/manager/clock_replacer.rs b/src/page/manager/clock_replacer.rs index 570eb5a4..0dff68a0 100644 --- a/src/page/manager/clock_replacer.rs +++ b/src/page/manager/clock_replacer.rs @@ -44,8 +44,7 @@ impl ClockReplacer { let frame = &self.frames[frame_id.as_usize()]; // Clear pin bit (bit 0), keep ref_bit unchanged - let current = frame.state.load(Ordering::Relaxed); - frame.state.store(current & 0b10, Ordering::Release); + frame.state.fetch_and(0b10, Ordering::Release); } // Find a frame to evict and pin it @@ -77,7 +76,6 @@ impl ClockReplacer { } // Pin the frame: set both pin (bit 0) and ref_bit (bit 1) to true - let frame = &self.frames[current_idx]; frame.state.store(0b11, Ordering::Relaxed); let frame_id = FrameId::from_usize(current_idx); cleanup(frame_id); From 1c04c871eb21456f6bc7537aa5309b0869516c96 Mon Sep 17 00:00:00 2001 From: nqd Date: Wed, 10 Dec 2025 14:36:39 +0000 Subject: [PATCH 53/65] reorder field --- src/page/manager/buffer_pool.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index a2c3af1a..543cddd2 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -66,19 +66,20 @@ pub(crate) type FxMap = DashMap; pub(crate) type FxSet = DashSet; pub struct PageManager { - num_frames: u32, - page_count: AtomicU32, - file: RwLock, - file_len: AtomicU64, - frames: Arc>, /* list of frames that hold pages' data, indexed by frame id with - * fix num_frames size */ page_table: FxMap, /* mapping between page id and buffer pool frames, * indexed by page id with fix num_frames size */ + frames: Arc>, /* list of frames that hold pages' data, indexed by frame id with + * fix num_frames size */ replacer: Arc, + loading_pages: FxSet, + page_count: AtomicU32, + num_frames: u32, + updated_pages: Arc>, new_pages: Mutex>, - loading_pages: FxSet, + file: RwLock, + file_len: AtomicU64, io_uring: Arc>, tx_job: Sender, drop_pages: Mutex>, From edaab0fb76830b638c12154ad121b9836ee055c7 Mon Sep 17 00:00:00 2001 From: nqd Date: Thu, 18 Dec 2025 03:31:23 +0000 Subject: [PATCH 54/65] wip --- src/page/manager/buffer_pool.rs | 87 ++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 35 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 543cddd2..32fc74d4 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -1,6 +1,7 @@ use crossbeam_channel::{Receiver, Sender}; use fxhash::FxBuildHasher; use io_uring::{opcode, types, IoUring}; +use proptest::num; use std::{ ffi::CString, fs::File, @@ -33,11 +34,17 @@ use crate::{ struct Frame { ptr: *mut [u8; Page::SIZE], page_id: AtomicU32, // 0 means None, otherwise it's the page_id + state: AtomicU32, // Bit 0: pin (If true, this frame cannot be evicted) + // Bit 1: ref_bit (Second Chance bit) } impl Clone for Frame { fn clone(&self) -> Self { - Frame { ptr: self.ptr, page_id: AtomicU32::new(self.page_id.load(Ordering::Acquire)) } + Frame { + ptr: self.ptr, + page_id: AtomicU32::new(self.page_id.load(Ordering::SeqCst)), + state: AtomicU32::new(self.state.load(Ordering::SeqCst)), + } } } @@ -62,15 +69,47 @@ impl FrameId { } } +struct Frames(Vec); +impl Frames { + fn allocate(num_frames: usize) -> Self { + let mut frames = Vec::with_capacity(num_frames); + (0..num_frames).into_iter().for_each(|_| { + let boxed_array = Box::new([0; Page::SIZE]); + let ptr = Box::into_raw(boxed_array); + frames.push(Frame { ptr, page_id: AtomicU32::new(0), state: AtomicU32::new(0) }); + }); + Self(frames) + } + + // Pin a frame so that it will not be evicted. + fn pin(&self, frame_id: FrameId) { + todo!() + } + + // Unpin a frame. + fn unpin(&self, frame_id: FrameId) { + todo!() + } + + // Find a frame to be evicted, also pin that frame and running cleanup F function. + fn victim_and_pin(&self, cleanup: F) -> Option + where + F: FnOnce(FrameId), + { + todo!() + } +} + pub(crate) type FxMap = DashMap; pub(crate) type FxSet = DashSet; pub struct PageManager { page_table: FxMap, /* mapping between page id and buffer pool frames, * indexed by page id with fix num_frames size */ - frames: Arc>, /* list of frames that hold pages' data, indexed by frame id with - * fix num_frames size */ - replacer: Arc, + frames: Arc, /* list of frames that hold pages' data, indexed by frame id with + * fix num_frames size */ + frame_lock_replacer_hand: Mutex, /* CLOCK cache replacer hand */ + // replacer: Arc, loading_pages: FxSet, page_count: AtomicU32, num_frames: u32, @@ -93,18 +132,7 @@ enum WriteMessage { impl std::fmt::Debug for PageManager { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("PageManager") - .field("num_frames", &self.num_frames) - .field("page_count", &self.page_count) - .field("file", &self.file) - .field("file_len", &self.file_len) - .field("frames", &self.frames) - .field("page_table", &self.page_table) - // .field("original_free_frame_idx", &self.original_free_frame_idx) - // .field("lru_replacer", &self.lru_replacer) - // .field("loading_page", &self.loading_page) - .field("io_uring", &"") - .finish() + f.debug_struct("PageManager").field("num_frames", &self.num_frames).finish() } } @@ -148,18 +176,9 @@ impl PageManager { let file_len = AtomicU64::new(file.metadata().map_err(PageError::IO)?.len()); let page_table = DashMap::with_capacity_and_hasher(num_frames as usize, FxBuildHasher::default()); - let mut frames = Vec::with_capacity(num_frames as usize); - for _ in 0..num_frames { - let boxed_array = Box::new([0; Page::SIZE]); - let ptr = Box::into_raw(boxed_array); - frames.push(Frame { ptr, page_id: AtomicU32::new(0) }); - } - // let lru_replacer = Arc::new(CacheEvict::new(num_frames as usize)); - // let loading_page = - // DashSet::with_capacity_and_hasher(num_frames as usize, FxBuildHasher::default()); - let replacer = ClockReplacer::new(num_frames as usize); + let frames = Frames::allocate(num_frames as usize); - // Initialize io_uring with queue depth base on num_frames + // Initialize io_uring with queue depth let queue_depth = num_frames.min(2048) as u32; let io_uring = IoUring::new(queue_depth) .map_err(|e| PageError::IO(io::Error::new(io::ErrorKind::Other, e)))?; @@ -171,11 +190,8 @@ impl PageManager { file: RwLock::new(file), file_len, frames: Arc::new(frames), + frame_lock_replacer_hand: Mutex::new(0), page_table, - // original_free_frame_idx: AtomicU32::new(0), - // lru_replacer, - // loading_page, - replacer: Arc::new(replacer), updated_pages: Arc::new(DashMap::with_hasher(FxBuildHasher::default())), new_pages: Mutex::new(Vec::new()), loading_pages: DashSet::with_hasher(FxBuildHasher::default()), @@ -195,11 +211,11 @@ impl PageManager { let frames = self.frames.clone(); let io_uring = self.io_uring.clone(); let updated_pages = self.updated_pages.clone(); - let replacer = self.replacer.clone(); thread::spawn(move || { loop { match rx_job.recv() { Ok(WriteMessage::Pages(pages)) => { + // First, write pages to disk let result = Self::write_updated_pages( io_uring.clone(), frames.clone(), @@ -210,6 +226,7 @@ impl PageManager { if result.is_err() { panic!("{:?}", result); } + // Then unpin those writtern pages so that the pages could be evicted and reused pages.iter().for_each(|(page_id, frame_id)| { let frame = &frames[frame_id.0 as usize]; if let PageState::Dirty(_) = @@ -217,7 +234,7 @@ impl PageManager { { return; } - replacer.unpin(*frame_id); + frames.unpin(*frame_id); // It's possible that between checking for dirty state and unpin, the // page is get_mut again. In that case, re-pin the frame. if let PageState::Dirty(_) = @@ -245,7 +262,7 @@ impl PageManager { fn write_updated_pages( ring: Arc>, - frames: Arc>, + frames: Arc, pages: &[(PageId, FrameId)], file: File, ) -> io::Result<()> { @@ -257,7 +274,7 @@ impl PageManager { let page_id = page.0; let frame_id = page.1; let offset = page_id.as_offset(); - let frame = &frames[frame_id.0 as usize]; + let frame = &frames.0[frame_id.0 as usize]; let page_data = unsafe { std::slice::from_raw_parts(frame.ptr as *const u8, Page::SIZE) }; From 3ed31c58fc5c06374262f6b6ae38ed49c683695e Mon Sep 17 00:00:00 2001 From: nqd Date: Thu, 18 Dec 2025 06:56:38 +0000 Subject: [PATCH 55/65] wip --- src/page/manager/buffer_pool.rs | 119 ++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 52 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 32fc74d4..5a1f3047 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -48,6 +48,18 @@ impl Clone for Frame { } } +impl Frame { + // Unpin the frame if the page is not dirty. + // If the page is in dirty state, returns false. + fn unpin(&self) -> bool { + todo!() + } + + fn pin(&self) { + todo!() + } +} + // SAFETY: Frame contains a pointer to heap-allocated memory that we own exclusively. // The memory is allocated via Box and we manage its lifetime, so it's safe to send // between threads. @@ -81,15 +93,20 @@ impl Frames { Self(frames) } - // Pin a frame so that it will not be evicted. - fn pin(&self, frame_id: FrameId) { - todo!() + #[inline(always)] + fn get(&self, frame_id: FrameId) -> &Frame { + &self.0[frame_id.0 as usize] } - // Unpin a frame. - fn unpin(&self, frame_id: FrameId) { - todo!() - } + // // Pin a frame so that it will not be evicted. + // fn pin(&self, frame_id: FrameId) { + // todo!() + // } + + // // Unpin a frame. + // fn unpin(&self, frame_id: FrameId) { + // todo!() + // } // Find a frame to be evicted, also pin that frame and running cleanup F function. fn victim_and_pin(&self, cleanup: F) -> Option @@ -228,22 +245,24 @@ impl PageManager { } // Then unpin those writtern pages so that the pages could be evicted and reused pages.iter().for_each(|(page_id, frame_id)| { - let frame = &frames[frame_id.0 as usize]; - if let PageState::Dirty(_) = - unsafe { RawPageState::from_ptr(frame.ptr.cast()).load() } - { - return; - } - frames.unpin(*frame_id); - // It's possible that between checking for dirty state and unpin, the - // page is get_mut again. In that case, re-pin the frame. - if let PageState::Dirty(_) = - unsafe { RawPageState::from_ptr(frame.ptr.cast()).load() } - { - replacer.pin(*frame_id); - return; + let frame = frames.get(*frame_id); + // if let PageState::Dirty(_) = + // unsafe { RawPageState::from_ptr(frame.ptr.cast()).load() } + // { + // return; + // } + // frames.unpin(*frame_id); + // // It's possible that between checking for dirty state and unpin, the + // // page is get_mut again. In that case, re-pin the frame. + // if let PageState::Dirty(_) = + // unsafe { RawPageState::from_ptr(frame.ptr.cast()).load() } + // { + // frame.pin(*frame_id); + // return; + // } + if frame.unpin() { + updated_pages.remove(page_id); } - updated_pages.remove(page_id); }); } Ok(WriteMessage::Shutdown) => { @@ -274,7 +293,7 @@ impl PageManager { let page_id = page.0; let frame_id = page.1; let offset = page_id.as_offset(); - let frame = &frames.0[frame_id.0 as usize]; + let frame = frames.get(frame_id); let page_data = unsafe { std::slice::from_raw_parts(frame.ptr as *const u8, Page::SIZE) }; @@ -331,11 +350,8 @@ impl PageManager { if page_id > self.page_count.load(Ordering::Relaxed) { return Err(PageError::PageNotFound(page_id)); } - - // Atomically get or load the page - let frame_id = self.select_frame_id(page_id)?; - - let frame = &self.frames[frame_id.0 as usize]; + // Get or load the page from disk. + let (_, frame) = self.select_frame(page_id)?; unsafe { Page::from_ptr(page_id, frame.ptr, self) } } @@ -348,29 +364,30 @@ impl PageManager { if page_id > self.page_count.load(Ordering::Relaxed) { return Err(PageError::PageNotFound(page_id)); } - - // Atomically get or load the page - let frame_id = self.select_frame_id(page_id)?; - + // Get or load the page from disk. + let (frame_id, frame) = self.select_frame(page_id)?; self.add_updated_page(page_id, frame_id); - let frame = &self.frames[frame_id.0 as usize]; unsafe { PageMut::from_ptr(page_id, snapshot_id, frame.ptr, self) } } - fn select_frame_id(&self, page_id: PageId) -> Result { + // Return frame for given page_id. + // If the page_id is in frames, return from the cache. Otherwise, get an evicted frame_id and load data from the disk. In both case, pin the frame index. + fn select_frame(&self, page_id: PageId) -> Result<(FrameId, &Frame), PageError> { loop { - // Use get().map() to release the DashMap lock immediately before pinning - if let Some(frame_id) = self.page_table.get(&page_id).map(|r| *r) { - self.replacer.pin(frame_id); - return Ok(frame_id); + // If the page_id is in the cache. Hold dashmap lock until returning. + if let Some(frame_id) = self.page_table.get(&page_id) { + let frame = self.frames.get(*frame_id); + frame.pin(); + return Ok((*frame_id, frame)); } - // Otherwise, need to load page from disk + // Otherwise, need to load page from disk. if self.loading_pages.insert(page_id) { - // This thread is the first to load the page + // This thread is the first to load the page. Load page from disk and also pin the frame. let frame_id = self.load_page_from_disk(page_id)?; self.page_table.insert(page_id, frame_id); + let frame = self.frames.get(frame_id); self.loading_pages.remove(&page_id); - return Ok(frame_id); + return Ok((frame_id, frame)); } // Another thread is already loading this page, spin/yield and retry std::thread::yield_now(); @@ -379,13 +396,11 @@ impl PageManager { // Remove the current pageid, frame_id from page_table. fn cleanup_victim_page(&self, frame_id: FrameId) { - let current_frame = self.frames.get(frame_id.as_usize()); - if let Some(current_frame) = current_frame { - let stored_id = current_frame.page_id.load(Ordering::Acquire); - if stored_id != 0 { - if let Some(old_page_id) = PageId::new(stored_id) { - self.page_table.remove(&old_page_id); - } + let current_frame = self.frames.get(frame_id); + let stored_id = current_frame.page_id.load(Ordering::Acquire); + if stored_id != 0 { + if let Some(old_page_id) = PageId::new(stored_id) { + self.page_table.remove(&old_page_id); } } } @@ -395,18 +410,18 @@ impl PageManager { /// The result FrameId get pinned after this function. fn load_page_from_disk(&self, page_id: PageId) -> Result { let frame_id = self - .replacer + .frames .victim_and_pin(|fid| { self.cleanup_victim_page(fid); }) .ok_or(PageError::OutOfMemory)?; - self.frames[frame_id.0 as usize].page_id.store(page_id.as_u32(), Ordering::Relaxed); - let buf: *mut [u8; Page::SIZE] = self.frames[frame_id.0 as usize].ptr; + let frame = self.frames.get(frame_id); + frame.page_id.store(page_id.as_u32(), Ordering::Relaxed); unsafe { self.file .read() - .read_exact_at(&mut *buf, page_id.as_offset() as u64) + .read_exact_at(&mut *frame.ptr, page_id.as_offset() as u64) .map_err(PageError::IO)?; } Ok(frame_id) From 6b9f193b6d19a4d1e942367eed11cb7494a9fb7f Mon Sep 17 00:00:00 2001 From: nqd Date: Thu, 18 Dec 2025 08:15:51 +0000 Subject: [PATCH 56/65] wip --- src/page/manager/buffer_pool.rs | 35 ++++++++++++++++----------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 5a1f3047..0a168aca 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -103,13 +103,13 @@ impl Frames { // todo!() // } - // // Unpin a frame. - // fn unpin(&self, frame_id: FrameId) { - // todo!() - // } + // Unpin a frame. + fn unpin(&self, frame_id: FrameId) { + todo!() + } // Find a frame to be evicted, also pin that frame and running cleanup F function. - fn victim_and_pin(&self, cleanup: F) -> Option + fn victim_and_pin(&self, cleanup: F) -> Option<(FrameId, &Frame)> where F: FnOnce(FrameId), { @@ -409,14 +409,13 @@ impl PageManager { /// by the select_frame_id pattern in get() and get_mut(). /// The result FrameId get pinned after this function. fn load_page_from_disk(&self, page_id: PageId) -> Result { - let frame_id = self + let (frame_id, frame) = self .frames .victim_and_pin(|fid| { self.cleanup_victim_page(fid); }) .ok_or(PageError::OutOfMemory)?; - let frame = self.frames.get(frame_id); frame.page_id.store(page_id.as_u32(), Ordering::Relaxed); unsafe { self.file @@ -442,8 +441,8 @@ impl PageManager { /// /// Returns an error if the buffer pool is full. pub fn allocate(&self, snapshot_id: SnapshotId) -> Result, PageError> { - let frame_id = self - .replacer + let (frame_id, frame) = self + .frames .victim_and_pin(|fid| { self.cleanup_victim_page(fid); }) @@ -453,9 +452,9 @@ impl PageManager { self.new_pages.lock().push((frame_id, page_id)); self.grow_if_needed(new_count as u64 * Page::SIZE as u64)?; self.page_table.insert(page_id, frame_id); - self.frames[frame_id.0 as usize].page_id.store(page_id.as_u32(), Ordering::Relaxed); - let data = self.frames[frame_id.0 as usize].ptr; - unsafe { PageMut::acquire_unchecked(page_id, snapshot_id, data, self) } + + frame.page_id.store(page_id.as_u32(), Ordering::Relaxed); + unsafe { PageMut::acquire_unchecked(page_id, snapshot_id, frame.ptr, self) } } /// Checks if a page is currently in the Dirty state. @@ -468,7 +467,7 @@ impl PageManager { } // A page is dirty if it is in the page_table if let Some(frame_id) = self.page_table.get(&page_id) { - let frame = &self.frames[frame_id.0 as usize]; + let frame = self.frames.get(*frame_id); // SAFETY: We're just reading the state atomically, respecting the memory model let state = unsafe { RawPageState::from_ptr(frame.ptr.cast()) }; @@ -504,7 +503,7 @@ impl PageManager { let iovecs: Vec = new_pages .iter() .map(|(frame_id, _)| { - let frame = &self.frames[frame_id.0 as usize]; + let frame = self.frames.get(*frame_id); libc::iovec { iov_base: frame.ptr as *mut libc::c_void, iov_len: Page::SIZE } }) .collect(); @@ -545,7 +544,7 @@ impl PageManager { for entry in self.updated_pages.iter() { let frame_id = *entry.value(); let page_id = *entry.key(); - let frame = &self.frames[frame_id.0 as usize]; + let frame = self.frames.get(frame_id); let offset = page_id.as_offset(); let page_data = @@ -575,10 +574,10 @@ impl PageManager { // Submit all pending operations ring_guard.submit()?; - new_pages.iter().for_each(|(frame_id, _)| self.replacer.unpin(*frame_id)); + new_pages.iter().for_each(|(frame_id, _)| self.frames.unpin(*frame_id)); new_pages.clear(); - self.updated_pages.iter().for_each(|entry| self.replacer.unpin(*entry.value())); + self.updated_pages.iter().for_each(|entry| self.frames.unpin(*entry.value())); self.updated_pages.clear(); self.drop_pages.lock().clear(); @@ -627,7 +626,7 @@ impl PageManager { #[inline] pub fn drop_page(&self, page_id: PageId) { if let Some(frame_id) = self.page_table.get(&page_id) { - self.replacer.unpin(*frame_id); + self.frames.unpin(*frame_id); } } From 0554a22a781104b10573143da3cde89b38b5fa81 Mon Sep 17 00:00:00 2001 From: nqd Date: Thu, 18 Dec 2025 09:37:37 +0000 Subject: [PATCH 57/65] update pin/unpin --- src/page/manager/buffer_pool.rs | 38 +++++++++++++++++++++------------ src/page/state.rs | 28 ++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 14 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 0a168aca..b670859d 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -52,11 +52,20 @@ impl Frame { // Unpin the frame if the page is not dirty. // If the page is in dirty state, returns false. fn unpin(&self) -> bool { - todo!() + let is_dirty = unsafe { PageState::is_dirty(self.ptr.cast()) }; + + if is_dirty { + return false; + } + // Clear pin bit (bit 0), keep ref_bit (bit 1) unchanged + self.state.fetch_and(0b10, Ordering::Release); + true } + #[inline(always)] fn pin(&self) { - todo!() + // Set both pin (bit 0) and ref_bit (bit 1) to true + self.state.store(0b11, Ordering::Release); } } @@ -70,7 +79,7 @@ unsafe impl Sync for Frame {} pub(crate) struct FrameId(u32); impl FrameId { - #[inline] + #[inline(always)] pub(crate) const fn as_usize(&self) -> usize { self.0 as usize } @@ -98,14 +107,11 @@ impl Frames { &self.0[frame_id.0 as usize] } - // // Pin a frame so that it will not be evicted. - // fn pin(&self, frame_id: FrameId) { - // todo!() - // } - - // Unpin a frame. - fn unpin(&self, frame_id: FrameId) { - todo!() + // Unpin the frame if the occupied page is not dirty. + // If the page is in dirty state, returns false. + fn unpin(&self, frame_id: FrameId) -> bool { + let frame = self.get(frame_id); + frame.unpin() } // Find a frame to be evicted, also pin that frame and running cleanup F function. @@ -574,10 +580,14 @@ impl PageManager { // Submit all pending operations ring_guard.submit()?; - new_pages.iter().for_each(|(frame_id, _)| self.frames.unpin(*frame_id)); + new_pages.iter().for_each(|(frame_id, _)| { + self.frames.unpin(*frame_id); + }); new_pages.clear(); - self.updated_pages.iter().for_each(|entry| self.frames.unpin(*entry.value())); + self.updated_pages.iter().for_each(|entry| { + self.frames.unpin(*entry.value()); + }); self.updated_pages.clear(); self.drop_pages.lock().clear(); @@ -765,7 +775,7 @@ mod tests { for i in 1..=10 { let page_id = PageId::new(i).unwrap(); let frame_id = m.page_table.get(&page_id).expect("page not in cache"); - let frame = m.frames.get(frame_id.as_usize()).unwrap(); + let frame = m.frames.get(*frame_id); assert_eq!(frame.page_id.load(Ordering::Relaxed), page_id.as_u32()); } } diff --git a/src/page/state.rs b/src/page/state.rs index 821c04b7..8e357171 100644 --- a/src/page/state.rs +++ b/src/page/state.rs @@ -151,6 +151,22 @@ impl PageState { } } + /// Check if value from a raw pointer is in a dirty state. + /// + /// # Safety + /// + /// * `ptr` must be properly aligned. + /// * Access to the data pointed by `ptr` must adhere to the [memory model for page state + /// access]. + /// + /// [valid]: core::ptr#safety + /// [memory model for page state access]: self#memory-model + #[inline] + pub(super) unsafe fn is_dirty(ptr: *mut u64) -> bool { + // SAFETY: guaranteed by the caller + *ptr & Self::DIRTY_MASK != 0 + } + //#[inline] //pub(super) fn occupied(snapshot_id: SnapshotId) -> Option { // if snapshot_id & Self::DIRTY_MASK != 0 { @@ -218,6 +234,18 @@ mod tests { assert_eq!(PageState::from(0x8000000000000123), PageState::Dirty(0x123)); } + #[test] + fn state_is_dirty_from_ptr() { + let mut data = 123u64; + + let is_dirty = unsafe { PageState::is_dirty(&raw mut data) }; + assert_eq!(is_dirty, false); + + data = data | PageState::DIRTY_MASK; + let is_dirty = unsafe { PageState::is_dirty(&raw mut data) }; + assert_eq!(is_dirty, true); + } + #[test] fn raw_page_state_mutations() { let mut data = 123u64; From 962cf24b4bdc939e029c9d96e81916802ef5278c Mon Sep 17 00:00:00 2001 From: nqd Date: Thu, 18 Dec 2025 14:05:59 +0000 Subject: [PATCH 58/65] update test --- src/page/manager/buffer_pool.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index b670859d..62d8e706 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -121,6 +121,11 @@ impl Frames { { todo!() } + + #[cfg(test)] + fn count_pinned(&self) -> usize { + self.0.iter().filter(|i| (i.state.load(Ordering::SeqCst) & 0b01) != 0).count() + } } pub(crate) type FxMap = DashMap; @@ -866,9 +871,9 @@ mod tests { let mut p = m.allocate(snapshot).expect("page allocation failed"); p.contents_mut().iter_mut().for_each(|byte| *byte = 0xab); drop(p); - assert_eq!(m.replacer.count_pinned(), 1); + assert_eq!(m.frames.count_pinned(), 1); m.sync().expect("sync failed"); - assert_eq!(m.replacer.count_pinned(), 0); + assert_eq!(m.frames.count_pinned(), 0); seek(&f, 0); assert_eq!(len(&f), 1024 * Page::SIZE); @@ -880,9 +885,9 @@ mod tests { let mut p = m.allocate(snapshot + i as u64).expect("page allocation failed"); p.contents_mut().iter_mut().for_each(|byte| *byte = 0xab ^ (i as u8)); } - assert_eq!(m.replacer.count_pinned(), 255); + assert_eq!(m.frames.count_pinned(), 255); m.sync().expect("sync failed"); - assert_eq!(m.replacer.count_pinned(), 0); + assert_eq!(m.frames.count_pinned(), 0); assert_eq!(len(&f), 1024 * Page::SIZE); for i in 1..=255 { @@ -918,7 +923,7 @@ mod tests { let page = m.get(page_id).expect("page not in cache"); assert_eq!(page.contents(), &mut [0xab ^ (i as u8); Page::DATA_SIZE]); let frame_id = m.page_table.get(&page_id).expect("page not in cache"); - let frame = &m.frames[frame_id.0 as usize]; + let frame = m.frames.get(*frame_id); assert_eq!(frame.ptr as *const u8, page.all_contents().as_ptr()); } } @@ -933,7 +938,7 @@ mod tests { let page = m.get_mut(snapshot + i as u64, page_id).expect("page not in cache"); assert_eq!(page.contents(), &mut [0xab ^ (i as u8); Page::DATA_SIZE]); let frame_id = m.page_table.get(&page_id).expect("page not in cache"); - let frame = &m.frames[frame_id.0 as usize]; + let frame = m.frames.get(*frame_id); assert_eq!(frame.ptr as *const u8, page.all_contents().as_ptr()); } } From 000af8895026c241b3a44be41113ff21ca801632 Mon Sep 17 00:00:00 2001 From: nqd Date: Thu, 18 Dec 2025 15:05:02 +0000 Subject: [PATCH 59/65] refactor --- src/page/manager.rs | 2 +- src/page/manager/buffer_pool.rs | 104 +---------------------- src/page/manager/frame.rs | 142 ++++++++++++++++++++++++++++++++ 3 files changed, 144 insertions(+), 104 deletions(-) create mode 100644 src/page/manager/frame.rs diff --git a/src/page/manager.rs b/src/page/manager.rs index a3511398..0b8a5d23 100644 --- a/src/page/manager.rs +++ b/src/page/manager.rs @@ -3,7 +3,7 @@ use crate::page::PageId; #[cfg(feature = "buffer_pool_backend")] pub(super) mod buffer_pool; #[cfg(feature = "buffer_pool_backend")] -pub(super) mod clock_replacer; +pub(super) mod frame; #[cfg(feature = "mmap_backend")] pub(super) mod mmap; pub(super) mod options; diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 62d8e706..a6e11303 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -1,7 +1,6 @@ use crossbeam_channel::{Receiver, Sender}; use fxhash::FxBuildHasher; use io_uring::{opcode, types, IoUring}; -use proptest::num; use std::{ ffi::CString, fs::File, @@ -23,111 +22,13 @@ use parking_lot::{Mutex, RwLock}; use crate::{ page::{ - manager::clock_replacer::ClockReplacer, + manager::frame::{Frame, FrameId, Frames}, state::{PageState, RawPageState}, Page, PageError, PageId, PageManagerOptions, PageMut, }, snapshot::SnapshotId, }; -#[derive(Debug)] -struct Frame { - ptr: *mut [u8; Page::SIZE], - page_id: AtomicU32, // 0 means None, otherwise it's the page_id - state: AtomicU32, // Bit 0: pin (If true, this frame cannot be evicted) - // Bit 1: ref_bit (Second Chance bit) -} - -impl Clone for Frame { - fn clone(&self) -> Self { - Frame { - ptr: self.ptr, - page_id: AtomicU32::new(self.page_id.load(Ordering::SeqCst)), - state: AtomicU32::new(self.state.load(Ordering::SeqCst)), - } - } -} - -impl Frame { - // Unpin the frame if the page is not dirty. - // If the page is in dirty state, returns false. - fn unpin(&self) -> bool { - let is_dirty = unsafe { PageState::is_dirty(self.ptr.cast()) }; - - if is_dirty { - return false; - } - // Clear pin bit (bit 0), keep ref_bit (bit 1) unchanged - self.state.fetch_and(0b10, Ordering::Release); - true - } - - #[inline(always)] - fn pin(&self) { - // Set both pin (bit 0) and ref_bit (bit 1) to true - self.state.store(0b11, Ordering::Release); - } -} - -// SAFETY: Frame contains a pointer to heap-allocated memory that we own exclusively. -// The memory is allocated via Box and we manage its lifetime, so it's safe to send -// between threads. -unsafe impl Send for Frame {} -unsafe impl Sync for Frame {} - -#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] -pub(crate) struct FrameId(u32); - -impl FrameId { - #[inline(always)] - pub(crate) const fn as_usize(&self) -> usize { - self.0 as usize - } - - #[inline] - pub(crate) const fn from_usize(value: usize) -> Self { - FrameId(value as u32) - } -} - -struct Frames(Vec); -impl Frames { - fn allocate(num_frames: usize) -> Self { - let mut frames = Vec::with_capacity(num_frames); - (0..num_frames).into_iter().for_each(|_| { - let boxed_array = Box::new([0; Page::SIZE]); - let ptr = Box::into_raw(boxed_array); - frames.push(Frame { ptr, page_id: AtomicU32::new(0), state: AtomicU32::new(0) }); - }); - Self(frames) - } - - #[inline(always)] - fn get(&self, frame_id: FrameId) -> &Frame { - &self.0[frame_id.0 as usize] - } - - // Unpin the frame if the occupied page is not dirty. - // If the page is in dirty state, returns false. - fn unpin(&self, frame_id: FrameId) -> bool { - let frame = self.get(frame_id); - frame.unpin() - } - - // Find a frame to be evicted, also pin that frame and running cleanup F function. - fn victim_and_pin(&self, cleanup: F) -> Option<(FrameId, &Frame)> - where - F: FnOnce(FrameId), - { - todo!() - } - - #[cfg(test)] - fn count_pinned(&self) -> usize { - self.0.iter().filter(|i| (i.state.load(Ordering::SeqCst) & 0b01) != 0).count() - } -} - pub(crate) type FxMap = DashMap; pub(crate) type FxSet = DashSet; @@ -136,8 +37,6 @@ pub struct PageManager { * indexed by page id with fix num_frames size */ frames: Arc, /* list of frames that hold pages' data, indexed by frame id with * fix num_frames size */ - frame_lock_replacer_hand: Mutex, /* CLOCK cache replacer hand */ - // replacer: Arc, loading_pages: FxSet, page_count: AtomicU32, num_frames: u32, @@ -218,7 +117,6 @@ impl PageManager { file: RwLock::new(file), file_len, frames: Arc::new(frames), - frame_lock_replacer_hand: Mutex::new(0), page_table, updated_pages: Arc::new(DashMap::with_hasher(FxBuildHasher::default())), new_pages: Mutex::new(Vec::new()), diff --git a/src/page/manager/frame.rs b/src/page/manager/frame.rs new file mode 100644 index 00000000..0e9824f8 --- /dev/null +++ b/src/page/manager/frame.rs @@ -0,0 +1,142 @@ +use crate::{ + page::{ + state::{PageState, RawPageState}, + Page, PageError, PageId, PageManagerOptions, PageMut, + }, + snapshot::SnapshotId, +}; +use parking_lot::Mutex; +use std::sync::atomic::AtomicU32; + +#[derive(Debug)] +struct Frame { + ptr: *mut [u8; Page::SIZE], + page_id: AtomicU32, // 0 means None, otherwise it's the page_id + state: AtomicU32, // Bit 0: pin (If true, this frame cannot be evicted) + // Bit 1: ref_bit (Second Chance bit) +} + +impl Clone for Frame { + fn clone(&self) -> Self { + Frame { + ptr: self.ptr, + page_id: AtomicU32::new(self.page_id.load(Ordering::SeqCst)), + state: AtomicU32::new(self.state.load(Ordering::SeqCst)), + } + } +} + +impl Frame { + // Unpin the frame if the page is not dirty. + // If the page is in dirty state, returns false. + fn unpin(&self) -> bool { + let is_dirty = unsafe { PageState::is_dirty(self.ptr.cast()) }; + + if is_dirty { + return false; + } + // Clear pin bit (bit 0), keep ref_bit (bit 1) unchanged + self.state.fetch_and(0b10, Ordering::Release); + true + } + + #[inline(always)] + fn pin(&self) { + // Set both pin (bit 0) and ref_bit (bit 1) to true + self.state.store(0b11, Ordering::Release); + } +} + +// SAFETY: Frame contains a pointer to heap-allocated memory that we own exclusively. +// The memory is allocated via Box and we manage its lifetime, so it's safe to send +// between threads. +unsafe impl Send for Frame {} +unsafe impl Sync for Frame {} + +#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] +pub(crate) struct FrameId(u32); + +impl FrameId { + #[inline(always)] + pub(crate) const fn as_usize(&self) -> usize { + self.0 as usize + } + + #[inline] + pub(crate) const fn from_usize(value: usize) -> Self { + FrameId(value as u32) + } +} + +struct Frames { + inner: Vec, + hand: Mutex, +} + +impl Frames { + fn allocate(num_frames: usize) -> Self { + let mut frames = Vec::with_capacity(num_frames); + (0..num_frames).into_iter().for_each(|_| { + let boxed_array = Box::new([0; Page::SIZE]); + let ptr = Box::into_raw(boxed_array); + frames.push(Frame { ptr, page_id: AtomicU32::new(0), state: AtomicU32::new(0) }); + }); + Self { inner: frames, hand: Mutex::new(0) } + } + + #[inline(always)] + fn get(&self, frame_id: FrameId) -> &Frame { + &self.inner[frame_id.0 as usize] + } + + // Unpin the frame if the occupied page is not dirty. + // If the page is in dirty state, returns false. + fn unpin(&self, frame_id: FrameId) -> bool { + let frame = self.get(frame_id); + frame.unpin() + } + + // Find a frame to be evicted, also pin that frame and running cleanup F function. + fn victim_and_pin(&self, cleanup: F) -> Option<(FrameId, &Frame)> + where + F: FnOnce(FrameId), + { + let mut hand = self.hand.lock(); + let num_frames = self.inner.len(); + + for _ in 0..(num_frames * 3) { + let current_idx = *hand; + let frame = &self.inner[current_idx]; + + // Move hand forward for next iteration + *hand = (*hand + 1) % num_frames; + + // Check if pin bit (bit 0) is set + let current_state = frame.state.load(Ordering::Relaxed); + if (current_state & 0b01) != 0 { + // This page is being used. Cannot evict. Skip it. + continue; + } + // Check reference bit (bit 1): swap atomically returns old value and sets to false + let old_state = frame.state.swap(current_state & 0b01, Ordering::Relaxed); + if (old_state & 0b10) != 0 { + // Had a second chance (ref_bit was true, now set to false) + continue; + } + + // Pin the frame: set both pin (bit 0) and ref_bit (bit 1) to true + frame.state.store(0b11, Ordering::Relaxed); + let frame_id = FrameId::from_usize(current_idx); + cleanup(frame_id); + return Some((frame_id, frame)); + } + + // If get here, literally every single frame is pinned. The buffer pool is exhausted. + None + } + + #[cfg(test)] + fn count_pinned(&self) -> usize { + self.inner.iter().filter(|i| (i.state.load(Ordering::SeqCst) & 0b01) != 0).count() + } +} From 350949c01ae3fd451f9cc483bbecc68fedee0066 Mon Sep 17 00:00:00 2001 From: nqd Date: Fri, 19 Dec 2025 03:42:37 +0000 Subject: [PATCH 60/65] wip --- src/page/manager/frame.rs | 69 +++++++++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 10 deletions(-) diff --git a/src/page/manager/frame.rs b/src/page/manager/frame.rs index 0e9824f8..3938c0fa 100644 --- a/src/page/manager/frame.rs +++ b/src/page/manager/frame.rs @@ -6,10 +6,10 @@ use crate::{ snapshot::SnapshotId, }; use parking_lot::Mutex; -use std::sync::atomic::AtomicU32; +use std::sync::atomic::{AtomicU32, Ordering}; #[derive(Debug)] -struct Frame { +pub(crate) struct Frame { ptr: *mut [u8; Page::SIZE], page_id: AtomicU32, // 0 means None, otherwise it's the page_id state: AtomicU32, // Bit 0: pin (If true, this frame cannot be evicted) @@ -29,7 +29,8 @@ impl Clone for Frame { impl Frame { // Unpin the frame if the page is not dirty. // If the page is in dirty state, returns false. - fn unpin(&self) -> bool { + #[inline] + pub(crate) fn unpin(&self) -> bool { let is_dirty = unsafe { PageState::is_dirty(self.ptr.cast()) }; if is_dirty { @@ -40,8 +41,8 @@ impl Frame { true } - #[inline(always)] - fn pin(&self) { + #[inline] + pub(crate) fn pin(&self) { // Set both pin (bit 0) and ref_bit (bit 1) to true self.state.store(0b11, Ordering::Release); } @@ -68,13 +69,13 @@ impl FrameId { } } -struct Frames { +pub(crate) struct Frames { inner: Vec, hand: Mutex, } impl Frames { - fn allocate(num_frames: usize) -> Self { + pub(crate) fn allocate(num_frames: usize) -> Self { let mut frames = Vec::with_capacity(num_frames); (0..num_frames).into_iter().for_each(|_| { let boxed_array = Box::new([0; Page::SIZE]); @@ -85,19 +86,19 @@ impl Frames { } #[inline(always)] - fn get(&self, frame_id: FrameId) -> &Frame { + pub(crate) fn get(&self, frame_id: FrameId) -> &Frame { &self.inner[frame_id.0 as usize] } // Unpin the frame if the occupied page is not dirty. // If the page is in dirty state, returns false. - fn unpin(&self, frame_id: FrameId) -> bool { + pub(crate) fn unpin(&self, frame_id: FrameId) -> bool { let frame = self.get(frame_id); frame.unpin() } // Find a frame to be evicted, also pin that frame and running cleanup F function. - fn victim_and_pin(&self, cleanup: F) -> Option<(FrameId, &Frame)> + pub(crate) fn victim_and_pin(&self, cleanup: F) -> Option<(FrameId, &Frame)> where F: FnOnce(FrameId), { @@ -140,3 +141,51 @@ impl Frames { self.inner.iter().filter(|i| (i.state.load(Ordering::SeqCst) & 0b01) != 0).count() } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_pin() { + let r = ClockReplacer::new(5); + r.pin(FrameId::from_usize(0)); + assert_eq!(r.count_pinned(), 1); + r.pin(FrameId::from_usize(0)); + assert_eq!(r.count_pinned(), 1); + r.pin(FrameId::from_usize(4)); + assert_eq!(r.count_pinned(), 2); + } + + #[test] + fn test_upin() { + let r = ClockReplacer::new(5); + r.pin(FrameId::from_usize(0)); + r.pin(FrameId::from_usize(1)); + r.pin(FrameId::from_usize(1)); + r.pin(FrameId::from_usize(2)); + assert_eq!(r.count_pinned(), 3); + + r.unpin(FrameId::from_usize(2)); + assert_eq!(r.count_pinned(), 2); + + r.unpin(FrameId::from_usize(1)); + assert_eq!(r.count_pinned(), 1); + r.unpin(FrameId::from_usize(1)); + assert_eq!(r.count_pinned(), 1); + } + + #[test] + fn test_evict() { + let r = ClockReplacer::new(5); + (0..5).for_each(|i| { + assert_eq!(r.victim_and_pin(|_| {}), Some(FrameId::from_usize(i))); + r.pin(FrameId::from_usize(i)); + }); + assert_eq!(r.victim_and_pin(|_| {}), None); + + r.unpin(FrameId::from_usize(4)); + assert_eq!(r.victim_and_pin(|_| {}), Some(FrameId::from_usize(4))); + } +} + From 7f2eb7b075d3d2aa76e6c587d7c0b245e9fe4e8e Mon Sep 17 00:00:00 2001 From: nqd Date: Fri, 19 Dec 2025 07:27:10 +0000 Subject: [PATCH 61/65] wip --- src/page/manager/buffer_pool.rs | 86 ++++++++++++------------- src/page/manager/frame.rs | 107 ++++++++++++++++---------------- 2 files changed, 91 insertions(+), 102 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index a6e11303..e9358b70 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -64,6 +64,8 @@ impl std::fmt::Debug for PageManager { } impl PageManager { + const MIN_IO_URING_QUEUE_DEPTH: u32 = 2048; + pub fn options() -> PageManagerOptions { PageManagerOptions::new() } @@ -106,7 +108,7 @@ impl PageManager { let frames = Frames::allocate(num_frames as usize); // Initialize io_uring with queue depth - let queue_depth = num_frames.min(2048) as u32; + let queue_depth = num_frames.min(Self::MIN_IO_URING_QUEUE_DEPTH) as u32; let io_uring = IoUring::new(queue_depth) .map_err(|e| PageError::IO(io::Error::new(io::ErrorKind::Other, e)))?; @@ -154,22 +156,8 @@ impl PageManager { } // Then unpin those writtern pages so that the pages could be evicted and reused pages.iter().for_each(|(page_id, frame_id)| { - let frame = frames.get(*frame_id); - // if let PageState::Dirty(_) = - // unsafe { RawPageState::from_ptr(frame.ptr.cast()).load() } - // { - // return; - // } - // frames.unpin(*frame_id); - // // It's possible that between checking for dirty state and unpin, the - // // page is get_mut again. In that case, re-pin the frame. - // if let PageState::Dirty(_) = - // unsafe { RawPageState::from_ptr(frame.ptr.cast()).load() } - // { - // frame.pin(*frame_id); - // return; - // } - if frame.unpin() { + // TODO: Could have race condition btw unpin and remove actions. + if let Some(true) = frames.unpin(*frame_id) { updated_pages.remove(page_id); } }); @@ -202,28 +190,30 @@ impl PageManager { let page_id = page.0; let frame_id = page.1; let offset = page_id.as_offset(); - let frame = frames.get(frame_id); - let page_data = - unsafe { std::slice::from_raw_parts(frame.ptr as *const u8, Page::SIZE) }; - - // Create write operation - let write_op = opcode::Write::new(types::Fd(fd), page_data.as_ptr(), Page::SIZE as u32) - .offset(offset as u64) - .build() - .user_data(op_count); - // Submit to ring - loop { - let mut sq = ring_guard.submission(); - match unsafe { sq.push(&write_op) } { - Ok(_) => { - op_count += 1; - drop(sq); - break; - } - Err(_) => { - // Submission queue is full, submit and wait - drop(sq); - let _ = ring_guard.submit_and_wait(1); + if let Some(frame) = frames.get(frame_id) { + let page_data = + unsafe { std::slice::from_raw_parts(frame.ptr as *const u8, Page::SIZE) }; + + // Create write operation + let write_op = + opcode::Write::new(types::Fd(fd), page_data.as_ptr(), Page::SIZE as u32) + .offset(offset as u64) + .build() + .user_data(op_count); + // Submit to ring + loop { + let mut sq = ring_guard.submission(); + match unsafe { sq.push(&write_op) } { + Ok(_) => { + op_count += 1; + drop(sq); + break; + } + Err(_) => { + // Submission queue is full, submit and wait + drop(sq); + let _ = ring_guard.submit_and_wait(1); + } } } } @@ -285,7 +275,7 @@ impl PageManager { loop { // If the page_id is in the cache. Hold dashmap lock until returning. if let Some(frame_id) = self.page_table.get(&page_id) { - let frame = self.frames.get(*frame_id); + let frame = self.frames.get(*frame_id).unwrap(); frame.pin(); return Ok((*frame_id, frame)); } @@ -294,7 +284,7 @@ impl PageManager { // This thread is the first to load the page. Load page from disk and also pin the frame. let frame_id = self.load_page_from_disk(page_id)?; self.page_table.insert(page_id, frame_id); - let frame = self.frames.get(frame_id); + let frame = self.frames.get(frame_id).unwrap(); self.loading_pages.remove(&page_id); return Ok((frame_id, frame)); } @@ -305,7 +295,7 @@ impl PageManager { // Remove the current pageid, frame_id from page_table. fn cleanup_victim_page(&self, frame_id: FrameId) { - let current_frame = self.frames.get(frame_id); + let current_frame = self.frames.get(frame_id).unwrap(); let stored_id = current_frame.page_id.load(Ordering::Acquire); if stored_id != 0 { if let Some(old_page_id) = PageId::new(stored_id) { @@ -376,7 +366,7 @@ impl PageManager { } // A page is dirty if it is in the page_table if let Some(frame_id) = self.page_table.get(&page_id) { - let frame = self.frames.get(*frame_id); + let frame = self.frames.get(*frame_id).unwrap(); // SAFETY: We're just reading the state atomically, respecting the memory model let state = unsafe { RawPageState::from_ptr(frame.ptr.cast()) }; @@ -412,7 +402,7 @@ impl PageManager { let iovecs: Vec = new_pages .iter() .map(|(frame_id, _)| { - let frame = self.frames.get(*frame_id); + let frame = self.frames.get(*frame_id).unwrap(); libc::iovec { iov_base: frame.ptr as *mut libc::c_void, iov_len: Page::SIZE } }) .collect(); @@ -453,7 +443,7 @@ impl PageManager { for entry in self.updated_pages.iter() { let frame_id = *entry.value(); let page_id = *entry.key(); - let frame = self.frames.get(frame_id); + let frame = self.frames.get(frame_id).unwrap(); let offset = page_id.as_offset(); let page_data = @@ -678,7 +668,7 @@ mod tests { for i in 1..=10 { let page_id = PageId::new(i).unwrap(); let frame_id = m.page_table.get(&page_id).expect("page not in cache"); - let frame = m.frames.get(*frame_id); + let frame = m.frames.get(*frame_id).unwrap(); assert_eq!(frame.page_id.load(Ordering::Relaxed), page_id.as_u32()); } } @@ -821,7 +811,7 @@ mod tests { let page = m.get(page_id).expect("page not in cache"); assert_eq!(page.contents(), &mut [0xab ^ (i as u8); Page::DATA_SIZE]); let frame_id = m.page_table.get(&page_id).expect("page not in cache"); - let frame = m.frames.get(*frame_id); + let frame = m.frames.get(*frame_id).unwrap(); assert_eq!(frame.ptr as *const u8, page.all_contents().as_ptr()); } } @@ -836,7 +826,7 @@ mod tests { let page = m.get_mut(snapshot + i as u64, page_id).expect("page not in cache"); assert_eq!(page.contents(), &mut [0xab ^ (i as u8); Page::DATA_SIZE]); let frame_id = m.page_table.get(&page_id).expect("page not in cache"); - let frame = m.frames.get(*frame_id); + let frame = m.frames.get(*frame_id).unwrap(); assert_eq!(frame.ptr as *const u8, page.all_contents().as_ptr()); } } diff --git a/src/page/manager/frame.rs b/src/page/manager/frame.rs index 3938c0fa..9167cab8 100644 --- a/src/page/manager/frame.rs +++ b/src/page/manager/frame.rs @@ -1,19 +1,13 @@ -use crate::{ - page::{ - state::{PageState, RawPageState}, - Page, PageError, PageId, PageManagerOptions, PageMut, - }, - snapshot::SnapshotId, -}; +use crate::page::{state::PageState, Page}; use parking_lot::Mutex; use std::sync::atomic::{AtomicU32, Ordering}; #[derive(Debug)] pub(crate) struct Frame { - ptr: *mut [u8; Page::SIZE], - page_id: AtomicU32, // 0 means None, otherwise it's the page_id - state: AtomicU32, // Bit 0: pin (If true, this frame cannot be evicted) - // Bit 1: ref_bit (Second Chance bit) + pub(crate) ptr: *mut [u8; Page::SIZE], + pub(crate) page_id: AtomicU32, // 0 means None, otherwise it's the page_id + state: AtomicU32, // Bit 0: pin (If true, this frame cannot be evicted) + // Bit 1: ref_bit (Second Chance bit) } impl Clone for Frame { @@ -58,14 +52,10 @@ unsafe impl Sync for Frame {} pub(crate) struct FrameId(u32); impl FrameId { - #[inline(always)] - pub(crate) const fn as_usize(&self) -> usize { - self.0 as usize - } - + /// Constructs a new `FrameId` from `u32` #[inline] - pub(crate) const fn from_usize(value: usize) -> Self { - FrameId(value as u32) + pub(crate) const fn new(id: u32) -> Self { + FrameId(id) } } @@ -85,16 +75,24 @@ impl Frames { Self { inner: frames, hand: Mutex::new(0) } } - #[inline(always)] - pub(crate) fn get(&self, frame_id: FrameId) -> &Frame { - &self.inner[frame_id.0 as usize] + #[inline] + pub(crate) fn get(&self, frame_id: FrameId) -> Option<&Frame> { + self.inner.get(frame_id.0 as usize) + } + + #[cfg(test)] + #[inline] + pub(crate) fn pin(&self, frame_id: FrameId) -> Option<()> { + let frame = self.get(frame_id)?; + Some(frame.pin()) } // Unpin the frame if the occupied page is not dirty. // If the page is in dirty state, returns false. - pub(crate) fn unpin(&self, frame_id: FrameId) -> bool { - let frame = self.get(frame_id); - frame.unpin() + #[inline] + pub(crate) fn unpin(&self, frame_id: FrameId) -> Option { + let frame = self.get(frame_id)?; + Some(frame.unpin()) } // Find a frame to be evicted, also pin that frame and running cleanup F function. @@ -127,7 +125,7 @@ impl Frames { // Pin the frame: set both pin (bit 0) and ref_bit (bit 1) to true frame.state.store(0b11, Ordering::Relaxed); - let frame_id = FrameId::from_usize(current_idx); + let frame_id = FrameId::new(current_idx as u32); cleanup(frame_id); return Some((frame_id, frame)); } @@ -137,7 +135,7 @@ impl Frames { } #[cfg(test)] - fn count_pinned(&self) -> usize { + pub(crate) fn count_pinned(&self) -> usize { self.inner.iter().filter(|i| (i.state.load(Ordering::SeqCst) & 0b01) != 0).count() } } @@ -148,44 +146,45 @@ mod tests { #[test] fn test_pin() { - let r = ClockReplacer::new(5); - r.pin(FrameId::from_usize(0)); - assert_eq!(r.count_pinned(), 1); - r.pin(FrameId::from_usize(0)); - assert_eq!(r.count_pinned(), 1); - r.pin(FrameId::from_usize(4)); - assert_eq!(r.count_pinned(), 2); + let f = Frames::allocate(5); + f.pin(FrameId::new(0)); + assert_eq!(f.count_pinned(), 1); + f.pin(FrameId::new(0)); + assert_eq!(f.count_pinned(), 1); + f.pin(FrameId::new(4)); + assert_eq!(f.count_pinned(), 2); } #[test] fn test_upin() { - let r = ClockReplacer::new(5); - r.pin(FrameId::from_usize(0)); - r.pin(FrameId::from_usize(1)); - r.pin(FrameId::from_usize(1)); - r.pin(FrameId::from_usize(2)); - assert_eq!(r.count_pinned(), 3); - - r.unpin(FrameId::from_usize(2)); - assert_eq!(r.count_pinned(), 2); - - r.unpin(FrameId::from_usize(1)); - assert_eq!(r.count_pinned(), 1); - r.unpin(FrameId::from_usize(1)); - assert_eq!(r.count_pinned(), 1); + let f = Frames::allocate(5); + f.pin(FrameId::new(0)); + f.pin(FrameId::new(0)); + f.pin(FrameId::new(1)); + f.pin(FrameId::new(2)); + assert_eq!(f.count_pinned(), 3); + + f.unpin(FrameId::new(2)); + assert_eq!(f.count_pinned(), 2); + + f.unpin(FrameId::new(1)); + assert_eq!(f.count_pinned(), 1); + f.unpin(FrameId::new(1)); + assert_eq!(f.count_pinned(), 1); } #[test] fn test_evict() { - let r = ClockReplacer::new(5); + let f = Frames::allocate(5); (0..5).for_each(|i| { - assert_eq!(r.victim_and_pin(|_| {}), Some(FrameId::from_usize(i))); - r.pin(FrameId::from_usize(i)); + let result = f.victim_and_pin(|_| {}).unwrap(); + assert_eq!(result.0, FrameId::new(i)); + f.pin(FrameId::new(i)); }); - assert_eq!(r.victim_and_pin(|_| {}), None); + assert_eq!(f.victim_and_pin(|_| {}).is_none(), true); - r.unpin(FrameId::from_usize(4)); - assert_eq!(r.victim_and_pin(|_| {}), Some(FrameId::from_usize(4))); + f.unpin(FrameId::new(4)); + let evicted = f.victim_and_pin(|_| {}).unwrap(); + assert_eq!(evicted.0, FrameId::new(4)); } } - From f704185d08df10e867fc12b00b83805ced773ce2 Mon Sep 17 00:00:00 2001 From: nqd Date: Sun, 25 Jan 2026 07:20:23 +0000 Subject: [PATCH 62/65] write read example --- examples/write_read/main.rs | 147 ++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 examples/write_read/main.rs diff --git a/examples/write_read/main.rs b/examples/write_read/main.rs new file mode 100644 index 00000000..cbe6463c --- /dev/null +++ b/examples/write_read/main.rs @@ -0,0 +1,147 @@ +use std::env; + +use alloy_primitives::{Address, StorageKey, StorageValue, U256}; +use alloy_trie::{EMPTY_ROOT_HASH, KECCAK_EMPTY}; +use rand::prelude::*; +use triedb::{ + account::Account, + database::DatabaseOptions, + path::{AddressPath, StoragePath}, + transaction::TransactionError, + Database, +}; + +pub const DEFAULT_SETUP_DB_EOA_SIZE: usize = 1_000_000; +pub const DEFAULT_SETUP_DB_CONTRACT_SIZE: usize = 100_000; +pub const DEFAULT_SETUP_DB_STORAGE_PER_CONTRACT: usize = 10; +const SEED_EOA: u64 = 42; // EOA seeding value +const SEED_CONTRACT: u64 = 43; // contract account seeding value + +pub fn generate_random_address(rng: &mut StdRng) -> AddressPath { + let addr = Address::random_with(rng); + AddressPath::for_address(addr) +} + +pub fn write( + db: &Database, + repeat: usize, + eoa_size: usize, + contract_size: usize, + storage_per_contract: usize, +) -> Result<(), TransactionError> { + // Populate database with initial accounts + let mut eoa_rng = StdRng::seed_from_u64(SEED_EOA); + let mut contract_rng = StdRng::seed_from_u64(SEED_CONTRACT); + for _i in 0..repeat { + let mut tx = db.begin_rw()?; + for i in 1..=eoa_size { + let address = generate_random_address(&mut eoa_rng); + let account = + Account::new(i as u64, U256::from(i as u64), EMPTY_ROOT_HASH, KECCAK_EMPTY); + + tx.set_account(address, Some(account))?; + } + + for i in 1..=contract_size { + let address = generate_random_address(&mut contract_rng); + let account = + Account::new(i as u64, U256::from(i as u64), EMPTY_ROOT_HASH, KECCAK_EMPTY); + + tx.set_account(address.clone(), Some(account))?; + + // add random storage to each account + for key in 1..=storage_per_contract { + let storage_key = StorageKey::from(U256::from(key)); + let storage_path = + StoragePath::for_address_path_and_slot(address.clone(), storage_key); + let storage_value = + StorageValue::from_be_slice(storage_path.get_slot().pack().as_slice()); + + tx.set_storage_slot(storage_path, Some(storage_value))?; + } + } + + tx.commit()?; + } + println!("root hash: {:?}", db.state_root()); + + Ok(()) +} + +pub fn read( + db: &Database, + repeat: usize, + eoa_size: usize, + contract_size: usize, + storage_per_contract: usize, +) -> Result<(), TransactionError> { + // Populate database with initial accounts + let mut eoa_rng = StdRng::seed_from_u64(SEED_EOA); + let mut contract_rng = StdRng::seed_from_u64(SEED_CONTRACT); + for _i in 0..repeat { + let mut tx = db.begin_rw()?; + for i in 1..=eoa_size { + let address = generate_random_address(&mut eoa_rng); + let acc = tx.get_account(&address)?; + assert_eq!( + acc, + Some(Account::new(i as u64, U256::from(i as u64), EMPTY_ROOT_HASH, KECCAK_EMPTY)) + ); + } + + for i in 1..=contract_size { + let address = generate_random_address(&mut contract_rng); + let account = + Account::new(i as u64, U256::from(i as u64), EMPTY_ROOT_HASH, KECCAK_EMPTY); + + tx.set_account(address.clone(), Some(account))?; + let acc = tx.get_account(&address)?; + assert_eq!( + acc, + Some(Account::new(i as u64, U256::from(i as u64), EMPTY_ROOT_HASH, KECCAK_EMPTY)) + ); + + // add random storage to each account + for key in 1..=storage_per_contract { + let storage_key = StorageKey::from(U256::from(key)); + let storage_path = + StoragePath::for_address_path_and_slot(address.clone(), storage_key); + + let val = tx.get_storage_slot(&storage_path)?; + assert_eq!( + val, + Some(StorageValue::from_be_slice(storage_path.get_slot().pack().as_slice())) + ); + } + } + + tx.commit()?; + } + + Ok(()) +} + +fn main() { + let args: Vec = env::args().collect(); + + let db_path = args.get(1).unwrap(); + let repeat = args.get(2).and_then(|s| s.parse::().ok()).unwrap_or(1); + let eoa_size = + args.get(3).and_then(|s| s.parse::().ok()).unwrap_or(DEFAULT_SETUP_DB_EOA_SIZE); + let contract_size = + args.get(4).and_then(|s| s.parse::().ok()).unwrap_or(DEFAULT_SETUP_DB_CONTRACT_SIZE); + let storage_per_contract = args + .get(5) + .and_then(|s| s.parse::().ok()) + .unwrap_or(DEFAULT_SETUP_DB_STORAGE_PER_CONTRACT); + + let db = DatabaseOptions::default() + .create_new(true) + .num_frames(1024 * 1024 * 6) + .open(db_path) + .unwrap(); + + println!("eoa size: {eoa_size}, contract size: {contract_size}, storage per contract: {storage_per_contract}, repeat: {repeat}"); + + write(&db, repeat, eoa_size, contract_size, storage_per_contract).unwrap(); +} From 1a53cf4f58afd1cf6f2ee0d1bca5c1d774d2f5b8 Mon Sep 17 00:00:00 2001 From: nqd Date: Sun, 25 Jan 2026 14:54:00 +0000 Subject: [PATCH 63/65] fix batch of iovec --- src/page/manager/buffer_pool.rs | 81 ++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 37 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index e9358b70..88f928cd 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -397,6 +397,8 @@ impl PageManager { // Write contiguous new pages as a batch using writev. // Note: iovecs must stay alive until operations complete, so we define it outside the scope + // IOV_MAX is typically 1024 on most systems, so we batch writes to respect this limit + const IOV_MAX: usize = 1024; let _iovecs = if !new_pages.is_empty() { // Collect iovec for new pages let iovecs: Vec = new_pages @@ -407,28 +409,33 @@ impl PageManager { }) .collect(); - // Get the offset of the first new page - let first_offset = new_pages[0].1.as_offset() as u64; - - unsafe { - let writev_op = - opcode::Writev::new(types::Fd(fd), iovecs.as_ptr(), iovecs.len() as u32) - .offset(first_offset) - .build() - .user_data(op_count); - - // Submit to ring - loop { - let mut sq = ring_guard.submission(); - match sq.push(&writev_op) { - Ok(_) => { - op_count += 1; - break; - } - Err(_) => { - // Submission queue is full, submit and wait - drop(sq); - ring_guard.submit_and_wait(1)?; + // Split into batches of IOV_MAX to avoid EINVAL errors + for (batch_idx, iovec_chunk) in iovecs.chunks(IOV_MAX).enumerate() { + let batch_offset = new_pages[batch_idx * IOV_MAX].1.as_offset() as u64; + + unsafe { + let writev_op = opcode::Writev::new( + types::Fd(fd), + iovec_chunk.as_ptr(), + iovec_chunk.len() as u32, + ) + .offset(batch_offset) + .build() + .user_data(op_count); + + // Submit to ring + loop { + let mut sq = ring_guard.submission(); + match sq.push(&writev_op) { + Ok(_) => { + op_count += 1; + break; + } + Err(_) => { + // Submission queue is full, submit and wait + drop(sq); + ring_guard.submit_and_wait(1)?; + } } } } @@ -535,21 +542,21 @@ impl PageManager { #[inline] pub fn drop_page_mut(&self, page_id: PageId) { - if self.updated_pages.get(&page_id).is_some() { - let mut drop_pages = self.drop_pages.lock(); - drop_pages.push(page_id); - if drop_pages.len() >= 10 { - // iter thru all items in drop_pages and remove from the drop_pages - let mut pages = Vec::with_capacity(10); - drop_pages.iter().for_each(|p| { - if let Some(f) = self.page_table.get(p) { - pages.push((*f.key(), *f.value())); - } - }); - self.tx_job.send(WriteMessage::Pages(pages)).unwrap(); - drop_pages.clear(); - } - } + // if self.updated_pages.get(&page_id).is_some() { + // let mut drop_pages = self.drop_pages.lock(); + // drop_pages.push(page_id); + // if drop_pages.len() >= 10 { + // // iter thru all items in drop_pages and remove from the drop_pages + // let mut pages = Vec::with_capacity(10); + // drop_pages.iter().for_each(|p| { + // if let Some(f) = self.page_table.get(p) { + // pages.push((*f.key(), *f.value())); + // } + // }); + // self.tx_job.send(WriteMessage::Pages(pages)).unwrap(); + // drop_pages.clear(); + // } + // } } fn next_page_id(&self) -> Option<(PageId, u32)> { From 37ea6a221ea5e0cb5807423e0723ea9eaaab747c Mon Sep 17 00:00:00 2001 From: nqd Date: Thu, 5 Feb 2026 13:46:44 +0000 Subject: [PATCH 64/65] update background --- src/page/manager/buffer_pool.rs | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 88f928cd..03bf8d80 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -542,21 +542,21 @@ impl PageManager { #[inline] pub fn drop_page_mut(&self, page_id: PageId) { - // if self.updated_pages.get(&page_id).is_some() { - // let mut drop_pages = self.drop_pages.lock(); - // drop_pages.push(page_id); - // if drop_pages.len() >= 10 { - // // iter thru all items in drop_pages and remove from the drop_pages - // let mut pages = Vec::with_capacity(10); - // drop_pages.iter().for_each(|p| { - // if let Some(f) = self.page_table.get(p) { - // pages.push((*f.key(), *f.value())); - // } - // }); - // self.tx_job.send(WriteMessage::Pages(pages)).unwrap(); - // drop_pages.clear(); - // } - // } + if self.updated_pages.get(&page_id).is_some() { + let mut drop_pages = self.drop_pages.lock(); + drop_pages.push(page_id); + if drop_pages.len() >= 10 { + // iter thru all items in drop_pages and remove from the drop_pages + let mut pages = Vec::with_capacity(8); + drop_pages.iter().for_each(|p| { + if let Some(f) = self.page_table.get(p) { + pages.push((*f.key(), *f.value())); + } + }); + self.tx_job.send(WriteMessage::Pages(pages)).unwrap(); + drop_pages.clear(); + } + } } fn next_page_id(&self) -> Option<(PageId, u32)> { From ffb95774fca76d328904117b310a41650394538d Mon Sep 17 00:00:00 2001 From: nqd Date: Sun, 8 Feb 2026 14:15:15 +0000 Subject: [PATCH 65/65] need to update pin status --- examples/insert/main.rs | 2 +- src/page/manager/buffer_pool.rs | 46 +++++++++++++++++++-------------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/examples/insert/main.rs b/examples/insert/main.rs index 828c73b5..cc4ddcf9 100644 --- a/examples/insert/main.rs +++ b/examples/insert/main.rs @@ -84,7 +84,7 @@ fn main() { let db = DatabaseOptions::default() .create_new(true) - .num_frames(1024 * 1024 * 6) + .num_frames(128 * 1) .open(db_path) .unwrap(); diff --git a/src/page/manager/buffer_pool.rs b/src/page/manager/buffer_pool.rs index 03bf8d80..51c79f26 100644 --- a/src/page/manager/buffer_pool.rs +++ b/src/page/manager/buffer_pool.rs @@ -41,8 +41,8 @@ pub struct PageManager { page_count: AtomicU32, num_frames: u32, - updated_pages: Arc>, - new_pages: Mutex>, + updated_pages: Arc>, // list of pages were updated during the current write transaction + new_pages: Mutex>, // list of new pages created for the current transaction file: RwLock, file_len: AtomicU64, @@ -65,6 +65,7 @@ impl std::fmt::Debug for PageManager { impl PageManager { const MIN_IO_URING_QUEUE_DEPTH: u32 = 2048; + const BACKGROUND_WRITING_PAGES_THRESHOLD: usize = 10; pub fn options() -> PageManagerOptions { PageManagerOptions::new() @@ -156,9 +157,13 @@ impl PageManager { } // Then unpin those writtern pages so that the pages could be evicted and reused pages.iter().for_each(|(page_id, frame_id)| { - // TODO: Could have race condition btw unpin and remove actions. - if let Some(true) = frames.unpin(*frame_id) { - updated_pages.remove(page_id); + // TODO: only check updated_pages will not work, since the same page/frame could be read + // To solve this race condition, we could create a state for pin status: PIN, UNPIN_UNWRITEN, UNPIN + // When mut page get dropped, change from PIN -> UNPIN_UNWRITEN. + // When use again, UNPIN_UNWRITEN/UNPIN -> PIN + // After writen in the background, change UNPIN_UNWRITEN -> UNPIN. + if let None = updated_pages.get(page_id) { + frames.unpin(*frame_id); } }); } @@ -542,20 +547,23 @@ impl PageManager { #[inline] pub fn drop_page_mut(&self, page_id: PageId) { - if self.updated_pages.get(&page_id).is_some() { - let mut drop_pages = self.drop_pages.lock(); - drop_pages.push(page_id); - if drop_pages.len() >= 10 { - // iter thru all items in drop_pages and remove from the drop_pages - let mut pages = Vec::with_capacity(8); - drop_pages.iter().for_each(|p| { - if let Some(f) = self.page_table.get(p) { - pages.push((*f.key(), *f.value())); - } - }); - self.tx_job.send(WriteMessage::Pages(pages)).unwrap(); - drop_pages.clear(); - } + if self.updated_pages.get(&page_id).is_none() { + return; + } + + let mut drop_pages = self.drop_pages.lock(); + drop_pages.push(page_id); + if drop_pages.len() >= Self::BACKGROUND_WRITING_PAGES_THRESHOLD { + // Don't unpin those pages until they are writen the disk. + let mut pages = Vec::with_capacity(Self::BACKGROUND_WRITING_PAGES_THRESHOLD); + drop_pages.iter().for_each(|p| { + let f = self.page_table.get(p).unwrap(); + pages.push((*f.key(), *f.value())); + let updated_page = self.updated_pages.remove(p); + debug_assert!(updated_page.is_some()); + }); + self.tx_job.send(WriteMessage::Pages(pages)).unwrap(); + drop_pages.clear(); } }