From 93ec7503e08f126e180579f02dcdb6e7a95724ba Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 7 Sep 2024 16:11:36 +0300 Subject: [PATCH 01/21] Lock the correct revision of rust-postgres crates (#8960) We modified the crate in an incompatible way and upgraded to the new version in PR #8076. However, it was reverted in #8654. The revert reverted the Cargo.lock reference to it, but since Cargo.toml still points to the (tip of the) 'neon' branch, every time you make any other unrelated changes to Cargo.toml, it also tries to update the rust-postgres crates to the tip of the 'neon' branch again, which doesn't work. To fix, lock the crates to the exact commit SHA that works. --- Cargo.lock | 8 ++++---- Cargo.toml | 21 ++++++++++++++++----- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 634af671981a..cf3031c6d026 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4121,7 +4121,7 @@ dependencies = [ [[package]] name = "postgres" version = "0.19.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "bytes", "fallible-iterator", @@ -4134,7 +4134,7 @@ dependencies = [ [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "base64 0.20.0", "byteorder", @@ -4153,7 +4153,7 @@ dependencies = [ [[package]] name = "postgres-types" version = "0.2.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "bytes", "fallible-iterator", @@ -6409,7 +6409,7 @@ dependencies = [ [[package]] name = "tokio-postgres" version = "0.7.7" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "async-trait", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 5045ee0d4d2b..920392097137 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -201,10 +201,21 @@ env_logger = "0.10" log = "0.4" ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } -postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } -postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } + +# We want to use the 'neon' branch for these, but there's currently one +# incompatible change on the branch. See: +# +# - PR #8076 which contained changes that depended on the new changes in +# the rust-postgres crate, and +# - PR #8654 which reverted those changes and made the code in proxy incompatible +# with the tip of the 'neon' branch again. +# +# When those proxy changes are re-applied (see PR #8747), we can switch using +# the tip of the 'neon' branch again. +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" } +postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" } ## Local libraries compute_api = { version = "0.1", path = "./libs/compute_api/" } @@ -241,7 +252,7 @@ tonic-build = "0.9" [patch.crates-io] # Needed to get `tokio-postgres-rustls` to depend on our fork. -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" } # bug fixes for UUID parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" } From 89c5e80b3ff55f0f316aebca0bba497eba7fbec8 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sun, 8 Sep 2024 21:47:23 +0300 Subject: [PATCH 02/21] Update toml and toml_edit crates (#8963) Eliminates a few duplicate versions from the dependency tree. --- Cargo.lock | 57 +++++++------------------------ Cargo.toml | 4 +-- control_plane/src/pageserver.rs | 10 +++--- libs/remote_storage/src/config.rs | 2 +- libs/utils/src/toml_edit_ext.rs | 2 +- pageserver/ctl/src/main.rs | 2 +- pageserver/src/tenant/config.rs | 3 +- workspace_hack/Cargo.toml | 2 ++ 8 files changed, 26 insertions(+), 56 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cf3031c6d026..30c9f7e08053 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1246,7 +1246,7 @@ dependencies = [ "tokio-postgres", "tokio-stream", "tokio-util", - "toml_edit 0.19.10", + "toml_edit", "tracing", "tracing-opentelemetry", "tracing-subscriber", @@ -1360,8 +1360,8 @@ dependencies = [ "tokio", "tokio-postgres", "tokio-util", - "toml 0.7.4", - "toml_edit 0.19.10", + "toml", + "toml_edit", "tracing", "url", "utils", @@ -3144,7 +3144,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd01039851e82f8799046eabbb354056283fb265c8ec0996af940f4e85a380ff" dependencies = [ "serde", - "toml 0.8.14", + "toml", ] [[package]] @@ -3660,7 +3660,7 @@ dependencies = [ "thiserror", "tokio", "tokio-util", - "toml_edit 0.19.10", + "toml_edit", "utils", "workspace_hack", ] @@ -3747,7 +3747,7 @@ dependencies = [ "tokio-stream", "tokio-tar", "tokio-util", - "toml_edit 0.19.10", + "toml_edit", "tracing", "twox-hash", "url", @@ -4812,7 +4812,7 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "toml_edit 0.19.10", + "toml_edit", "tracing", "utils", ] @@ -5322,7 +5322,7 @@ dependencies = [ "tokio-stream", "tokio-tar", "tokio-util", - "toml_edit 0.19.10", + "toml_edit", "tracing", "tracing-subscriber", "url", @@ -6520,18 +6520,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "toml" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6135d499e69981f9ff0ef2167955a5333c35e36f6937d382974566b3d5b94ec" -dependencies = [ - "serde", - "serde_spanned", - "toml_datetime", - "toml_edit 0.19.10", -] - [[package]] name = "toml" version = "0.8.14" @@ -6541,7 +6529,7 @@ dependencies = [ "serde", "serde_spanned", "toml_datetime", - "toml_edit 0.22.14", + "toml_edit", ] [[package]] @@ -6553,19 +6541,6 @@ dependencies = [ "serde", ] -[[package]] -name = "toml_edit" -version = "0.19.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739" -dependencies = [ - "indexmap 1.9.3", - "serde", - "serde_spanned", - "toml_datetime", - "winnow 0.4.6", -] - [[package]] name = "toml_edit" version = "0.22.14" @@ -6576,7 +6551,7 @@ dependencies = [ "serde", "serde_spanned", "toml_datetime", - "winnow 0.6.13", + "winnow", ] [[package]] @@ -6989,7 +6964,7 @@ dependencies = [ "tokio-stream", "tokio-tar", "tokio-util", - "toml_edit 0.19.10", + "toml_edit", "tracing", "tracing-error", "tracing-subscriber", @@ -7535,15 +7510,6 @@ version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" -[[package]] -name = "winnow" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61de7bac303dc551fe038e2b3cef0f571087a47571ea6e79a87692ac99b99699" -dependencies = [ - "memchr", -] - [[package]] name = "winnow" version = "0.6.13" @@ -7651,6 +7617,7 @@ dependencies = [ "tokio", "tokio-rustls 0.24.0", "tokio-util", + "toml_edit", "tonic", "tower", "tracing", diff --git a/Cargo.toml b/Cargo.toml index 920392097137..107cd6cd44c3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -177,8 +177,8 @@ tokio-rustls = "0.25" tokio-stream = "0.1" tokio-tar = "0.3" tokio-util = { version = "0.7.10", features = ["io", "rt"] } -toml = "0.7" -toml_edit = "0.19" +toml = "0.8" +toml_edit = "0.22" tonic = {version = "0.9", features = ["tls", "tls-roots"]} tower-service = "0.3.2" tracing = "0.1" diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 31777eb7a569..33ca70af96c5 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -75,14 +75,14 @@ impl PageServerNode { } } - fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document { - toml_edit::Document::from_str(&format!("id={node_id}")).unwrap() + fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::DocumentMut { + toml_edit::DocumentMut::from_str(&format!("id={node_id}")).unwrap() } fn pageserver_init_make_toml( &self, conf: NeonLocalInitPageserverConf, - ) -> anyhow::Result { + ) -> anyhow::Result { assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully"); // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656) @@ -137,9 +137,9 @@ impl PageServerNode { // Turn `overrides` into a toml document. // TODO: above code is legacy code, it should be refactored to use toml_edit directly. - let mut config_toml = toml_edit::Document::new(); + let mut config_toml = toml_edit::DocumentMut::new(); for fragment_str in overrides { - let fragment = toml_edit::Document::from_str(&fragment_str) + let fragment = toml_edit::DocumentMut::from_str(&fragment_str) .expect("all fragments in `overrides` are valid toml documents, this function controls that"); for (key, item) in fragment.iter() { config_toml.insert(key, item.clone()); diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index f819a1572a5a..d0e92411dabe 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -185,7 +185,7 @@ mod tests { use super::*; fn parse(input: &str) -> anyhow::Result { - let toml = input.parse::().unwrap(); + let toml = input.parse::().unwrap(); RemoteStorageConfig::from_toml(toml.as_item()) } diff --git a/libs/utils/src/toml_edit_ext.rs b/libs/utils/src/toml_edit_ext.rs index ab5f7bdd95ab..1359e27b771d 100644 --- a/libs/utils/src/toml_edit_ext.rs +++ b/libs/utils/src/toml_edit_ext.rs @@ -10,7 +10,7 @@ pub fn deserialize_item(item: &toml_edit::Item) -> Result where T: serde::de::DeserializeOwned, { - let document: toml_edit::Document = match item { + let document: toml_edit::DocumentMut = match item { toml_edit::Item::Table(toml) => toml.clone().into(), toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => { toml.clone().into_table().into() diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 3b66b0c4aa62..cf001ef0d5d4 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -174,7 +174,7 @@ async fn main() -> anyhow::Result<()> { println!("specified prefix '{}' failed validation", cmd.prefix); return Ok(()); }; - let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?; + let toml_document = toml_edit::DocumentMut::from_str(&cmd.config_toml_str)?; let toml_item = toml_document .get("remote_storage") .expect("need remote_storage"); diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 7e0344666b45..547b43a39934 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -452,7 +452,8 @@ impl TryFrom for TenantConfOpt { .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message())); } toml_edit::Item::Table(table) => { - let deserializer = toml_edit::de::Deserializer::new(table.into()); + let deserializer = + toml_edit::de::Deserializer::from(toml_edit::DocumentMut::from(table)); return serde_path_to_error::deserialize(deserializer) .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message())); } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 94f4c0f22f87..411ca8103254 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -83,6 +83,7 @@ time = { version = "0.3", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } tokio-rustls = { version = "0.24" } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } +toml_edit = { version = "0.22", features = ["serde"] } tonic = { version = "0.9", features = ["tls-roots"] } tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } @@ -126,6 +127,7 @@ serde = { version = "1", features = ["alloc", "derive"] } syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] } syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] } +toml_edit = { version = "0.22", features = ["serde"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] } From 2d885ac07ae0207ab886fd4dda84701ae33893f1 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sun, 8 Sep 2024 21:47:57 +0300 Subject: [PATCH 03/21] Update strum (#8962) I wanted to use some features from the newer version. The PR that needed the new version is not ready yet (and might never be), but seems nice to stay up in any case. --- Cargo.lock | 40 ++++++++++--------------------- Cargo.toml | 6 ++--- libs/pageserver_api/src/models.rs | 2 +- libs/utils/src/logging.rs | 6 ++--- pageserver/src/metrics.rs | 4 ++-- 5 files changed, 21 insertions(+), 37 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 30c9f7e08053..4fb3ac72230c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1189,9 +1189,9 @@ dependencies = [ [[package]] name = "comfy-table" -version = "6.1.4" +version = "7.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e7b787b0dc42e8111badfdbe4c3059158ccb2db8780352fa1b01e8ccf45cc4d" +checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7" dependencies = [ "crossterm", "strum", @@ -1485,25 +1485,22 @@ checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "crossterm" -version = "0.25.0" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e64e6c0fbe2c17357405f7c758c1ef960fce08bdfb2c03d88d2a18d7e09c4b67" +checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.4.1", "crossterm_winapi", "libc", - "mio", "parking_lot 0.12.1", - "signal-hook", - "signal-hook-mio", "winapi", ] [[package]] name = "crossterm_winapi" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" dependencies = [ "winapi", ] @@ -5731,17 +5728,6 @@ dependencies = [ "signal-hook-registry", ] -[[package]] -name = "signal-hook-mio" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af" -dependencies = [ - "libc", - "mio", - "signal-hook", -] - [[package]] name = "signal-hook-registry" version = "1.4.1" @@ -6054,21 +6040,21 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "strum" -version = "0.24.1" +version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" [[package]] name = "strum_macros" -version = "0.24.3" +version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" dependencies = [ - "heck 0.4.1", + "heck 0.5.0", "proc-macro2", "quote", "rustversion", - "syn 1.0.109", + "syn 2.0.52", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 107cd6cd44c3..40e399619d18 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -73,7 +73,7 @@ camino = "1.1.6" cfg-if = "1.0.0" chrono = { version = "0.4", default-features = false, features = ["clock"] } clap = { version = "4.0", features = ["derive"] } -comfy-table = "6.1" +comfy-table = "7.1" const_format = "0.2" crc32c = "0.6" crossbeam-deque = "0.8.5" @@ -158,8 +158,8 @@ signal-hook = "0.3" smallvec = "1.11" smol_str = { version = "0.2.0", features = ["serde"] } socket2 = "0.5" -strum = "0.24" -strum_macros = "0.24" +strum = "0.26" +strum_macros = "0.26" "subtle" = "2.5.0" svg_fmt = "0.4.3" sync_wrapper = "0.1.2" diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index ffe79c8350da..45e84baa1f5d 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -62,7 +62,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut}; serde::Serialize, serde::Deserialize, strum_macros::Display, - strum_macros::EnumVariantNames, + strum_macros::VariantNames, strum_macros::AsRefStr, strum_macros::IntoStaticStr, )] diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 2ea0781667bc..e205d60d747d 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -3,11 +3,9 @@ use std::str::FromStr; use anyhow::Context; use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; -use strum_macros::{EnumString, EnumVariantNames}; +use strum_macros::{EnumString, VariantNames}; -#[derive( - EnumString, strum_macros::Display, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy, -)] +#[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)] #[strum(serialize_all = "snake_case")] pub enum LogFormat { Plain, diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index c4011d593c93..9197505876f3 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -9,7 +9,7 @@ use metrics::{ use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; use strum::{EnumCount, VariantNames}; -use strum_macros::{EnumVariantNames, IntoStaticStr}; +use strum_macros::{IntoStaticStr, VariantNames}; use tracing::warn; use utils::id::TimelineId; @@ -27,7 +27,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[ ]; // Metrics collected on operations on the storage repository. -#[derive(Debug, EnumVariantNames, IntoStaticStr)] +#[derive(Debug, VariantNames, IntoStaticStr)] #[strum(serialize_all = "kebab_case")] pub(crate) enum StorageTimeOperation { #[strum(serialize = "layer flush")] From c8f67eed8f0e3ed182ebe85753389ae5b1c161ea Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 9 Sep 2024 10:34:56 +0300 Subject: [PATCH 04/21] Remove TEST_SHARED_FIXTURES (#8965) I wish it worked, but it's been broken for a long time, so let's admit defeat and remove it. The idea of sharing the same pageserver and safekeeper environment between tests is still sound, and it could save a lot of time in our CI. We should perhaps put some time into doing that, but we're better off starting from scratch than trying to make TEST_SHARED_FIXTURES work in its current form. --- test_runner/README.md | 12 ++--- test_runner/fixtures/neon_fixtures.py | 68 +++------------------------ 2 files changed, 10 insertions(+), 70 deletions(-) diff --git a/test_runner/README.md b/test_runner/README.md index 73aa29d4bb05..647b930b26ac 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -18,8 +18,7 @@ Prerequisites: Regression tests are in the 'regress' directory. They can be run in parallel to minimize total runtime. Most regression test sets up their -environment with its own pageservers and safekeepers (but see -`TEST_SHARED_FIXTURES`). +environment with its own pageservers and safekeepers. 'pg_clients' contains tests for connecting with various client libraries. Each client test uses a Dockerfile that pulls an image that @@ -74,7 +73,6 @@ This is used to construct full path to the postgres binaries. Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION=16` `TEST_OUTPUT`: Set the directory where test state and test output files should go. -`TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. `RUST_LOG`: logging configuration to pass into Neon CLI Useful parameters and commands: @@ -259,11 +257,9 @@ compute Postgres nodes. The connections between them can be configured to use JW authentication tokens, and some other configuration options can be tweaked too. The easiest way to get access to a Neon Environment is by using the `neon_simple_env` -fixture. The 'simple' env may be shared across multiple tests, so don't shut down the nodes -or make other destructive changes in that environment. Also don't assume that -there are no tenants or branches or data in the cluster. For convenience, there is a -branch called `empty`, though. The convention is to create a test-specific branch of -that and load any test data there, instead of the 'main' branch. +fixture. For convenience, there is a branch called `empty` in environments created with +'neon_simple_env'. The convention is to create a test-specific branch of that and load any +test data there, instead of the 'main' branch. For more complicated cases, you can build a custom Neon Environment, with the `neon_env` fixture: diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 5a600dd0a196..3047dcc4f78a 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -221,33 +221,6 @@ def neon_api(neon_api_key: str, neon_api_base_url: str) -> NeonAPI: return NeonAPI(neon_api_key, neon_api_base_url) -def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]: - """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar. - - This function can be used as a scope like this: - @pytest.fixture(scope=shareable_scope) - def myfixture(...) - ... - """ - scope: Literal["session", "function"] - - if os.environ.get("TEST_SHARED_FIXTURES") is None: - # Create the environment in the per-test output directory - scope = "function" - elif ( - os.environ.get("BUILD_TYPE") is not None - and os.environ.get("DEFAULT_PG_VERSION") is not None - ): - scope = "session" - else: - pytest.fail( - "Shared environment(TEST_SHARED_FIXTURES) requires BUILD_TYPE and DEFAULT_PG_VERSION to be set", - pytrace=False, - ) - - return scope - - @pytest.fixture(scope="session") def worker_port_num(): return (32768 - BASE_PORT) // int(os.environ.get("PYTEST_XDIST_WORKER_COUNT", "1")) @@ -1431,8 +1404,8 @@ def generate_endpoint_id(self) -> str: return "ep-" + str(self.endpoint_counter) -@pytest.fixture(scope=shareable_scope) -def _shared_simple_env( +@pytest.fixture(scope="function") +def neon_simple_env( request: FixtureRequest, pytestconfig: Config, port_distributor: PortDistributor, @@ -1450,19 +1423,13 @@ def _shared_simple_env( pageserver_io_buffer_alignment: Optional[int], ) -> Iterator[NeonEnv]: """ - # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES - is set, this is shared by all tests using `neon_simple_env`. + Simple Neon environment, with no authentication and no safekeepers. This fixture will use RemoteStorageKind.LOCAL_FS with pageserver. """ - if os.environ.get("TEST_SHARED_FIXTURES") is None: - # Create the environment in the per-test output directory - repo_dir = get_test_repo_dir(request, top_output_dir) - else: - # We're running shared fixtures. Share a single directory. - repo_dir = top_output_dir / "shared_repo" - shutil.rmtree(repo_dir, ignore_errors=True) + # Create the environment in the per-test output directory + repo_dir = get_test_repo_dir(request, top_output_dir) with NeonEnvBuilder( top_output_dir=top_output_dir, @@ -1489,22 +1456,6 @@ def _shared_simple_env( yield env - -@pytest.fixture(scope="function") -def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]: - """ - Simple Neon environment, with no authentication and no safekeepers. - - If TEST_SHARED_FIXTURES environment variable is set, we reuse the same - environment for all tests that use 'neon_simple_env', keeping the - page server and safekeepers running. Any compute nodes are stopped after - each the test, however. - """ - yield _shared_simple_env - - _shared_simple_env.endpoints.stop_all() - - @pytest.fixture(scope="function") def neon_env_builder( pytestconfig: Config, @@ -4898,14 +4849,7 @@ def pytest_addoption(parser: Parser): # This is autouse, so the test output directory always gets created, even -# if a test doesn't put anything there. It also solves a problem with the -# neon_simple_env fixture: if TEST_SHARED_FIXTURES is not set, it -# creates the repo in the test output directory. But it cannot depend on -# 'test_output_dir' fixture, because when TEST_SHARED_FIXTURES is not set, -# it has 'session' scope and cannot access fixtures with 'function' -# scope. So it uses the get_test_output_dir() function to get the path, and -# this fixture ensures that the directory exists. That works because -# 'autouse' fixtures are run before other fixtures. +# if a test doesn't put anything there. # # NB: we request the overlay dir fixture so the fixture does its cleanups @pytest.fixture(scope="function", autouse=True) From 723c0971e818848696984fd66c562c9d0cbff948 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 9 Sep 2024 10:35:12 +0300 Subject: [PATCH 05/21] Don't create 'empty' branch in neon_simple_env (#8965) Now that we've given up hope on sharing the neon_simple_env between tests, there's no reason to not use the 'main' branch directly. --- test_runner/README.md | 5 ++-- test_runner/fixtures/neon_fixtures.py | 5 +--- .../performance/test_logical_replication.py | 4 +-- test_runner/regress/test_basebackup_error.py | 3 +-- test_runner/regress/test_clog_truncate.py | 5 ++-- test_runner/regress/test_compute_catalog.py | 3 +-- test_runner/regress/test_config.py | 3 +-- test_runner/regress/test_createdropdb.py | 17 ++++--------- test_runner/regress/test_createuser.py | 5 ++-- test_runner/regress/test_ddl_forwarding.py | 3 +-- .../regress/test_explain_with_lfc_stats.py | 6 ++--- test_runner/regress/test_lfc_resize.py | 3 +-- .../test_lfc_working_set_approximation.py | 6 ++--- test_runner/regress/test_local_file_cache.py | 7 ++---- .../regress/test_logical_replication.py | 11 +++----- test_runner/regress/test_migrations.py | 3 +-- test_runner/regress/test_multixact.py | 7 +++--- test_runner/regress/test_neon_superuser.py | 2 +- test_runner/regress/test_parallel_copy.py | 3 +-- .../regress/test_pg_query_cancellation.py | 4 +-- test_runner/regress/test_pg_waldump.py | 4 +-- test_runner/regress/test_read_validation.py | 12 ++------- test_runner/regress/test_readonly_node.py | 25 ++++++++----------- test_runner/regress/test_subxacts.py | 3 +-- test_runner/regress/test_timeline_delete.py | 7 ++++-- test_runner/regress/test_timeline_size.py | 4 +-- test_runner/regress/test_twophase.py | 7 ++---- test_runner/regress/test_unlogged.py | 5 ++-- test_runner/regress/test_vm_bits.py | 5 ++-- test_runner/test_broken.py | 3 +-- 30 files changed, 65 insertions(+), 115 deletions(-) diff --git a/test_runner/README.md b/test_runner/README.md index 647b930b26ac..d754e60d1737 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -257,9 +257,8 @@ compute Postgres nodes. The connections between them can be configured to use JW authentication tokens, and some other configuration options can be tweaked too. The easiest way to get access to a Neon Environment is by using the `neon_simple_env` -fixture. For convenience, there is a branch called `empty` in environments created with -'neon_simple_env'. The convention is to create a test-specific branch of that and load any -test data there, instead of the 'main' branch. +fixture. For convenience, there is a branch called `main` in environments created with +'neon_simple_env', ready to be used in the test. For more complicated cases, you can build a custom Neon Environment, with the `neon_env` fixture: diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 3047dcc4f78a..60887b9aed36 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -57,7 +57,6 @@ from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import cursor as PgCursor from psycopg2.extensions import make_dsn, parse_dsn -from typing_extensions import Literal from urllib3.util.retry import Retry from fixtures import overlayfs @@ -1451,11 +1450,9 @@ def neon_simple_env( ) as builder: env = builder.init_start() - # For convenience in tests, create a branch from the freshly-initialized cluster. - env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) - yield env + @pytest.fixture(scope="function") def neon_env_builder( pytestconfig: Config, diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index 077f73ac06ea..29a038052472 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -22,10 +22,8 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg): env = neon_simple_env - env.neon_cli.create_branch("test_logical_replication", "empty") - endpoint = env.endpoints.create_start("test_logical_replication") + endpoint = env.endpoints.create_start("main") - log.info("postgres is running on 'test_logical_replication' branch") pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()]) endpoint.safe_psql("create publication pub1 for table pgbench_accounts, pgbench_history") diff --git a/test_runner/regress/test_basebackup_error.py b/test_runner/regress/test_basebackup_error.py index 170b49488420..13c080ea0e60 100644 --- a/test_runner/regress/test_basebackup_error.py +++ b/test_runner/regress/test_basebackup_error.py @@ -8,11 +8,10 @@ # def test_basebackup_error(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_basebackup_error", "empty") pageserver_http = env.pageserver.http_client() # Introduce failpoint pageserver_http.configure_failpoints(("basebackup-before-control-file", "return")) with pytest.raises(Exception, match="basebackup-before-control-file"): - env.endpoints.create_start("test_basebackup_error") + env.endpoints.create_start("main") diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py index 26e6e336b972..6e4880841a00 100644 --- a/test_runner/regress/test_clog_truncate.py +++ b/test_runner/regress/test_clog_truncate.py @@ -11,7 +11,6 @@ # def test_clog_truncate(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_clog_truncate", "empty") # set aggressive autovacuum to make sure that truncation will happen config = [ @@ -24,7 +23,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv): "autovacuum_freeze_max_age=100000", ] - endpoint = env.endpoints.create_start("test_clog_truncate", config_lines=config) + endpoint = env.endpoints.create_start("main", config_lines=config) # Install extension containing function needed for test endpoint.safe_psql("CREATE EXTENSION neon_test_utils") @@ -58,7 +57,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv): # create new branch after clog truncation and start a compute node on it log.info(f"create branch at lsn_after_truncation {lsn_after_truncation}") env.neon_cli.create_branch( - "test_clog_truncate_new", "test_clog_truncate", ancestor_start_lsn=lsn_after_truncation + "test_clog_truncate_new", "main", ancestor_start_lsn=lsn_after_truncation ) endpoint2 = env.endpoints.create_start("test_clog_truncate_new") diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index dd36190fcd48..8b8c970357e5 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -4,9 +4,8 @@ def test_compute_catalog(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_config", "empty") - endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"]) + endpoint = env.endpoints.create_start("main", config_lines=["log_min_messages=debug1"]) client = endpoint.http_client() objects = client.dbs_and_roles() diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py index 2ef28eb94b5d..d8ef0b8dbda9 100644 --- a/test_runner/regress/test_config.py +++ b/test_runner/regress/test_config.py @@ -9,10 +9,9 @@ # def test_config(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_config", "empty") # change config - endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"]) + endpoint = env.endpoints.create_start("main", config_lines=["log_min_messages=debug1"]) with closing(endpoint.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/regress/test_createdropdb.py b/test_runner/regress/test_createdropdb.py index f741a9fc8740..af643f45d7ae 100644 --- a/test_runner/regress/test_createdropdb.py +++ b/test_runner/regress/test_createdropdb.py @@ -17,9 +17,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str): if env.pg_version == PgVersion.V14 and strategy == "wal_log": pytest.skip("wal_log strategy not supported on PostgreSQL 14") - env.neon_cli.create_branch("test_createdb", "empty") - - endpoint = env.endpoints.create_start("test_createdb") + endpoint = env.endpoints.create_start("main") with endpoint.cursor() as cur: # Cause a 'relmapper' change in the original branch @@ -33,7 +31,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str): lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") # Create a branch - env.neon_cli.create_branch("test_createdb2", "test_createdb", ancestor_start_lsn=lsn) + env.neon_cli.create_branch("test_createdb2", "main", ancestor_start_lsn=lsn) endpoint2 = env.endpoints.create_start("test_createdb2") # Test that you can connect to the new database on both branches @@ -62,8 +60,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str): # def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env - env.neon_cli.create_branch("test_dropdb", "empty") - endpoint = env.endpoints.create_start("test_dropdb") + endpoint = env.endpoints.create_start("main") with endpoint.cursor() as cur: cur.execute("CREATE DATABASE foodb") @@ -80,14 +77,10 @@ def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): lsn_after_drop = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") # Create two branches before and after database drop. - env.neon_cli.create_branch( - "test_before_dropdb", "test_dropdb", ancestor_start_lsn=lsn_before_drop - ) + env.neon_cli.create_branch("test_before_dropdb", "main", ancestor_start_lsn=lsn_before_drop) endpoint_before = env.endpoints.create_start("test_before_dropdb") - env.neon_cli.create_branch( - "test_after_dropdb", "test_dropdb", ancestor_start_lsn=lsn_after_drop - ) + env.neon_cli.create_branch("test_after_dropdb", "main", ancestor_start_lsn=lsn_after_drop) endpoint_after = env.endpoints.create_start("test_after_dropdb") # Test that database exists on the branch before drop diff --git a/test_runner/regress/test_createuser.py b/test_runner/regress/test_createuser.py index 17d9824f5203..d6f138e1266c 100644 --- a/test_runner/regress/test_createuser.py +++ b/test_runner/regress/test_createuser.py @@ -7,8 +7,7 @@ # def test_createuser(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_createuser", "empty") - endpoint = env.endpoints.create_start("test_createuser") + endpoint = env.endpoints.create_start("main") with endpoint.cursor() as cur: # Cause a 'relmapper' change in the original branch @@ -19,7 +18,7 @@ def test_createuser(neon_simple_env: NeonEnv): lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") # Create a branch - env.neon_cli.create_branch("test_createuser2", "test_createuser", ancestor_start_lsn=lsn) + env.neon_cli.create_branch("test_createuser2", "main", ancestor_start_lsn=lsn) endpoint2 = env.endpoints.create_start("test_createuser2") # Test that you can connect to new branch as a new user diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index 50da673d87d0..65f310c27abc 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -290,9 +290,8 @@ def assert_db_connlimit(endpoint: Any, db_name: str, connlimit: int, msg: str): # Here we test the latter. The first one is tested in test_ddl_forwarding def test_ddl_forwarding_invalid_db(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_ddl_forwarding_invalid_db", "empty") endpoint = env.endpoints.create_start( - "test_ddl_forwarding_invalid_db", + "main", # Some non-existent url config_lines=["neon.console_url=http://localhost:9999/unknown/api/v0/roles_and_databases"], ) diff --git a/test_runner/regress/test_explain_with_lfc_stats.py b/test_runner/regress/test_explain_with_lfc_stats.py index 5231dedcdac2..0217c9ac7b62 100644 --- a/test_runner/regress/test_explain_with_lfc_stats.py +++ b/test_runner/regress/test_explain_with_lfc_stats.py @@ -10,11 +10,9 @@ def test_explain_with_lfc_stats(neon_simple_env: NeonEnv): cache_dir = Path(env.repo_dir) / "file_cache" cache_dir.mkdir(exist_ok=True) - branchname = "test_explain_with_lfc_stats" - env.neon_cli.create_branch(branchname, "empty") - log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}") + log.info("Creating endpoint with 1MB shared_buffers and 64 MB LFC") endpoint = env.endpoints.create_start( - branchname, + "main", config_lines=[ "shared_buffers='1MB'", f"neon.file_cache_path='{cache_dir}/file.cache'", diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index 1b2c7f808f37..cb0b30d9c6e8 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -16,9 +16,8 @@ @pytest.mark.timeout(600) def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): env = neon_simple_env - env.neon_cli.create_branch("test_lfc_resize", "empty") endpoint = env.endpoints.create_start( - "test_lfc_resize", + "main", config_lines=[ "neon.file_cache_path='file.cache'", "neon.max_file_cache_size=512MB", diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py index 4c53e4e2fd35..4a3a949d1a17 100644 --- a/test_runner/regress/test_lfc_working_set_approximation.py +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -12,11 +12,9 @@ def test_lfc_working_set_approximation(neon_simple_env: NeonEnv): cache_dir = Path(env.repo_dir) / "file_cache" cache_dir.mkdir(exist_ok=True) - branchname = "test_approximate_working_set_size" - env.neon_cli.create_branch(branchname, "empty") - log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}") + log.info("Creating endpoint with 1MB shared_buffers and 64 MB LFC") endpoint = env.endpoints.create_start( - branchname, + "main", config_lines=[ "shared_buffers='1MB'", f"neon.file_cache_path='{cache_dir}/file.cache'", diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py index 3c404c3b231e..9c38200937c7 100644 --- a/test_runner/regress/test_local_file_cache.py +++ b/test_runner/regress/test_local_file_cache.py @@ -5,7 +5,7 @@ import time from typing import List -from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.utils import query_scalar @@ -15,11 +15,8 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder): cache_dir = os.path.join(env.repo_dir, "file_cache") os.mkdir(cache_dir) - env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) - env.neon_cli.create_branch("test_local_file_cache_unlink", "empty") - endpoint = env.endpoints.create_start( - "test_local_file_cache_unlink", + "main", config_lines=[ "shared_buffers='1MB'", f"neon.file_cache_path='{cache_dir}/file.cache'", diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index f83a833ddae2..15a3719e0b82 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -36,10 +36,8 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_logical_replication", "empty") - endpoint = env.endpoints.create_start( - "test_logical_replication", config_lines=["log_statement=all"] - ) + timeline_id = env.initial_timeline + endpoint = env.endpoints.create_start("main", config_lines=["log_statement=all"]) pg_conn = endpoint.connect() cur = pg_conn.cursor() @@ -185,10 +183,9 @@ def slot_removed(ep): env = neon_simple_env - env.neon_cli.create_branch("test_logical_replication", "empty") # set low neon.logical_replication_max_snap_files endpoint = env.endpoints.create_start( - "test_logical_replication", + "main", config_lines=["log_statement=all", "neon.logical_replication_max_snap_files=1"], ) @@ -472,7 +469,7 @@ def test_slots_and_branching(neon_simple_env: NeonEnv): def test_replication_shutdown(neon_simple_env: NeonEnv): # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed env = neon_simple_env - env.neon_cli.create_branch("test_replication_shutdown_publisher", "empty") + env.neon_cli.create_branch("test_replication_shutdown_publisher", "main") pub = env.endpoints.create("test_replication_shutdown_publisher") env.neon_cli.create_branch("test_replication_shutdown_subscriber") diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py index bdc5ca907ec1..e88e56d030a8 100644 --- a/test_runner/regress/test_migrations.py +++ b/test_runner/regress/test_migrations.py @@ -9,9 +9,8 @@ def test_migrations(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_migrations", "empty") - endpoint = env.endpoints.create("test_migrations") + endpoint = env.endpoints.create("main") endpoint.respec(skip_pg_catalog_updates=False) endpoint.start() diff --git a/test_runner/regress/test_multixact.py b/test_runner/regress/test_multixact.py index 88f7a5db59ff..8a00f8835fe4 100644 --- a/test_runner/regress/test_multixact.py +++ b/test_runner/regress/test_multixact.py @@ -14,8 +14,7 @@ # def test_multixact(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env - env.neon_cli.create_branch("test_multixact", "empty") - endpoint = env.endpoints.create_start("test_multixact") + endpoint = env.endpoints.create_start("main") cur = endpoint.connect().cursor() cur.execute( @@ -73,7 +72,9 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): assert int(next_multixact_id) > int(next_multixact_id_old) # Branch at this point - env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn) + env.neon_cli.create_branch( + "test_multixact_new", ancestor_branch_name="main", ancestor_start_lsn=lsn + ) endpoint_new = env.endpoints.create_start("test_multixact_new") next_multixact_id_new = endpoint_new.safe_psql( diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py index fd31df84da2f..7825ec772c9f 100644 --- a/test_runner/regress/test_neon_superuser.py +++ b/test_runner/regress/test_neon_superuser.py @@ -6,7 +6,7 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion): env = neon_simple_env - env.neon_cli.create_branch("test_neon_superuser_publisher", "empty") + env.neon_cli.create_branch("test_neon_superuser_publisher", "main") pub = env.endpoints.create("test_neon_superuser_publisher") env.neon_cli.create_branch("test_neon_superuser_subscriber") diff --git a/test_runner/regress/test_parallel_copy.py b/test_runner/regress/test_parallel_copy.py index b33e387a6687..a5037e869448 100644 --- a/test_runner/regress/test_parallel_copy.py +++ b/test_runner/regress/test_parallel_copy.py @@ -41,8 +41,7 @@ async def parallel_load_same_table(endpoint: Endpoint, n_parallel: int): # Load data into one table with COPY TO from 5 parallel connections def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5): env = neon_simple_env - env.neon_cli.create_branch("test_parallel_copy", "empty") - endpoint = env.endpoints.create_start("test_parallel_copy") + endpoint = env.endpoints.create_start("main") # Create test table conn = endpoint.connect() diff --git a/test_runner/regress/test_pg_query_cancellation.py b/test_runner/regress/test_pg_query_cancellation.py index bad2e5865e81..c6b4eff51617 100644 --- a/test_runner/regress/test_pg_query_cancellation.py +++ b/test_runner/regress/test_pg_query_cancellation.py @@ -42,11 +42,9 @@ def test_cancellations(neon_simple_env: NeonEnv): ps_http = ps.http_client() ps_http.is_testing_enabled_or_skip() - env.neon_cli.create_branch("test_config", "empty") - # We don't want to have any racy behaviour with autovacuum IOs ep = env.endpoints.create_start( - "test_config", + "main", config_lines=[ "autovacuum = off", "shared_buffers = 128MB", diff --git a/test_runner/regress/test_pg_waldump.py b/test_runner/regress/test_pg_waldump.py index 8e80efd9ba0e..1990d69b6aea 100644 --- a/test_runner/regress/test_pg_waldump.py +++ b/test_runner/regress/test_pg_waldump.py @@ -22,8 +22,8 @@ def check_wal_segment(pg_waldump_path: str, segment_path: str, test_output_dir): def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin): env = neon_simple_env tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_pg_waldump", "empty") - endpoint = env.endpoints.create_start("test_pg_waldump") + timeline_id = env.initial_timeline + endpoint = env.endpoints.create_start("main") cur = endpoint.connect().cursor() cur.execute( diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index 1ac881553fbc..78798c5abf4c 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -15,12 +15,8 @@ # def test_read_validation(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_read_validation", "empty") - - endpoint = env.endpoints.create_start( - "test_read_validation", - ) + endpoint = env.endpoints.create_start("main") with closing(endpoint.connect()) as con: with con.cursor() as c: for e in extensions: @@ -131,13 +127,9 @@ def test_read_validation(neon_simple_env: NeonEnv): def test_read_validation_neg(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_read_validation_neg", "empty") - env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*") - endpoint = env.endpoints.create_start( - "test_read_validation_neg", - ) + endpoint = env.endpoints.create_start("main") with closing(endpoint.connect()) as con: with con.cursor() as c: diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 368f60127e51..347fc3a04ddb 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -22,8 +22,7 @@ # def test_readonly_node(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_readonly_node", "empty") - endpoint_main = env.endpoints.create_start("test_readonly_node") + endpoint_main = env.endpoints.create_start("main") env.pageserver.allowed_errors.extend( [ @@ -74,12 +73,12 @@ def test_readonly_node(neon_simple_env: NeonEnv): # Create first read-only node at the point where only 100 rows were inserted endpoint_hundred = env.endpoints.create_start( - branch_name="test_readonly_node", endpoint_id="ep-readonly_node_hundred", lsn=lsn_a + branch_name="main", endpoint_id="ep-readonly_node_hundred", lsn=lsn_a ) # And another at the point where 200100 rows were inserted endpoint_more = env.endpoints.create_start( - branch_name="test_readonly_node", endpoint_id="ep-readonly_node_more", lsn=lsn_b + branch_name="main", endpoint_id="ep-readonly_node_more", lsn=lsn_b ) # On the 'hundred' node, we should see only 100 rows @@ -100,7 +99,7 @@ def test_readonly_node(neon_simple_env: NeonEnv): # Check creating a node at segment boundary endpoint = env.endpoints.create_start( - branch_name="test_readonly_node", + branch_name="main", endpoint_id="ep-branch_segment_boundary", lsn=Lsn("0/3000000"), ) @@ -112,7 +111,7 @@ def test_readonly_node(neon_simple_env: NeonEnv): with pytest.raises(Exception, match="invalid basebackup lsn"): # compute node startup with invalid LSN should fail env.endpoints.create_start( - branch_name="test_readonly_node", + branch_name="main", endpoint_id="ep-readonly_node_preinitdb", lsn=Lsn("0/42"), ) @@ -218,14 +217,10 @@ def generate_updates_on_main( # Similar test, but with more data, and we force checkpoints def test_timetravel(neon_simple_env: NeonEnv): env = neon_simple_env - pageserver_http_client = env.pageserver.http_client() - env.neon_cli.create_branch("test_timetravel", "empty") - endpoint = env.endpoints.create_start("test_timetravel") - + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline client = env.pageserver.http_client() - - tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0] - timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0] + endpoint = env.endpoints.create_start("main") lsns = [] @@ -249,7 +244,7 @@ def test_timetravel(neon_simple_env: NeonEnv): wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) # run checkpoint manually to force a new layer file - pageserver_http_client.timeline_checkpoint(tenant_id, timeline_id) + client.timeline_checkpoint(tenant_id, timeline_id) ##### Restart pageserver env.endpoints.stop_all() @@ -258,7 +253,7 @@ def test_timetravel(neon_simple_env: NeonEnv): for i, lsn in lsns: endpoint_old = env.endpoints.create_start( - branch_name="test_timetravel", endpoint_id=f"ep-old_lsn_{i}", lsn=lsn + branch_name="main", endpoint_id=f"ep-old_lsn_{i}", lsn=lsn ) with endpoint_old.cursor() as cur: assert query_scalar(cur, f"select count(*) from testtab where iteration={i}") == 100000 diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py index 10cb00c7805e..82075bd723c7 100644 --- a/test_runner/regress/test_subxacts.py +++ b/test_runner/regress/test_subxacts.py @@ -9,8 +9,7 @@ # CLOG. def test_subxacts(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env - env.neon_cli.create_branch("test_subxacts", "empty") - endpoint = env.endpoints.create_start("test_subxacts") + endpoint = env.endpoints.create_start("main") pg_conn = endpoint.connect() cur = pg_conn.cursor() diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 328131cd082d..711fcd50166e 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -68,10 +68,13 @@ def test_timeline_delete(neon_simple_env: NeonEnv): # construct pair of branches to validate that pageserver prohibits # deletion of ancestor timelines when they have child branches - parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_parent", "empty") + parent_timeline_id = env.neon_cli.create_branch( + new_branch_name="test_ancestor_branch_delete_parent", ancestor_branch_name="main" + ) leaf_timeline_id = env.neon_cli.create_branch( - "test_ancestor_branch_delete_branch1", "test_ancestor_branch_delete_parent" + new_branch_name="test_ancestor_branch_delete_branch1", + ancestor_branch_name="test_ancestor_branch_delete_parent", ) timeline_path = env.pageserver.timeline_dir(env.initial_tenant, parent_timeline_id) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 9bf5f8680b59..f2265dd3d937 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -36,7 +36,7 @@ def test_timeline_size(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty") + new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "main") client = env.pageserver.http_client() client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) @@ -68,7 +68,7 @@ def test_timeline_size(neon_simple_env: NeonEnv): def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty") + new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "main") client = env.pageserver.http_client() client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py index dd76689008dd..ea900b07b86c 100644 --- a/test_runner/regress/test_twophase.py +++ b/test_runner/regress/test_twophase.py @@ -9,10 +9,7 @@ # def test_twophase(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_twophase", "empty") - endpoint = env.endpoints.create_start( - "test_twophase", config_lines=["max_prepared_transactions=5"] - ) + endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=5"]) conn = endpoint.connect() cur = conn.cursor() @@ -56,7 +53,7 @@ def test_twophase(neon_simple_env: NeonEnv): assert len(twophase_files) == 2 # Create a branch with the transaction in prepared state - fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "test_twophase") + fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "main") # Start compute on the new branch endpoint2 = env.endpoints.create_start( diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py index 137d28b9fa0e..deba29536c4d 100644 --- a/test_runner/regress/test_unlogged.py +++ b/test_runner/regress/test_unlogged.py @@ -9,8 +9,7 @@ # def test_unlogged(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_unlogged", "empty") - endpoint = env.endpoints.create_start("test_unlogged") + endpoint = env.endpoints.create_start("main") conn = endpoint.connect() cur = conn.cursor() @@ -22,7 +21,7 @@ def test_unlogged(neon_simple_env: NeonEnv): cur.execute("INSERT INTO iut (id) values (42);") # create another compute to fetch inital empty contents from pageserver - fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "test_unlogged") + fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "main") endpoint2 = env.endpoints.create_start("test_unlogged_basebackup") conn2 = endpoint2.connect() diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 7272979c4a2e..3075211ada95 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -13,8 +13,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_vm_bit_clear", "empty") - endpoint = env.endpoints.create_start("test_vm_bit_clear") + endpoint = env.endpoints.create_start("main") pg_conn = endpoint.connect() cur = pg_conn.cursor() @@ -58,7 +57,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): cur.execute("UPDATE vmtest_cold_update2 SET id = 5000, filler=repeat('x', 200) WHERE id = 1") # Branch at this point, to test that later - fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "test_vm_bit_clear") + fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "main") # Clear the buffer cache, to force the VM page to be re-fetched from # the page server diff --git a/test_runner/test_broken.py b/test_runner/test_broken.py index 7e8aef5a5f05..d710b53528e7 100644 --- a/test_runner/test_broken.py +++ b/test_runner/test_broken.py @@ -23,8 +23,7 @@ def test_broken(neon_simple_env: NeonEnv, pg_bin): env = neon_simple_env - env.neon_cli.create_branch("test_broken", "empty") - env.endpoints.create_start("test_broken") + env.endpoints.create_start("main") log.info("postgres is running") log.info("THIS NEXT COMMAND WILL FAIL:") From e158df4e86318fa3fd5ee9516f3e7ac91dd14283 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 10 Sep 2024 05:03:27 +0800 Subject: [PATCH 06/21] feat(pageserver): split delta writer automatically determines key range (#8850) close https://github.com/neondatabase/neon/issues/8838 ## Summary of changes This patch modifies the split delta layer writer to avoid taking start_key and end_key when creating/finishing the layer writer. The start_key for the delta layers will be the first key provided to the layer writer, and the end_key would be the `last_key.next()`. This simplifies the delta layer writer API. On that, the layer key hack is removed. Image layers now use the full key range, and delta layers use the first/last key provided by the user. --------- Signed-off-by: Alex Chi Z --- libs/pageserver_api/src/key.rs | 9 - pageserver/src/tenant.rs | 6 +- .../src/tenant/storage_layer/split_writer.rs | 158 ++++++++++++------ pageserver/src/tenant/timeline/compaction.rs | 7 +- 4 files changed, 109 insertions(+), 71 deletions(-) diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 77d744e4da21..8929ccb41d6f 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -263,15 +263,6 @@ impl Key { field5: u8::MAX, field6: u32::MAX, }; - /// A key slightly smaller than [`Key::MAX`] for use in layer key ranges to avoid them to be confused with L0 layers - pub const NON_L0_MAX: Key = Key { - field1: u8::MAX, - field2: u32::MAX, - field3: u32::MAX, - field4: u32::MAX, - field5: u8::MAX, - field6: u32::MAX - 1, - }; pub fn from_hex(s: &str) -> Result { if s.len() != 36 { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index fd2520a42eb3..c6f0e481017e 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -7091,13 +7091,13 @@ mod tests { vec![ // Image layer at GC horizon PersistentLayerKey { - key_range: Key::MIN..Key::NON_L0_MAX, + key_range: Key::MIN..Key::MAX, lsn_range: Lsn(0x30)..Lsn(0x31), is_delta: false }, - // The delta layer covers the full range (with the layer key hack to avoid being recognized as L0) + // The delta layer below the horizon PersistentLayerKey { - key_range: Key::MIN..Key::NON_L0_MAX, + key_range: get_key(3)..get_key(4), lsn_range: Lsn(0x30)..Lsn(0x48), is_delta: true }, diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs index 7c1ac863bf2e..40a6a77a5013 100644 --- a/pageserver/src/tenant/storage_layer/split_writer.rs +++ b/pageserver/src/tenant/storage_layer/split_writer.rs @@ -188,7 +188,7 @@ impl SplitImageLayerWriter { .await } - /// When split writer fails, the caller should call this function and handle partially generated layers. + /// This function will be deprecated with #8841. pub(crate) fn take(self) -> anyhow::Result<(Vec, ImageLayerWriter)> { Ok((self.generated_layers, self.inner)) } @@ -204,7 +204,7 @@ impl SplitImageLayerWriter { /// will split them into multiple files based on size. #[must_use] pub struct SplitDeltaLayerWriter { - inner: DeltaLayerWriter, + inner: Option<(Key, DeltaLayerWriter)>, target_layer_size: u64, generated_layers: Vec, conf: &'static PageServerConf, @@ -212,7 +212,6 @@ pub struct SplitDeltaLayerWriter { tenant_shard_id: TenantShardId, lsn_range: Range, last_key_written: Key, - start_key: Key, } impl SplitDeltaLayerWriter { @@ -220,29 +219,18 @@ impl SplitDeltaLayerWriter { conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, - start_key: Key, lsn_range: Range, target_layer_size: u64, - ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { target_layer_size, - inner: DeltaLayerWriter::new( - conf, - timeline_id, - tenant_shard_id, - start_key, - lsn_range.clone(), - ctx, - ) - .await?, + inner: None, generated_layers: Vec::new(), conf, timeline_id, tenant_shard_id, lsn_range, last_key_written: Key::MIN, - start_key, }) } @@ -265,9 +253,26 @@ impl SplitDeltaLayerWriter { // // Also, keep all updates of a single key in a single file. TODO: split them using the legacy compaction // strategy. https://github.com/neondatabase/neon/issues/8837 + + if self.inner.is_none() { + self.inner = Some(( + key, + DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + key, + self.lsn_range.clone(), + ctx, + ) + .await?, + )); + } + let (_, inner) = self.inner.as_mut().unwrap(); + let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */; - if self.inner.num_keys() >= 1 - && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size + if inner.num_keys() >= 1 + && inner.estimated_size() + addition_size_estimation >= self.target_layer_size { if key != self.last_key_written { let next_delta_writer = DeltaLayerWriter::new( @@ -279,13 +284,13 @@ impl SplitDeltaLayerWriter { ctx, ) .await?; - let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer); + let (start_key, prev_delta_writer) = + std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap(); let layer_key = PersistentLayerKey { - key_range: self.start_key..key, + key_range: start_key..key, lsn_range: self.lsn_range.clone(), is_delta: true, }; - self.start_key = key; if discard(&layer_key).await { drop(prev_delta_writer); self.generated_layers @@ -296,17 +301,18 @@ impl SplitDeltaLayerWriter { self.generated_layers .push(SplitWriterResult::Produced(delta_layer)); } - } else if self.inner.estimated_size() >= S3_UPLOAD_LIMIT { + } else if inner.estimated_size() >= S3_UPLOAD_LIMIT { // We have to produce a very large file b/c a key is updated too often. anyhow::bail!( "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced", key, - self.inner.estimated_size() + inner.estimated_size() ); } } self.last_key_written = key; - self.inner.put_value(key, lsn, val, ctx).await + let (_, inner) = self.inner.as_mut().unwrap(); + inner.put_value(key, lsn, val, ctx).await } pub async fn put_value( @@ -325,7 +331,6 @@ impl SplitDeltaLayerWriter { self, tline: &Arc, ctx: &RequestContext, - end_key: Key, discard: D, ) -> anyhow::Result> where @@ -337,11 +342,15 @@ impl SplitDeltaLayerWriter { inner, .. } = self; + let Some((start_key, inner)) = inner else { + return Ok(generated_layers); + }; if inner.num_keys() == 0 { return Ok(generated_layers); } + let end_key = self.last_key_written.next(); let layer_key = PersistentLayerKey { - key_range: self.start_key..end_key, + key_range: start_key..end_key, lsn_range: self.lsn_range.clone(), is_delta: true, }; @@ -360,15 +369,14 @@ impl SplitDeltaLayerWriter { self, tline: &Arc, ctx: &RequestContext, - end_key: Key, ) -> anyhow::Result> { - self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false }) + self.finish_with_discard_fn(tline, ctx, |_| async { false }) .await } - /// When split writer fails, the caller should call this function and handle partially generated layers. - pub(crate) fn take(self) -> anyhow::Result<(Vec, DeltaLayerWriter)> { - Ok((self.generated_layers, self.inner)) + /// This function will be deprecated with #8841. + pub(crate) fn take(self) -> anyhow::Result<(Vec, Option)> { + Ok((self.generated_layers, self.inner.map(|x| x.1))) } } @@ -432,10 +440,8 @@ mod tests { tenant.conf, tline.timeline_id, tenant.tenant_shard_id, - get_key(0), Lsn(0x18)..Lsn(0x20), 4 * 1024 * 1024, - &ctx, ) .await .unwrap(); @@ -460,11 +466,22 @@ mod tests { ) .await .unwrap(); - let layers = delta_writer - .finish(&tline, &ctx, get_key(10)) - .await - .unwrap(); + let layers = delta_writer.finish(&tline, &ctx).await.unwrap(); assert_eq!(layers.len(), 1); + assert_eq!( + layers + .into_iter() + .next() + .unwrap() + .into_resident_layer() + .layer_desc() + .key(), + PersistentLayerKey { + key_range: get_key(0)..get_key(1), + lsn_range: Lsn(0x18)..Lsn(0x20), + is_delta: true + } + ); } #[tokio::test] @@ -501,10 +518,8 @@ mod tests { tenant.conf, tline.timeline_id, tenant.tenant_shard_id, - get_key(0), Lsn(0x18)..Lsn(0x20), 4 * 1024 * 1024, - &ctx, ) .await .unwrap(); @@ -533,10 +548,7 @@ mod tests { .finish(&tline, &ctx, get_key(N as u32)) .await .unwrap(); - let delta_layers = delta_writer - .finish(&tline, &ctx, get_key(N as u32)) - .await - .unwrap(); + let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap(); if discard { for layer in image_layers { layer.into_discarded_layer(); @@ -555,6 +567,14 @@ mod tests { .collect_vec(); assert_eq!(image_layers.len(), N / 512 + 1); assert_eq!(delta_layers.len(), N / 512 + 1); + assert_eq!( + delta_layers.first().unwrap().layer_desc().key_range.start, + get_key(0) + ); + assert_eq!( + delta_layers.last().unwrap().layer_desc().key_range.end, + get_key(N as u32) + ); for idx in 0..image_layers.len() { assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN); assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX); @@ -602,10 +622,8 @@ mod tests { tenant.conf, tline.timeline_id, tenant.tenant_shard_id, - get_key(0), Lsn(0x18)..Lsn(0x20), 4 * 1024, - &ctx, ) .await .unwrap(); @@ -644,11 +662,35 @@ mod tests { ) .await .unwrap(); - let layers = delta_writer - .finish(&tline, &ctx, get_key(10)) - .await - .unwrap(); + let layers = delta_writer.finish(&tline, &ctx).await.unwrap(); assert_eq!(layers.len(), 2); + let mut layers_iter = layers.into_iter(); + assert_eq!( + layers_iter + .next() + .unwrap() + .into_resident_layer() + .layer_desc() + .key(), + PersistentLayerKey { + key_range: get_key(0)..get_key(1), + lsn_range: Lsn(0x18)..Lsn(0x20), + is_delta: true + } + ); + assert_eq!( + layers_iter + .next() + .unwrap() + .into_resident_layer() + .layer_desc() + .key(), + PersistentLayerKey { + key_range: get_key(1)..get_key(2), + lsn_range: Lsn(0x18)..Lsn(0x20), + is_delta: true + } + ); } #[tokio::test] @@ -668,10 +710,8 @@ mod tests { tenant.conf, tline.timeline_id, tenant.tenant_shard_id, - get_key(0), Lsn(0x10)..Lsn(N as u64 * 16 + 0x10), 4 * 1024 * 1024, - &ctx, ) .await .unwrap(); @@ -689,10 +729,20 @@ mod tests { .await .unwrap(); } - let delta_layers = delta_writer - .finish(&tline, &ctx, get_key(N as u32)) - .await - .unwrap(); + let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap(); assert_eq!(delta_layers.len(), 1); + let delta_layer = delta_layers + .into_iter() + .next() + .unwrap() + .into_resident_layer(); + assert_eq!( + delta_layer.layer_desc().key(), + PersistentLayerKey { + key_range: get_key(0)..get_key(1), + lsn_range: Lsn(0x10)..Lsn(N as u64 * 16 + 0x10), + is_delta: true + } + ); } } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index a87b502cd607..0b5c520ba7ad 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1809,7 +1809,6 @@ impl Timeline { .unwrap(); // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized // as an L0 layer. - let hack_end_key = Key::NON_L0_MAX; let mut delta_layers = Vec::new(); let mut image_layers = Vec::new(); let mut downloaded_layers = Vec::new(); @@ -1855,10 +1854,8 @@ impl Timeline { self.conf, self.timeline_id, self.tenant_shard_id, - Key::MIN, lowest_retain_lsn..end_lsn, self.get_compaction_target_size(), - ctx, ) .await?; @@ -1965,7 +1962,7 @@ impl Timeline { let produced_image_layers = if let Some(writer) = image_layer_writer { if !dry_run { writer - .finish_with_discard_fn(self, ctx, hack_end_key, discard) + .finish_with_discard_fn(self, ctx, Key::MAX, discard) .await? } else { let (layers, _) = writer.take()?; @@ -1978,7 +1975,7 @@ impl Timeline { let produced_delta_layers = if !dry_run { delta_layer_writer - .finish_with_discard_fn(self, ctx, hack_end_key, discard) + .finish_with_discard_fn(self, ctx, discard) .await? } else { let (layers, _) = delta_layer_writer.take()?; From 982b376ea2e42d45f70c625ec91ac513f9f3a661 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 10 Sep 2024 00:04:00 +0300 Subject: [PATCH 07/21] Update parquet crate to a released version (#8961) PR #7782 set the dependency in Cargo.toml to 'master', and locked the version to commit that contained a specific fix, because we needed the fix before it was included in a versioned release. The fix was later included in parquet crate version 52.0.0, so we can now switch back to using a released version. The latest release is 53.0.0, switch straight to that. --------- Co-authored-by: Conrad Ludgate --- Cargo.lock | 10 +++++--- Cargo.toml | 8 ++---- proxy/src/context/parquet.rs | 48 ++++++++++++++++++------------------ workspace_hack/Cargo.toml | 4 +-- 4 files changed, 34 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4fb3ac72230c..3ca6acbc3e0a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3907,8 +3907,9 @@ dependencies = [ [[package]] name = "parquet" -version = "51.0.0" -source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829" +version = "53.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0fbf928021131daaa57d334ca8e3904fe9ae22f73c56244fc7db9b04eedc3d8" dependencies = [ "ahash", "bytes", @@ -3927,8 +3928,9 @@ dependencies = [ [[package]] name = "parquet_derive" -version = "51.0.0" -source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829" +version = "53.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86e9fcfae007533a06b580429a3f7e07cb833ec8aa37c041c16563e7918f057e" dependencies = [ "parquet", "proc-macro2", diff --git a/Cargo.toml b/Cargo.toml index 40e399619d18..fd1d4e016cc4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -123,8 +123,8 @@ opentelemetry = "0.20.0" opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-semantic-conventions = "0.12.0" parking_lot = "0.12" -parquet = { version = "51.0.0", default-features = false, features = ["zstd"] } -parquet_derive = "51.0.0" +parquet = { version = "53", default-features = false, features = ["zstd"] } +parquet_derive = "53" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pin-project-lite = "0.2" procfs = "0.16" @@ -254,10 +254,6 @@ tonic-build = "0.9" # Needed to get `tokio-postgres-rustls` to depend on our fork. tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" } -# bug fixes for UUID -parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" } -parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" } - ################# Binary contents sections [profile.release] diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index c6f83fd069c3..fafea2a08fd7 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -598,15 +598,15 @@ mod tests { assert_eq!( file_stats, [ - (1315874, 3, 6000), - (1315867, 3, 6000), - (1315927, 3, 6000), - (1315884, 3, 6000), - (1316014, 3, 6000), - (1315856, 3, 6000), - (1315648, 3, 6000), - (1315884, 3, 6000), - (438913, 1, 2000) + (1312632, 3, 6000), + (1312621, 3, 6000), + (1312680, 3, 6000), + (1312637, 3, 6000), + (1312773, 3, 6000), + (1312610, 3, 6000), + (1312404, 3, 6000), + (1312639, 3, 6000), + (437848, 1, 2000) ] ); @@ -638,11 +638,11 @@ mod tests { assert_eq!( file_stats, [ - (1208861, 5, 10000), - (1208592, 5, 10000), - (1208885, 5, 10000), - (1208873, 5, 10000), - (1209128, 5, 10000) + (1203465, 5, 10000), + (1203189, 5, 10000), + (1203490, 5, 10000), + (1203475, 5, 10000), + (1203729, 5, 10000) ] ); @@ -667,15 +667,15 @@ mod tests { assert_eq!( file_stats, [ - (1315874, 3, 6000), - (1315867, 3, 6000), - (1315927, 3, 6000), - (1315884, 3, 6000), - (1316014, 3, 6000), - (1315856, 3, 6000), - (1315648, 3, 6000), - (1315884, 3, 6000), - (438913, 1, 2000) + (1312632, 3, 6000), + (1312621, 3, 6000), + (1312680, 3, 6000), + (1312637, 3, 6000), + (1312773, 3, 6000), + (1312610, 3, 6000), + (1312404, 3, 6000), + (1312639, 3, 6000), + (437848, 1, 2000) ] ); @@ -712,7 +712,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(659836, 2, 3001), (659550, 2, 3000), (659346, 2, 2999)] + [(657696, 2, 3001), (657410, 2, 3000), (657206, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 411ca8103254..140c43639e68 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -60,7 +60,7 @@ num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } -parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] } +parquet = { version = "53", default-features = false, features = ["zstd"] } prost = { version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } @@ -116,7 +116,7 @@ num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } -parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] } +parquet = { version = "53", default-features = false, features = ["zstd"] } proc-macro2 = { version = "1" } prost = { version = "0.11" } quote = { version = "1" } From 842be0ba74c4c6e4245c29c3fffae4401d282c4a Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 10 Sep 2024 00:01:52 +0200 Subject: [PATCH 08/21] Specialize WalIngest on PostgreSQL version (#8904) The current code assumes that most of this functionality is version-independent, which is only true up to v16 - PostgreSQL 17 has a new field in CheckPoint that we need to keep track of. This basically removes the file-level dependency on v14, and replaces it with switches that load the correct version dependencies where required. --- libs/postgres_ffi/build.rs | 1 + libs/postgres_ffi/src/lib.rs | 104 ++++++++++++++++ pageserver/src/walingest.rs | 231 +++++++++++++++++++++-------------- 3 files changed, 247 insertions(+), 89 deletions(-) diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index d3e3ce648f30..a346390f3ddf 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -121,6 +121,7 @@ fn main() -> anyhow::Result<()> { .allowlist_type("XLogPageHeaderData") .allowlist_type("XLogLongPageHeaderData") .allowlist_var("XLOG_PAGE_MAGIC") + .allowlist_var("PG_MAJORVERSION_NUM") .allowlist_var("PG_CONTROL_FILE_SIZE") .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC") .allowlist_type("PageHeaderData") diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 9acb105e9b53..f18e0c603b20 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -44,6 +44,9 @@ macro_rules! postgres_ffi { // Re-export some symbols from bindings pub use bindings::DBState_DB_SHUTDOWNED; pub use bindings::{CheckPoint, ControlFileData, XLogRecord}; + + pub const ZERO_CHECKPOINT: bytes::Bytes = + bytes::Bytes::from_static(&[0u8; xlog_utils::SIZEOF_CHECKPOINT]); } }; } @@ -106,6 +109,107 @@ macro_rules! dispatch_pgversion { }; } +#[macro_export] +macro_rules! enum_pgversion_dispatch { + ($name:expr, $typ:ident, $bind:ident, $code:block) => { + enum_pgversion_dispatch!( + name = $name, + bind = $bind, + typ = $typ, + code = $code, + pgversions = [ + V14 : v14, + V15 : v15, + V16 : v16, + ] + ) + }; + (name = $name:expr, + bind = $bind:ident, + typ = $typ:ident, + code = $code:block, + pgversions = [$($variant:ident : $md:ident),+ $(,)?]) => { + match $name { + $( + self::$typ::$variant($bind) => { + use $crate::$md as pgv; + $code + } + ),+, + } + }; +} + +#[macro_export] +macro_rules! enum_pgversion { + {$name:ident, pgv :: $t:ident} => { + enum_pgversion!{ + name = $name, + typ = $t, + pgversions = [ + V14 : v14, + V15 : v15, + V16 : v16, + ] + } + }; + {$name:ident, pgv :: $p:ident :: $t:ident} => { + enum_pgversion!{ + name = $name, + path = $p, + typ = $t, + pgversions = [ + V14 : v14, + V15 : v15, + V16 : v16, + ] + } + }; + {name = $name:ident, + typ = $t:ident, + pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => { + pub enum $name { + $($variant ( $crate::$md::$t )),+ + } + impl self::$name { + pub fn pg_version(&self) -> u32 { + enum_pgversion_dispatch!(self, $name, _ign, { + pgv::bindings::PG_MAJORVERSION_NUM + }) + } + } + $( + impl Into for $crate::$md::$t { + fn into(self) -> self::$name { + self::$name::$variant (self) + } + } + )+ + }; + {name = $name:ident, + path = $p:ident, + typ = $t:ident, + pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => { + pub enum $name { + $($variant ($crate::$md::$p::$t)),+ + } + impl $name { + pub fn pg_version(&self) -> u32 { + enum_pgversion_dispatch!(self, $name, _ign, { + pgv::bindings::PG_MAJORVERSION_NUM + }) + } + } + $( + impl Into<$name> for $crate::$md::$p::$t { + fn into(self) -> $name { + $name::$variant (self) + } + } + )+ + }; +} + pub mod pg_constants; pub mod relfile_utils; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 2d3841881bab..39bc9e385f7f 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -25,9 +25,7 @@ use std::time::Duration; use std::time::SystemTime; use pageserver_api::shard::ShardIdentity; -use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes; -use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment; -use postgres_ffi::TimestampTz; +use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz}; use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn}; use anyhow::{bail, Context, Result}; @@ -48,16 +46,31 @@ use pageserver_api::key::rel_block_to_key; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; -use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; -use postgres_ffi::v14::xlog_utils::*; -use postgres_ffi::v14::CheckPoint; use postgres_ffi::TransactionId; use postgres_ffi::BLCKSZ; +use utils::bin_ser::SerializeError; use utils::lsn::Lsn; +enum_pgversion! {CheckPoint, pgv::CheckPoint} + +impl CheckPoint { + fn encode(&self) -> Result { + enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.encode() }) + } + + fn update_next_xid(&mut self, xid: u32) -> bool { + enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.update_next_xid(xid) }) + } + + pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool { + enum_pgversion_dispatch!(self, CheckPoint, cp, { + cp.update_next_multixid(multi_xid, multi_offset) + }) + } +} + pub struct WalIngest { shard: ShardIdentity, - pg_version: u32, checkpoint: CheckPoint, checkpoint_modified: bool, warn_ingest_lag: WarnIngestLag, @@ -78,12 +91,16 @@ impl WalIngest { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?; - let checkpoint = CheckPoint::decode(&checkpoint_bytes)?; - trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value); + let pgversion = timeline.pg_version; + + let checkpoint = dispatch_pgversion!(pgversion, { + let checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?; + trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value); + >::into(checkpoint) + }); Ok(WalIngest { shard: *timeline.get_shard_identity(), - pg_version: timeline.pg_version, checkpoint, checkpoint_modified: false, warn_ingest_lag: WarnIngestLag { @@ -117,7 +134,7 @@ impl WalIngest { modification.set_lsn(lsn)?; - if decoded.is_dbase_create_copy(self.pg_version) { + if decoded.is_dbase_create_copy(pg_version) { // Records of this type should always be preceded by a commit(), as they // rely on reading data pages back from the Timeline. assert!(!modification.has_dirty_data_pages()); @@ -337,70 +354,67 @@ impl WalIngest { pg_constants::RM_XLOG_ID => { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; - if info == pg_constants::XLOG_NEXTOID { - let next_oid = buf.get_u32_le(); - if self.checkpoint.nextOid != next_oid { - self.checkpoint.nextOid = next_oid; - self.checkpoint_modified = true; - } - } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE - || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN - { - let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT]; - buf.copy_to_slice(&mut checkpoint_bytes); - let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?; - trace!( - "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", - xlog_checkpoint.oldestXid, - self.checkpoint.oldestXid - ); - if (self - .checkpoint - .oldestXid - .wrapping_sub(xlog_checkpoint.oldestXid) as i32) - < 0 - { - self.checkpoint.oldestXid = xlog_checkpoint.oldestXid; - } - trace!( - "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}", - xlog_checkpoint.oldestActiveXid, - self.checkpoint.oldestActiveXid - ); - - // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`, - // because at shutdown, all in-progress transactions will implicitly - // end. Postgres startup code knows that, and allows hot standby to start - // immediately from a shutdown checkpoint. - // - // In Neon, Postgres hot standby startup always behaves as if starting from - // an online checkpoint. It needs a valid `oldestActiveXid` value, so - // instead of overwriting self.checkpoint.oldestActiveXid with - // InvalidTransactionid from the checkpoint WAL record, update it to a - // proper value, knowing that there are no in-progress transactions at this - // point, except for prepared transactions. - // - // See also the neon code changes in the InitWalRecovery() function. - if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID - && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN + enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { + if info == pg_constants::XLOG_NEXTOID { + let next_oid = buf.get_u32_le(); + if cp.nextOid != next_oid { + cp.nextOid = next_oid; + self.checkpoint_modified = true; + } + } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE + || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { - let mut oldest_active_xid = self.checkpoint.nextXid.value as u32; - for xid in modification.tline.list_twophase_files(lsn, ctx).await? { - if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 { - oldest_active_xid = xid; + let mut checkpoint_bytes = [0u8; pgv::xlog_utils::SIZEOF_CHECKPOINT]; + buf.copy_to_slice(&mut checkpoint_bytes); + let xlog_checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?; + trace!( + "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", + xlog_checkpoint.oldestXid, + cp.oldestXid + ); + if (cp.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 { + cp.oldestXid = xlog_checkpoint.oldestXid; + } + trace!( + "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}", + xlog_checkpoint.oldestActiveXid, + cp.oldestActiveXid + ); + + // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`, + // because at shutdown, all in-progress transactions will implicitly + // end. Postgres startup code knows that, and allows hot standby to start + // immediately from a shutdown checkpoint. + // + // In Neon, Postgres hot standby startup always behaves as if starting from + // an online checkpoint. It needs a valid `oldestActiveXid` value, so + // instead of overwriting self.checkpoint.oldestActiveXid with + // InvalidTransactionid from the checkpoint WAL record, update it to a + // proper value, knowing that there are no in-progress transactions at this + // point, except for prepared transactions. + // + // See also the neon code changes in the InitWalRecovery() function. + if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID + && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN + { + let mut oldest_active_xid = cp.nextXid.value as u32; + for xid in modification.tline.list_twophase_files(lsn, ctx).await? { + if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 { + oldest_active_xid = xid; + } } + cp.oldestActiveXid = oldest_active_xid; + } else { + cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid; } - self.checkpoint.oldestActiveXid = oldest_active_xid; - } else { - self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid; - } - // Write a new checkpoint key-value pair on every checkpoint record, even - // if nothing really changed. Not strictly required, but it seems nice to - // have some trace of the checkpoint records in the layer files at the same - // LSNs. - self.checkpoint_modified = true; - } + // Write a new checkpoint key-value pair on every checkpoint record, even + // if nothing really changed. Not strictly required, but it seems nice to + // have some trace of the checkpoint records in the layer files at the same + // LSNs. + self.checkpoint_modified = true; + } + }); } pg_constants::RM_LOGICALMSG_ID => { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; @@ -424,7 +438,11 @@ impl WalIngest { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_RUNNING_XACTS { let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf); - self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid; + + enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { + cp.oldestActiveXid = xlrec.oldest_running_xid; + }); + self.checkpoint_modified = true; } } @@ -539,7 +557,7 @@ impl WalIngest { && blk.has_image && decoded.xl_rmid == pg_constants::RM_XLOG_ID && (decoded.xl_info == pg_constants::XLOG_FPI - || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) + || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version) // do not materialize null pages because them most likely be soon replaced with real data @@ -1242,12 +1260,17 @@ impl WalIngest { fn warn_on_ingest_lag( &mut self, conf: &crate::config::PageServerConf, - wal_timestmap: TimestampTz, + wal_timestamp: TimestampTz, ) { debug_assert_current_span_has_tenant_and_timeline_id(); let now = SystemTime::now(); let rate_limits = &mut self.warn_ingest_lag; - match try_from_pg_timestamp(wal_timestmap) { + + let ts = enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, _cp, { + pgv::xlog_utils::try_from_pg_timestamp(wal_timestamp) + }); + + match ts { Ok(ts) => { match now.duration_since(ts) { Ok(lag) => { @@ -1257,7 +1280,7 @@ impl WalIngest { warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout"); }) } - }, + } Err(e) => { let delta_t = e.duration(); // determined by prod victoriametrics query: 1000 * (timestamp(node_time_seconds{neon_service="pageserver"}) - node_time_seconds) @@ -1271,7 +1294,6 @@ impl WalIngest { } } }; - } Err(error) => { rate_limits.timestamp_invalid_msg_ratelimit.call2(|rate_limit_stats| { @@ -1379,14 +1401,17 @@ impl WalIngest { // truncated, but a checkpoint record with the updated values isn't written until // later. In Neon, a server can start at any LSN, not just on a checkpoint record, // so we keep the oldestXid and oldestXidDB up-to-date. - self.checkpoint.oldestXid = xlrec.oldest_xid; - self.checkpoint.oldestXidDB = xlrec.oldest_xid_db; + enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { + cp.oldestXid = xlrec.oldest_xid; + cp.oldestXidDB = xlrec.oldest_xid_db; + }); self.checkpoint_modified = true; // TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it let latest_page_number = - self.checkpoint.nextXid.value as u32 / pg_constants::CLOG_XACTS_PER_PAGE; + enum_pgversion_dispatch!(self.checkpoint, CheckPoint, cp, { cp.nextXid.value }) as u32 + / pg_constants::CLOG_XACTS_PER_PAGE; // Now delete all segments containing pages between xlrec.pageno // and latest_page_number. @@ -1394,7 +1419,9 @@ impl WalIngest { // First, make an important safety check: // the current endpoint page must not be eligible for removal. // See SimpleLruTruncate() in slru.c - if clogpage_precedes(latest_page_number, xlrec.pageno) { + if dispatch_pgversion!(modification.tline.pg_version, { + pgv::nonrelfile_utils::clogpage_precedes(latest_page_number, xlrec.pageno) + }) { info!("could not truncate directory pg_xact apparent wraparound"); return Ok(()); } @@ -1411,7 +1438,12 @@ impl WalIngest { .await? { let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; - if slru_may_delete_clogsegment(segpage, xlrec.pageno) { + + let may_delete = dispatch_pgversion!(modification.tline.pg_version, { + pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, xlrec.pageno) + }); + + if may_delete { modification .drop_slru_segment(SlruKind::Clog, segno, ctx) .await?; @@ -1530,14 +1562,23 @@ impl WalIngest { xlrec: &XlMultiXactTruncate, ctx: &RequestContext, ) -> Result<()> { - self.checkpoint.oldestMulti = xlrec.end_trunc_off; - self.checkpoint.oldestMultiDB = xlrec.oldest_multi_db; + let (maxsegment, startsegment, endsegment) = + enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { + cp.oldestMulti = xlrec.end_trunc_off; + cp.oldestMultiDB = xlrec.oldest_multi_db; + let maxsegment: i32 = pgv::nonrelfile_utils::mx_offset_to_member_segment( + pg_constants::MAX_MULTIXACT_OFFSET, + ); + let startsegment: i32 = + pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.start_trunc_memb); + let endsegment: i32 = + pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.end_trunc_memb); + (maxsegment, startsegment, endsegment) + }); + self.checkpoint_modified = true; // PerformMembersTruncation - let maxsegment: i32 = mx_offset_to_member_segment(pg_constants::MAX_MULTIXACT_OFFSET); - let startsegment: i32 = mx_offset_to_member_segment(xlrec.start_trunc_memb); - let endsegment: i32 = mx_offset_to_member_segment(xlrec.end_trunc_memb); let mut segment: i32 = startsegment; // Delete all the segments except the last one. The last segment can still @@ -1811,11 +1852,23 @@ mod tests { // TODO } - static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); + #[tokio::test] + async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> { + for i in 14..=16 { + dispatch_pgversion!(i, { + pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?; + }); + } + + Ok(()) + } async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result { let mut m = tline.begin_modification(Lsn(0x10)); - m.put_checkpoint(ZERO_CHECKPOINT.clone())?; + m.put_checkpoint(dispatch_pgversion!( + tline.pg_version, + pgv::ZERO_CHECKPOINT.clone() + ))?; m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file m.commit(ctx).await?; let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?; From 97582178cb576f8b68acc53535adf8918d7dbd94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 10 Sep 2024 02:40:00 +0200 Subject: [PATCH 09/21] Remove async_trait from the Handler trait (#8958) Newest attempt to remove `async_trait` from the Handler trait. Earlier attempts were in #7301 and #8296 . --- libs/postgres_backend/src/lib.rs | 5 +- libs/postgres_backend/tests/simple_select.rs | 1 - pageserver/src/page_service.rs | 1 - proxy/src/console/mgmt.rs | 2 +- safekeeper/src/handler.rs | 72 ++++++++++---------- 5 files changed, 40 insertions(+), 41 deletions(-) diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 600f1d728c3b..8ea4b93fb12e 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -81,17 +81,16 @@ pub fn is_expected_io_error(e: &io::Error) -> bool { ) } -#[async_trait::async_trait] pub trait Handler { /// Handle single query. /// postgres_backend will issue ReadyForQuery after calling this (this /// might be not what we want after CopyData streaming, but currently we don't /// care). It will also flush out the output buffer. - async fn process_query( + fn process_query( &mut self, pgb: &mut PostgresBackend, query_string: &str, - ) -> Result<(), QueryError>; + ) -> impl Future>; /// Called on startup packet receival, allows to process params. /// diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index 7ec85f0dbe90..900083ea7fc8 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -23,7 +23,6 @@ async fn make_tcp_pair() -> (TcpStream, TcpStream) { struct TestHandler {} -#[async_trait::async_trait] impl Handler for TestHandler { // return single col 'hey' for any query async fn process_query( diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 39c6a6fb7499..9261b7481d56 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1199,7 +1199,6 @@ impl PageServerHandler { } } -#[async_trait::async_trait] impl postgres_backend::Handler for PageServerHandler where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs index 2ed4f5f206bb..ee5f83ee76d8 100644 --- a/proxy/src/console/mgmt.rs +++ b/proxy/src/console/mgmt.rs @@ -78,7 +78,7 @@ pub(crate) type ComputeReady = DatabaseInfo; // TODO: replace with an http-based protocol. struct MgmtHandler; -#[async_trait::async_trait] + impl postgres_backend::Handler for MgmtHandler { async fn process_query( &mut self, diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 2c519433ef83..3f00b69cde43 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -2,6 +2,7 @@ //! protocol commands. use anyhow::Context; +use std::future::Future; use std::str::{self, FromStr}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; @@ -95,7 +96,6 @@ fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str { } } -#[async_trait::async_trait] impl postgres_backend::Handler for SafekeeperPostgresHandler { @@ -197,49 +197,51 @@ impl postgres_backend::Handler Ok(()) } - async fn process_query( + fn process_query( &mut self, pgb: &mut PostgresBackend, query_string: &str, - ) -> Result<(), QueryError> { - if query_string - .to_ascii_lowercase() - .starts_with("set datestyle to ") - { - // important for debug because psycopg2 executes "SET datestyle TO 'ISO'" on connect - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - return Ok(()); - } + ) -> impl Future> { + Box::pin(async move { + if query_string + .to_ascii_lowercase() + .starts_with("set datestyle to ") + { + // important for debug because psycopg2 executes "SET datestyle TO 'ISO'" on connect + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + return Ok(()); + } - let cmd = parse_cmd(query_string)?; - let cmd_str = cmd_to_string(&cmd); + let cmd = parse_cmd(query_string)?; + let cmd_str = cmd_to_string(&cmd); - let _guard = PG_QUERIES_GAUGE.with_label_values(&[cmd_str]).guard(); + let _guard = PG_QUERIES_GAUGE.with_label_values(&[cmd_str]).guard(); - info!("got query {:?}", query_string); + info!("got query {:?}", query_string); - let tenant_id = self.tenant_id.context("tenantid is required")?; - let timeline_id = self.timeline_id.context("timelineid is required")?; - self.check_permission(Some(tenant_id))?; - self.ttid = TenantTimelineId::new(tenant_id, timeline_id); + let tenant_id = self.tenant_id.context("tenantid is required")?; + let timeline_id = self.timeline_id.context("timelineid is required")?; + self.check_permission(Some(tenant_id))?; + self.ttid = TenantTimelineId::new(tenant_id, timeline_id); - match cmd { - SafekeeperPostgresCommand::StartWalPush => { - self.handle_start_wal_push(pgb) - .instrument(info_span!("WAL receiver")) - .await - } - SafekeeperPostgresCommand::StartReplication { start_lsn, term } => { - self.handle_start_replication(pgb, start_lsn, term) - .instrument(info_span!("WAL sender")) - .await - } - SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await, - SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await, - SafekeeperPostgresCommand::JSONCtrl { ref cmd } => { - handle_json_ctrl(self, pgb, cmd).await + match cmd { + SafekeeperPostgresCommand::StartWalPush => { + self.handle_start_wal_push(pgb) + .instrument(info_span!("WAL receiver")) + .await + } + SafekeeperPostgresCommand::StartReplication { start_lsn, term } => { + self.handle_start_replication(pgb, start_lsn, term) + .instrument(info_span!("WAL sender")) + .await + } + SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await, + SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await, + SafekeeperPostgresCommand::JSONCtrl { ref cmd } => { + handle_json_ctrl(self, pgb, cmd).await + } } - } + }) } } From 26b5fcdc5077e5f4051f27c2e2d8f82ac5038acb Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 10 Sep 2024 12:54:25 +0100 Subject: [PATCH 10/21] reinstate write-path key check (#8973) ## Problem In https://github.com/neondatabase/neon/pull/8621, validation of keys during ingest was removed because the places where we actually store keys are now past the point where we have already converted them to CompactKey (i128) representation. ## Summary of changes Reinstate validation at an earlier stage in ingest. This doesn't cover literally every place we write a key, but it covers most cases where we're trusting postgres to give us a valid key (i.e. one that doesn't try and use a custom spacenode). --- pageserver/src/pgdatadir_mapping.rs | 49 ++++++++++++++++++++++++----- pageserver/src/walingest.rs | 8 ++--- 2 files changed, 46 insertions(+), 11 deletions(-) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 808d4b666e07..6dd8851b13a1 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1205,6 +1205,13 @@ impl<'a> DatadirModification<'a> { img: Bytes, ) -> anyhow::Result<()> { anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + let key = rel_block_to_key(rel, blknum); + if !key.is_valid_key_on_write_path() { + anyhow::bail!( + "the request contains data not supported by pageserver at {}", + key + ); + } self.put(rel_block_to_key(rel, blknum), Value::Image(img)); Ok(()) } @@ -1216,14 +1223,34 @@ impl<'a> DatadirModification<'a> { blknum: BlockNumber, img: Bytes, ) -> anyhow::Result<()> { - self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img)); + let key = slru_block_to_key(kind, segno, blknum); + if !key.is_valid_key_on_write_path() { + anyhow::bail!( + "the request contains data not supported by pageserver at {}", + key + ); + } + self.put(key, Value::Image(img)); Ok(()) } - pub(crate) fn put_rel_page_image_zero(&mut self, rel: RelTag, blknum: BlockNumber) { - self.pending_zero_data_pages - .insert(rel_block_to_key(rel, blknum).to_compact()); + pub(crate) fn put_rel_page_image_zero( + &mut self, + rel: RelTag, + blknum: BlockNumber, + ) -> anyhow::Result<()> { + anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + let key = rel_block_to_key(rel, blknum); + if !key.is_valid_key_on_write_path() { + anyhow::bail!( + "the request contains data not supported by pageserver: {} @ {}", + key, + self.lsn + ); + } + self.pending_zero_data_pages.insert(key.to_compact()); self.pending_bytes += ZERO_PAGE.len(); + Ok(()) } pub(crate) fn put_slru_page_image_zero( @@ -1231,10 +1258,18 @@ impl<'a> DatadirModification<'a> { kind: SlruKind, segno: u32, blknum: BlockNumber, - ) { - self.pending_zero_data_pages - .insert(slru_block_to_key(kind, segno, blknum).to_compact()); + ) -> anyhow::Result<()> { + let key = slru_block_to_key(kind, segno, blknum); + if !key.is_valid_key_on_write_path() { + anyhow::bail!( + "the request contains data not supported by pageserver: {} @ {}", + key, + self.lsn + ); + } + self.pending_zero_data_pages.insert(key.to_compact()); self.pending_bytes += ZERO_PAGE.len(); + Ok(()) } /// Call this at the end of each WAL record. diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 39bc9e385f7f..6e15ad81c30c 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1222,7 +1222,7 @@ impl WalIngest { if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 { // Tail of last remaining FSM page has to be zeroed. // We are not precise here and instead of digging in FSM bitmap format just clear the whole page. - modification.put_rel_page_image_zero(rel, fsm_physical_page_no); + modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?; fsm_physical_page_no += 1; } let nblocks = get_relsize(modification, rel, ctx).await?; @@ -1244,7 +1244,7 @@ impl WalIngest { if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 { // Tail of last remaining vm page has to be zeroed. // We are not precise here and instead of digging in VM bitmap format just clear the whole page. - modification.put_rel_page_image_zero(rel, vm_page_no); + modification.put_rel_page_image_zero(rel, vm_page_no)?; vm_page_no += 1; } let nblocks = get_relsize(modification, rel, ctx).await?; @@ -1737,7 +1737,7 @@ impl WalIngest { continue; } - modification.put_rel_page_image_zero(rel, gap_blknum); + modification.put_rel_page_image_zero(rel, gap_blknum)?; } } Ok(()) @@ -1803,7 +1803,7 @@ impl WalIngest { // fill the gap with zeros for gap_blknum in old_nblocks..blknum { - modification.put_slru_page_image_zero(kind, segno, gap_blknum); + modification.put_slru_page_image_zero(kind, segno, gap_blknum)?; } } Ok(()) From bae793ffcd90470b26380053fe931a91545798a5 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Tue, 10 Sep 2024 15:36:08 +0200 Subject: [PATCH 11/21] proxy: Handle all let underscore instances (#8898) * Most can be simply replaced * One instance renamed to _rtchk (return-type check) --- proxy/src/cache/endpoints.rs | 2 +- proxy/src/console/messages.rs | 16 ++++++++-------- proxy/src/context.rs | 10 +++++++--- proxy/src/context/parquet.rs | 2 +- proxy/src/lib.rs | 2 +- proxy/src/proxy/tests.rs | 2 +- .../connection_with_credentials_provider.rs | 7 +++++-- proxy/src/stream.rs | 15 +++++++++------ proxy/src/url.rs | 2 +- 9 files changed, 34 insertions(+), 24 deletions(-) diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index f4762232d8ff..27121ce89e84 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -242,6 +242,6 @@ mod tests { #[test] fn test() { let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}"; - let _: ControlPlaneEventKey = serde_json::from_str(s).unwrap(); + serde_json::from_str::(s).unwrap(); } } diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index a48c7316f6e7..9b66333cd473 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -395,7 +395,7 @@ mod tests { } } }); - let _: KickSession<'_> = serde_json::from_str(&json.to_string())?; + serde_json::from_str::>(&json.to_string())?; Ok(()) } @@ -403,7 +403,7 @@ mod tests { #[test] fn parse_db_info() -> anyhow::Result<()> { // with password - let _: DatabaseInfo = serde_json::from_value(json!({ + serde_json::from_value::(json!({ "host": "localhost", "port": 5432, "dbname": "postgres", @@ -413,7 +413,7 @@ mod tests { }))?; // without password - let _: DatabaseInfo = serde_json::from_value(json!({ + serde_json::from_value::(json!({ "host": "localhost", "port": 5432, "dbname": "postgres", @@ -422,7 +422,7 @@ mod tests { }))?; // new field (forward compatibility) - let _: DatabaseInfo = serde_json::from_value(json!({ + serde_json::from_value::(json!({ "host": "localhost", "port": 5432, "dbname": "postgres", @@ -441,7 +441,7 @@ mod tests { "address": "0.0.0.0", "aux": dummy_aux(), }); - let _: WakeCompute = serde_json::from_str(&json.to_string())?; + serde_json::from_str::(&json.to_string())?; Ok(()) } @@ -451,18 +451,18 @@ mod tests { let json = json!({ "role_secret": "secret", }); - let _: GetRoleSecret = serde_json::from_str(&json.to_string())?; + serde_json::from_str::(&json.to_string())?; let json = json!({ "role_secret": "secret", "allowed_ips": ["8.8.8.8"], }); - let _: GetRoleSecret = serde_json::from_str(&json.to_string())?; + serde_json::from_str::(&json.to_string())?; let json = json!({ "role_secret": "secret", "allowed_ips": ["8.8.8.8"], "project_id": "project", }); - let _: GetRoleSecret = serde_json::from_str(&json.to_string())?; + serde_json::from_str::(&json.to_string())?; Ok(()) } diff --git a/proxy/src/context.rs b/proxy/src/context.rs index 72e1fa1cee0a..c013218ad91f 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -6,7 +6,7 @@ use pq_proto::StartupMessageParams; use smol_str::SmolStr; use std::net::IpAddr; use tokio::sync::mpsc; -use tracing::{field::display, info, info_span, Span}; +use tracing::{debug, field::display, info, info_span, Span}; use try_lock::TryLock; use uuid::Uuid; @@ -362,7 +362,9 @@ impl RequestMonitoringInner { }); } if let Some(tx) = self.sender.take() { - let _: Result<(), _> = tx.send(RequestData::from(&*self)); + tx.send(RequestData::from(&*self)) + .inspect_err(|e| debug!("tx send failed: {e}")) + .ok(); } } @@ -371,7 +373,9 @@ impl RequestMonitoringInner { // Here we log the length of the session. self.disconnect_timestamp = Some(Utc::now()); if let Some(tx) = self.disconnect_sender.take() { - let _: Result<(), _> = tx.send(RequestData::from(&*self)); + tx.send(RequestData::from(&*self)) + .inspect_err(|e| debug!("tx send failed: {e}")) + .ok(); } } } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index fafea2a08fd7..9f6f83022ed6 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -290,7 +290,7 @@ async fn worker_inner( } if !w.flushed_row_groups().is_empty() { - let _: Writer = upload_parquet(w, len, &storage).await?; + let _rtchk: Writer = upload_parquet(w, len, &storage).await?; } Ok(()) diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 923d6ae288a2..0070839aa8ec 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -3,7 +3,7 @@ #![deny( deprecated, future_incompatible, - // TODO: consider let_underscore + let_underscore, nonstandard_style, rust_2024_compatibility )] diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 4264dbae0f1a..752d98272602 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -268,7 +268,7 @@ async fn keepalive_is_inherited() -> anyhow::Result<()> { anyhow::Ok(keepalive) }); - let _ = TcpStream::connect(("127.0.0.1", port)).await?; + TcpStream::connect(("127.0.0.1", port)).await?; assert!(t.await??, "keepalive should be inherited"); Ok(()) diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index 7d222e2dec0e..2de66b58b172 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -6,7 +6,7 @@ use redis::{ ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult, }; use tokio::task::JoinHandle; -use tracing::{error, info}; +use tracing::{debug, error, info}; use super::elasticache::CredentialsProvider; @@ -109,7 +109,10 @@ impl ConnectionWithCredentialsProvider { let credentials_provider = credentials_provider.clone(); let con2 = con.clone(); let f = tokio::spawn(async move { - let _ = Self::keep_connection(con2, credentials_provider).await; + Self::keep_connection(con2, credentials_provider) + .await + .inspect_err(|e| debug!("keep_connection failed: {e}")) + .ok(); }); self.refresh_token_task = Some(f); } diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index c14dd18afe0e..e2fc73235ee8 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -12,6 +12,7 @@ use std::{io, task}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tokio_rustls::server::TlsStream; +use tracing::debug; /// Stream wrapper which implements libpq's protocol. /// @@ -138,9 +139,10 @@ impl PqStream { ); // already error case, ignore client IO error - let _: Result<_, std::io::Error> = self - .write_message(&BeMessage::ErrorResponse(msg, None)) - .await; + self.write_message(&BeMessage::ErrorResponse(msg, None)) + .await + .inspect_err(|e| debug!("write_message failed: {e}")) + .ok(); Err(ReportedError { source: anyhow::anyhow!(msg), @@ -164,9 +166,10 @@ impl PqStream { ); // already error case, ignore client IO error - let _: Result<_, std::io::Error> = self - .write_message(&BeMessage::ErrorResponse(&msg, None)) - .await; + self.write_message(&BeMessage::ErrorResponse(&msg, None)) + .await + .inspect_err(|e| debug!("write_message failed: {e}")) + .ok(); Err(ReportedError { source: anyhow::anyhow!(error), diff --git a/proxy/src/url.rs b/proxy/src/url.rs index 28ac7efdfc90..270cd7c24da0 100644 --- a/proxy/src/url.rs +++ b/proxy/src/url.rs @@ -57,7 +57,7 @@ mod tests { fn bad_url() { let url = "test:foobar"; url.parse::().expect("unexpected parsing failure"); - let _ = url.parse::().expect_err("should not parse"); + url.parse::().expect_err("should not parse"); } #[test] From cb060548fb2115ca6a57a95c6c947c45fc2095a6 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 11 Sep 2024 18:45:34 +0100 Subject: [PATCH 12/21] libs: tweak PageserverUtilization::is_overloaded (#8946) ## Problem Having run in production for a while, we see that nodes are generally safely oversubscribed by about a factor of 2. ## Summary of changes Tweak the is_overloaded method to check for utililzation over 200% rather than over 100% --- libs/pageserver_api/src/models/utilization.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs index 844a0cda5db7..641aa51989ed 100644 --- a/libs/pageserver_api/src/models/utilization.rs +++ b/libs/pageserver_api/src/models/utilization.rs @@ -89,8 +89,19 @@ impl PageserverUtilization { /// If a node is currently hosting more work than it can comfortably handle. This does not indicate that /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative. + /// + /// When a node is overloaded, we may override soft affinity preferences and do things like scheduling + /// into a node in a less desirable AZ, if all the nodes in the preferred AZ are overloaded. pub fn is_overloaded(score: RawScore) -> bool { - score >= Self::UTILIZATION_FULL + // Why the factor of two? This is unscientific but reflects behavior of real systems: + // - In terms of shard counts, a node's preferred max count is a soft limit intended to keep + // startup and housekeeping jobs nice and responsive. We can go to double this limit if needed + // until some more nodes are deployed. + // - In terms of disk space, the node's utilization heuristic assumes every tenant needs to + // hold its biggest timeline fully on disk, which is tends to be an over estimate when + // some tenants are very idle and have dropped layers from disk. In practice going up to + // double is generally better than giving up and scheduling in a sub-optimal AZ. + score >= 2 * Self::UTILIZATION_FULL } pub fn adjust_shard_count_max(&mut self, shard_count: u32) { From 43846b72fa488b96d37bbc40d691bbf4e4f8fdd3 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sun, 8 Sep 2024 21:40:30 +0300 Subject: [PATCH 13/21] Remove unused "neon_local init --pg-version" arg It has been unused since commit 8712e1899e, when it stopped creating the initial timeline. --- control_plane/src/bin/neon_local.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 1d66532d49bd..af6545f8d29c 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -1570,7 +1570,6 @@ fn cli() -> Command { .value_parser(value_parser!(PathBuf)) .value_name("config") ) - .arg(pg_version_arg.clone()) .arg(force_arg) ) .subcommand( From aeca15008c15b211d74536439ff701e533a412ef Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 11 Sep 2024 10:55:41 +0300 Subject: [PATCH 14/21] Remove obsolete and misleading comment The tenant ID was not actually generated here but in NeonEnvBuilder. And the "neon_local init" command hasn't been able to generate the initial tenant since 8712e1899e anyway. --- test_runner/fixtures/neon_fixtures.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 60887b9aed36..22472559f4d4 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1073,9 +1073,6 @@ def __init__(self, config: NeonEnvBuilder): self.pg_distrib_dir = config.pg_distrib_dir self.endpoint_counter = 0 self.storage_controller_config = config.storage_controller_config - - # generate initial tenant ID here instead of letting 'neon init' generate it, - # so that we don't need to dig it out of the config file afterwards. self.initial_tenant = config.initial_tenant self.initial_timeline = config.initial_timeline From 0a363c3dce2fbd76f6483f02c2273e3f7b205b3e Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 11 Sep 2024 12:38:31 +0300 Subject: [PATCH 15/21] Add --timeline-id option to "neon_local timeline branch" command Makes it consistent with the "timeline create" and "timeline import" commands, which allowed you to pass the timeline id as argument. This also makes it unnecessary to parse the timeline ID from the output in the python function that calls it. --- control_plane/src/bin/neon_local.rs | 4 ++- test_runner/fixtures/neon_fixtures.py | 43 ++++++++++----------------- 2 files changed, 18 insertions(+), 29 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index af6545f8d29c..144cd647c920 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -640,6 +640,8 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local } Some(("branch", branch_match)) => { let tenant_id = get_tenant_id(branch_match, env)?; + let new_timeline_id = + parse_timeline_id(branch_match)?.unwrap_or(TimelineId::generate()); let new_branch_name = branch_match .get_one::("branch-name") .ok_or_else(|| anyhow!("No branch name provided"))?; @@ -658,7 +660,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local .map(|lsn_str| Lsn::from_str(lsn_str)) .transpose() .context("Failed to parse ancestor start Lsn from the request")?; - let new_timeline_id = TimelineId::generate(); let storage_controller = StorageController::from_env(env); let create_req = TimelineCreateRequest { new_timeline_id, @@ -1582,6 +1583,7 @@ fn cli() -> Command { .subcommand(Command::new("branch") .about("Create a new timeline, using another timeline as a base, copying its data") .arg(tenant_id_arg.clone()) + .arg(timeline_id_arg.clone()) .arg(branch_name_arg.clone()) .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name") .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false)) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 22472559f4d4..1c33d1415498 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1518,14 +1518,6 @@ class PageserverPort: http: int -CREATE_TIMELINE_ID_EXTRACTOR: re.Pattern = re.compile( # type: ignore[type-arg] - r"^Created timeline '(?P[^']+)'", re.MULTILINE -) -TIMELINE_DATA_EXTRACTOR: re.Pattern = re.compile( # type: ignore[type-arg] - r"\s?(?P[^\s]+)\s\[(?P[^\]]+)\]", re.MULTILINE -) - - class AbstractNeonCli(abc.ABC): """ A typed wrapper around an arbitrary Neon CLI tool. @@ -1754,6 +1746,9 @@ def create_timeline( tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None, ) -> TimelineId: + if timeline_id is None: + timeline_id = TimelineId.generate() + cmd = [ "timeline", "create", @@ -1761,23 +1756,16 @@ def create_timeline( new_branch_name, "--tenant-id", str(tenant_id or self.env.initial_tenant), + "--timeline-id", + str(timeline_id), "--pg-version", self.env.pg_version, ] - if timeline_id is not None: - cmd.extend(["--timeline-id", str(timeline_id)]) - res = self.raw_cli(cmd) res.check_returncode() - matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout) - - created_timeline_id = None - if matches is not None: - created_timeline_id = matches.group("timeline_id") - - return TimelineId(str(created_timeline_id)) + return timeline_id def create_branch( self, @@ -1785,12 +1773,17 @@ def create_branch( ancestor_branch_name: Optional[str] = None, tenant_id: Optional[TenantId] = None, ancestor_start_lsn: Optional[Lsn] = None, + new_timeline_id: Optional[TimelineId] = None, ) -> TimelineId: + if new_timeline_id is None: + new_timeline_id = TimelineId.generate() cmd = [ "timeline", "branch", "--branch-name", new_branch_name, + "--timeline-id", + str(new_timeline_id), "--tenant-id", str(tenant_id or self.env.initial_tenant), ] @@ -1802,16 +1795,7 @@ def create_branch( res = self.raw_cli(cmd) res.check_returncode() - matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout) - - created_timeline_id = None - if matches is not None: - created_timeline_id = matches.group("timeline_id") - - if created_timeline_id is None: - raise Exception("could not find timeline id after `neon timeline create` invocation") - else: - return TimelineId(str(created_timeline_id)) + return TimelineId(str(new_timeline_id)) def list_timelines(self, tenant_id: Optional[TenantId] = None) -> List[Tuple[str, TimelineId]]: """ @@ -1820,6 +1804,9 @@ def list_timelines(self, tenant_id: Optional[TenantId] = None) -> List[Tuple[str # main [b49f7954224a0ad25cc0013ea107b54b] # ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540] + TIMELINE_DATA_EXTRACTOR: re.Pattern = re.compile( # type: ignore[type-arg] + r"\s?(?P[^\s]+)\s\[(?P[^\]]+)\]", re.MULTILINE + ) res = self.raw_cli( ["timeline", "list", "--tenant-id", str(tenant_id or self.env.initial_tenant)] ) From 8dc069037b003e63c77683670be4e965384e794b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 11 Sep 2024 15:01:34 +0300 Subject: [PATCH 16/21] Remove NeonEnvBuilder.start() function It feels wrong to me to start() from the builder object. Surely the thing you start is the environment itself, not its configuration. --- test_runner/fixtures/neon_fixtures.py | 6 +----- test_runner/performance/test_storage_controller_scale.py | 2 +- test_runner/regress/test_compatibility.py | 4 ++-- test_runner/regress/test_sharding.py | 6 +++--- 4 files changed, 7 insertions(+), 11 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1c33d1415498..ee62372871f8 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -553,10 +553,6 @@ def init_configs(self, default_remote_storage_if_missing: bool = True) -> NeonEn self.env = NeonEnv(self) return self.env - def start(self): - assert self.env is not None, "environment is not already initialized, call init() first" - self.env.start() - def init_start( self, initial_tenant_conf: Optional[Dict[str, Any]] = None, @@ -572,7 +568,7 @@ def init_start( Configuring pageserver with remote storage is now the default. There will be a warning if pageserver is created without one. """ env = self.init_configs(default_remote_storage_if_missing=default_remote_storage_if_missing) - self.start() + env.start() # Prepare the default branch to start the postgres on later. # Pageserver itself does not create tenants and timelines, until started first and asked via HTTP API. diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 297aedfbed2b..a186bbacebb8 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -84,7 +84,7 @@ def test_storage_controller_many_tenants( compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01)) env = neon_env_builder.init_configs() - neon_env_builder.start() + env.start() # We will intentionally stress reconciler concurrrency, which triggers a warning when lots # of shards are hitting the delayed path. diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 467e5b17342c..b559be5f18a5 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -178,7 +178,7 @@ def test_backward_compatibility( neon_env_builder.num_safekeepers = 3 env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo") env.pageserver.allowed_errors.append(ingest_lag_log_line) - neon_env_builder.start() + env.start() check_neon_works( env, @@ -265,7 +265,7 @@ def test_forward_compatibility( # does not include logs from previous runs assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version) - neon_env_builder.start() + env.start() # ensure the specified pageserver is running assert env.pageserver.log_contains("git-env:" + prev_pageserver_version) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index bfd82242e908..4a84dca399a3 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -374,7 +374,7 @@ def test_sharding_split_smoke( non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024} env = neon_env_builder.init_configs(True) - neon_env_builder.start() + env.start() tenant_id = TenantId.generate() timeline_id = TimelineId.generate() env.neon_cli.create_tenant( @@ -1436,7 +1436,7 @@ def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_pageservers = 2 env = neon_env_builder.init_configs() - neon_env_builder.start() + env.start() tenant_id = TenantId.generate() timeline_id = TimelineId.generate() @@ -1475,7 +1475,7 @@ def test_top_tenants(neon_env_builder: NeonEnvBuilder): """ env = neon_env_builder.init_configs() - neon_env_builder.start() + env.start() tenants = [] n_tenants = 8 From 9e3ead3689b012d344afbac4fcbf000372bb9969 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Wed, 11 Sep 2024 18:43:42 +0100 Subject: [PATCH 17/21] Collect the last of on-demand WAL download in CreateReplicationSlot reverts Signed-off-by: Tristan Partin --- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/revisions.json | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 49d5e576a56e..6f6d77fb5960 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 49d5e576a56e4cc59cd6a6a0791b2324b9fa675e +Subproject commit 6f6d77fb5960602fcd3fd130aca9f99ecb1619c9 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 6e9a4ff6249a..0baa7346dfd4 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 6e9a4ff6249ac02b8175054b7b3f7dfb198be48b +Subproject commit 0baa7346dfd42d61912eeca554c9bb0a190f0a1e diff --git a/vendor/revisions.json b/vendor/revisions.json index e52576e61fc5..3a65a507f3ed 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,11 +1,11 @@ { "v16": [ "16.4", - "6e9a4ff6249ac02b8175054b7b3f7dfb198be48b" + "0baa7346dfd42d61912eeca554c9bb0a190f0a1e" ], "v15": [ "15.8", - "49d5e576a56e4cc59cd6a6a0791b2324b9fa675e" + "6f6d77fb5960602fcd3fd130aca9f99ecb1619c9" ], "v14": [ "14.13", From fcab61bdcd9e30f2e2f6ce5be59e34bb98068f2f Mon Sep 17 00:00:00 2001 From: Stefan Radig Date: Thu, 12 Sep 2024 15:55:12 +0100 Subject: [PATCH 18/21] Prototype implementation for private access poc (#8976) ## Problem For the Private Access POC we want users to be able to disable access from the public proxy. To limit the number of changes this can be done by configuring an IP allowlist [ "255.255.255.255" ]. For the Private Access proxy a new commandline flag allows to disable IP allowlist completely. See https://www.notion.so/neondatabase/Neon-Private-Access-POC-Proposal-8f707754e1ab4190ad5709da7832f020?d=887495c15e884aa4973f973a8a0a582a#7ac6ec249b524a74adbeddc4b84b8f5f for details about the POC., ## Summary of changes - Adding the commandline flag is_private_access_proxy=true will disable IP allowlist --- proxy/src/auth/backend.rs | 5 ++++- proxy/src/auth/credentials.rs | 13 +++++++++++++ proxy/src/bin/local_proxy.rs | 1 + proxy/src/bin/proxy.rs | 5 +++++ proxy/src/config.rs | 1 + proxy/src/serverless/backend.rs | 4 +++- 6 files changed, 27 insertions(+), 2 deletions(-) diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 1d28c6df31d1..5561c9c56db7 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -311,7 +311,9 @@ async fn auth_quirks( let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?; // check allowed list - if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { + if config.ip_allowlist_check_enabled + && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) + { return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr())); } @@ -603,6 +605,7 @@ mod tests { rate_limiter_enabled: true, rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET), rate_limit_ip_subnet: 64, + ip_allowlist_check_enabled: true, }); async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage { diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 0e91ae570ad5..cba8601d143b 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -538,4 +538,17 @@ mod tests { )); Ok(()) } + + #[test] + fn test_connection_blocker() { + fn check(v: serde_json::Value) -> bool { + let peer_addr = IpAddr::from([127, 0, 0, 1]); + let ip_list: Vec = serde_json::from_value(v).unwrap(); + check_peer_addr_is_in_list(&peer_addr, &ip_list) + } + + assert!(check(json!([]))); + assert!(check(json!(["127.0.0.1"]))); + assert!(!check(json!(["255.255.255.255"]))); + } } diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index 08effeff9960..6eba71df1b07 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -224,6 +224,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig rate_limiter_enabled: false, rate_limiter: BucketRateLimiter::new(vec![]), rate_limit_ip_subnet: 64, + ip_allowlist_check_enabled: true, }, require_client_ip: false, handshake_timeout: Duration::from_secs(10), diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 7706a1f7cdd9..ca9aeb04d8be 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -224,6 +224,10 @@ struct ProxyCliArgs { /// Whether to retry the wake_compute request #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)] wake_compute_retry: String, + + /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + is_private_access_proxy: bool, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -682,6 +686,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { rate_limiter_enabled: args.auth_rate_limit_enabled, rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, + ip_allowlist_check_enabled: !args.is_private_access_proxy, }; let config = Box::leak(Box::new(ProxyConfig { diff --git a/proxy/src/config.rs b/proxy/src/config.rs index d7fc6eee222d..1cda6d200ccd 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -64,6 +64,7 @@ pub struct AuthenticationConfig { pub rate_limiter_enabled: bool, pub rate_limiter: AuthRateLimiter, pub rate_limit_ip_subnet: u8, + pub ip_allowlist_check_enabled: bool, } impl TlsConfig { diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index f24e0478bea3..d163878528b3 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -50,7 +50,9 @@ impl PoolingBackend { .as_ref() .map(|()| user_info.clone()); let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?; - if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { + if config.ip_allowlist_check_enabled + && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) + { return Err(AuthError::ip_address_not_allowed(ctx.peer_addr())); } if !self From 78938d1b591b33d23495a0edb8b123cc5cac6a27 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Thu, 12 Sep 2024 23:18:41 +0100 Subject: [PATCH 19/21] [compute/postgres] feature: PostgreSQL 17 (#8573) This adds preliminary PG17 support to Neon, based on RC1 / 2024-09-04 https://github.com/postgres/postgres/commit/07b828e9d4aa916f1763774787440d914eea69c4 NOTICE: The data produced by the included version of the PostgreSQL fork may not be compatible with the future full release of PostgreSQL 17 due to expected or unexpected future changes in magic numbers and internals. DO NOT EXPECT DATA IN V17-TENANTS TO BE COMPATIBLE WITH THE 17.0 RELEASE! Co-authored-by: Anastasia Lubennikova Co-authored-by: Alexander Bayandin Co-authored-by: Konstantin Knizhnik Co-authored-by: Heikki Linnakangas --- .github/workflows/_build-and-test-locally.yml | 19 +- .github/workflows/build_and_test.yml | 27 +- .github/workflows/neon_extra_builds.yml | 15 + .gitmodules | 4 + Dockerfile | 12 +- Dockerfile.compute-node | 289 +++- Makefile | 56 +- compute_tools/src/compute.rs | 31 +- compute_tools/src/extension_server.rs | 1 + control_plane/src/local_env.rs | 2 +- control_plane/src/storage_controller.rs | 40 +- libs/pageserver_api/src/key.rs | 39 +- libs/postgres_ffi/build.rs | 2 +- libs/postgres_ffi/src/lib.rs | 5 + libs/postgres_ffi/src/pg_constants.rs | 39 +- libs/postgres_ffi/src/pg_constants_v14.rs | 27 + libs/postgres_ffi/src/pg_constants_v15.rs | 2 + libs/postgres_ffi/src/pg_constants_v16.rs | 2 + libs/postgres_ffi/src/pg_constants_v17.rs | 55 + libs/postgres_ffi/wal_craft/src/lib.rs | 2 +- libs/walproposer/build.rs | 9 +- pageserver/ctl/src/layer_map_analyzer.rs | 16 +- pageserver/src/basebackup.rs | 16 +- pageserver/src/config.rs | 2 +- pageserver/src/import_datadir.rs | 6 +- pageserver/src/pgdatadir_mapping.rs | 111 +- pageserver/src/walingest.rs | 191 ++- pageserver/src/walrecord.rs | 73 +- pgxn/neon/bitmap.h | 12 + pgxn/neon/file_cache.c | 494 ++++-- pgxn/neon/libpagestore.c | 4 + pgxn/neon/neon_pgversioncompat.h | 14 +- pgxn/neon/pagestore_client.h | 54 +- pgxn/neon/pagestore_smgr.c | 1352 ++++++++++++----- pgxn/neon/walproposer_pg.c | 39 +- pgxn/neon_rmgr/neon_rmgr_decode.c | 399 ++++- pgxn/neon_walredo/inmem_smgr.c | 79 +- pgxn/neon_walredo/inmem_smgr.h | 2 +- pgxn/neon_walredo/walredoproc.c | 14 +- test_runner/fixtures/common_types.py | 2 +- test_runner/fixtures/neon_fixtures.py | 9 +- test_runner/fixtures/pg_version.py | 1 + .../5670669815/v17/ext_index.json | 7 + test_runner/regress/test_compatibility.py | 8 +- .../regress/test_download_extensions.py | 2 + test_runner/regress/test_postgres_version.py | 17 +- .../regress/test_timeline_detach_ancestor.py | 3 + test_runner/regress/test_twophase.py | 70 +- vendor/postgres-v17 | 1 + 49 files changed, 2898 insertions(+), 778 deletions(-) create mode 100644 libs/postgres_ffi/src/pg_constants_v17.rs create mode 100644 pgxn/neon/bitmap.h create mode 100644 test_runner/regress/data/extension_test/5670669815/v17/ext_index.json create mode 160000 vendor/postgres-v17 diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index e18e6a12013d..67152b6991a0 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -62,7 +62,7 @@ jobs: # git config --global --add safe.directory ${{ github.workspace }} git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do + for r in 14 15 16 17; do git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" done @@ -83,6 +83,10 @@ jobs: id: pg_v16_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + - name: Set pg 17 revision for caching + id: pg_v17_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT + # Set some environment variables used by all the steps. # # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. @@ -136,6 +140,13 @@ jobs: path: pg_install/v16 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + - name: Cache postgres v17 build + id: cache_pg_17 + uses: actions/cache@v4 + with: + path: pg_install/v17 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' run: mold -run make postgres-v14 -j$(nproc) @@ -148,6 +159,10 @@ jobs: if: steps.cache_pg_16.outputs.cache-hit != 'true' run: mold -run make postgres-v16 -j$(nproc) + - name: Build postgres v17 + if: steps.cache_pg_17.outputs.cache-hit != 'true' + run: mold -run make postgres-v17 -j$(nproc) + - name: Build neon extensions run: mold -run make neon-pg-ext -j$(nproc) @@ -210,7 +225,7 @@ jobs: run: | PQ_LIB_DIR=$(pwd)/pg_install/v16/lib export PQ_LIB_DIR - LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib + LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib export LD_LIBRARY_PATH #nextest does not yet support running doctests diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 4bb9e5cb6675..7c06fd9ab8fa 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -211,7 +211,7 @@ jobs: build-tag: ${{ needs.tag.outputs.build-tag }} build-type: ${{ matrix.build-type }} # Run tests on all Postgres versions in release builds and only on the latest version in debug builds - pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }} + pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16", "v17"]' || '["v17"]' }} secrets: inherit # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking @@ -548,7 +548,7 @@ jobs: strategy: fail-fast: false matrix: - version: [ v14, v15, v16 ] + version: [ v14, v15, v16, v17 ] arch: [ x64, arm64 ] runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} @@ -627,7 +627,7 @@ jobs: - name: Build compute-tools image # compute-tools are Postgres independent, so build it only once - if: matrix.version == 'v16' + if: matrix.version == 'v17' uses: docker/build-push-action@v6 with: target: compute-tools-image @@ -649,7 +649,7 @@ jobs: strategy: matrix: - version: [ v14, v15, v16 ] + version: [ v14, v15, v16, v17 ] steps: - uses: docker/login-action@v3 @@ -671,7 +671,7 @@ jobs: neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 - name: Create multi-arch compute-tools image - if: matrix.version == 'v16' + if: matrix.version == 'v17' run: | docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \ neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \ @@ -689,7 +689,7 @@ jobs: neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} - name: Push multi-arch compute-tools image to ECR - if: matrix.version == 'v16' + if: matrix.version == 'v17' run: | docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \ neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} @@ -700,7 +700,7 @@ jobs: strategy: fail-fast: false matrix: - version: [ v14, v15, v16 ] + version: [ v14, v15, v16, v17 ] env: VM_BUILDER_VERSION: v0.29.3 @@ -798,7 +798,7 @@ jobs: runs-on: ubuntu-22.04 env: - VERSIONS: v14 v15 v16 + VERSIONS: v14 v15 v16 v17 steps: - uses: docker/login-action@v3 @@ -839,7 +839,7 @@ jobs: done done docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \ - neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }} + neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }} - name: Login to prod ECR uses: docker/login-action@v3 @@ -852,7 +852,7 @@ jobs: - name: Copy all images to prod ECR if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' run: | - for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do + for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \ 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} done @@ -864,7 +864,7 @@ jobs: with: client_id: ${{ vars.AZURE_DEV_CLIENT_ID }} image_tag: ${{ needs.tag.outputs.build-tag }} - images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16 + images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} tenant_id: ${{ vars.AZURE_TENANT_ID }} @@ -876,7 +876,7 @@ jobs: with: client_id: ${{ vars.AZURE_PROD_CLIENT_ID }} image_tag: ${{ needs.tag.outputs.build-tag }} - images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16 + images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} tenant_id: ${{ vars.AZURE_TENANT_ID }} @@ -971,7 +971,7 @@ jobs: # git config --global --add safe.directory ${{ github.workspace }} git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do + for r in 14 15 16 17; do git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" done @@ -1117,6 +1117,7 @@ jobs: files_to_promote+=("s3://${BUCKET}/${s3_key}") + # TODO Add v17 for pg_version in v14 v15 v16; do # We run less tests for debug builds, so we don't need to promote them if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 7fecdbde8c5c..41c9f5dee5a4 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -72,6 +72,10 @@ jobs: id: pg_v16_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + - name: Set pg 17 revision for caching + id: pg_v17_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT + - name: Cache postgres v14 build id: cache_pg_14 uses: actions/cache@v4 @@ -93,6 +97,13 @@ jobs: path: pg_install/v16 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Cache postgres v17 build + id: cache_pg_17 + uses: actions/cache@v4 + with: + path: pg_install/v17 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Set extra env for macOS run: | echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV @@ -120,6 +131,10 @@ jobs: if: steps.cache_pg_16.outputs.cache-hit != 'true' run: make postgres-v16 -j$(sysctl -n hw.ncpu) + - name: Build postgres v17 + if: steps.cache_pg_17.outputs.cache-hit != 'true' + run: make postgres-v17 -j$(sysctl -n hw.ncpu) + - name: Build neon extensions run: make neon-pg-ext -j$(sysctl -n hw.ncpu) diff --git a/.gitmodules b/.gitmodules index 1d925674a1da..d1330bf28c60 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,7 @@ path = vendor/postgres-v16 url = https://github.com/neondatabase/postgres.git branch = REL_16_STABLE_neon +[submodule "vendor/postgres-v17"] + path = vendor/postgres-v17 + url = https://github.com/neondatabase/postgres.git + branch = REL_17_STABLE_neon diff --git a/Dockerfile b/Dockerfile index 1efedfa9bc84..bdb76a4f4fa9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,6 +5,8 @@ ARG REPOSITORY=neondatabase ARG IMAGE=build-tools ARG TAG=pinned +ARG DEFAULT_PG_VERSION=17 +ARG STABLE_PG_VERSION=16 # Build Postgres FROM $REPOSITORY/$IMAGE:$TAG AS pg-build @@ -13,6 +15,7 @@ WORKDIR /home/nonroot COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15 COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16 +COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17 COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh @@ -28,16 +31,19 @@ FROM $REPOSITORY/$IMAGE:$TAG AS build WORKDIR /home/nonroot ARG GIT_VERSION=local ARG BUILD_TAG +ARG STABLE_PG_VERSION COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server +COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib +COPY --from=pg-build /home/nonroot/pg_install/v17/lib pg_install/v17/lib COPY --chown=nonroot . . ARG ADDITIONAL_RUSTFLAGS RUN set -e \ - && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \ + && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \ --bin pg_sni_router \ --bin pageserver \ --bin pagectl \ @@ -52,6 +58,7 @@ RUN set -e \ # Build final image # FROM debian:bullseye-slim +ARG DEFAULT_PG_VERSION WORKDIR /data RUN set -e \ @@ -77,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubbe COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/ +COPY --from=pg-build /home/nonroot/pg_install/v17 /usr/local/v17/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. @@ -93,7 +101,7 @@ RUN mkdir -p /data/.neon/ && \ # When running a binary that links with libpq, default to using our most recent postgres version. Binaries # that want a particular postgres version will select it explicitly: this is just a default. -ENV LD_LIBRARY_PATH=/usr/local/v16/lib +ENV LD_LIBRARY_PATH=/usr/local/v${DEFAULT_PG_VERSION}/lib VOLUME ["/data"] diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index b6c89cd71f26..fe902eb97817 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -79,15 +79,23 @@ RUN cd postgres && \ # ######################################################################################### FROM build-deps AS postgis-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt update && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt update && \ apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \ libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ protobuf-c-compiler xsltproc # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2 -RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + mkdir -p /sfcgal && \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \ mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -96,7 +104,10 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \ mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ @@ -122,7 +133,10 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postg cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \ cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis -RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \ echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \ mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ @@ -142,12 +156,19 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti # ######################################################################################### FROM build-deps AS plv8-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt update && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt update && \ apt install -y ninja-build python3-dev libncurses5 binutils clang -RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \ echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \ mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \ # generate and copy upgrade scripts @@ -172,9 +193,13 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.t # ######################################################################################### FROM build-deps AS h3-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "$(uname -m)" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "$(uname -m)" in \ "x86_64") \ export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \ ;; \ @@ -192,7 +217,11 @@ RUN case "$(uname -m)" in \ && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \ && rm /tmp/cmake-install.sh -RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + mkdir -p /h3/usr/ && \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \ mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ @@ -202,7 +231,10 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz cp -R /h3/usr / && \ rm -rf build -RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \ mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ @@ -218,9 +250,13 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3 # ######################################################################################### FROM build-deps AS unit-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \ echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \ mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -239,6 +275,7 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz - # ######################################################################################### FROM build-deps AS vector-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY patches/pgvector.patch /pgvector.patch @@ -246,7 +283,10 @@ COPY patches/pgvector.patch /pgvector.patch # By default, pgvector Makefile uses `-march=native`. We don't want that, # because we build the images on different machines than where we run them. # Pass OPTFLAGS="" to remove it. -RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \ echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ patch -p1 < /pgvector.patch && \ @@ -261,10 +301,14 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O # ######################################################################################### FROM build-deps AS pgjwt-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021 -RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \ echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \ mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -277,9 +321,13 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214 # ######################################################################################### FROM build-deps AS hypopg-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \ echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \ mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -293,9 +341,13 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypo # ######################################################################################### FROM build-deps AS pg-hashids-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \ mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ @@ -309,11 +361,15 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz # ######################################################################################### FROM build-deps AS rum-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY patches/rum.patch /rum.patch -RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \ mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \ patch -p1 < /rum.patch && \ @@ -328,9 +384,13 @@ RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O r # ######################################################################################### FROM build-deps AS pgtap-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \ mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -344,9 +404,13 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta # ######################################################################################### FROM build-deps AS ip4r-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \ mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -360,9 +424,13 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i # ######################################################################################### FROM build-deps AS prefix-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \ mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -376,9 +444,13 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p # ######################################################################################### FROM build-deps AS hll-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \ mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -392,9 +464,13 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar # ######################################################################################### FROM build-deps AS plpgsql-check-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \ echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \ mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ @@ -413,7 +489,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "${PG_VERSION}" in \ "v14" | "v15") \ export TIMESCALEDB_VERSION=2.10.1 \ export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \ @@ -446,7 +525,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "${PG_VERSION}" in \ "v14") \ export PG_HINT_PLAN_VERSION=14_1_4_1 \ export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \ @@ -459,6 +541,9 @@ RUN case "${PG_VERSION}" in \ export PG_HINT_PLAN_VERSION=16_1_6_0 \ export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \ ;; \ + "v17") \ + echo "TODO: PG17 pg_hint_plan support" && exit 0 \ + ;; \ *) \ echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \ ;; \ @@ -478,10 +563,14 @@ RUN case "${PG_VERSION}" in \ # ######################################################################################### FROM build-deps AS pg-cron-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \ mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -495,9 +584,13 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O # ######################################################################################### FROM build-deps AS rdkit-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt-get update && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt-get update && \ apt-get install -y \ cmake \ libboost-iostreams1.74-dev \ @@ -507,7 +600,10 @@ RUN apt-get update && \ libeigen3-dev ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" -RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \ mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \ cmake \ @@ -544,10 +640,14 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar. # ######################################################################################### FROM build-deps AS pg-uuidv7-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \ mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -561,10 +661,14 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz # ######################################################################################### FROM build-deps AS pg-roaringbitmap-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -578,10 +682,14 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4 # ######################################################################################### FROM build-deps AS pg-semver-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \ mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -599,7 +707,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "${PG_VERSION}" in \ "v14" | "v15") \ export PG_EMBEDDING_VERSION=0.3.5 \ export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \ @@ -620,10 +731,14 @@ RUN case "${PG_VERSION}" in \ # ######################################################################################### FROM build-deps AS pg-anon-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ @@ -641,6 +756,7 @@ RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tag # ######################################################################################### FROM build-deps AS rust-extensions-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt-get update && \ @@ -651,9 +767,11 @@ ENV HOME=/home/nonroot ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH" USER nonroot WORKDIR /home/nonroot -ARG PG_VERSION -RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ chmod +x rustup-init && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ rm rustup-init && \ @@ -672,7 +790,10 @@ USER root FROM rust-extensions-build AS pg-jsonschema-pg-build ARG PG_VERSION -RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \ echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \ mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8 @@ -694,7 +815,10 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar. FROM rust-extensions-build AS pg-graphql-pg-build ARG PG_VERSION -RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \ echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \ mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ @@ -714,7 +838,10 @@ FROM rust-extensions-build AS pg-tiktoken-pg-build ARG PG_VERSION # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023 -RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \ echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \ mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ # TODO update pgrx version in the pg_tiktoken repo and remove this line @@ -733,7 +860,10 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6 FROM rust-extensions-build AS pg-pgx-ulid-build ARG PG_VERSION -RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \ mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ @@ -748,10 +878,14 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz - ######################################################################################### FROM build-deps AS wal2json-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -764,10 +898,14 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar. # ######################################################################################### FROM build-deps AS pg-ivm-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -781,10 +919,14 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv # ######################################################################################### FROM build-deps AS pg-partman-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -835,7 +977,10 @@ COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ -RUN make -j $(getconf _NPROCESSORS_ONLN) \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon \ -s install && \ @@ -854,8 +999,8 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ case "${PG_VERSION}" in \ "v14" | "v15") \ ;; \ - "v16") \ - echo "Skipping HNSW for PostgreSQL 16" && exit 0 \ + "v16" | "v17") \ + echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \ ;; \ *) \ echo "unexpected PostgreSQL version" && exit 1 \ @@ -878,7 +1023,10 @@ ENV BUILD_TAG=$BUILD_TAG USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . -RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto ######################################################################################### # @@ -899,15 +1047,24 @@ FROM neon-pg-ext-build AS postgres-cleanup-layer COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise) -RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp # Remove headers that we won't need anymore - we've completed installation of all extensions -RUN rm -r /usr/local/pgsql/include +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + rm -r /usr/local/pgsql/include # Remove static postgresql libraries - all compilation is finished, so we # can now remove these files - they must be included in other binaries by now # if they were to be used by other libraries. -RUN rm /usr/local/pgsql/lib/lib*.a +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + rm /usr/local/pgsql/lib/lib*.a ######################################################################################### @@ -918,7 +1075,10 @@ RUN rm /usr/local/pgsql/lib/lib*.a FROM neon-pg-ext-build AS neon-pg-ext-test ARG PG_VERSION -RUN mkdir /ext-src +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + mkdir /ext-src #COPY --from=postgis-build /postgis.tar.gz /ext-src/ #COPY --from=postgis-build /sfcgal/* /usr @@ -956,18 +1116,39 @@ COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src COPY patches/pg_anon.patch /ext-src COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src -RUN cd /ext-src/ && for f in *.tar.gz; \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/ && for f in *.tar.gz; \ do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \ rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \ || exit 1; rm -f $f; done -RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch -RUN cd /ext-src/rum-src && patch -p1 <../rum.patch +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/rum-src && patch -p1 <../rum.patch +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch # cmake is required for the h3 test -RUN apt-get update && apt-get install -y cmake -RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt-get update && apt-get install -y cmake +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh -RUN patch -p1 &str { "14" => return "v14", "15" => return "v15", "16" => return "v16", + "17" => return "v17", _ => {} }, _ => {} diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 5dbc3bcbbcfd..d616154af6b1 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -342,7 +342,7 @@ impl LocalEnv { #[allow(clippy::manual_range_patterns)] match pg_version { - 14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))), + 14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))), _ => bail!("Unsupported postgres version: {}", pg_version), } } diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index c715d6b789b4..2b714fbfbf10 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -28,6 +28,7 @@ use utils::{ auth::{encode_from_key_file, Claims, Scope}, id::{NodeId, TenantId}, }; +use whoami::username; pub struct StorageController { env: LocalEnv, @@ -183,7 +184,7 @@ impl StorageController { /// to other versions if that one isn't found. Some automated tests create circumstances /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`. async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result { - let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14]; + let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 16, 15, 14]; for v in prefer_versions { let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap(); @@ -211,7 +212,16 @@ impl StorageController { /// Readiness check for our postgres process async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result { let bin_path = pg_bin_dir.join("pg_isready"); - let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)]; + let args = [ + "-h", + "localhost", + "-U", + &username(), + "-d", + DB_NAME, + "-p", + &format!("{}", postgres_port), + ]; let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?; Ok(exitcode.success()) @@ -225,7 +235,11 @@ impl StorageController { /// /// Returns the database url pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result { - let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port); + let database_url = format!( + "postgresql://{}@localhost:{}/{DB_NAME}", + &username(), + postgres_port + ); let pg_bin_dir = self.get_pg_bin_dir().await?; let createdb_path = pg_bin_dir.join("createdb"); @@ -235,6 +249,10 @@ impl StorageController { "localhost", "-p", &format!("{}", postgres_port), + "-U", + &username(), + "-O", + &username(), DB_NAME, ]) .output() @@ -271,7 +289,7 @@ impl StorageController { // But tokio-postgres fork doesn't have this upstream commit: // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79 // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399 - .user(&whoami::username()) + .user(&username()) .dbname(DB_NAME) .connect(tokio_postgres::NoTls) .await @@ -328,6 +346,12 @@ impl StorageController { let pg_log_path = pg_data_path.join("postgres.log"); if !tokio::fs::try_exists(&pg_data_path).await? { + let initdb_args = ["-D", pg_data_path.as_ref(), "--username", &username()]; + tracing::info!( + "Initializing storage controller database with args: {:?}", + initdb_args + ); + // Initialize empty database let initdb_path = pg_bin_dir.join("initdb"); let mut child = Command::new(&initdb_path) @@ -335,7 +359,7 @@ impl StorageController { ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ]) - .args(["-D", pg_data_path.as_ref()]) + .args(initdb_args) .spawn() .expect("Failed to spawn initdb"); let status = child.wait().await?; @@ -364,8 +388,14 @@ impl StorageController { pg_data_path.as_ref(), "-l", pg_log_path.as_ref(), + "-U", + &username(), "start", ]; + tracing::info!( + "Starting storage controller database with args: {:?}", + db_start_args + ); background_process::start_process( "storage_controller_db", diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 8929ccb41d6f..4a776709c953 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,8 +1,8 @@ use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::Oid; use postgres_ffi::RepOriginId; -use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; use std::{fmt, ops::Range}; @@ -350,7 +350,17 @@ impl Key { // 02 00000000 00000000 00000000 00 00000000 // // TwoPhaseFile: -// 02 00000000 00000000 00000000 00 XID +// +// 02 00000000 00000000 00XXXXXX XX XXXXXXXX +// +// \______XID_________/ +// +// The 64-bit XID is stored a little awkwardly in field6, field5 and +// field4. PostgreSQL v16 and below only stored a 32-bit XID, which +// fit completely in field6, but starting with PostgreSQL v17, a full +// 64-bit XID is used. Most pageserver code that accesses +// TwoPhaseFiles now deals with 64-bit XIDs even on v16, the high bits +// are just unused. // // ControlFile: // 03 00000000 00000000 00000000 00 00000000 @@ -582,35 +592,36 @@ pub const TWOPHASEDIR_KEY: Key = Key { }; #[inline(always)] -pub fn twophase_file_key(xid: TransactionId) -> Key { +pub fn twophase_file_key(xid: u64) -> Key { Key { field1: 0x02, field2: 0, field3: 0, - field4: 0, - field5: 0, - field6: xid, + field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32, + field5: ((xid & 0x000000FF00000000) >> 32) as u8, + field6: (xid & 0x00000000FFFFFFFF) as u32, } } #[inline(always)] -pub fn twophase_key_range(xid: TransactionId) -> Range { +pub fn twophase_key_range(xid: u64) -> Range { + // 64-bit XIDs really should not overflow let (next_xid, overflowed) = xid.overflowing_add(1); Key { field1: 0x02, field2: 0, field3: 0, - field4: 0, - field5: 0, - field6: xid, + field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32, + field5: ((xid & 0x000000FF00000000) >> 32) as u8, + field6: (xid & 0x00000000FFFFFFFF) as u32, }..Key { field1: 0x02, field2: 0, - field3: 0, - field4: 0, - field5: u8::from(overflowed), - field6: next_xid, + field3: u32::from(overflowed), + field4: ((next_xid & 0xFFFFFF0000000000) >> 40) as u32, + field5: ((next_xid & 0x000000FF00000000) >> 32) as u8, + field6: (next_xid & 0x00000000FFFFFFFF) as u32, } } diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index a346390f3ddf..d3a85f26837b 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> { PathBuf::from("pg_install") }; - for pg_version in &["v14", "v15", "v16"] { + for pg_version in &["v14", "v15", "v16", "v17"] { let mut pg_install_dir_versioned = pg_install_dir.join(pg_version); if pg_install_dir_versioned.is_relative() { let cwd = env::current_dir().context("Failed to get current_dir")?; diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index f18e0c603b20..0d46ed6aacb9 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -57,6 +57,7 @@ macro_rules! for_all_postgres_versions { $macro!(v14); $macro!(v15); $macro!(v16); + $macro!(v17); }; } @@ -91,6 +92,7 @@ macro_rules! dispatch_pgversion { 14 : v14, 15 : v15, 16 : v16, + 17 : v17, ] ) }; @@ -121,6 +123,7 @@ macro_rules! enum_pgversion_dispatch { V14 : v14, V15 : v15, V16 : v16, + V17 : v17, ] ) }; @@ -150,6 +153,7 @@ macro_rules! enum_pgversion { V14 : v14, V15 : v15, V16 : v16, + V17 : v17, ] } }; @@ -162,6 +166,7 @@ macro_rules! enum_pgversion { V14 : v14, V15 : v15, V16 : v16, + V17 : v17, ] } }; diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 6ce855c78ef5..61b49a634dd4 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -152,6 +152,9 @@ pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8; pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; +// From heapam_xlog.h +pub const XLOG_HEAP2_REWRITE: u8 = 0x00; + // From replication/message.h pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00; @@ -219,15 +222,20 @@ pub const INVALID_TRANSACTION_ID: u32 = 0; pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000; pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384; +/* pg_control.h */ pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00; pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10; -pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; -pub const XLP_LONG_HEADER: u16 = 0x0002; +pub const XLOG_PARAMETER_CHANGE: u8 = 0x60; +pub const XLOG_END_OF_RECOVERY: u8 = 0x90; /* From xlog.h */ pub const XLOG_REPLORIGIN_SET: u8 = 0x00; pub const XLOG_REPLORIGIN_DROP: u8 = 0x10; +/* xlog_internal.h */ +pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; +pub const XLP_LONG_HEADER: u16 = 0x0002; + /* From replication/slot.h */ pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4 /* offset of `slotdata` in ReplicationSlotOnDisk */ + 64 /* NameData */ + 4*4; @@ -245,33 +253,6 @@ pub const VM_HEAPBLOCKS_PER_PAGE: u32 = /* From origin.c */ pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE; -// List of subdirectories inside pgdata. -// Copied from src/bin/initdb/initdb.c -pub const PGDATA_SUBDIRS: [&str; 22] = [ - "global", - "pg_wal/archive_status", - "pg_commit_ts", - "pg_dynshmem", - "pg_notify", - "pg_serial", - "pg_snapshots", - "pg_subtrans", - "pg_twophase", - "pg_multixact", - "pg_multixact/members", - "pg_multixact/offsets", - "base", - "base/1", - "pg_replslot", - "pg_tblspc", - "pg_stat", - "pg_stat_tmp", - "pg_xact", - "pg_logical", - "pg_logical/snapshots", - "pg_logical/mappings", -]; - // Don't include postgresql.conf as it is inconvenient on node start: // we need postgresql.conf before basebackup to synchronize safekeepers // so no point in overwriting it during backup restore. Rest of the files diff --git a/libs/postgres_ffi/src/pg_constants_v14.rs b/libs/postgres_ffi/src/pg_constants_v14.rs index 32f8f5111436..fe01a5df7c6d 100644 --- a/libs/postgres_ffi/src/pg_constants_v14.rs +++ b/libs/postgres_ffi/src/pg_constants_v14.rs @@ -5,6 +5,33 @@ pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */ +// List of subdirectories inside pgdata. +// Copied from src/bin/initdb/initdb.c +pub const PGDATA_SUBDIRS: [&str; 22] = [ + "global", + "pg_wal/archive_status", + "pg_commit_ts", + "pg_dynshmem", + "pg_notify", + "pg_serial", + "pg_snapshots", + "pg_subtrans", + "pg_twophase", + "pg_multixact", + "pg_multixact/members", + "pg_multixact/offsets", + "base", + "base/1", + "pg_replslot", + "pg_tblspc", + "pg_stat", + "pg_stat_tmp", + "pg_xact", + "pg_logical", + "pg_logical/snapshots", + "pg_logical/mappings", +]; + pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { (bimg_info & BKPIMAGE_IS_COMPRESSED) != 0 } diff --git a/libs/postgres_ffi/src/pg_constants_v15.rs b/libs/postgres_ffi/src/pg_constants_v15.rs index 626a23c7eaba..3cd1b7aec5fb 100644 --- a/libs/postgres_ffi/src/pg_constants_v15.rs +++ b/libs/postgres_ffi/src/pg_constants_v15.rs @@ -11,6 +11,8 @@ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */ +pub use super::super::v14::bindings::PGDATA_SUBDIRS; + pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; diff --git a/libs/postgres_ffi/src/pg_constants_v16.rs b/libs/postgres_ffi/src/pg_constants_v16.rs index 587be71cb3ea..31bd5b68fdf5 100644 --- a/libs/postgres_ffi/src/pg_constants_v16.rs +++ b/libs/postgres_ffi/src/pg_constants_v16.rs @@ -11,6 +11,8 @@ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */ +pub use super::super::v14::bindings::PGDATA_SUBDIRS; + pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; diff --git a/libs/postgres_ffi/src/pg_constants_v17.rs b/libs/postgres_ffi/src/pg_constants_v17.rs new file mode 100644 index 000000000000..21329386805b --- /dev/null +++ b/libs/postgres_ffi/src/pg_constants_v17.rs @@ -0,0 +1,55 @@ +pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; + +pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; +pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10; +pub const XLOG_DBASE_DROP: u8 = 0x20; + +pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */ +pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ + +pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */ + +// List of subdirectories inside pgdata. +// Copied from src/bin/initdb/initdb.c +pub const PGDATA_SUBDIRS: [&str; 23] = [ + "global", + "pg_wal/archive_status", + "pg_wal/summaries", + "pg_commit_ts", + "pg_dynshmem", + "pg_notify", + "pg_serial", + "pg_snapshots", + "pg_subtrans", + "pg_twophase", + "pg_multixact", + "pg_multixact/members", + "pg_multixact/offsets", + "base", + "base/1", + "pg_replslot", + "pg_tblspc", + "pg_stat", + "pg_stat_tmp", + "pg_xact", + "pg_logical", + "pg_logical/snapshots", + "pg_logical/mappings", +]; + +pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { + const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; + + (bimg_info & ANY_COMPRESS_FLAG) != 0 +} + + +pub const XLOG_HEAP2_PRUNE_ON_ACCESS: u8 = 0x10; +pub const XLOG_HEAP2_PRUNE_VACUUM_SCAN: u8 = 0x20; +pub const XLOG_HEAP2_PRUNE_VACUUM_CLEANUP: u8 = 0x30; + + +pub const XLOG_OVERWRITE_CONTRECORD: u8 = 0xD0; +pub const XLOG_CHECKPOINT_REDO: u8 = 0xE0; diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 6052f04d11ad..949e3f42511b 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -53,7 +53,7 @@ impl Conf { #[allow(clippy::manual_range_patterns)] match self.pg_version { - 14 | 15 | 16 => Ok(path.join(format!("v{}", self.pg_version))), + 14 | 15 | 16 | 17 => Ok(path.join(format!("v{}", self.pg_version))), _ => bail!("Unsupported postgres version: {}", self.pg_version), } } diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs index 28547f52bf2e..3f549889b8a7 100644 --- a/libs/walproposer/build.rs +++ b/libs/walproposer/build.rs @@ -5,6 +5,8 @@ use std::{env, path::PathBuf, process::Command}; use anyhow::{anyhow, Context}; +const WALPROPOSER_PG_VERSION: &str = "v17"; + fn main() -> anyhow::Result<()> { // Tell cargo to invalidate the built crate whenever the wrapper changes println!("cargo:rerun-if-changed=bindgen_deps.h"); @@ -36,7 +38,10 @@ fn main() -> anyhow::Result<()> { // Rebuild crate when libwalproposer.a changes println!("cargo:rerun-if-changed={walproposer_lib_search_str}/libwalproposer.a"); - let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config"); + let pg_config_bin = pg_install_abs + .join(WALPROPOSER_PG_VERSION) + .join("bin") + .join("pg_config"); let inc_server_path: String = if pg_config_bin.exists() { let output = Command::new(pg_config_bin) .arg("--includedir-server") @@ -53,7 +58,7 @@ fn main() -> anyhow::Result<()> { .into() } else { let server_path = pg_install_abs - .join("v16") + .join(WALPROPOSER_PG_VERSION) .join("include") .join("postgresql") .join("server") diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index a07107753eb8..adc090823d84 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -79,16 +79,24 @@ pub(crate) fn parse_filename(name: &str) -> Option { return None; } let keys: Vec<&str> = split[0].split('-').collect(); - let mut lsns: Vec<&str> = split[1].split('-').collect(); - let is_delta = if lsns.len() == 1 { - lsns.push(lsns[0]); + let lsn_and_opt_generation: Vec<&str> = split[1].split('v').collect(); + let lsns: Vec<&str> = lsn_and_opt_generation[0].split('-').collect(); + let the_lsns: [&str; 2]; + + /* + * Generations add a -vX-XXXXXX postfix, which causes issues when we try to + * parse 'vX' as an LSN. + */ + let is_delta = if lsns.len() == 1 || lsns[1].is_empty() { + the_lsns = [lsns[0], lsns[0]]; false } else { + the_lsns = [lsns[0], lsns[1]]; true }; let key_range = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap(); - let lsn_range = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap(); + let lsn_range = Lsn::from_hex(the_lsns[0]).unwrap()..Lsn::from_hex(the_lsns[1]).unwrap(); let holes = Vec::new(); Some(LayerFile { key_range, diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 207f781e1b27..a32d09f3b3bb 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -30,9 +30,8 @@ use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::dispatch_pgversion; use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; -use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA}; +use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PG_HBA}; use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; -use postgres_ffi::TransactionId; use postgres_ffi::XLogFileName; use postgres_ffi::PG_TLI; use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; @@ -255,8 +254,11 @@ where let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup; + let pgversion = self.timeline.pg_version; + let subdirs = dispatch_pgversion!(pgversion, &pgv::bindings::PGDATA_SUBDIRS[..]); + // Create pgdata subdirs structure - for dir in PGDATA_SUBDIRS.iter() { + for dir in subdirs.iter() { let header = new_tar_header_dir(dir)?; self.ar .append(&header, &mut io::empty()) @@ -606,7 +608,7 @@ where // // Extract twophase state files // - async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> { + async fn add_twophase_file(&mut self, xid: u64) -> Result<(), BasebackupError> { let img = self .timeline .get_twophase_file(xid, self.lsn, self.ctx) @@ -617,7 +619,11 @@ where buf.extend_from_slice(&img[..]); let crc = crc32c::crc32c(&img[..]); buf.put_u32_le(crc); - let path = format!("pg_twophase/{:>08X}", xid); + let path = if self.timeline.pg_version < 17 { + format!("pg_twophase/{:>08X}", xid) + } else { + format!("pg_twophase/{:>016X}", xid) + }; let header = new_tar_header(&path, buf.len() as u64)?; self.ar .append(&header, &buf[..]) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 29a98855d3fe..e9f197ec2dc9 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -281,7 +281,7 @@ impl PageServerConf { #[allow(clippy::manual_range_patterns)] match pg_version { - 14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))), + 14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))), _ => bail!("Unsupported postgres version: {}", pg_version), } } diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 5a0894cd1ba8..ca87f1d08021 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -580,9 +580,11 @@ async fn import_file( import_slru(modification, slru, file_path, reader, len, ctx).await?; debug!("imported multixact members slru"); } else if file_path.starts_with("pg_twophase") { - let xid = u32::from_str_radix(file_name.as_ref(), 16)?; - let bytes = read_all_bytes(reader).await?; + + // In PostgreSQL v17, this is a 64-bit FullTransactionid. In previous versions, + // it's a 32-bit TransactionId, which fits in u64 anyway. + let xid = u64::from_str_radix(file_name.as_ref(), 16)?; modification .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]), ctx) .await?; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 6dd8851b13a1..5f8766ca2c51 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -633,7 +633,7 @@ impl Timeline { pub(crate) async fn get_twophase_file( &self, - xid: TransactionId, + xid: u64, lsn: Lsn, ctx: &RequestContext, ) -> Result { @@ -646,11 +646,19 @@ impl Timeline { &self, lsn: Lsn, ctx: &RequestContext, - ) -> Result, PageReconstructError> { + ) -> Result, PageReconstructError> { // fetch directory entry let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; - Ok(TwoPhaseDirectory::des(&buf)?.xids) + if self.pg_version >= 17 { + Ok(TwoPhaseDirectoryV17::des(&buf)?.xids) + } else { + Ok(TwoPhaseDirectory::des(&buf)? + .xids + .iter() + .map(|x| u64::from(*x)) + .collect()) + } } pub(crate) async fn get_control_file( @@ -902,9 +910,13 @@ impl Timeline { // Then pg_twophase result.add_key(TWOPHASEDIR_KEY); - let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; - let twophase_dir = TwoPhaseDirectory::des(&buf)?; - let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); + + let mut xids: Vec = self + .list_twophase_files(lsn, ctx) + .await? + .iter() + .cloned() + .collect(); xids.sort_unstable(); for xid in xids { result.add_key(twophase_file_key(xid)); @@ -1127,9 +1139,15 @@ impl<'a> DatadirModification<'a> { // Create AuxFilesDirectory self.init_aux_dir()?; - let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory { - xids: HashSet::new(), - })?; + let buf = if self.tline.pg_version >= 17 { + TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 { + xids: HashSet::new(), + }) + } else { + TwoPhaseDirectory::ser(&TwoPhaseDirectory { + xids: HashSet::new(), + }) + }?; self.pending_directory_entries .push((DirectoryKind::TwoPhase, 0)); self.put(TWOPHASEDIR_KEY, Value::Image(buf.into())); @@ -1321,22 +1339,31 @@ impl<'a> DatadirModification<'a> { pub async fn put_twophase_file( &mut self, - xid: TransactionId, + xid: u64, img: Bytes, ctx: &RequestContext, ) -> anyhow::Result<()> { // Add it to the directory entry - let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; - let mut dir = TwoPhaseDirectory::des(&buf)?; - if !dir.xids.insert(xid) { - anyhow::bail!("twophase file for xid {} already exists", xid); - } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); - self.put( - TWOPHASEDIR_KEY, - Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), - ); + let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?; + let newdirbuf = if self.tline.pg_version >= 17 { + let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?; + if !dir.xids.insert(xid) { + anyhow::bail!("twophase file for xid {} already exists", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) + } else { + let xid = xid as u32; + let mut dir = TwoPhaseDirectory::des(&dirbuf)?; + if !dir.xids.insert(xid) { + anyhow::bail!("twophase file for xid {} already exists", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectory::ser(&dir)?) + }; + self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); self.put(twophase_file_key(xid), Value::Image(img)); Ok(()) @@ -1639,22 +1666,32 @@ impl<'a> DatadirModification<'a> { /// This method is used for marking truncated SLRU files pub async fn drop_twophase_file( &mut self, - xid: TransactionId, + xid: u64, ctx: &RequestContext, ) -> anyhow::Result<()> { // Remove it from the directory entry let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; - let mut dir = TwoPhaseDirectory::des(&buf)?; + let newdirbuf = if self.tline.pg_version >= 17 { + let mut dir = TwoPhaseDirectoryV17::des(&buf)?; - if !dir.xids.remove(&xid) { - warn!("twophase file for xid {} does not exist", xid); - } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); - self.put( - TWOPHASEDIR_KEY, - Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), - ); + if !dir.xids.remove(&xid) { + warn!("twophase file for xid {} does not exist", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) + } else { + let xid: u32 = u32::try_from(xid)?; + let mut dir = TwoPhaseDirectory::des(&buf)?; + + if !dir.xids.remove(&xid) { + warn!("twophase file for xid {} does not exist", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectory::ser(&dir)?) + }; + self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); // Delete it self.delete(twophase_key_range(xid)); @@ -2124,11 +2161,21 @@ struct DbDirectory { dbdirs: HashMap<(Oid, Oid), bool>, } +// The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of +// pg_twophase files was expanded from 32-bit XIDs to 64-bit XIDs. Previously, the files +// were named like "pg_twophase/000002E5", now they're like +// "pg_twophsae/0000000A000002E4". + #[derive(Debug, Serialize, Deserialize)] struct TwoPhaseDirectory { xids: HashSet, } +#[derive(Debug, Serialize, Deserialize)] +struct TwoPhaseDirectoryV17 { + xids: HashSet, +} + #[derive(Debug, Serialize, Deserialize, Default)] struct RelDirectory { // Set of relations that exist. (relfilenode, forknum) diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 6e15ad81c30c..229c01a6817e 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -237,6 +237,26 @@ impl WalIngest { .await?; } } + } else if pg_version == 17 { + if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG { + debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + debug!("XLOG_DBASE_CREATE_FILE_COPY"); + let createdb = XlCreateDatabase::decode(&mut buf); + self.ingest_xlog_dbase_create(modification, &createdb, ctx) + .await?; + } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification + .drop_dbdir(tablespace_id, dropdb.db_id, ctx) + .await?; + } + } } } pg_constants::RM_TBLSPC_ID => { @@ -246,7 +266,11 @@ impl WalIngest { let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK; if info == pg_constants::CLOG_ZEROPAGE { - let pageno = buf.get_u32_le(); + let pageno = if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( @@ -260,7 +284,7 @@ impl WalIngest { .await?; } else { assert!(info == pg_constants::CLOG_TRUNCATE); - let xlrec = XlClogTruncate::decode(&mut buf); + let xlrec = XlClogTruncate::decode(&mut buf, pg_version); self.ingest_clog_truncate_record(modification, &xlrec, ctx) .await?; } @@ -299,12 +323,21 @@ impl WalIngest { parsed_xact.xid, lsn, ); - modification - .drop_twophase_file(parsed_xact.xid, ctx) - .await?; + + let xid: u64 = if pg_version >= 17 { + self.adjust_to_full_transaction_id(parsed_xact.xid)? + } else { + parsed_xact.xid as u64 + }; + modification.drop_twophase_file(xid, ctx).await?; } else if info == pg_constants::XLOG_XACT_PREPARE { + let xid: u64 = if pg_version >= 17 { + self.adjust_to_full_transaction_id(decoded.xl_xid)? + } else { + decoded.xl_xid as u64 + }; modification - .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx) + .put_twophase_file(xid, Bytes::copy_from_slice(&buf[..]), ctx) .await?; } } @@ -312,7 +345,11 @@ impl WalIngest { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE { - let pageno = buf.get_u32_le(); + let pageno = if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( @@ -325,7 +362,11 @@ impl WalIngest { ) .await?; } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { - let pageno = buf.get_u32_le(); + let pageno = if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( @@ -354,6 +395,20 @@ impl WalIngest { pg_constants::RM_XLOG_ID => { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + if info == pg_constants::XLOG_PARAMETER_CHANGE { + if let CheckPoint::V17(cp) = &mut self.checkpoint { + let rec = v17::XlParameterChange::decode(&mut buf); + cp.wal_level = rec.wal_level; + self.checkpoint_modified = true; + } + } else if info == pg_constants::XLOG_END_OF_RECOVERY { + if let CheckPoint::V17(cp) = &mut self.checkpoint { + let rec = v17::XlEndOfRecovery::decode(&mut buf); + cp.wal_level = rec.wal_level; + self.checkpoint_modified = true; + } + } + enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { if info == pg_constants::XLOG_NEXTOID { let next_oid = buf.get_u32_le(); @@ -397,12 +452,24 @@ impl WalIngest { if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { - let mut oldest_active_xid = cp.nextXid.value as u32; - for xid in modification.tline.list_twophase_files(lsn, ctx).await? { - if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 { - oldest_active_xid = xid; + let oldest_active_xid = if pg_version >= 17 { + let mut oldest_active_full_xid = cp.nextXid.value; + for xid in modification.tline.list_twophase_files(lsn, ctx).await? { + if xid < oldest_active_full_xid { + oldest_active_full_xid = xid; + } } - } + oldest_active_full_xid as u32 + } else { + let mut oldest_active_xid = cp.nextXid.value as u32; + for xid in modification.tline.list_twophase_files(lsn, ctx).await? { + let narrow_xid = xid as u32; + if (narrow_xid.wrapping_sub(oldest_active_xid) as i32) < 0 { + oldest_active_xid = narrow_xid; + } + } + oldest_active_xid + }; cp.oldestActiveXid = oldest_active_xid; } else { cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid; @@ -515,6 +582,25 @@ impl WalIngest { Ok(modification.len() > prev_len) } + /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL + fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result { + let next_full_xid = + enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value }); + + let next_xid = (next_full_xid) as u32; + let mut epoch = (next_full_xid >> 32) as u32; + + if xid > next_xid { + // Wraparound occurred, must be from a prev epoch. + if epoch == 0 { + bail!("apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"); + } + epoch -= 1; + } + + Ok((epoch as u64) << 32 | xid as u64) + } + /// Do not store this block, but observe it for the purposes of updating our relation size state. async fn observe_decoded_block( &mut self, @@ -815,6 +901,73 @@ impl WalIngest { bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } + 17 => { + if decoded.xl_rmid == pg_constants::RM_HEAP_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + + if info == pg_constants::XLOG_HEAP_INSERT { + let xlrec = v17::XlHeapInsert::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_DELETE { + let xlrec = v17::XlHeapDelete::decode(buf); + if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_UPDATE + || info == pg_constants::XLOG_HEAP_HOT_UPDATE + { + let xlrec = v17::XlHeapUpdate::decode(buf); + // the size of tuple data is inferred from the size of the record. + // we can't validate the remaining number of bytes without parsing + // the tuple data. + if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno); + } + if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { + // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a + // non-HOT update where the new tuple goes to different page than + // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is + // set. + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_LOCK { + let xlrec = v17::XlHeapLock::decode(buf); + if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; + } + } + } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + if info == pg_constants::XLOG_HEAP2_MULTI_INSERT { + let xlrec = v17::XlHeapMultiInsert::decode(buf); + + let offset_array_len = + if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { + // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set + 0 + } else { + size_of::() * xlrec.ntuples as usize + }; + assert_eq!(offset_array_len, buf.remaining()); + + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED { + let xlrec = v17::XlHeapLockUpdated::decode(buf); + if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; + } + } + } else { + bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); + } + } _ => {} } @@ -923,26 +1076,26 @@ impl WalIngest { assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID); match pg_version { - 16 => { + 16 | 17 => { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; match info { pg_constants::XLOG_NEON_HEAP_INSERT => { - let xlrec = v16::rm_neon::XlNeonHeapInsert::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapInsert::decode(buf); assert_eq!(0, buf.remaining()); if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } pg_constants::XLOG_NEON_HEAP_DELETE => { - let xlrec = v16::rm_neon::XlNeonHeapDelete::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapDelete::decode(buf); if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } pg_constants::XLOG_NEON_HEAP_UPDATE | pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => { - let xlrec = v16::rm_neon::XlNeonHeapUpdate::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapUpdate::decode(buf); // the size of tuple data is inferred from the size of the record. // we can't validate the remaining number of bytes without parsing // the tuple data. @@ -958,7 +1111,7 @@ impl WalIngest { } } pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => { - let xlrec = v16::rm_neon::XlNeonHeapMultiInsert::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapMultiInsert::decode(buf); let offset_array_len = if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { @@ -974,7 +1127,7 @@ impl WalIngest { } } pg_constants::XLOG_NEON_HEAP_LOCK => { - let xlrec = v16::rm_neon::XlNeonHeapLock::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapLock::decode(buf); if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks[0].blkno); flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 0c4d575de823..dd199e2c557f 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -174,6 +174,7 @@ impl DecodedWALRecord { } 15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY, 16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY, + 17 => info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY, _ => { panic!("Unsupported postgres version {pg_version}") } @@ -341,16 +342,47 @@ pub mod v14 { } } } + + #[repr(C)] + #[derive(Debug)] + pub struct XlParameterChange { + pub max_connections: i32, + pub max_worker_processes: i32, + pub max_wal_senders: i32, + pub max_prepared_xacts: i32, + pub max_locks_per_xact: i32, + pub wal_level: i32, + pub wal_log_hints: bool, + pub track_commit_timestamp: bool, + pub _padding: [u8; 2], + } + + impl XlParameterChange { + pub fn decode(buf: &mut Bytes) -> XlParameterChange { + XlParameterChange { + max_connections: buf.get_i32_le(), + max_worker_processes: buf.get_i32_le(), + max_wal_senders: buf.get_i32_le(), + max_prepared_xacts: buf.get_i32_le(), + max_locks_per_xact: buf.get_i32_le(), + wal_level: buf.get_i32_le(), + wal_log_hints: buf.get_u8() != 0, + track_commit_timestamp: buf.get_u8() != 0, + _padding: [buf.get_u8(), buf.get_u8()], + } + } + } } pub mod v15 { pub use super::v14::{ XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapLockUpdated, XlHeapMultiInsert, XlHeapUpdate, + XlParameterChange, }; } pub mod v16 { - pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert}; + pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert, XlParameterChange}; use bytes::{Buf, Bytes}; use postgres_ffi::{OffsetNumber, TransactionId}; @@ -529,6 +561,37 @@ pub mod v16 { } } +pub mod v17 { + pub use super::v14::XlHeapLockUpdated; + use bytes::{Buf, Bytes}; + pub use postgres_ffi::{TimeLineID, TimestampTz}; + + pub use super::v16::rm_neon; + pub use super::v16::{ + XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange, + }; + + #[repr(C)] + #[derive(Debug)] + pub struct XlEndOfRecovery { + pub end_time: TimestampTz, + pub this_time_line_id: TimeLineID, + pub prev_time_line_id: TimeLineID, + pub wal_level: i32, + } + + impl XlEndOfRecovery { + pub fn decode(buf: &mut Bytes) -> XlEndOfRecovery { + XlEndOfRecovery { + end_time: buf.get_i64_le(), + this_time_line_id: buf.get_u32_le(), + prev_time_line_id: buf.get_u32_le(), + wal_level: buf.get_i32_le(), + } + } + } +} + #[repr(C)] #[derive(Debug)] pub struct XlSmgrCreate { @@ -746,9 +809,13 @@ pub struct XlClogTruncate { } impl XlClogTruncate { - pub fn decode(buf: &mut Bytes) -> XlClogTruncate { + pub fn decode(buf: &mut Bytes, pg_version: u32) -> XlClogTruncate { XlClogTruncate { - pageno: buf.get_u32_le(), + pageno: if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }, oldest_xid: buf.get_u32_le(), oldest_xid_db: buf.get_u32_le(), } diff --git a/pgxn/neon/bitmap.h b/pgxn/neon/bitmap.h new file mode 100644 index 000000000000..0a131816efdd --- /dev/null +++ b/pgxn/neon/bitmap.h @@ -0,0 +1,12 @@ +#ifndef NEON_BITMAP_H +#define NEON_BITMAP_H + +/* + * Utilities for manipulating bits8* as bitmaps. + */ + +#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7))) +#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7)) +#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7)) + +#endif //NEON_BITMAP_H diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 479209a53747..ab6739465b39 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -27,6 +27,7 @@ #include "pagestore_client.h" #include "common/hashfn.h" #include "pgstat.h" +#include "port/pg_iovec.h" #include "postmaster/bgworker.h" #include RELFILEINFO_HDR #include "storage/buf_internals.h" @@ -40,6 +41,7 @@ #include "utils/guc.h" #include "hll.h" +#include "bitmap.h" #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) @@ -469,6 +471,99 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) return found; } +/* + * Check if page is present in the cache. + * Returns true if page is found in local cache. + */ +int +lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + int nblocks, bits8 *bitmap) +{ + BufferTag tag; + FileCacheEntry *entry; + uint32 chunk_offs; + int found = 0; + uint32 hash; + int i = 0; + + if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ + return 0; + + CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forkNum; + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); + + tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1); + hash = get_hash_value(lfc_hash, &tag); + chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1); + + LWLockAcquire(lfc_lock, LW_SHARED); + + while (true) + { + int this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs); + if (LFC_ENABLED()) + { + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + + if (entry != NULL) + { + for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++) + { + if ((entry->bitmap[chunk_offs >> 5] & + (1 << (chunk_offs & 31))) != 0) + { + BITMAP_SET(bitmap, i); + found++; + } + } + } + else + { + i += this_chunk; + } + } + else + { + return found; + } + + /* + * Break out of the iteration before doing expensive stuff for + * a next iteration + */ + if (i + 1 >= nblocks) + break; + + /* + * Prepare for the next iteration. We don't unlock here, as that'd + * probably be more expensive than the gains it'd get us. + */ + tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1); + hash = get_hash_value(lfc_hash, &tag); + chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1); + } + + LWLockRelease(lfc_lock); + +#if USE_ASSERT_CHECKING + do { + int count = 0; + + for (int j = 0; j < nblocks; j++) + { + if (BITMAP_ISSET(bitmap, j)) + count++; + } + + Assert(count == found); + } while (false); +#endif + + return found; +} + /* * Evict a page (if present) from the local file cache */ @@ -548,91 +643,171 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) } /* - * Try to read page from local cache. - * Returns true if page is found in local cache. - * In case of error local file cache is disabled (lfc->limit is set to zero). + * Try to read pages from local cache. + * Returns the number of pages read from the local cache, and sets bits in + * 'read' for the pages which were read. This may scribble over buffers not + * marked in 'read', so be careful with operation ordering. + * + * In case of error local file cache is disabled (lfc->limit is set to zero), + * and -1 is returned. Note that 'read' and the buffers may be touched and in + * an otherwise invalid state. + * + * If the mask argument is supplied, bits will be set at the offsets of pages + * that were present and read from the LFC. */ -bool -lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - char *buffer) +int +lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + void **buffers, BlockNumber nblocks, bits8 *mask) { BufferTag tag; FileCacheEntry *entry; ssize_t rc; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); bool result = true; uint32 hash; uint64 generation; uint32 entry_offset; + int blocks_read = 0; + int buf_offset = 0; if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ - return false; + return 0; if (!lfc_ensure_opened()) - return false; + return 0; CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; - tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - hash = get_hash_value(lfc_hash, &tag); - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (!LFC_ENABLED()) + /* + * For every chunk that has blocks we're interested in, we + * 1. get the chunk header + * 2. Check if the chunk actually has the blocks we're interested in + * 3. Read the blocks we're looking for (in one preadv), assuming they exist + * 4. Update the statistics for the read call. + * + * If there is an error, we do an early return. + */ + while (nblocks > 0) { - LWLockRelease(lfc_lock); - return false; - } + struct iovec iov[PG_IOV_MAX]; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); + int iteration_hits = 0; + int iteration_misses = 0; + Assert(blocks_in_chunk > 0); + + for (int i = 0; i < blocks_in_chunk; i++) + { + iov[i].iov_base = buffers[buf_offset + i]; + iov[i].iov_len = BLCKSZ; + } - entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + tag.blockNum = blkno - chunk_offs; + hash = get_hash_value(lfc_hash, &tag); - /* Approximate working set */ - tag.blockNum = blkno; - addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + /* We can return the blocks we've read before LFC got disabled; + * assuming we read any. */ + if (!LFC_ENABLED()) + { + LWLockRelease(lfc_lock); + return blocks_read; + } + + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + + /* Approximate working set for the blocks assumed in this entry */ + for (int i = 0; i < blocks_in_chunk; i++) + { + tag.blockNum = blkno + i; + addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + } + + if (entry == NULL) + { + /* Pages are not cached */ + lfc_ctl->misses += blocks_in_chunk; + pgBufferUsage.file_cache.misses += blocks_in_chunk; + LWLockRelease(lfc_lock); + + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; + blkno += blocks_in_chunk; + + continue; + } + + /* Unlink entry from LRU list to pin it for the duration of IO operation */ + if (entry->access_count++ == 0) + dlist_delete(&entry->list_node); + + generation = lfc_ctl->generation; + entry_offset = entry->offset; - if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0) - { - /* Page is not cached */ - lfc_ctl->misses += 1; - pgBufferUsage.file_cache.misses += 1; LWLockRelease(lfc_lock); - return false; - } - /* Unlink entry from LRU list to pin it for the duration of IO operation */ - if (entry->access_count++ == 0) - dlist_delete(&entry->list_node); - generation = lfc_ctl->generation; - entry_offset = entry->offset; - LWLockRelease(lfc_lock); + for (int i = 0; i < blocks_in_chunk; i++) + { + /* + * If the page is valid, we consider it "read". + * All other pages will be fetched separately by the next cache + */ + if (entry->bitmap[(chunk_offs + i) / 32] & (1 << ((chunk_offs + i) % 32))) + { + BITMAP_SET(mask, buf_offset + i); + iteration_hits++; + } + else + iteration_misses++; + } - rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); - if (rc != BLCKSZ) - { - lfc_disable("read"); - return false; - } + Assert(iteration_hits + iteration_misses > 0); - /* Place entry to the head of LRU list */ - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + if (iteration_hits != 0) + { + rc = preadv(lfc_desc, iov, blocks_in_chunk, + ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); - if (lfc_ctl->generation == generation) - { - CriticalAssert(LFC_ENABLED()); - lfc_ctl->hits += 1; - pgBufferUsage.file_cache.hits += 1; - CriticalAssert(entry->access_count > 0); - if (--entry->access_count == 0) - dlist_push_tail(&lfc_ctl->lru, &entry->list_node); - } - else - result = false; + if (rc != (BLCKSZ * blocks_in_chunk)) + { + lfc_disable("read"); + return -1; + } + } - LWLockRelease(lfc_lock); + /* Place entry to the head of LRU list */ + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - return result; + if (lfc_ctl->generation == generation) + { + CriticalAssert(LFC_ENABLED()); + lfc_ctl->hits += iteration_hits; + lfc_ctl->misses += iteration_misses; + pgBufferUsage.file_cache.hits += iteration_hits; + pgBufferUsage.file_cache.misses += iteration_misses; + CriticalAssert(entry->access_count > 0); + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); + } + else + { + /* generation mismatch, assume error condition */ + LWLockRelease(lfc_lock); + return -1; + } + + LWLockRelease(lfc_lock); + + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; + blkno += blocks_in_chunk; + blocks_read += iteration_hits; + } + + return blocks_read; } /* @@ -640,20 +815,17 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * If cache is full then evict some other page. */ void -#if PG_MAJORVERSION_NUM < 16 -lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer) -#else -lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *buffer) -#endif +lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + const void *const *buffers, BlockNumber nblocks) { BufferTag tag; FileCacheEntry *entry; ssize_t rc; bool found; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); uint32 hash; uint64 generation; uint32 entry_offset; + int buf_offset = 0; if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return; @@ -661,110 +833,142 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void if (!lfc_ensure_opened()) return; - tag.forkNum = forkNum; - tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forkNum; CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - hash = get_hash_value(lfc_hash, &tag); - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (!LFC_ENABLED()) + /* + * For every chunk that has blocks we're interested in, we + * 1. get the chunk header + * 2. Check if the chunk actually has the blocks we're interested in + * 3. Read the blocks we're looking for (in one preadv), assuming they exist + * 4. Update the statistics for the read call. + * + * If there is an error, we do an early return. + */ + while (nblocks > 0) { - LWLockRelease(lfc_lock); - return; - } - - entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); + struct iovec iov[PG_IOV_MAX]; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); + Assert(blocks_in_chunk > 0); - if (found) - { - /* - * Unlink entry from LRU list to pin it for the duration of IO - * operation - */ - if (entry->access_count++ == 0) - dlist_delete(&entry->list_node); - } - else - { - /* - * We have two choices if all cache pages are pinned (i.e. used in IO - * operations): - * - * 1) Wait until some of this operation is completed and pages is - * unpinned. - * - * 2) Allocate one more chunk, so that specified cache size is more - * recommendation than hard limit. - * - * As far as probability of such event (that all pages are pinned) is - * considered to be very very small: there are should be very large - * number of concurrent IO operations and them are limited by - * max_connections, we prefer not to complicate code and use second - * approach. - */ - if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru)) + for (int i = 0; i < blocks_in_chunk; i++) { - /* Cache overflow: evict least recently used chunk */ - FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); - - CriticalAssert(victim->access_count == 0); - entry->offset = victim->offset; /* grab victim's chunk */ - hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); - neon_log(DEBUG2, "Swap file cache page"); + iov[i].iov_base = unconstify(void *, buffers[buf_offset + i]); + iov[i].iov_len = BLCKSZ; } - else if (!dlist_is_empty(&lfc_ctl->holes)) + + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); + hash = get_hash_value(lfc_hash, &tag); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (!LFC_ENABLED()) { - /* We can reuse a hole that was left behind when the LFC was shrunk previously */ - FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes)); - uint32 offset = hole->offset; - bool found; + LWLockRelease(lfc_lock); + return; + } - hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found); - CriticalAssert(found); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); - lfc_ctl->used += 1; - entry->offset = offset; /* reuse the hole */ + if (found) + { + /* + * Unlink entry from LRU list to pin it for the duration of IO + * operation + */ + if (entry->access_count++ == 0) + dlist_delete(&entry->list_node); } else { - lfc_ctl->used += 1; - entry->offset = lfc_ctl->size++; /* allocate new chunk at end - * of file */ + /* + * We have two choices if all cache pages are pinned (i.e. used in IO + * operations): + * + * 1) Wait until some of this operation is completed and pages is + * unpinned. + * + * 2) Allocate one more chunk, so that specified cache size is more + * recommendation than hard limit. + * + * As far as probability of such event (that all pages are pinned) is + * considered to be very very small: there are should be very large + * number of concurrent IO operations and them are limited by + * max_connections, we prefer not to complicate code and use second + * approach. + */ + if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru)) + { + /* Cache overflow: evict least recently used chunk */ + FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); + + CriticalAssert(victim->access_count == 0); + entry->offset = victim->offset; /* grab victim's chunk */ + hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); + neon_log(DEBUG2, "Swap file cache page"); + } + else if (!dlist_is_empty(&lfc_ctl->holes)) + { + /* We can reuse a hole that was left behind when the LFC was shrunk previously */ + FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes)); + uint32 offset = hole->offset; + bool found; + + hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found); + CriticalAssert(found); + + lfc_ctl->used += 1; + entry->offset = offset; /* reuse the hole */ + } + else + { + lfc_ctl->used += 1; + entry->offset = lfc_ctl->size++; /* allocate new chunk at end + * of file */ + } + entry->access_count = 1; + entry->hash = hash; + memset(entry->bitmap, 0, sizeof entry->bitmap); } - entry->access_count = 1; - entry->hash = hash; - memset(entry->bitmap, 0, sizeof entry->bitmap); - } - generation = lfc_ctl->generation; - entry_offset = entry->offset; - lfc_ctl->writes += 1; - LWLockRelease(lfc_lock); - - rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); - if (rc != BLCKSZ) - { - lfc_disable("write"); - } - else - { - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + generation = lfc_ctl->generation; + entry_offset = entry->offset; + lfc_ctl->writes += blocks_in_chunk; + LWLockRelease(lfc_lock); - if (lfc_ctl->generation == generation) + rc = pwritev(lfc_desc, iov, blocks_in_chunk, + ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + if (rc != BLCKSZ * blocks_in_chunk) { - CriticalAssert(LFC_ENABLED()); - /* Place entry to the head of LRU list */ - CriticalAssert(entry->access_count > 0); - if (--entry->access_count == 0) - dlist_push_tail(&lfc_ctl->lru, &entry->list_node); - - entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31)); + lfc_disable("write"); } + else + { + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - LWLockRelease(lfc_lock); + if (lfc_ctl->generation == generation) + { + CriticalAssert(LFC_ENABLED()); + /* Place entry to the head of LRU list */ + CriticalAssert(entry->access_count > 0); + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); + + for (int i = 0; i < blocks_in_chunk; i++) + { + entry->bitmap[(chunk_offs + i) >> 5] |= + (1 << ((chunk_offs + i) & 31)); + } + } + + LWLockRelease(lfc_lock); + } + blkno += blocks_in_chunk; + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; } } diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 5126c26c5d13..df7000acc0f7 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -537,7 +537,11 @@ pageserver_connect(shardno_t shard_no, int elevel) /* No more polling needed; connection succeeded */ shard->last_connect_time = GetCurrentTimestamp(); +#if PG_MAJORVERSION_NUM >= 17 + shard->wes_read = CreateWaitEventSet(NULL, 3); +#else shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3); +#endif AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index addb6ccce6ec..59b97d64fed5 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -6,7 +6,11 @@ #ifndef NEON_PGVERSIONCOMPAT_H #define NEON_PGVERSIONCOMPAT_H +#if PG_MAJORVERSION_NUM < 17 #define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId) +#else +#define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != INVALID_PROC_NUMBER) +#endif #define RelFileInfoEquals(a, b) ( \ NInfoGetSpcOid(a) == NInfoGetSpcOid(b) && \ @@ -50,7 +54,7 @@ #define CopyNRelFileInfoToBufTag(tag, rinfo) \ do { \ (tag).rnode = (rinfo); \ - } while (false); + } while (false) #define BufTagGetNRelFileInfo(tag) tag.rnode @@ -98,7 +102,7 @@ (tag).spcOid = (rinfo).spcOid; \ (tag).dbOid = (rinfo).dbOid; \ (tag).relNumber = (rinfo).relNumber; \ - } while (false); + } while (false) #define BufTagGetNRelFileInfo(tag) \ ((RelFileLocator) { \ @@ -113,4 +117,10 @@ #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers #endif +#if PG_MAJORVERSION_NUM < 17 +#define ProcNumber BackendId +#define INVALID_PROC_NUMBER InvalidBackendId +#define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess()) +#endif + #endif /* NEON_PGVERSIONCOMPAT_H */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 1f196d016c3f..4c9e40a063ad 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -6,8 +6,6 @@ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * contrib/neon/pagestore_client.h - * *------------------------------------------------------------------------- */ #ifndef pageserver_h @@ -187,7 +185,7 @@ extern char *nm_to_string(NeonMessage *msg); * API */ -typedef unsigned shardno_t; +typedef uint16 shardno_t; typedef struct { @@ -211,7 +209,7 @@ extern int neon_protocol_version; extern shardno_t get_shard_number(BufferTag* tag); -extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo); +extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo); extern void smgr_init_neon(void); extern void readahead_buffer_resize(int newsize, void *extra); @@ -233,8 +231,13 @@ extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nbuffers, bool skipFsync); #endif +#if PG_MAJORVERSION_NUM >=17 +extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks); +#else extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +#endif /* * LSN values associated with each request to the pageserver @@ -269,19 +272,11 @@ typedef struct } neon_request_lsns; #if PG_MAJORVERSION_NUM < 16 -extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, char *buffer); -extern void neon_write(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); #else -extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - void *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, void *buffer); -extern void neon_write(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, const void *buffer, bool skipFsync); #endif extern void neon_writeback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); @@ -299,17 +294,34 @@ extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockN extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum); /* functions for local file cache */ -#if PG_MAJORVERSION_NUM < 16 -extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - char *buffer); -#else -extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - const void *buffer); -#endif -extern bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer); -extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); +extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, const void *const *buffers, + BlockNumber nblocks); +/* returns number of blocks read, with one bit set in *read for each */ +extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, void **buffers, + BlockNumber nblocks, bits8 *mask); + +extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno); +extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, int nblocks, bits8 *bitmap); extern void lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); extern void lfc_init(void); +static inline bool +lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + void *buffer) +{ + bits8 rv = 0; + return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1; +} + +static inline void +lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + const void *buffer) +{ + return lfc_writev(rinfo, forkNum, blkno, &buffer, 1); +} #endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 7f39c7d02603..36538ea5e20f 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -58,6 +58,7 @@ #include "pgstat.h" #include "postmaster/autovacuum.h" #include "postmaster/interrupt.h" +#include "port/pg_iovec.h" #include "replication/walsender.h" #include "storage/bufmgr.h" #include "storage/buf_internals.h" @@ -66,6 +67,7 @@ #include "storage/smgr.h" #include "pagestore_client.h" +#include "bitmap.h" #if PG_VERSION_NUM >= 150000 #include "access/xlogrecovery.h" @@ -170,16 +172,28 @@ typedef enum PrefetchStatus * valid */ } PrefetchStatus; +/* must fit in uint8; bits 0x1 are used */ +typedef enum { + PRFSF_NONE = 0x0, + PRFSF_SEQ = 0x1, +} PrefetchRequestFlags; + typedef struct PrefetchRequest { BufferTag buftag; /* must be first entry in the struct */ + shardno_t shard_no; + uint8 status; /* see PrefetchStatus for valid values */ + uint8 flags; /* see PrefetchRequestFlags */ neon_request_lsns request_lsns; NeonResponse *response; /* may be null */ - PrefetchStatus status; - shardno_t shard_no; uint64 my_ring_index; } PrefetchRequest; +StaticAssertDecl(sizeof(PrefetchRequest) == 64, + "We prefer to have a power-of-2 size for this struct. Please" + " try to find an alternative solution before reaching to" + " increase the expected size here"); + /* prefetch buffer lookup hash table */ typedef struct PrfHashEntry @@ -251,17 +265,17 @@ typedef struct PrefetchState PrefetchRequest prf_buffer[]; /* prefetch buffers */ } PrefetchState; -#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7))) -#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7)) -#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7)) - static PrefetchState *MyPState; +#define GetPrfSlotNoCheck(ring_index) ( \ + &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ +) + #define GetPrfSlot(ring_index) ( \ ( \ AssertMacro((ring_index) < MyPState->ring_unused && \ (ring_index) >= MyPState->ring_last), \ - &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ + GetPrfSlotNoCheck(ring_index) \ ) \ ) @@ -281,9 +295,17 @@ static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_ static bool prefetch_wait_for(uint64 ring_index); static void prefetch_cleanup_trailing_unused(void); static inline void prefetch_set_unused(uint64 ring_index); +#if PG_MAJORVERSION_NUM < 17 +static void +GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum, + BlockNumber blkno, int nblocks, XLogRecPtr *lsns); +#endif -static neon_request_lsns neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno); -static bool neon_prefetch_response_usable(neon_request_lsns request_lsns, +static void +neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, + BlockNumber blkno, neon_request_lsns *output, + BlockNumber nblocks, const bits8 *mask); +static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns, PrefetchRequest *slot); static bool @@ -729,9 +751,9 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns if (force_request_lsns) slot->request_lsns = *force_request_lsns; else - slot->request_lsns = neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), - slot->buftag.forkNum, - slot->buftag.blockNum); + neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), + slot->buftag.forkNum, slot->buftag.blockNum, + &slot->request_lsns, 1, NULL); request.req.lsn = slot->request_lsns.request_lsn; request.req.not_modified_since = slot->request_lsns.not_modified_since; @@ -771,141 +793,194 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns */ static uint64 -prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns) +prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, + BlockNumber nblocks, const bits8 *mask) { - uint64 ring_index; + uint64 min_ring_index; PrefetchRequest req; - PrefetchRequest *slot; - PrfHashEntry *entry; +#if USE_ASSERT_CHECKING + bool any_hits = false; +#endif + /* We will never read further ahead than our buffer can store. */ + nblocks = Max(1, Min(nblocks, readahead_buffer_size)); /* use an intermediate PrefetchRequest struct to ensure correct alignment */ req.buftag = tag; + Retry: - entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req); + min_ring_index = UINT64_MAX; + for (int i = 0; i < nblocks; i++) + { + PrefetchRequest *slot = NULL; + PrfHashEntry *entry = NULL; + uint64 ring_index; + neon_request_lsns *lsns; + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; - if (entry != NULL) - { - slot = entry->slot; - ring_index = slot->my_ring_index; - Assert(slot == GetPrfSlot(ring_index)); + if (frlsns) + lsns = &frlsns[i]; + else + lsns = NULL; - Assert(slot->status != PRFS_UNUSED); - Assert(MyPState->ring_last <= ring_index && - ring_index < MyPState->ring_unused); - Assert(BUFFERTAGS_EQUAL(slot->buftag, tag)); +#if USE_ASSERT_CHECKING + any_hits = true; +#endif - /* - * If the caller specified a request LSN to use, only accept prefetch - * responses that satisfy that request. - */ - if (force_request_lsns) - { - if (!neon_prefetch_response_usable(*force_request_lsns, slot)) - { - /* Wait for the old request to finish and discard it */ - if (!prefetch_wait_for(ring_index)) - goto Retry; - prefetch_set_unused(ring_index); - entry = NULL; - } - } + slot = NULL; + entry = NULL; + + req.buftag.blockNum = tag.blockNum + i; + entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req); if (entry != NULL) { + slot = entry->slot; + ring_index = slot->my_ring_index; + Assert(slot == GetPrfSlot(ring_index)); + + Assert(slot->status != PRFS_UNUSED); + Assert(MyPState->ring_last <= ring_index && + ring_index < MyPState->ring_unused); + Assert(BUFFERTAGS_EQUAL(slot->buftag, req.buftag)); + /* - * We received a prefetch for a page that was recently read and - * removed from the buffers. Remove that request from the buffers. + * If the caller specified a request LSN to use, only accept + * prefetch responses that satisfy that request. */ - if (slot->status == PRFS_TAG_REMAINS) + if (lsns) { - prefetch_set_unused(ring_index); - entry = NULL; + if (!neon_prefetch_response_usable(lsns, slot)) + { + /* Wait for the old request to finish and discard it */ + if (!prefetch_wait_for(ring_index)) + goto Retry; + prefetch_set_unused(ring_index); + entry = NULL; + slot = NULL; + } } - else + + if (entry != NULL) { - /* The buffered request is good enough, return that index */ - pgBufferUsage.prefetch.duplicates++; - return ring_index; + /* + * We received a prefetch for a page that was recently read + * and removed from the buffers. Remove that request from the + * buffers. + */ + if (slot->status == PRFS_TAG_REMAINS) + { + prefetch_set_unused(ring_index); + entry = NULL; + slot = NULL; + } + else + { + min_ring_index = Min(min_ring_index, ring_index); + /* The buffered request is good enough, return that index */ + pgBufferUsage.prefetch.duplicates++; + continue; + } } } - } - - /* - * If the prefetch queue is full, we need to make room by clearing the - * oldest slot. If the oldest slot holds a buffer that was already - * received, we can just throw it away; we fetched the page unnecessarily - * in that case. If the oldest slot holds a request that we haven't - * received a response for yet, we have to wait for the response to that - * before we can continue. We might not have even flushed the request to - * the pageserver yet, it might be just sitting in the output buffer. In - * that case, we flush it and wait for the response. (We could decide not - * to send it, but it's hard to abort when the request is already in the - * output buffer, and 'not sending' a prefetch request kind of goes - * against the principles of prefetching) - */ - if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused) - { - uint64 cleanup_index = MyPState->ring_last; - - slot = GetPrfSlot(cleanup_index); - Assert(slot->status != PRFS_UNUSED); + /* + * We can only leave the block above by finding that there's + * no entry that can satisfy this request, either because there + * was no entry, or because the entry was invalid or didn't satisfy + * the LSNs provided. + * + * The code should've made sure to clear up the data. + */ + Assert(entry == NULL); + Assert(slot == NULL); /* - * If there is good reason to run compaction on the prefetch buffers, - * try to do that. + * If the prefetch queue is full, we need to make room by clearing the + * oldest slot. If the oldest slot holds a buffer that was already + * received, we can just throw it away; we fetched the page + * unnecessarily in that case. If the oldest slot holds a request that + * we haven't received a response for yet, we have to wait for the + * response to that before we can continue. We might not have even + * flushed the request to the pageserver yet, it might be just sitting + * in the output buffer. In that case, we flush it and wait for the + * response. (We could decide not to send it, but it's hard to abort + * when the request is already in the output buffer, and 'not sending' + * a prefetch request kind of goes against the principles of + * prefetching) */ - if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers()) - { - Assert(slot->status == PRFS_UNUSED); - } - else + if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused) { + uint64 cleanup_index = MyPState->ring_last; + + slot = GetPrfSlot(cleanup_index); + + Assert(slot->status != PRFS_UNUSED); + /* - * We have the slot for ring_last, so that must still be in - * progress + * If there is good reason to run compaction on the prefetch buffers, + * try to do that. */ - switch (slot->status) + if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers()) { - case PRFS_REQUESTED: - Assert(MyPState->ring_receive == cleanup_index); - if (!prefetch_wait_for(cleanup_index)) - goto Retry; - prefetch_set_unused(cleanup_index); - break; - case PRFS_RECEIVED: - case PRFS_TAG_REMAINS: - prefetch_set_unused(cleanup_index); - break; - default: - pg_unreachable(); + Assert(slot->status == PRFS_UNUSED); + } + else + { + /* + * We have the slot for ring_last, so that must still be in + * progress + */ + switch (slot->status) + { + case PRFS_REQUESTED: + Assert(MyPState->ring_receive == cleanup_index); + if (!prefetch_wait_for(cleanup_index)) + goto Retry; + prefetch_set_unused(cleanup_index); + break; + case PRFS_RECEIVED: + case PRFS_TAG_REMAINS: + prefetch_set_unused(cleanup_index); + break; + default: + pg_unreachable(); + } } } - } - /* - * The next buffer pointed to by `ring_unused` is now definitely empty, so - * we can insert the new request to it. - */ - ring_index = MyPState->ring_unused; - slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)]; + /* + * The next buffer pointed to by `ring_unused` is now definitely empty, so + * we can insert the new request to it. + */ + ring_index = MyPState->ring_unused; - Assert(MyPState->ring_last <= ring_index); + Assert(MyPState->ring_last <= ring_index && + ring_index <= MyPState->ring_unused); - Assert(slot->status == PRFS_UNUSED); + slot = GetPrfSlotNoCheck(ring_index); - /* - * We must update the slot data before insertion, because the hash - * function reads the buffer tag from the slot. - */ - slot->buftag = tag; - slot->shard_no = get_shard_number(&tag); - slot->my_ring_index = ring_index; + Assert(slot->status == PRFS_UNUSED); - prefetch_do_request(slot, force_request_lsns); - Assert(slot->status == PRFS_REQUESTED); - Assert(MyPState->ring_last <= ring_index && - ring_index < MyPState->ring_unused); + /* + * We must update the slot data before insertion, because the hash + * function reads the buffer tag from the slot. + */ + slot->buftag = req.buftag; + slot->shard_no = get_shard_number(&tag); + slot->my_ring_index = ring_index; + + min_ring_index = Min(min_ring_index, ring_index); + + prefetch_do_request(slot, lsns); + } + + Assert(any_hits); + + Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED || + GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED); + Assert(MyPState->ring_last <= min_ring_index && + min_ring_index < MyPState->ring_unused); if (flush_every_n_requests > 0 && MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests) @@ -921,9 +996,17 @@ prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns) MyPState->ring_flush = MyPState->ring_unused; } - return ring_index; + return min_ring_index; +} + + +static uint64 +prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns) +{ + return prefetch_register_bufferv(tag, force_request_lsns, 1, NULL); } + /* * Note: this function can get canceled and use a long jump to the next catch * context. Take care. @@ -1348,6 +1431,50 @@ log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, return log_newpage(rinfo, forkNum, blkno, copied_buffer.data, page_std); } +#if PG_MAJORVERSION_NUM >= 17 +/* + * Wrapper around log_newpages() that makes a temporary copy of the block and + * WAL-logs that. This makes it safe to use while holding only a shared lock + * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint + * directly because it skips the logging if the LSN is new enough. + */ +static XLogRecPtr +log_newpages_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, + BlockNumber nblocks, Page *pages, bool page_std) +{ + PGAlignedBlock copied_buffer[XLR_MAX_BLOCK_ID]; + BlockNumber blknos[XLR_MAX_BLOCK_ID]; + Page pageptrs[XLR_MAX_BLOCK_ID]; + int nregistered = 0; + XLogRecPtr result = 0; + + for (int i = 0; i < nblocks; i++) + { + Page page = copied_buffer[nregistered].data; + memcpy(page, pages[i], BLCKSZ); + pageptrs[nregistered] = page; + blknos[nregistered] = blkno + i; + + ++nregistered; + + if (nregistered >= XLR_MAX_BLOCK_ID) + { + log_newpages(rinfo, forkNum, nregistered, blknos, pageptrs, + page_std); + nregistered = 0; + } + } + + if (nregistered != 0) + { + log_newpages(rinfo, forkNum, nregistered, blknos, pageptrs, + page_std); + } + + return ProcLastRecPtr; +} +#endif /* PG_MAJORVERSION_NUM >= 17 */ + /* * Is 'buffer' identical to a freshly initialized empty heap page? */ @@ -1361,14 +1488,160 @@ PageIsEmptyHeapPage(char *buffer) return memcmp(buffer, empty_page.data, BLCKSZ) == 0; } +#if PG_MAJORVERSION_NUM >= 17 +static void +neon_wallog_pagev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + BlockNumber nblocks, const char **buffers, bool force) +{ +#define BLOCK_BATCH_SIZE 16 + bool log_pages; + BlockNumber batch_blockno = blocknum; + XLogRecPtr lsns[BLOCK_BATCH_SIZE]; + int batch_size = 0; + + /* + * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM + * changes are not WAL-logged when the changes are made, so this is our + * last chance to log them, otherwise they're lost. That's OK for + * correctness, the non-logged updates are not critical. But we want to + * have a reasonably up-to-date VM and FSM in the page server. + */ + log_pages = false; + if (force) + { + Assert(XLogInsertAllowed()); + log_pages = true; + } + else if (XLogInsertAllowed() && + !ShutdownRequestPending && + (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM)) + { + log_pages = true; + } + + if (log_pages) + { + XLogRecPtr recptr; + recptr = log_newpages_copy(&InfoFromSMgrRel(reln), forknum, blocknum, + nblocks, (Page *) buffers, false); + + for (int i = 0; i < nblocks; i++) + PageSetLSN(unconstify(char *, buffers[i]), recptr); + + ereport(SmgrTrace, + (errmsg(NEON_TAG "Page %u through %u of relation %u/%u/%u.%u " + "were force logged, lsn=%X/%X", + blocknum, blocknum + nblocks, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, LSN_FORMAT_ARGS(recptr)))); + } + + for (int i = 0; i < nblocks; i++) + { + Page page = (Page) buffers[i]; + BlockNumber blkno = blocknum + i; + XLogRecPtr lsn = PageGetLSN(page); + + if (lsn == InvalidXLogRecPtr) + { + /* + * When PostgreSQL extends a relation, it calls smgrextend() with an + * all-zeros pages, and we can just ignore that in Neon. We do need to + * remember the new size, though, so that smgrnblocks() returns the + * right answer after the rel has been extended. We rely on the + * relsize cache for that. + * + * A completely empty heap page doesn't need to be WAL-logged, either. + * The heapam can leave such a page behind, if e.g. an insert errors + * out after initializing the page, but before it has inserted the + * tuple and WAL-logged the change. When we read the page from the + * page server, it will come back as all-zeros. That's OK, the heapam + * will initialize an all-zeros page on first use. + * + * In other scenarios, evicting a dirty page with no LSN is a bad + * sign: it implies that the page was not WAL-logged, and its contents + * will be lost when it's evicted. + */ + if (PageIsNew(page)) + { + ereport(SmgrTrace, + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is all-zeros", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum))); + } + else if (PageIsEmptyHeapPage(page)) + { + ereport(SmgrTrace, + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum))); + } + else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM) + { + /* + * Its a bad sign if there is a page with zero LSN in the buffer + * cache in a standby, too. However, PANICing seems like a cure + * worse than the disease, as the damage has likely already been + * done in the primary. So in a standby, make this an assertion, + * and in a release build just LOG the error and soldier on. We + * update the last-written LSN of the page with a conservative + * value in that case, which is the last replayed LSN. + */ + ereport(RecoveryInProgress() ? LOG : PANIC, + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum))); + Assert(false); + + lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */ + } + } + else + { + ereport(SmgrTrace, + (errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, LSN_FORMAT_ARGS(lsn)))); + } + + /* + * Remember the LSN on this page. When we read the page again, we must + * read the same or newer version of it. + */ + lsns[batch_size++] = lsn; + + if (batch_size >= BLOCK_BATCH_SIZE) + { + SetLastWrittenLSNForBlockv(lsns, InfoFromSMgrRel(reln), forknum, + batch_blockno, + batch_size); + batch_blockno += batch_size; + batch_size = 0; + } + } + + if (batch_size != 0) + { + SetLastWrittenLSNForBlockv(lsns, InfoFromSMgrRel(reln), forknum, + batch_blockno, + batch_size); + } +} +#endif + /* * A page is being evicted from the shared buffer cache. Update the * last-written LSN of the page, and WAL-log it if needed. */ -static void #if PG_MAJORVERSION_NUM < 16 +static void neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force) #else +static void neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force) #endif { @@ -1548,18 +1821,39 @@ nm_adjust_lsn(XLogRecPtr lsn) return lsn; } + +/* + * Since PG17 we use vetorized version, + * so add compatibility function for older versions + */ +#if PG_MAJORVERSION_NUM < 17 +static void +GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum, + BlockNumber blkno, int nblocks, XLogRecPtr *lsns) +{ + lsns[0] = GetLastWrittenLSN(relfilenode, forknum, blkno); +} +#endif + /* * Return LSN for requesting pages and number of blocks from page server */ -static neon_request_lsns -neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) +static void +neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, + neon_request_lsns *output, BlockNumber nblocks, + const bits8 *mask) { - XLogRecPtr last_written_lsn; - neon_request_lsns result; + XLogRecPtr last_written_lsns[PG_IOV_MAX]; + + Assert(nblocks <= PG_IOV_MAX); + + GetLastWrittenLSNv(rinfo, forknum, blkno, (int) nblocks, last_written_lsns); - last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno); - last_written_lsn = nm_adjust_lsn(last_written_lsn); - Assert(last_written_lsn != InvalidXLogRecPtr); + for (int i = 0; i < nblocks; i++) + { + last_written_lsns[i] = nm_adjust_lsn(last_written_lsns[i]); + Assert(last_written_lsns[i] != InvalidXLogRecPtr); + } if (RecoveryInProgress()) { @@ -1630,95 +1924,111 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) /* Request the page at the end of the last fully replayed LSN. */ XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL); - if (last_written_lsn > replay_lsn) + for (int i = 0; i < nblocks; i++) { - /* GetCurrentReplayRecPtr was introduced in v15 */ + neon_request_lsns *result = &output[i]; + XLogRecPtr last_written_lsn = last_written_lsns[i]; + + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; + + if (last_written_lsn > replay_lsn) + { + /* GetCurrentReplayRecPtr was introduced in v15 */ #if PG_VERSION_NUM >= 150000 - Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL)); + Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL)); #endif - /* - * Cases 2 and 4. If this is a backend (case 4), the - * neon_read_at_lsn() call later will wait for the WAL record to be - * fully replayed. - */ - result.request_lsn = last_written_lsn; - } - else - { - /* cases 1 and 3 */ - result.request_lsn = replay_lsn; - } - result.not_modified_since = last_written_lsn; - result.effective_request_lsn = result.request_lsn; - Assert(last_written_lsn <= result.request_lsn); + /* + * Cases 2 and 4. If this is a backend (case 4), the + * neon_read_at_lsn() call later will wait for the WAL record to be + * fully replayed. + */ + result->request_lsn = last_written_lsn; + } + else + { + /* cases 1 and 3 */ + result->request_lsn = replay_lsn; + } + + result->not_modified_since = last_written_lsn; + result->effective_request_lsn = result->request_lsn; + Assert(last_written_lsn <= result->request_lsn); - neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X", - LSN_FORMAT_ARGS(result.request_lsn), LSN_FORMAT_ARGS(result.not_modified_since)); + neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X", + LSN_FORMAT_ARGS(result->request_lsn), LSN_FORMAT_ARGS(result->not_modified_since)); + } } else { XLogRecPtr flushlsn; - - /* - * Use the latest LSN that was evicted from the buffer cache as the - * 'not_modified_since' hint. Any pages modified by later WAL records - * must still in the buffer cache, so our request cannot concern - * those. - */ - neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X", - LSN_FORMAT_ARGS(last_written_lsn)); - - /* - * Is it possible that the last-written LSN is ahead of last flush - * LSN? Generally not, we shouldn't evict a page from the buffer cache - * before all its modifications have been safely flushed. That's the - * "WAL before data" rule. However, such case does exist at index - * building, _bt_blwritepage logs the full page without flushing WAL - * before smgrextend (files are fsynced before build ends). - */ #if PG_VERSION_NUM >= 150000 flushlsn = GetFlushRecPtr(NULL); #else flushlsn = GetFlushRecPtr(); #endif - if (last_written_lsn > flushlsn) + + for (int i = 0; i < nblocks; i++) { - neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", - LSN_FORMAT_ARGS(last_written_lsn), - LSN_FORMAT_ARGS(flushlsn)); - XLogFlush(last_written_lsn); - flushlsn = last_written_lsn; - } + neon_request_lsns *result = &output[i]; + XLogRecPtr last_written_lsn = last_written_lsns[i]; - /* - * Request the very latest version of the page. In principle we - * want to read the page at the current insert LSN, and we could - * use that value in the request. However, there's a corner case - * with pageserver's garbage collection. If the GC horizon is - * set to a very small value, it's possible that by the time - * that the pageserver processes our request, the GC horizon has - * already moved past the LSN we calculate here. Standby servers - * always have that problem as the can always lag behind the - * primary, but for the primary we can avoid it by always - * requesting the latest page, by setting request LSN to - * UINT64_MAX. - * - * Remember the current LSN, however, so that we can later - * correctly determine if the response to the request is still - * valid. The most up-to-date LSN we could use for that purpose - * would be the current insert LSN, but to avoid the overhead of - * looking it up, use 'flushlsn' instead. This relies on the - * assumption that if the page was modified since the last WAL - * flush, it should still be in the buffer cache, and we - * wouldn't be requesting it. - */ - result.request_lsn = UINT64_MAX; - result.not_modified_since = last_written_lsn; - result.effective_request_lsn = flushlsn; - } + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; + /* + * Use the latest LSN that was evicted from the buffer cache as the + * 'not_modified_since' hint. Any pages modified by later WAL records + * must still in the buffer cache, so our request cannot concern + * those. + */ + neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X", + LSN_FORMAT_ARGS(last_written_lsn)); + + /* + * Is it possible that the last-written LSN is ahead of last flush + * LSN? Generally not, we shouldn't evict a page from the buffer cache + * before all its modifications have been safely flushed. That's the + * "WAL before data" rule. However, such case does exist at index + * building, _bt_blwritepage logs the full page without flushing WAL + * before smgrextend (files are fsynced before build ends). + */ + if (last_written_lsn > flushlsn) + { + neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", + LSN_FORMAT_ARGS(last_written_lsn), + LSN_FORMAT_ARGS(flushlsn)); + XLogFlush(last_written_lsn); + flushlsn = last_written_lsn; + } - return result; + /* + * Request the very latest version of the page. In principle we + * want to read the page at the current insert LSN, and we could + * use that value in the request. However, there's a corner case + * with pageserver's garbage collection. If the GC horizon is + * set to a very small value, it's possible that by the time + * that the pageserver processes our request, the GC horizon has + * already moved past the LSN we calculate here. Standby servers + * always have that problem as the can always lag behind the + * primary, but for the primary we can avoid it by always + * requesting the latest page, by setting request LSN to + * UINT64_MAX. + * + * Remember the current LSN, however, so that we can later + * correctly determine if the response to the request is still + * valid. The most up-to-date LSN we could use for that purpose + * would be the current insert LSN, but to avoid the overhead of + * looking it up, use 'flushlsn' instead. This relies on the + * assumption that if the page was modified since the last WAL + * flush, it should still be in the buffer cache, and we + * wouldn't be requesting it. + */ + result->request_lsn = UINT64_MAX; + result->not_modified_since = last_written_lsn; + result->effective_request_lsn = flushlsn; + } + } } /* @@ -1728,13 +2038,13 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) * satisfy a page read now. */ static bool -neon_prefetch_response_usable(neon_request_lsns request_lsns, +neon_prefetch_response_usable(neon_request_lsns *request_lsns, PrefetchRequest *slot) { /* sanity check the LSN's on the old and the new request */ - Assert(request_lsns.request_lsn >= request_lsns.not_modified_since); - Assert(request_lsns.effective_request_lsn >= request_lsns.not_modified_since); - Assert(request_lsns.effective_request_lsn <= request_lsns.request_lsn); + Assert(request_lsns->request_lsn >= request_lsns->not_modified_since); + Assert(request_lsns->effective_request_lsn >= request_lsns->not_modified_since); + Assert(request_lsns->effective_request_lsn <= request_lsns->request_lsn); Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since); Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn); @@ -1755,15 +2065,15 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns, * calculate LSNs "out of order" with each other, but the prefetch queue * is backend-private at the moment.) */ - if (request_lsns.effective_request_lsn < slot->request_lsns.effective_request_lsn || - request_lsns.not_modified_since < slot->request_lsns.not_modified_since) + if (request_lsns->effective_request_lsn < slot->request_lsns.effective_request_lsn || + request_lsns->not_modified_since < slot->request_lsns.not_modified_since) { ereport(LOG, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "request with unexpected LSN after prefetch"), errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), - LSN_FORMAT_ARGS(request_lsns.not_modified_since), + LSN_FORMAT_ARGS(request_lsns->effective_request_lsn), + LSN_FORMAT_ARGS(request_lsns->not_modified_since), LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)))); return false; @@ -1817,9 +2127,9 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns, */ /* this follows from the checks above */ - Assert(request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); + Assert(request_lsns->effective_request_lsn >= slot->request_lsns.not_modified_since); - return request_lsns.not_modified_since <= slot->request_lsns.effective_request_lsn; + return request_lsns->not_modified_since <= slot->request_lsns.effective_request_lsn; } /* @@ -1886,7 +2196,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) return false; } - request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); { NeonExistsRequest request = { .req.tag = T_NeonExistsRequest, @@ -2068,7 +2379,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, */ if (max_cluster_size > 0 && reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && - !IsAutoVacuumWorkerProcess()) + !AmAutoVacuumWorkerProcess()) { uint64 current_size = GetNeonCurrentClusterSize(); @@ -2149,7 +2460,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, if (max_cluster_size > 0 && reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && - !IsAutoVacuumWorkerProcess()) + !AmAutoVacuumWorkerProcess()) { uint64 current_size = GetNeonCurrentClusterSize(); @@ -2247,14 +2558,17 @@ neon_close(SMgrRelation reln, ForkNumber forknum) } +#if PG_MAJORVERSION_NUM >= 17 /* * neon_prefetch() -- Initiate asynchronous read of the specified block of a relation */ bool -neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks) { uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; BufferTag tag; + bool io_initiated = false; switch (reln->smgr_relpersistence) { @@ -2264,37 +2578,103 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: - return mdprefetch(reln, forknum, blocknum); + return mdprefetch(reln, forknum, blocknum, nblocks); default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum)) - return false; - + tag.spcOid = reln->smgr_rlocator.locator.spcOid; + tag.dbOid = reln->smgr_rlocator.locator.dbOid; + tag.relNumber = reln->smgr_rlocator.locator.relNumber; tag.forkNum = forknum; - tag.blockNum = blocknum; - CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln)); + while (nblocks > 0) + { + int iterblocks = Min(nblocks, PG_IOV_MAX); + int seqlen = 0; + bits8 lfc_present[PG_IOV_MAX / 8]; + memset(lfc_present, 0, sizeof(lfc_present)); - ring_index = prefetch_register_buffer(tag, NULL); + if (lfc_cache_containsv(InfoFromSMgrRel(reln), forknum, blocknum, + iterblocks, lfc_present) == iterblocks) + { + nblocks -= iterblocks; + blocknum += iterblocks; + continue; + } - Assert(ring_index < MyPState->ring_unused && - MyPState->ring_last <= ring_index); + io_initiated = true; + + tag.blockNum = blocknum; + + for (int i = 0; i < PG_IOV_MAX / 8; i++) + lfc_present[i] = ~(lfc_present[i]); + + ring_index = prefetch_register_bufferv(tag, NULL, iterblocks, + lfc_present); + nblocks -= iterblocks; + blocknum += iterblocks; + + Assert(ring_index < MyPState->ring_unused && + MyPState->ring_last <= ring_index); + } return false; } + +#else /* PG_MAJORVERSION_NUM >= 17 */ /* - * neon_writeback() -- Tell the kernel to write pages back to storage. - * - * This accepts a range of blocks because flushing several pages at once is - * considerably more efficient than doing so individually. + * neon_prefetch() -- Initiate asynchronous read of the specified block of a relation */ -void -neon_writeback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks) +bool +neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; + BufferTag tag; + + switch (reln->smgr_relpersistence) + { + case 0: /* probably shouldn't happen, but ignore it */ + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdprefetch(reln, forknum, blocknum); + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum)) + return false; + + tag.forkNum = forknum; + tag.blockNum = blocknum; + + CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln)); + + ring_index = prefetch_register_buffer(tag, NULL); + + Assert(ring_index < MyPState->ring_unused && + MyPState->ring_last <= ring_index); + + return false; +} +#endif /* PG_MAJORVERSION_NUM < 17 */ + + +/* + * neon_writeback() -- Tell the kernel to write pages back to storage. + * + * This accepts a range of blocks because flushing several pages at once is + * considerably more efficient than doing so individually. + */ +void +neon_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) { switch (reln->smgr_relpersistence) { @@ -2315,7 +2695,12 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - /* not implemented */ + /* + * TODO: WAL sync up to lwLsn for the indicated blocks + * Without that sync, writeback doesn't actually guarantee the data is + * persistently written, which does seem to be one of the assumed + * properties of this smgr API call. + */ neon_log(SmgrTrace, "writeback noop"); #ifdef DEBUG_COMPARE_LOCAL @@ -2324,30 +2709,27 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, #endif } -/* - * While function is defined in the neon extension it's used within neon_test_utils directly. - * To avoid breaking tests in the runtime please keep function signature in sync. - */ -void +static void #if PG_MAJORVERSION_NUM < 16 -neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - neon_request_lsns request_lsns, char *buffer) +neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns, + char **buffers, BlockNumber nblocks, const bits8 *mask) #else -neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - neon_request_lsns request_lsns, void *buffer) +neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns, + void **buffers, BlockNumber nblocks, const bits8 *mask) #endif { NeonResponse *resp; uint64 ring_index; PrfHashEntry *entry; PrefetchRequest *slot; - BufferTag buftag = - { - .forkNum = forkNum, - .blockNum = blkno, - }; + BufferTag buftag = {0}; + + Assert(PointerIsValid(request_lsns)); + Assert(nblocks >= 1); CopyNRelFileInfoToBufTag(buftag, rinfo); + buftag.forkNum = forkNum; + buftag.blockNum = base_blockno; /* * The redo process does not lock pages that it needs to replay but are @@ -2365,115 +2747,147 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * weren't for the behaviour of the LwLsn cache that uses the highest * value of the LwLsn cache when the entry is not found. */ - if (RecoveryInProgress() && !(MyBackendType == B_STARTUP)) - XLogWaitForReplayOf(request_lsns.request_lsn); + prefetch_register_bufferv(buftag, request_lsns, nblocks, mask); - /* - * Try to find prefetched page in the list of received pages. - */ + for (int i = 0; i < nblocks; i++) + { + void *buffer = buffers[i]; + BlockNumber blockno = base_blockno + i; + neon_request_lsns *reqlsns = &request_lsns[i]; + + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; + + if (RecoveryInProgress() && MyBackendType != B_STARTUP) + XLogWaitForReplayOf(reqlsns[0].request_lsn); + + /* + * Try to find prefetched page in the list of received pages. + */ Retry: - entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); + buftag.blockNum = blockno; + entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); - if (entry != NULL) - { - slot = entry->slot; - if (neon_prefetch_response_usable(request_lsns, slot)) - { - ring_index = slot->my_ring_index; - pgBufferUsage.prefetch.hits += 1; - } - else + if (entry != NULL) { - /* - * Cannot use this prefetch, discard it - * - * We can't drop cache for not-yet-received requested items. It is - * unlikely this happens, but it can happen if prefetch distance - * is large enough and a backend didn't consume all prefetch - * requests. - */ - if (slot->status == PRFS_REQUESTED) + slot = entry->slot; + if (neon_prefetch_response_usable(reqlsns, slot)) { - if (!prefetch_wait_for(slot->my_ring_index)) - goto Retry; + ring_index = slot->my_ring_index; + pgBufferUsage.prefetch.hits += 1; + } + else + { + /* + * Cannot use this prefetch, discard it + * + * We can't drop cache for not-yet-received requested items. It is + * unlikely this happens, but it can happen if prefetch distance + * is large enough and a backend didn't consume all prefetch + * requests. + */ + if (slot->status == PRFS_REQUESTED) + { + if (!prefetch_wait_for(slot->my_ring_index)) + goto Retry; + } + /* drop caches */ + prefetch_set_unused(slot->my_ring_index); + pgBufferUsage.prefetch.expired += 1; + /* make it look like a prefetch cache miss */ + entry = NULL; } - /* drop caches */ - prefetch_set_unused(slot->my_ring_index); - pgBufferUsage.prefetch.expired += 1; - /* make it look like a prefetch cache miss */ - entry = NULL; } - } - do - { - if (entry == NULL) + do { - pgBufferUsage.prefetch.misses += 1; + if (entry == NULL) + { + pgBufferUsage.prefetch.misses += 1; - ring_index = prefetch_register_buffer(buftag, &request_lsns); - slot = GetPrfSlot(ring_index); - } - else - { - /* - * Empty our reference to the prefetch buffer's hash entry. When - * we wait for prefetches, the entry reference is invalidated by - * potential updates to the hash, and when we reconnect to the - * pageserver the prefetch we're waiting for may be dropped, in - * which case we need to retry and take the branch above. - */ - entry = NULL; - } + ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL); + Assert(ring_index != UINT64_MAX); + slot = GetPrfSlot(ring_index); + } + else + { + /* + * Empty our reference to the prefetch buffer's hash entry. When + * we wait for prefetches, the entry reference is invalidated by + * potential updates to the hash, and when we reconnect to the + * pageserver the prefetch we're waiting for may be dropped, in + * which case we need to retry and take the branch above. + */ + entry = NULL; + } - Assert(slot->my_ring_index == ring_index); - Assert(MyPState->ring_last <= ring_index && - MyPState->ring_unused > ring_index); - Assert(slot->status != PRFS_UNUSED); - Assert(GetPrfSlot(ring_index) == slot); + Assert(slot->my_ring_index == ring_index); + Assert(MyPState->ring_last <= ring_index && + MyPState->ring_unused > ring_index); + Assert(slot->status != PRFS_UNUSED); + Assert(GetPrfSlot(ring_index) == slot); - } while (!prefetch_wait_for(ring_index)); + } while (!prefetch_wait_for(ring_index)); - Assert(slot->status == PRFS_RECEIVED); + Assert(slot->status == PRFS_RECEIVED); + Assert(memcmp(&buftag, &slot->buftag, sizeof(BufferTag)) == 0); + Assert(buftag.blockNum == base_blockno + i); - resp = slot->response; + resp = slot->response; - switch (resp->tag) - { - case T_NeonGetPageResponse: - memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); - lfc_write(rinfo, forkNum, blkno, buffer); - break; + switch (resp->tag) + { + case T_NeonGetPageResponse: + memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); + lfc_write(rinfo, forkNum, blockno, buffer); + break; - case T_NeonErrorResponse: - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", - slot->shard_no, blkno, - RelFileInfoFmt(rinfo), - forkNum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - default: - NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, - "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", - T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); + case T_NeonErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", + slot->shard_no, blockno, RelFileInfoFmt(rinfo), + forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + default: + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", + T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); + } + + /* buffer was used, clean up for later reuse */ + prefetch_set_unused(ring_index); + prefetch_cleanup_trailing_unused(); } +} - /* buffer was used, clean up for later reuse */ - prefetch_set_unused(ring_index); - prefetch_cleanup_trailing_unused(); +/* + * While function is defined in the neon extension it's used within neon_test_utils directly. + * To avoid breaking tests in the runtime please keep function signature in sync. + */ +void +#if PG_MAJORVERSION_NUM < 16 +neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + neon_request_lsns request_lsns, char *buffer) +#else +neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + neon_request_lsns request_lsns, void *buffer) +#endif +{ + neon_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); } +#if PG_MAJORVERSION_NUM < 17 /* * neon_read() -- Read the specified block from a relation. */ -void #if PG_MAJORVERSION_NUM < 16 +void neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer) #else +void neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) #endif { @@ -2502,7 +2916,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer return; } - request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno); + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL); neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); #ifdef DEBUG_COMPARE_LOCAL @@ -2578,6 +2992,148 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer } #endif } +#endif /* PG_MAJORVERSION_NUM <= 16 */ + +#if PG_MAJORVERSION_NUM >= 17 +void +neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + void **buffers, BlockNumber nblocks) +{ + bits8 read[PG_IOV_MAX / 8]; + neon_request_lsns request_lsns[PG_IOV_MAX]; + int lfc_result; + + switch (reln->smgr_relpersistence) + { + case 0: + neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdreadv(reln, forknum, blocknum, buffers, nblocks); + return; + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (nblocks > PG_IOV_MAX) + neon_log(ERROR, "Read request too large: %d is larger than max %d", + nblocks, PG_IOV_MAX); + + memset(read, 0, sizeof(read)); + + /* Try to read from local file cache */ + lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers, + nblocks, read); + + /* Read all blocks from LFC, so we're done */ + if (lfc_result == nblocks) + return; + + if (lfc_result == -1) + { + /* can't use the LFC result, so read all blocks from PS */ + for (int i = 0; i < PG_IOV_MAX / 8; i++) + read[i] = 0xFF; + } + else + { + /* invert the result: exclude blocks read from lfc */ + for (int i = 0; i < PG_IOV_MAX / 8; i++) + read[i] = ~(read[i]); + } + + neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, + request_lsns, nblocks, read); + + neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, + buffers, nblocks, read); + +#ifdef DEBUG_COMPARE_LOCAL + if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) + { + char pageserver_masked[BLCKSZ]; + char mdbuf[BLCKSZ]; + char mdbuf_masked[BLCKSZ]; + + for (int i = 0; i < nblocks; i++) + { +#if PG_MAJORVERSION_NUM >= 17 + mdreadv(reln, forkNum, blkno + i, &mdbuf, 1); +#else + mdread(reln, forkNum, blkno + i, mdbuf); +#endif + + memcpy(pageserver_masked, buffer, BLCKSZ); + memcpy(mdbuf_masked, mdbuf, BLCKSZ); + + if (PageIsNew((Page) mdbuf)) + { + if (!PageIsNew((Page) pageserver_masked)) + { + neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(buffer)); + } + } + else if (PageIsNew((Page) buffer)) + { + neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf)); + } + else if (PageGetSpecialSize(mdbuf) == 0) + { + /* assume heap */ + RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData))) + { + if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID) + { + /* assume btree */ + RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + } + } + } +#endif +} +#endif #ifdef DEBUG_COMPARE_LOCAL static char * @@ -2623,8 +3179,11 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo if (mdexists(reln, forknum)) { /* It exists locally. Guess it's unlogged then. */ +#if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); +#else mdwrite(reln, forknum, blocknum, buffer, skipFsync); - +#endif /* * We could set relpersistence now that we have determined * that it's local. But we don't dare to do it, because that @@ -2641,9 +3200,12 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: + #if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); + #else mdwrite(reln, forknum, blocknum, buffer, skipFsync); + #endif return; - default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } @@ -2660,10 +3222,64 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) + #if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); + #else mdwrite(reln, forknum, blocknum, buffer, skipFsync); + #endif #endif } + + +#if PG_MAJORVERSION_NUM >= 17 +void +neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + const void **buffers, BlockNumber nblocks, bool skipFsync) +{ + switch (reln->smgr_relpersistence) + { + case 0: + /* This is a bit tricky. Check if the relation exists locally */ + if (mdexists(reln, forknum)) + { + /* It exists locally. Guess it's unlogged then. */ + mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); + + /* + * We could set relpersistence now that we have determined + * that it's local. But we don't dare to do it, because that + * would immediately allow reads as well, which shouldn't + * happen. We could cache it with a different 'relpersistence' + * value, but this isn't performance critical. + */ + return; + } + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); + return; + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false); + + lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); +#endif +} + +#endif + /* * neon_nblocks() -- Get the number of blocks stored in a relation. */ @@ -2699,7 +3315,9 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) return n_blocks; } - request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + { NeonNblocksRequest request = { .req.tag = T_NeonNblocksRequest, @@ -2757,7 +3375,9 @@ neon_dbsize(Oid dbNode) neon_request_lsns request_lsns; NRelFileInfo dummy_node = {0}; - request_lsns = neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsns(dummy_node, MAIN_FORKNUM, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + { NeonDbSizeRequest request = { .req.tag = T_NeonDbSizeRequest, @@ -2898,6 +3518,38 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) #endif } +#if PG_MAJORVERSION_NUM >= 17 +void +neon_regisersync(SMgrRelation reln, ForkNumber forknum) +{ + switch (reln->smgr_relpersistence) + { + case 0: + neon_log(ERROR, "cannot call smgrregistersync() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdregistersync(reln, forknum); + return; + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + neon_log(SmgrTrace, "[NEON_SMGR] registersync noop"); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdimmedsync(reln, forknum); +#endif +} +#endif + + /* * neon_start_unlogged_build() -- Starting build operation on a rel. * @@ -3047,8 +3699,11 @@ neon_end_unlogged_build(SMgrRelation reln) static int neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer) { - XLogRecPtr request_lsn, - not_modified_since; + XLogRecPtr request_lsn, + not_modified_since; + SlruKind kind; + int n_blocks; + shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ /* * Compute a request LSN to use, similar to neon_get_request_lsns() but the @@ -3078,32 +3733,30 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf */ not_modified_since = nm_adjust_lsn(GetRedoStartLsn()); - SlruKind kind; - - if (STRPREFIX(path, "pg_xact")) - kind = SLRU_CLOG; - else if (STRPREFIX(path, "pg_multixact/members")) - kind = SLRU_MULTIXACT_MEMBERS; - else if (STRPREFIX(path, "pg_multixact/offsets")) - kind = SLRU_MULTIXACT_OFFSETS; - else - return -1; + if (STRPREFIX(path, "pg_xact")) + kind = SLRU_CLOG; + else if (STRPREFIX(path, "pg_multixact/members")) + kind = SLRU_MULTIXACT_MEMBERS; + else if (STRPREFIX(path, "pg_multixact/offsets")) + kind = SLRU_MULTIXACT_OFFSETS; + else + return -1; NeonResponse *resp; NeonGetSlruSegmentRequest request = { .req.tag = T_NeonGetSlruSegmentRequest, .req.lsn = request_lsn, .req.not_modified_since = not_modified_since, - .kind = kind, .segno = segno }; - int n_blocks; - shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ + do { while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no)); + consume_prefetch_responses(); + resp = page_server->receive(shard_no); } while (resp == NULL); @@ -3182,14 +3835,23 @@ static const struct f_smgr neon_smgr = #if PG_MAJORVERSION_NUM >= 16 .smgr_zeroextend = neon_zeroextend, #endif +#if PG_MAJORVERSION_NUM >= 17 + .smgr_prefetch = neon_prefetch, + .smgr_readv = neon_readv, + .smgr_writev = neon_writev, +#else .smgr_prefetch = neon_prefetch, .smgr_read = neon_read, .smgr_write = neon_write, +#endif + .smgr_writeback = neon_writeback, .smgr_nblocks = neon_nblocks, .smgr_truncate = neon_truncate, .smgr_immedsync = neon_immedsync, - +#if PG_MAJORVERSION_NUM >= 17 + .smgr_registersync = neon_regisersync, +#endif .smgr_start_unlogged_build = neon_start_unlogged_build, .smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1, .smgr_end_unlogged_build = neon_end_unlogged_build, @@ -3198,11 +3860,11 @@ static const struct f_smgr neon_smgr = }; const f_smgr * -smgr_neon(BackendId backend, NRelFileInfo rinfo) +smgr_neon(ProcNumber backend, NRelFileInfo rinfo) { /* Don't use page server for temp relations */ - if (backend != InvalidBackendId) + if (backend != INVALID_PROC_NUMBER) return smgr_standard(backend, rinfo); else return &neon_smgr; diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 65ef588ba5a3..4d0d06e6de28 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -81,6 +81,7 @@ static void nwp_register_gucs(void); static void assign_neon_safekeepers(const char *newval, void *extra); static void nwp_prepare_shmem(void); static uint64 backpressure_lag_impl(void); +static uint64 startup_backpressure_wrap(void); static bool backpressure_throttling_impl(void); static void walprop_register_bgworker(void); @@ -90,7 +91,7 @@ static void walprop_pg_init_bgworker(void); static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp); static void walprop_pg_load_libpqwalreceiver(void); -static process_interrupts_callback_t PrevProcessInterruptsCallback; +static process_interrupts_callback_t PrevProcessInterruptsCallback = NULL; static shmem_startup_hook_type prev_shmem_startup_hook_type; #if PG_VERSION_NUM >= 150000 static shmem_request_hook_type prev_shmem_request_hook = NULL; @@ -178,7 +179,7 @@ pg_init_walproposer(void) nwp_prepare_shmem(); - delay_backend_us = &backpressure_lag_impl; + delay_backend_us = &startup_backpressure_wrap; PrevProcessInterruptsCallback = ProcessInterruptsCallback; ProcessInterruptsCallback = backpressure_throttling_impl; @@ -352,6 +353,22 @@ backpressure_lag_impl(void) return 0; } +/* + * We don't apply backpressure when we're the postmaster, or the startup + * process, because in postmaster we can't apply backpressure, and in + * the startup process we can't afford to slow down. + */ +static uint64 +startup_backpressure_wrap(void) +{ + if (AmStartupProcess() || !IsUnderPostmaster) + return 0; + + delay_backend_us = &backpressure_lag_impl; + + return backpressure_lag_impl(); +} + /* * WalproposerShmemSize --- report amount of shared memory space needed */ @@ -401,12 +418,13 @@ WalproposerShmemInit_SyncSafekeeper(void) static bool backpressure_throttling_impl(void) { - int64 lag; + uint64 lag; TimestampTz start, stop; - bool retry = PrevProcessInterruptsCallback - ? PrevProcessInterruptsCallback() - : false; + bool retry = false; + + if (PointerIsValid(PrevProcessInterruptsCallback)) + retry = PrevProcessInterruptsCallback(); /* * Don't throttle read only transactions or wal sender. Do throttle CREATE @@ -602,7 +620,12 @@ walprop_pg_init_walsender(void) /* Create replication slot for WAL proposer if not exists */ if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL) { +#if PG_MAJORVERSION_NUM >= 17 + ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, + false, false, false); +#else ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false); +#endif ReplicationSlotReserveWal(); /* Write this slot to disk */ ReplicationSlotMarkDirty(); @@ -1509,7 +1532,11 @@ walprop_pg_init_event_set(WalProposer *wp) wpg_log(FATAL, "double-initialization of event set"); /* for each sk, we have socket plus potentially socket for neon walreader */ +#if PG_MAJORVERSION_NUM >= 17 + waitEvents = CreateWaitEventSet(NULL, 2 + 2 * wp->n_safekeepers); +#else waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers); +#endif AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, diff --git a/pgxn/neon_rmgr/neon_rmgr_decode.c b/pgxn/neon_rmgr/neon_rmgr_decode.c index f327e132e94b..66032c88f62c 100644 --- a/pgxn/neon_rmgr/neon_rmgr_decode.c +++ b/pgxn/neon_rmgr/neon_rmgr_decode.c @@ -1,6 +1,7 @@ #include "postgres.h" #if PG_MAJORVERSION_NUM >= 16 + #include "access/heapam_xlog.h" #include "access/neon_xlog.h" #include "replication/decode.h" @@ -9,6 +10,10 @@ #include "neon_rmgr.h" +#endif /* PG >= 16 */ + +#if PG_MAJORVERSION_NUM == 16 + /* individual record(group)'s handlers */ static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); @@ -399,6 +404,398 @@ DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple) header->t_infomask2 = xlhdr.t_infomask2; header->t_hoff = xlhdr.t_hoff; } +#endif + +#if PG_MAJORVERSION_NUM == 17 + +/* individual record(group)'s handlers */ +static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); + +/* common function to decode tuples */ +static void DecodeXLogTuple(char *data, Size len, HeapTuple tuple); + + +void +neon_rm_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + uint8 info = XLogRecGetInfo(buf->record) & XLOG_NEON_OPMASK; + TransactionId xid = XLogRecGetXid(buf->record); + SnapBuild *builder = ctx->snapshot_builder; + + ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr); + + /* + * If we don't have snapshot or we are just fast-forwarding, there is no + * point in decoding data changes. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT || + ctx->fast_forward) + return; + + switch (info) + { + case XLOG_NEON_HEAP_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonInsert(ctx, buf); + break; + case XLOG_NEON_HEAP_DELETE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonDelete(ctx, buf); + break; + case XLOG_NEON_HEAP_UPDATE: + case XLOG_NEON_HEAP_HOT_UPDATE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonUpdate(ctx, buf); + break; + case XLOG_NEON_HEAP_LOCK: + break; + case XLOG_NEON_HEAP_MULTI_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonMultiInsert(ctx, buf); + break; + default: + elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info); + break; + } +} + +static inline bool +FilterByOrigin(LogicalDecodingContext *ctx, RepOriginId origin_id) +{ + if (ctx->callbacks.filter_by_origin_cb == NULL) + return false; + + return filter_by_origin_cb_wrapper(ctx, origin_id); +} + +/* + * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs. + * + * Deletes can contain the new tuple. + */ +static void +DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + Size datalen; + char *tupledata; + Size tuplelen; + XLogReaderState *r = buf->record; + xl_neon_heap_insert *xlrec; + ReorderBufferChange *change; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples (this does happen when + * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL). + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE)) + change->action = REORDER_BUFFER_CHANGE_INSERT; + else + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + tupledata = XLogRecGetBlockData(r, 0, &datalen); + tuplelen = datalen - SizeOfHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple); + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, + xlrec->flags & XLH_INSERT_ON_TOAST_RELATION); +} + +/* + * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs. + * + * Deletes can possibly contain the old primary key. + */ +static void +DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_delete *xlrec; + ReorderBufferChange *change; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_delete *) XLogRecGetData(r); + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; -#endif \ No newline at end of file + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + + if (xlrec->flags & XLH_DELETE_IS_SUPER) + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT; + else + change->action = REORDER_BUFFER_CHANGE_DELETE; + + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + /* old primary key stored */ + if (xlrec->flags & XLH_DELETE_CONTAINS_OLD) + { + Size datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapHeader; + Size tuplelen = datalen - SizeOfNeonHeapHeader; + + Assert(XLogRecGetDataLen(r) > (SizeOfNeonHeapDelete + SizeOfNeonHeapHeader)); + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple((char *) xlrec + SizeOfNeonHeapDelete, + datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout + * in the record, from wal into proper tuplebufs. + * + * Updates can possibly contain a new tuple and the old primary key. + */ +static void +DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_update *xlrec; + ReorderBufferChange *change; + char *data; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_update *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_UPDATE; + change->origin_id = XLogRecGetOrigin(r); + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE) + { + Size datalen; + Size tuplelen; + + data = XLogRecGetBlockData(r, 0, &datalen); + + tuplelen = datalen - SizeOfNeonHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.newtuple); + } + + if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD) + { + Size datalen; + Size tuplelen; + + /* caution, remaining data in record is not aligned */ + data = XLogRecGetData(r) + SizeOfNeonHeapUpdate; + datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapUpdate; + tuplelen = datalen - SizeOfNeonHeapHeader; + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs. + * + * Currently MULTI_INSERT will always contain the full tuples. + */ +static void +DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_multi_insert *xlrec; + int i; + char *data; + char *tupledata; + Size tuplelen; + RelFileLocator rlocator; + + xlrec = (xl_neon_heap_multi_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples. This happens when a + * multi_insert is done on a catalog or on a non-persistent relation. + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &rlocator, NULL, NULL); + if (rlocator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + /* + * We know that this multi_insert isn't for a catalog, so the block should + * always have data even if a full-page write of it is taken. + */ + tupledata = XLogRecGetBlockData(r, 0, &tuplelen); + Assert(tupledata != NULL); + + data = tupledata; + for (i = 0; i < xlrec->ntuples; i++) + { + ReorderBufferChange *change; + xl_neon_multi_insert_tuple *xlhdr; + int datalen; + HeapTuple tuple; + HeapTupleHeader header; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &rlocator, sizeof(RelFileLocator)); + + xlhdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(data); + data = ((char *) xlhdr) + SizeOfNeonMultiInsertTuple; + datalen = xlhdr->datalen; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, datalen); + + tuple = change->data.tp.newtuple; + header = tuple->t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->t_self); + + /* + * We can only figure this out after reassembling the transactions. + */ + tuple->t_tableOid = InvalidOid; + + tuple->t_len = datalen + SizeofHeapTupleHeader; + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy((char *) tuple->t_data + SizeofHeapTupleHeader, + (char *) data, + datalen); + header->t_infomask = xlhdr->t_infomask; + header->t_infomask2 = xlhdr->t_infomask2; + header->t_hoff = xlhdr->t_hoff; + + /* + * Reset toast reassembly state only after the last row in the last + * xl_multi_insert_tuple record emitted by one heap_multi_insert() + * call. + */ + if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI && + (i + 1) == xlrec->ntuples) + change->data.tp.clear_toast_afterwards = true; + else + change->data.tp.clear_toast_afterwards = false; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), + buf->origptr, change, false); + + /* move to the next xl_neon_multi_insert_tuple entry */ + data += datalen; + } + Assert(data == tupledata + tuplelen); +} + +/* + * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete + * (but not by heap_multi_insert) into a tuplebuf. + * + * The size 'len' and the pointer 'data' in the record need to be + * computed outside as they are record specific. + */ +static void +DecodeXLogTuple(char *data, Size len, HeapTuple tuple) +{ + xl_neon_heap_header xlhdr; + int datalen = len - SizeOfNeonHeapHeader; + HeapTupleHeader header; + + Assert(datalen >= 0); + + tuple->t_len = datalen + SizeofHeapTupleHeader; + header = tuple->t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->t_self); + + /* we can only figure this out after reassembling the transactions */ + tuple->t_tableOid = InvalidOid; + + /* data is not stored aligned, copy to aligned storage */ + memcpy((char *) &xlhdr, + data, + SizeOfNeonHeapHeader); + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy(((char *) tuple->t_data) + SizeofHeapTupleHeader, + data + SizeOfNeonHeapHeader, + datalen); + + header->t_infomask = xlhdr.t_infomask; + header->t_infomask2 = xlhdr.t_infomask2; + header->t_hoff = xlhdr.t_hoff; +} +#endif diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c index 4e604a710cf7..a45e8f5c4aeb 100644 --- a/pgxn/neon_walredo/inmem_smgr.c +++ b/pgxn/neon_walredo/inmem_smgr.c @@ -68,8 +68,13 @@ static void inmem_close(SMgrRelation reln, ForkNumber forknum); static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); static bool inmem_exists(SMgrRelation reln, ForkNumber forknum); static void inmem_unlink(NRelFileInfoBackend rinfo, ForkNumber forknum, bool isRedo); +#if PG_MAJORVERSION_NUM >= 17 +static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks); +#else static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +#endif #if PG_MAJORVERSION_NUM < 16 static void inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); @@ -93,7 +98,9 @@ static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); static void inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); - +#if PG_MAJORVERSION_NUM >= 17 +static void inmem_registersync(SMgrRelation reln, ForkNumber forknum); +#endif /* * inmem_init() -- Initialize private state @@ -190,6 +197,14 @@ inmem_close(SMgrRelation reln, ForkNumber forknum) { } +#if PG_MAJORVERSION_NUM >= 17 +static bool +inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks) +{ + return true; +} +#else /* * inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation */ @@ -198,6 +213,7 @@ inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { return true; } +#endif /* * inmem_writeback() -- Tell the kernel to write pages back to storage. @@ -211,11 +227,13 @@ inmem_writeback(SMgrRelation reln, ForkNumber forknum, /* * inmem_read() -- Read the specified block from a relation. */ +#if PG_MAJORVERSION_NUM < 16 static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, -#if PG_MAJORVERSION_NUM < 16 char *buffer) #else +static void +inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, void *buffer) #endif { @@ -228,6 +246,18 @@ inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, memcpy(buffer, page_body[pg], BLCKSZ); } +#if PG_MAJORVERSION_NUM >= 17 +static void +inmem_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + void **buffers, BlockNumber nblocks) +{ + for (int i = 0; i < nblocks; i++) + { + inmem_read(reln, forknum, blkno, buffers[i]); + } +} +#endif + /* * inmem_write() -- Write the supplied block at the appropriate location. * @@ -280,6 +310,18 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, memcpy(page_body[pg], buffer, BLCKSZ); } +#if PG_MAJORVERSION_NUM >= 17 +static void +inmem_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + const void **buffers, BlockNumber nblocks, bool skipFsync) +{ + for (int i = 0; i < nblocks; i++) + { + inmem_write(reln, forknum, blkno, buffers[i], skipFsync); + } +} +#endif + /* * inmem_nblocks() -- Get the number of blocks stored in a relation. */ @@ -315,6 +357,13 @@ inmem_immedsync(SMgrRelation reln, ForkNumber forknum) { } +#if PG_MAJORVERSION_NUM >= 17 +static void +inmem_registersync(SMgrRelation reln, ForkNumber forknum) +{ +} +#endif + static const struct f_smgr inmem_smgr = { .smgr_init = inmem_init, @@ -328,23 +377,39 @@ static const struct f_smgr inmem_smgr = #if PG_MAJORVERSION_NUM >= 16 .smgr_zeroextend = inmem_zeroextend, #endif +#if PG_MAJORVERSION_NUM >= 17 + .smgr_prefetch = inmem_prefetch, + .smgr_readv = inmem_readv, + .smgr_writev = inmem_writev, +#else .smgr_prefetch = inmem_prefetch, .smgr_read = inmem_read, .smgr_write = inmem_write, +#endif .smgr_writeback = inmem_writeback, .smgr_nblocks = inmem_nblocks, .smgr_truncate = inmem_truncate, .smgr_immedsync = inmem_immedsync, + +#if PG_MAJORVERSION_NUM >= 17 + .smgr_registersync = inmem_registersync, +#endif + + .smgr_start_unlogged_build = NULL, + .smgr_finish_unlogged_build_phase_1 = NULL, + .smgr_end_unlogged_build = NULL, + .smgr_read_slru_segment = NULL, }; const f_smgr * -smgr_inmem(BackendId backend, NRelFileInfo rinfo) +smgr_inmem(ProcNumber backend, NRelFileInfo rinfo) { Assert(InRecovery); - if (backend != InvalidBackendId) - return smgr_standard(backend, rinfo); - else - return &inmem_smgr; + // // What does this code do? + // if (backend != INVALID_PROC_NUMBER) + // return smgr_standard(backend, rinfo); + // else + return &inmem_smgr; } void diff --git a/pgxn/neon_walredo/inmem_smgr.h b/pgxn/neon_walredo/inmem_smgr.h index 58b98b8e6acc..91f1c80965ee 100644 --- a/pgxn/neon_walredo/inmem_smgr.h +++ b/pgxn/neon_walredo/inmem_smgr.h @@ -11,7 +11,7 @@ #ifndef INMEM_SMGR_H #define INMEM_SMGR_H -extern const f_smgr *smgr_inmem(BackendId backend, NRelFileInfo rinfo); +extern const f_smgr *smgr_inmem(ProcNumber backend, NRelFileInfo rinfo); extern void smgr_init_inmem(void); #endif /* INMEM_SMGR_H */ diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index cc545393f594..219ca852073d 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -100,6 +100,9 @@ #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "storage/dsm.h" +#if PG_MAJORVERSION_NUM >= 17 +#include "storage/dsm_registry.h" +#endif #include "storage/ipc.h" #include "storage/pg_shmem.h" #include "storage/pmsignal.h" @@ -137,7 +140,7 @@ static BufferTag target_redo_tag; static XLogReaderState *reader_state; -#define TRACE DEBUG5 +#define TRACE LOG #ifdef HAVE_LIBSECCOMP @@ -517,6 +520,10 @@ CreateFakeSharedMemoryAndSemaphores() /* * Set up xlog, clog, and buffers */ +#if PG_MAJORVERSION_NUM >= 17 + DSMRegistryShmemInit(); + VarsupShmemInit(); +#endif XLOGShmemInit(); CLOGShmemInit(); CommitTsShmemInit(); @@ -566,7 +573,10 @@ CreateFakeSharedMemoryAndSemaphores() /* * Set up other modules that need some shared memory space */ +#if PG_MAJORVERSION_NUM < 17 + /* "snapshot too old" was removed in PG17, and with it the SnapMgr */ SnapMgrInit(); +#endif BTreeShmemInit(); SyncScanShmemInit(); /* Skip due to the 'pg_notify' directory check */ @@ -742,7 +752,7 @@ BeginRedoForBlock(StringInfo input_message) target_redo_tag.forkNum, target_redo_tag.blockNum); - reln = smgropen(rinfo, InvalidBackendId, RELPERSISTENCE_PERMANENT); + reln = smgropen(rinfo, INVALID_PROC_NUMBER, RELPERSISTENCE_PERMANENT); if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber || reln->smgr_cached_nblocks[forknum] < blknum + 1) { diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py index 064a678c9601..d8390138c9a9 100644 --- a/test_runner/fixtures/common_types.py +++ b/test_runner/fixtures/common_types.py @@ -13,7 +13,7 @@ class Lsn: """ Datatype for an LSN. Internally it is a 64-bit integer, but the string - representation is like "1/123abcd". See also pg_lsn datatype in Postgres + representation is like "1/0123abcd". See also pg_lsn datatype in Postgres """ def __init__(self, x: Union[int, str]): diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index ee62372871f8..50284a3f5a76 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -933,8 +933,11 @@ def cleanup_local_storage(self): for directory_to_clean in reversed(directories_to_clean): if not os.listdir(directory_to_clean): - log.debug(f"Removing empty directory {directory_to_clean}") - directory_to_clean.rmdir() + log.info(f"Removing empty directory {directory_to_clean}") + try: + directory_to_clean.rmdir() + except Exception as e: + log.error(f"Error removing empty directory {directory_to_clean}: {e}") def cleanup_remote_storage(self): for x in [self.pageserver_remote_storage, self.safekeepers_remote_storage]: @@ -3423,6 +3426,7 @@ def configure(self, options: List[str]): assert not self.running with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file: conf_file.write("\n".join(options)) + conf_file.write("\n") def edit_hba(self, hba: List[str]): """Prepend hba lines into pg_hba.conf file.""" @@ -3476,6 +3480,7 @@ def vanilla_pg( pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) port = port_distributor.get_port() with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: + vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"]) yield vanilla_pg diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index e12c8e5f4af0..258935959b59 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -16,6 +16,7 @@ class PgVersion(str, enum.Enum): V14 = "14" V15 = "15" V16 = "16" + V17 = "17" # Instead of making version an optional parameter in methods, we can use this fake entry # to explicitly rely on the default server version (could be different from pg_version fixture value) NOT_SET = "<-POSTRGRES VERSION IS NOT SET->" diff --git a/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json new file mode 100644 index 000000000000..7990b2c3a25c --- /dev/null +++ b/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json @@ -0,0 +1,7 @@ +{ + "public_extensions": [], + "library_index": { + "TODO": "We still need PG17 extensions" + }, + "extension_data": {} +} \ No newline at end of file diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index b559be5f18a5..fb5c1d311549 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -21,7 +21,7 @@ from fixtures.pageserver.utils import ( timeline_delete_wait_completed, ) -from fixtures.pg_version import PgVersion +from fixtures.pg_version import PgVersion, skip_on_postgres from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage from fixtures.workload import Workload @@ -156,6 +156,9 @@ def test_create_snapshot( @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") +@skip_on_postgres( + PgVersion.V17, "There are no snapshots yet" +) # TODO: revert this once we have snapshots def test_backward_compatibility( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, @@ -203,6 +206,9 @@ def test_backward_compatibility( @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") +@skip_on_postgres( + PgVersion.V17, "There are no snapshots yet" +) # TODO: revert this once we have snapshots def test_forward_compatibility( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 27eb05ac0912..7370eb145608 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -44,6 +44,8 @@ def test_remote_extensions( ): if pg_version == PgVersion.V16: pytest.skip("TODO: PG16 extension building") + if pg_version == PgVersion.V17: + pytest.skip("TODO: PG17 extension building") # setup mock http server # that expects request for anon.tar.zst diff --git a/test_runner/regress/test_postgres_version.py b/test_runner/regress/test_postgres_version.py index 03e8c7c0dfd0..4145a303c6f8 100644 --- a/test_runner/regress/test_postgres_version.py +++ b/test_runner/regress/test_postgres_version.py @@ -20,16 +20,19 @@ def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion): output = f.read().strip() # `postgres --version` prints something like "postgres (PostgreSQL) 15.6 (85d809c124a898847a97d66a211f7d5ef4f8e0cb)". - pattern = r"postgres \(PostgreSQL\) (?P\d+\.\d+) \((?P[0-9a-f]{40})\)" + # beta- and release candidate releases would use '17beta1' and '18rc2' instead of .-separated numbers. + pattern = ( + r"postgres \(PostgreSQL\) (?P\d+(?:beta|rc|\.)\d+) \((?P[0-9a-f]{40})\)" + ) match = re.search(pattern, output, re.IGNORECASE) assert match is not None, f"Can't parse {output} with {pattern}" version = match.group("version") commit = match.group("commit") - assert ( - pg_version.v_prefixed in expected_revisions - ), f"Version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional" - - msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional" - assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg + if "." in version: + assert ( + pg_version.v_prefixed in expected_revisions + ), f"Released PostgreSQL version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional" + msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional" + assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index d152d0f41f6f..f98b53d966af 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -118,6 +118,9 @@ def test_ancestor_detach_branched_from( truncated_layers = 0 elif branchpoint == Branchpoint.AFTER_L0: branch_at = Lsn(last_lsn + 8) + # make sure the branch point is not on a page header + if 0 < (branch_at.lsn_int % 8192) < 40: + branch_at += 40 rows = 8192 # as there is no 8 byte walrecord, nothing should get copied from the straddling layer truncated_layers = 0 diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py index ea900b07b86c..ebe65e7c29dc 100644 --- a/test_runner/regress/test_twophase.py +++ b/test_runner/regress/test_twophase.py @@ -1,19 +1,32 @@ import os +from pathlib import Path +from fixtures.common_types import TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn +from fixtures.neon_fixtures import ( + NeonEnv, + PgBin, + fork_at_current_lsn, + import_timeline_from_vanilla_postgres, +) # # Test branching, when a transaction is in prepared state # -def test_twophase(neon_simple_env: NeonEnv): - env = neon_simple_env - endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=5"]) +def twophase_test_on_timeline(env: NeonEnv): + endpoint = env.endpoints.create_start( + "test_twophase", config_lines=["max_prepared_transactions=5"] + ) conn = endpoint.connect() cur = conn.cursor() + # FIXME: Switch to the next WAL segment, to work around the bug fixed in + # https://github.com/neondatabase/neon/pull/8914. When that is merged, this can be + # removed. + cur.execute("select pg_switch_wal()") + cur.execute("CREATE TABLE foo (t text)") # Prepare a transaction that will insert a row @@ -53,7 +66,7 @@ def test_twophase(neon_simple_env: NeonEnv): assert len(twophase_files) == 2 # Create a branch with the transaction in prepared state - fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "main") + fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "test_twophase") # Start compute on the new branch endpoint2 = env.endpoints.create_start( @@ -80,3 +93,50 @@ def test_twophase(neon_simple_env: NeonEnv): # Only one committed insert is visible on the original branch cur.execute("SELECT * FROM foo") assert cur.fetchall() == [("three",)] + + +def test_twophase(neon_simple_env: NeonEnv): + """ + Test branching, when a transaction is in prepared state + """ + env = neon_simple_env + env.neon_cli.create_branch("test_twophase") + + twophase_test_on_timeline(env) + + +def test_twophase_nonzero_epoch( + neon_simple_env: NeonEnv, + test_output_dir: Path, + pg_bin: PgBin, + vanilla_pg, +): + """ + Same as 'test_twophase' test, but with a non-zero XID epoch, i.e. after 4 billion XIDs + have been consumed. (This is to ensure that we correctly use the full 64-bit XIDs in + pg_twophase filenames with PostgreSQL v17.) + """ + env = neon_simple_env + + # Reset the vanilla Postgres instance with a higher XID epoch + pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") + cmd = [pg_resetwal_path, "--epoch=1000000000", "-D", str(vanilla_pg.pgdatadir)] + pg_bin.run_capture(cmd) + + timeline_id = TimelineId.generate() + + # Import the cluster to Neon + vanilla_pg.start() + vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") + import_timeline_from_vanilla_postgres( + test_output_dir, + env, + pg_bin, + env.initial_tenant, + timeline_id, + "test_twophase", + vanilla_pg.connstr(), + ) + vanilla_pg.stop() # don't need the original server anymore + + twophase_test_on_timeline(env) diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 new file mode 160000 index 000000000000..9156d63ce253 --- /dev/null +++ b/vendor/postgres-v17 @@ -0,0 +1 @@ +Subproject commit 9156d63ce253bed9d1f76355ceec610e444eaffa From 0a8c5e1214fcd3f59767a6ca4adeb68612977e51 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 13 Sep 2024 15:10:52 +0100 Subject: [PATCH 20/21] Fix broken image for PG17 (#8998) Most extensions are not required to run Neon-based PostgreSQL, but the Neon extension is _quite_ critical, so let's make sure we include it. ## Problem Staging doesn't have working compute images for PG17 ## Summary of changes Disable some PG17 filters so that we get the critical components into the PG17 image --- Dockerfile.compute-node | 63 ++++++++++++++--------------------------- 1 file changed, 21 insertions(+), 42 deletions(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index fe902eb97817..6e2510fe60c6 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -81,10 +81,7 @@ RUN cd postgres && \ FROM build-deps AS postgis-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - apt update && \ +RUN apt update && \ apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \ libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ @@ -92,8 +89,8 @@ RUN case "${PG_VERSION}" in "v17") \ # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2 RUN case "${PG_VERSION}" in "v17") \ - mkdir -p /sfcgal && \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + mkdir -p /sfcgal && \ + echo "Postgis doensn't yet support PG17 (needs 3.4.3, if not higher)" && exit 0;; \ esac && \ wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \ @@ -105,7 +102,7 @@ RUN case "${PG_VERSION}" in "v17") \ ENV PATH="/usr/local/pgsql/bin:$PATH" RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "Postgis doensn't yet support PG17 (needs 3.4.3, if not higher)" && exit 0;; \ esac && \ wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \ @@ -666,7 +663,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "v17 extensions is not supported yet by pg_roaringbitmap. Quit" && exit 0;; \ esac && \ wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ @@ -687,7 +684,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "v17 is not supported yet by pg_semver. Quit" && exit 0;; \ esac && \ wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \ @@ -707,10 +704,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in \ "v14" | "v15") \ export PG_EMBEDDING_VERSION=0.3.5 \ export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \ @@ -736,7 +730,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \ esac && \ wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ @@ -769,7 +763,7 @@ USER nonroot WORKDIR /home/nonroot RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "v17 is not supported yet by pgrx. Quit" && exit 0;; \ esac && \ curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ chmod +x rustup-init && \ @@ -791,7 +785,7 @@ FROM rust-extensions-build AS pg-jsonschema-pg-build ARG PG_VERSION RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "pg_jsonschema does not yet have a release that supports pg17" && exit 0;; \ esac && \ wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \ echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \ @@ -816,7 +810,7 @@ FROM rust-extensions-build AS pg-graphql-pg-build ARG PG_VERSION RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "pg_graphql does not yet have a release that supports pg17 as of now" && exit 0;; \ esac && \ wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \ echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \ @@ -839,7 +833,7 @@ ARG PG_VERSION # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023 RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "pg_tiktoken does not have versions, nor support for pg17" && exit 0;; \ esac && \ wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \ echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \ @@ -861,7 +855,7 @@ FROM rust-extensions-build AS pg-pgx-ulid-build ARG PG_VERSION RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \ esac && \ wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \ @@ -883,7 +877,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "We'll need to update wal2json to 2.6+ for pg17 support" && exit 0;; \ esac && \ wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ @@ -903,7 +897,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "We'll need to update pg_ivm to 1.9+ for pg17 support" && exit 0;; \ esac && \ wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ @@ -924,7 +918,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "pg_partman doesn't support PG17 yet" && exit 0;; \ esac && \ wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ @@ -977,10 +971,7 @@ COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - make -j $(getconf _NPROCESSORS_ONLN) \ +RUN make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon \ -s install && \ @@ -1023,10 +1014,7 @@ ENV BUILD_TAG=$BUILD_TAG USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto +RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto ######################################################################################### # @@ -1047,24 +1035,15 @@ FROM neon-pg-ext-build AS postgres-cleanup-layer COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise) -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp +RUN cd /usr/local/pgsql/bin && rm -f ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp # Remove headers that we won't need anymore - we've completed installation of all extensions -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - rm -r /usr/local/pgsql/include +RUN rm -r /usr/local/pgsql/include # Remove static postgresql libraries - all compilation is finished, so we # can now remove these files - they must be included in other binaries by now # if they were to be used by other libraries. -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - rm /usr/local/pgsql/lib/lib*.a +RUN rm /usr/local/pgsql/lib/lib*.a ######################################################################################### From b2c83db54d58d46e8ca11d5f1b4a38471322f713 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 16 Sep 2024 12:44:26 +0100 Subject: [PATCH 21/21] CI(gather-rust-build-stats): set PQ_LIB_DIR to Postgres 17 (#9001) ## Problem `gather-rust-build-stats` extra CI job fails with ``` "PQ_LIB_DIR" doesn't exist in the configured path: "/__w/neon/neon/pg_install/v16/lib" ``` ## Summary of changes - Use the path to Postgres 17 for the `gather-rust-build-stats` job. The job uses Postgres built by `make walproposer-lib` --- .github/workflows/neon_extra_builds.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 41c9f5dee5a4..140aac032a99 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -181,7 +181,7 @@ jobs: run: make walproposer-lib -j$(nproc) - name: Produce the build stats - run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc) + run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc) - name: Upload the build stats id: upload-stats