datafusion-contrib
diff --git a/‎Cargo.toml‎
Lines changed: 2 additions & 0 deletions b/‎Cargo.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎scripts/generate_orc_with_bloom_filter.py‎
Lines changed: 102 additions & 0 deletions b/‎scripts/generate_orc_with_bloom_filter.py‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎src/bloom_filter.rs‎
Lines changed: 185 additions & 0 deletions b/‎src/bloom_filter.rs‎
Lines changed: 185 additions & 0 deletions
diff --git a/‎src/lib.rs‎
Lines changed: 1 addition & 0 deletions b/‎src/lib.rs‎
Lines changed: 1 addition & 0 deletions
@@ -39,8 +39,10 @@ chrono = { version = "0.4.41", default-features = false, features = ["std"] }
 chrono-tz = "0.10"
 fallible-streaming-iterator = "0.1"
 flate2 = "1"
+log = "0.4"
 lz4_flex = "0.11"
 lzokay-native = "0.1"
+murmur3 = "0.5"
 num = "0.4.1"
 prost = "0.13"
 snafu = "0.8"
 
@@ -0,0 +1,102 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Generate a small ORC file that contains Bloom filters for regression tests.
+
+The generated file is written to:
+  tests/integration/data/bloom_filter.orc
+
+Dependencies:
+  pip install pyorc
+
+Usage:
+  python scripts/generate_orc_with_bloom_filter.py
+"""
+
+from pathlib import Path
+
+import pyorc
+from datetime import date
+from decimal import Decimal
+
+OUT_PATH = Path(__file__).parent.parent / "tests" / "integration" / "data" / "bloom_filter.orc"
+
+
+EXTRA_ROWS = 200
+
+
+def main() -> None:
+    OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
+
+    schema = (
+        "struct<id:int,name:string,score:double,event_date:date,flag:boolean,data:binary,dec:decimal(10,2)>"
+    )
+    # Non-contiguous values make some predicates fall within min/max but absent
+    # from the data (to exercise Bloom pruning). Include multiple rows to cover
+    # richer value ranges.
+    base_rows = [
+        (1, "alpha", 1.0, (2023, 1, 1), True, b"\x01", Decimal("1.11")),
+        (3, "gamma", 3.0, (2023, 1, 3), False, b"\x03", Decimal("3.33")),
+        (5, "delta", 5.0, (2023, 1, 5), True, b"\x05", Decimal("5.55")),
+        (10, "epsilon", 10.0, (2023, 1, 10), False, b"\x0a", Decimal("10.10")),
+    ]
+
+    # Add many more rows to create multiple row groups and stripes.
+    # We deliberately skip certain even values (id=2, date=2023-01-02, binary 0x02, decimal 2.22)
+    # so predicates for those values must rely on Bloom filters to prune.
+    extra_rows = []
+    day_choices = [1, 3, 4, 5, 6, 7]  # exclude day=2
+    for i in range(EXTRA_ROWS):
+        id_v = 101 + i * 2  # odd ids; still keeps id=2 absent
+        name_v = f"name_{i}"
+        score_v = float(id_v)
+        day = day_choices[i % len(day_choices)]
+        event_date = (2023, 1, day)
+        flag = i % 2 == 0
+        data = bytes([((i * 2) + 1) % 256])  # avoid byte 0x02
+        dec = Decimal(f"{id_v}.01")
+        extra_rows.append((id_v, name_v, score_v, event_date, flag, data, dec))
+
+    rows = base_rows + extra_rows
+
+    # Enable Bloom filters for all columns with a small false positive probability.
+    with OUT_PATH.open("wb") as f:
+        writer = pyorc.Writer(
+            f,
+            schema,
+            bloom_filter_columns=[
+                "id",
+                "name",
+                "score",
+                "event_date",
+                "flag",
+                "data",
+                "dec",
+            ],
+            bloom_filter_fpp=0.01,
+            stripe_size=1024,
+        )
+        for id_v, name_v, score_v, (y, m, d), flag, data, dec in rows:
+            writer.write((id_v, name_v, score_v, date(y, m, d), flag, data, dec))
+        writer.close()
+
+    print(f"Wrote ORC file with bloom filters to {OUT_PATH}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,185 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! ORC Bloom filter decoding and evaluation.
+//!
+//! This follows the ORC v1 spec (https://orc.apache.org/specification/ORCv1/):
+//! - Stream kinds `BLOOM_FILTER` / `BLOOM_FILTER_UTF8` provide per-row-group filters.
+//! - Bits are set using Murmur3 x64_128 with seed 0, deriving h1/h2 and the
+//!   double-hash sequence `h1 + i*h2 (mod m)` for `numHashFunctions`.
+//! - A cleared bit means the value is **definitely absent**; set bits mean
+//!   **possible presence** (false positives allowed).
+//!
+//! Bloom filters are attached to row groups and can quickly rule out equality
+//! predicates (e.g. `col = 'abc'`) before any data decoding.
+
+use murmur3::murmur3_x64_128;
+
+use crate::proto;
+
+/// A Bloom filter parsed from the ORC index stream.
+#[derive(Debug, Clone)]
+pub struct BloomFilter {
+    num_hash_functions: u32,
+    bitset: Vec<u64>,
+}
+
+impl BloomFilter {
+    /// Create a Bloom filter from a decoded protobuf value.
+    pub fn try_from_proto(proto: &proto::BloomFilter) -> Option<Self> {
+        // Ensure only one of bitset / utf8bitset is populated
+        assert!(
+            proto.bitset.is_empty() || proto.utf8bitset.is_none(),
+            "Bloom filter proto has both bitset and utf8bitset populated"
+        );
+
+        let num_hash_functions = proto.num_hash_functions();
+        if proto.bitset.is_empty() && proto.utf8bitset.is_none() {
+            return None;
+        }
+
+        let bitset = if !proto.bitset.is_empty() {
+            proto.bitset.clone()
+        } else {
+            // utf8bitset is encoded as bytes; convert to u64 words (little-endian)
+            proto
+                .utf8bitset
+                .as_ref()
+                .map(|bytes| {
+                    bytes
+                        .chunks(8)
+                        .map(|chunk| {
+                            let mut padded = [0u8; 8];
+                            for (idx, value) in chunk.iter().enumerate() {
+                                padded[idx] = *value;
+                            }
+                            u64::from_le_bytes(padded)
+                        })
+                        .collect::<Vec<_>>()
+                })
+                .unwrap_or_default()
+        };
+
+        Some(Self {
+            num_hash_functions: if num_hash_functions == 0 {
+                // Writers are expected to set this, but default to a safe value
+                3
+            } else {
+                num_hash_functions
+            },
+            bitset,
+        })
+    }
+
+    #[cfg(test)]
+    /// Create a Bloom filter from raw parts (mainly for tests)
+    pub fn from_parts(num_hash_functions: u32, bitset: Vec<u64>) -> Self {
+        Self {
+            num_hash_functions: num_hash_functions.max(1),
+            bitset,
+        }
+    }
+
+    /// Returns true if the value *might* be contained. False means *definitely not*.
+    pub fn might_contain(&self, value: &[u8]) -> bool {
+        let bit_count = self.bitset.len() * 64;
+        if bit_count == 0 {
+            // Defensive: no bits means we cannot use the filter
+            return true;
+        }
+
+        let hash = self.hash128(value);
+        let h1 = hash as u64;
+        let h2 = (hash >> 64) as u64;
+
+        for i in 0..self.num_hash_functions {
+            // ORC uses the standard double-hash scheme: h1 + i*h2 (mod m)
+            let combined = h1.wrapping_add((i as u64).wrapping_mul(h2));
+            let bit_idx = (combined % (bit_count as u64)) as usize;
+            if !self.test_bit(bit_idx) {
+                return false;
+            }
+        }
+
+        true
+    }
+
+    fn hash128(&self, value: &[u8]) -> u128 {
+        // The ORC specification uses Murmur3 (64-bit) for bloom filters.
+        // murmur3_x64_128 matches the Java reference implementation, where
+        // the lower 64 bits are treated as h1 and the upper 64 bits as h2.
+        let mut cursor = std::io::Cursor::new(value);
+        murmur3_x64_128(&mut cursor, 0).unwrap_or(0)
+    }
+
+    fn test_bit(&self, bit_idx: usize) -> bool {
+        let word = bit_idx / 64;
+        let bit = bit_idx % 64;
+        if let Some(bits) = self.bitset.get(word) {
+            (bits & (1u64 << bit)) != 0
+        } else {
+            false
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn build_filter(values: &[&[u8]], bitset_words: usize, hash_funcs: u32) -> BloomFilter {
+        let mut bitset = vec![0u64; bitset_words];
+        let bit_count = bitset_words * 64;
+
+        for value in values {
+            let mut cursor = std::io::Cursor::new(*value);
+            let hash = murmur3_x64_128(&mut cursor, 0).unwrap();
+            let h1 = hash as u64;
+            let h2 = (hash >> 64) as u64;
+            for i in 0..hash_funcs {
+                let combined = h1.wrapping_add((i as u64).wrapping_mul(h2));
+                let bit_idx = (combined % (bit_count as u64)) as usize;
+                bitset[bit_idx / 64] |= 1u64 << (bit_idx % 64);
+            }
+        }
+
+        BloomFilter::from_parts(hash_funcs, bitset)
+    }
+
+    #[test]
+    fn test_bloom_filter_hit_and_miss() {
+        let filter = build_filter(&[b"abc", b"def"], 2, 3);
+
+        assert!(filter.might_contain(b"abc"));
+        assert!(!filter.might_contain(b"xyz"));
+    }
+
+    #[test]
+    fn test_try_from_proto_utf8_bitset() {
+        let filter = build_filter(&[b"foo"], 1, 2);
+
+        let proto = proto::BloomFilter {
+            num_hash_functions: Some(filter.num_hash_functions),
+            bitset: vec![],
+            utf8bitset: Some(filter.bitset.iter().flat_map(|w| w.to_le_bytes()).collect()),
+        };
+
+        let decoded = BloomFilter::try_from_proto(&proto).unwrap();
+        assert!(decoded.might_contain(b"foo"));
+        assert!(!decoded.might_contain(b"bar"));
+    }
+}
@@ -51,6 +51,7 @@ pub mod arrow_reader;
 pub mod arrow_writer;
 #[cfg(feature = "async")]
 pub mod async_arrow_reader;
+mod bloom_filter;
 mod column;
 pub mod compression;
 #[allow(dead_code)]