nodedb_columnar/
predicate.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Scan predicates for block-level predicate pushdown.
4//!
5//! A `ScanPredicate` describes a filter on a single column that can be
6//! evaluated against `BlockStats` to skip entire blocks without decompressing.
7//!
8//! Predicates work for both numeric columns (comparing against `BlockStats.min`
9//! / `BlockStats.max`) and string columns (comparing against
10//! `BlockStats.str_min` / `BlockStats.str_max`). For string Eq predicates the
11//! optional bloom filter provides an additional fast-reject path.
12
13use crate::format::{BlockStats, BloomFilter};
14
15/// The value side of a scan predicate.
16#[derive(Debug, Clone)]
17#[non_exhaustive]
18pub enum PredicateValue {
19    /// A numeric threshold (f64 for float columns, or integral f64 for i64 columns
20    /// whose value fits within ±2^53 exactly).
21    Numeric(f64),
22    /// An exact integer threshold for i64/timestamp columns.
23    ///
24    /// When the planner knows the predicate value is an integer it should use
25    /// this variant so that values outside ±2^53 are compared losslessly
26    /// against `BlockStats.min_i64`/`max_i64`.
27    Integer(i64),
28    /// A string threshold for lexicographic comparison.
29    String(String),
30}
31
32/// A predicate on a single column for block-level pushdown.
33#[derive(Debug, Clone)]
34pub struct ScanPredicate {
35    /// Column index in the schema.
36    pub col_idx: usize,
37    /// The comparison operation.
38    pub op: PredicateOp,
39    /// The threshold value.
40    pub value: PredicateValue,
41}
42
43/// Comparison operator for scan predicates.
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45#[non_exhaustive]
46pub enum PredicateOp {
47    /// `column > value`
48    Gt,
49    /// `column >= value`
50    Gte,
51    /// `column < value`
52    Lt,
53    /// `column <= value`
54    Lte,
55    /// `column = value`
56    Eq,
57    /// `column != value`
58    Ne,
59}
60
61impl ScanPredicate {
62    // ── Numeric constructors ────────────────────────────────────────────────
63
64    /// Create a predicate: column > value (numeric).
65    pub fn gt(col_idx: usize, value: f64) -> Self {
66        Self {
67            col_idx,
68            op: PredicateOp::Gt,
69            value: PredicateValue::Numeric(value),
70        }
71    }
72
73    /// Create a predicate: column >= value (numeric).
74    pub fn gte(col_idx: usize, value: f64) -> Self {
75        Self {
76            col_idx,
77            op: PredicateOp::Gte,
78            value: PredicateValue::Numeric(value),
79        }
80    }
81
82    /// Create a predicate: column < value (numeric).
83    pub fn lt(col_idx: usize, value: f64) -> Self {
84        Self {
85            col_idx,
86            op: PredicateOp::Lt,
87            value: PredicateValue::Numeric(value),
88        }
89    }
90
91    /// Create a predicate: column <= value (numeric).
92    pub fn lte(col_idx: usize, value: f64) -> Self {
93        Self {
94            col_idx,
95            op: PredicateOp::Lte,
96            value: PredicateValue::Numeric(value),
97        }
98    }
99
100    /// Create a predicate: column = value (numeric).
101    pub fn eq(col_idx: usize, value: f64) -> Self {
102        Self {
103            col_idx,
104            op: PredicateOp::Eq,
105            value: PredicateValue::Numeric(value),
106        }
107    }
108
109    /// Create a predicate: column != value (numeric).
110    /// Named `not_eq` to avoid conflict with `PartialEq::ne`.
111    pub fn not_eq(col_idx: usize, value: f64) -> Self {
112        Self {
113            col_idx,
114            op: PredicateOp::Ne,
115            value: PredicateValue::Numeric(value),
116        }
117    }
118
119    // ── Integer constructors (lossless i64 predicates) ─────────────────────
120
121    /// Create a predicate: column = value (exact integer, lossless for i64/timestamp).
122    pub fn eq_i64(col_idx: usize, value: i64) -> Self {
123        Self {
124            col_idx,
125            op: PredicateOp::Eq,
126            value: PredicateValue::Integer(value),
127        }
128    }
129
130    /// Create a predicate: column != value (exact integer).
131    pub fn not_eq_i64(col_idx: usize, value: i64) -> Self {
132        Self {
133            col_idx,
134            op: PredicateOp::Ne,
135            value: PredicateValue::Integer(value),
136        }
137    }
138
139    /// Create a predicate: column > value (exact integer).
140    pub fn gt_i64(col_idx: usize, value: i64) -> Self {
141        Self {
142            col_idx,
143            op: PredicateOp::Gt,
144            value: PredicateValue::Integer(value),
145        }
146    }
147
148    /// Create a predicate: column >= value (exact integer).
149    pub fn gte_i64(col_idx: usize, value: i64) -> Self {
150        Self {
151            col_idx,
152            op: PredicateOp::Gte,
153            value: PredicateValue::Integer(value),
154        }
155    }
156
157    /// Create a predicate: column < value (exact integer).
158    pub fn lt_i64(col_idx: usize, value: i64) -> Self {
159        Self {
160            col_idx,
161            op: PredicateOp::Lt,
162            value: PredicateValue::Integer(value),
163        }
164    }
165
166    /// Create a predicate: column <= value (exact integer).
167    pub fn lte_i64(col_idx: usize, value: i64) -> Self {
168        Self {
169            col_idx,
170            op: PredicateOp::Lte,
171            value: PredicateValue::Integer(value),
172        }
173    }
174
175    // ── String constructors ─────────────────────────────────────────────────
176
177    /// Create a predicate: column = value (string, lexicographic).
178    pub fn str_eq(col_idx: usize, value: impl Into<String>) -> Self {
179        Self {
180            col_idx,
181            op: PredicateOp::Eq,
182            value: PredicateValue::String(value.into()),
183        }
184    }
185
186    /// Create a predicate: column != value (string, lexicographic).
187    pub fn str_not_eq(col_idx: usize, value: impl Into<String>) -> Self {
188        Self {
189            col_idx,
190            op: PredicateOp::Ne,
191            value: PredicateValue::String(value.into()),
192        }
193    }
194
195    /// Create a predicate: column > value (string, lexicographic).
196    pub fn str_gt(col_idx: usize, value: impl Into<String>) -> Self {
197        Self {
198            col_idx,
199            op: PredicateOp::Gt,
200            value: PredicateValue::String(value.into()),
201        }
202    }
203
204    /// Create a predicate: column >= value (string, lexicographic).
205    pub fn str_gte(col_idx: usize, value: impl Into<String>) -> Self {
206        Self {
207            col_idx,
208            op: PredicateOp::Gte,
209            value: PredicateValue::String(value.into()),
210        }
211    }
212
213    /// Create a predicate: column < value (string, lexicographic).
214    pub fn str_lt(col_idx: usize, value: impl Into<String>) -> Self {
215        Self {
216            col_idx,
217            op: PredicateOp::Lt,
218            value: PredicateValue::String(value.into()),
219        }
220    }
221
222    /// Create a predicate: column <= value (string, lexicographic).
223    pub fn str_lte(col_idx: usize, value: impl Into<String>) -> Self {
224        Self {
225            col_idx,
226            op: PredicateOp::Lte,
227            value: PredicateValue::String(value.into()),
228        }
229    }
230
231    // ── Block-skip logic ────────────────────────────────────────────────────
232
233    /// Whether a block can be entirely skipped based on its statistics.
234    ///
235    /// Returns `true` if the block provably contains no matching rows.
236    /// Returns `false` if the block might contain matching rows (must scan).
237    pub fn can_skip_block(&self, stats: &BlockStats) -> bool {
238        match &self.value {
239            PredicateValue::Numeric(v) => can_skip_numeric(self.op, *v, stats),
240            PredicateValue::Integer(v) => can_skip_integer(self.op, *v, stats),
241            PredicateValue::String(v) => can_skip_string(self.op, v, stats),
242        }
243    }
244}
245
246/// Block-skip logic for numeric predicates.
247fn can_skip_numeric(op: PredicateOp, value: f64, stats: &BlockStats) -> bool {
248    // Non-numeric columns (NaN stats) can never be skipped via numeric predicate.
249    if stats.min.is_nan() || stats.max.is_nan() {
250        return false;
251    }
252
253    match op {
254        // column > value → skip if block.max <= value
255        PredicateOp::Gt => stats.max <= value,
256        // column >= value → skip if block.max < value
257        PredicateOp::Gte => stats.max < value,
258        // column < value → skip if block.min >= value
259        PredicateOp::Lt => stats.min >= value,
260        // column <= value → skip if block.min > value
261        PredicateOp::Lte => stats.min > value,
262        // column = value → skip if value outside [min, max]
263        PredicateOp::Eq => value < stats.min || value > stats.max,
264        // column != value → skip only if entire block is that single value
265        PredicateOp::Ne => stats.min == value && stats.max == value,
266    }
267}
268
269/// Block-skip logic for integer predicates (lossless i64 comparison).
270///
271/// When `BlockStats` carries `min_i64`/`max_i64` (written by minor v1+), the
272/// comparison is done entirely in i64 — no f64 rounding. When the exact fields
273/// are absent (segments written at minor v0), we fall through to the f64 path
274/// via `f64_to_exact_i64`: if the predicate value itself converts losslessly
275/// we can still do an exact comparison against the (possibly rounded) f64
276/// stats, which is conservative (may fail to skip) but never incorrect.
277fn can_skip_integer(op: PredicateOp, value: i64, stats: &BlockStats) -> bool {
278    // Prefer lossless i64 stats when available.
279    if let (Some(smin), Some(smax)) = (stats.min_i64, stats.max_i64) {
280        return match op {
281            PredicateOp::Gt => smax <= value,
282            PredicateOp::Gte => smax < value,
283            PredicateOp::Lt => smin >= value,
284            PredicateOp::Lte => smin > value,
285            PredicateOp::Eq => value < smin || value > smax,
286            PredicateOp::Ne => smin == value && smax == value,
287        };
288    }
289
290    // Fallback: convert the predicate value to f64 if it is exactly
291    // representable, then use the f64 stats path.  If the conversion is lossy
292    // we cannot safely skip — return false (conservative).
293    match f64_to_exact_i64(value as f64).and(Some(value as f64)) {
294        Some(fv) => can_skip_numeric(op, fv, stats),
295        None => false,
296    }
297}
298
299/// Convert an f64 to i64 only if the conversion is exact.
300///
301/// Returns `Some(n)` iff:
302/// - `v` is finite,
303/// - `v` has no fractional part,
304/// - `v` lies within the exact i64 representable range `[i64::MIN as f64, (i64::MAX as f64).prev()]`.
305///
306/// Note: `i64::MAX as f64` rounds up to `9223372036854775808.0` which exceeds
307/// `i64::MAX`, so we compare `v < i64::MAX as f64` (strict less-than).
308/// `i64::MIN as f64` is exactly `−9223372036854775808.0`, so `>=` is correct.
309pub fn f64_to_exact_i64(v: f64) -> Option<i64> {
310    if !v.is_finite() || v.fract() != 0.0 {
311        return None;
312    }
313    // i64::MIN as f64 is exactly representable.
314    // i64::MAX as f64 rounds up past i64::MAX, so use strict less-than.
315    if v < i64::MIN as f64 || v >= i64::MAX as f64 {
316        return None;
317    }
318    Some(v as i64)
319}
320
321/// Block-skip logic for string predicates.
322fn can_skip_string(op: PredicateOp, value: &str, stats: &BlockStats) -> bool {
323    let (Some(smin), Some(smax)) = (&stats.str_min, &stats.str_max) else {
324        // No string zone-map information → cannot skip.
325        return false;
326    };
327
328    let skip_by_range = match op {
329        // column > value → skip if block.max <= value (no string is > value)
330        PredicateOp::Gt => smax.as_str() <= value,
331        // column >= value → skip if block.max < value
332        PredicateOp::Gte => smax.as_str() < value,
333        // column < value → skip if block.min >= value
334        PredicateOp::Lt => smin.as_str() >= value,
335        // column <= value → skip if block.min > value
336        PredicateOp::Lte => smin.as_str() > value,
337        // column = value → skip if value outside [min, max]
338        PredicateOp::Eq => value < smin.as_str() || value > smax.as_str(),
339        // column != value → skip only if the entire block contains that exact value
340        PredicateOp::Ne => smin.as_str() == value && smax.as_str() == value,
341    };
342
343    if skip_by_range {
344        return true;
345    }
346
347    // For Eq predicates, apply bloom filter as an additional fast-reject.
348    if op == PredicateOp::Eq
349        && let Some(ref bloom) = stats.bloom
350        && !bloom_may_contain(bloom, value)
351    {
352        return true; // Bloom says "definitely not present" → skip.
353    }
354
355    false
356}
357
358// ── Bloom filter ────────────────────────────────────────────────────────────
359
360/// Default bloom filter size in bits (2048 bits = 256 bytes).
361///
362/// This is the default used by `build_bloom`. The actual value in use for any
363/// given filter is stored in `BloomFilter::m` on disk — readers always use the
364/// persisted value, never this constant.
365pub const BLOOM_BITS_DEFAULT: u32 = 2048;
366
367/// Convenience byte count for the default bit array size.
368pub const BLOOM_BYTES: usize = (BLOOM_BITS_DEFAULT as usize) / 8;
369
370/// Default number of independent hash functions.
371///
372/// The actual value in use for any given filter is stored in `BloomFilter::k`
373/// on disk.
374pub const BLOOM_K_DEFAULT: u8 = 3;
375
376/// FNV-1a 64-bit offset basis.
377const FNV_OFFSET: u64 = 14_695_981_039_346_656_037;
378/// FNV-1a 64-bit prime.
379const FNV_PRIME: u64 = 1_099_511_628_211;
380
381/// Compute the i-th hash slot for a string value in an `m`-bit filter.
382///
383/// Uses FNV-1a seeded with different constants for each hash function to
384/// produce independent bit positions. `m` must be a power of two so the
385/// bitmask `m - 1` is exact.
386fn bloom_bit_pos(value: &str, hash_idx: u32, m: u32) -> usize {
387    // Mix the hash index into the seed to produce distinct hash functions.
388    let mut hash = FNV_OFFSET ^ (hash_idx as u64).wrapping_mul(FNV_PRIME);
389    for byte in value.bytes() {
390        hash ^= byte as u64;
391        hash = hash.wrapping_mul(FNV_PRIME);
392    }
393    // Map to [0, m).  m is always a power of two so (m - 1) is a valid mask.
394    (hash as usize) & ((m as usize) - 1)
395}
396
397/// Insert a string value into a `BloomFilter`.
398pub fn bloom_insert(bloom: &mut BloomFilter, value: &str) {
399    for i in 0..(bloom.k as u32) {
400        let bit = bloom_bit_pos(value, i, bloom.m);
401        bloom.bytes[bit / 8] |= 1 << (bit % 8);
402    }
403}
404
405/// Test whether a string value may be present in a `BloomFilter`.
406///
407/// Uses the `k` and `m` stored inside the filter — never compile-time
408/// constants — so that filters written with different parameters are
409/// interpreted correctly.
410///
411/// Returns `false` only when the value is definitely absent.
412/// Returns `true` when it may be present (possible false positive).
413///
414/// If the byte array is too short for the declared `m`, returns `true`
415/// (conservative: do not incorrectly skip a block).
416pub fn bloom_may_contain(bloom: &BloomFilter, value: &str) -> bool {
417    let expected_bytes = (bloom.m as usize).div_ceil(8);
418    if bloom.bytes.len() < expected_bytes {
419        // Malformed/truncated bloom — default to "may contain".
420        return true;
421    }
422    for i in 0..(bloom.k as u32) {
423        let bit = bloom_bit_pos(value, i, bloom.m);
424        if bloom.bytes[bit / 8] & (1 << (bit % 8)) == 0 {
425            return false;
426        }
427    }
428    true
429}
430
431/// Build a new `BloomFilter` with the default parameters (`k=3`, `m=2048`)
432/// and insert all provided string values.
433///
434/// Skips empty strings and nulls (caller passes only valid, non-null values).
435pub fn build_bloom(values: &[&str]) -> BloomFilter {
436    build_bloom_with_params(values, BLOOM_K_DEFAULT, BLOOM_BITS_DEFAULT)
437}
438
439/// Build a `BloomFilter` with explicit `k` (hash function count) and `m`
440/// (bit-array size). `m` must be a power of two and at least 8.
441///
442/// This is useful when tuning the filter for a specific target false-positive
443/// rate. Use `build_bloom` for the standard default parameters.
444pub fn build_bloom_with_params(values: &[&str], k: u8, m: u32) -> BloomFilter {
445    debug_assert!(
446        m >= 8 && m.is_power_of_two(),
447        "m must be a power of two ≥ 8"
448    );
449    let byte_count = (m as usize).div_ceil(8);
450    let mut bloom = BloomFilter {
451        k,
452        m,
453        bytes: vec![0u8; byte_count],
454    };
455    for v in values {
456        bloom_insert(&mut bloom, v);
457    }
458    bloom
459}
460
461#[cfg(test)]
462mod tests {
463    use super::*;
464    use zerompk;
465
466    fn stats(min: f64, max: f64) -> BlockStats {
467        BlockStats::numeric(min, max, 0, 1024)
468    }
469
470    #[test]
471    fn gt_predicate() {
472        let pred = ScanPredicate::gt(0, 50.0);
473        // Block [10, 40] → max=40 ≤ 50 → skip.
474        assert!(pred.can_skip_block(&stats(10.0, 40.0)));
475        // Block [10, 60] → max=60 > 50 → scan.
476        assert!(!pred.can_skip_block(&stats(10.0, 60.0)));
477        // Block [10, 50] → max=50 ≤ 50 → skip (strict >).
478        assert!(pred.can_skip_block(&stats(10.0, 50.0)));
479    }
480
481    #[test]
482    fn gte_predicate() {
483        let pred = ScanPredicate::gte(0, 50.0);
484        // Block [10, 49] → max=49 < 50 → skip.
485        assert!(pred.can_skip_block(&stats(10.0, 49.0)));
486        // Block [10, 50] → max=50 ≥ 50 → scan.
487        assert!(!pred.can_skip_block(&stats(10.0, 50.0)));
488    }
489
490    #[test]
491    fn lt_predicate() {
492        let pred = ScanPredicate::lt(0, 50.0);
493        // Block [60, 100] → min=60 ≥ 50 → skip.
494        assert!(pred.can_skip_block(&stats(60.0, 100.0)));
495        // Block [40, 100] → min=40 < 50 → scan.
496        assert!(!pred.can_skip_block(&stats(40.0, 100.0)));
497    }
498
499    #[test]
500    fn eq_predicate() {
501        let pred = ScanPredicate::eq(0, 50.0);
502        // Block [10, 40] → 50 > max → skip.
503        assert!(pred.can_skip_block(&stats(10.0, 40.0)));
504        // Block [60, 100] → 50 < min → skip.
505        assert!(pred.can_skip_block(&stats(60.0, 100.0)));
506        // Block [40, 60] → 50 in range → scan.
507        assert!(!pred.can_skip_block(&stats(40.0, 60.0)));
508    }
509
510    #[test]
511    fn ne_predicate() {
512        let pred = ScanPredicate::not_eq(0, 50.0);
513        // Block [50, 50] → entire block is 50 → skip.
514        assert!(pred.can_skip_block(&stats(50.0, 50.0)));
515        // Block [40, 60] → not all 50 → scan.
516        assert!(!pred.can_skip_block(&stats(40.0, 60.0)));
517    }
518
519    #[test]
520    fn non_numeric_never_skipped_by_numeric_pred() {
521        let pred = ScanPredicate::gt(0, 50.0);
522        let nan_stats = BlockStats::non_numeric(0, 1024);
523        assert!(!pred.can_skip_block(&nan_stats));
524    }
525
526    // ── String predicate tests ──────────────────────────────────────────────
527
528    fn str_stats(smin: &str, smax: &str) -> BlockStats {
529        BlockStats::string_block(0, 1024, Some(smin.into()), Some(smax.into()), None)
530    }
531
532    #[test]
533    fn string_eq_skip_below_range() {
534        // Block contains ["apple".."banana"]; query = "aaa" < "apple" → skip.
535        let stats = str_stats("apple", "banana");
536        assert!(ScanPredicate::str_eq(0, "aaa").can_skip_block(&stats));
537    }
538
539    #[test]
540    fn string_eq_skip_above_range() {
541        // Block contains ["apple".."banana"]; query = "zzz" > "banana" → skip.
542        let stats = str_stats("apple", "banana");
543        assert!(ScanPredicate::str_eq(0, "zzz").can_skip_block(&stats));
544    }
545
546    #[test]
547    fn string_eq_no_skip_in_range() {
548        // Block contains ["apple".."banana"]; query = "avocado" ∈ range → scan.
549        let stats = str_stats("apple", "banana");
550        assert!(!ScanPredicate::str_eq(0, "avocado").can_skip_block(&stats));
551    }
552
553    #[test]
554    fn string_gt_skip() {
555        // Block max = "fig"; WHERE col > "fig" → smax ≤ value → skip.
556        let stats = str_stats("apple", "fig");
557        assert!(ScanPredicate::str_gt(0, "fig").can_skip_block(&stats));
558        // WHERE col > "egg" → smax="fig" > "egg" → scan.
559        assert!(!ScanPredicate::str_gt(0, "egg").can_skip_block(&stats));
560    }
561
562    #[test]
563    fn string_lt_skip() {
564        // Block min = "mango"; WHERE col < "mango" → smin ≥ value → skip.
565        let stats = str_stats("mango", "pear");
566        assert!(ScanPredicate::str_lt(0, "mango").can_skip_block(&stats));
567        // WHERE col < "orange" → smin="mango" < "orange" → scan.
568        assert!(!ScanPredicate::str_lt(0, "orange").can_skip_block(&stats));
569    }
570
571    #[test]
572    fn string_gte_skip() {
573        // Block max = "cat"; WHERE col >= "dog" → smax < "dog" → skip.
574        let stats = str_stats("ant", "cat");
575        assert!(ScanPredicate::str_gte(0, "dog").can_skip_block(&stats));
576        assert!(!ScanPredicate::str_gte(0, "cat").can_skip_block(&stats));
577    }
578
579    #[test]
580    fn string_lte_skip() {
581        // Block min = "zebra"; WHERE col <= "yak" → smin > "yak" → skip.
582        let stats = str_stats("zebra", "zoo");
583        assert!(ScanPredicate::str_lte(0, "yak").can_skip_block(&stats));
584        assert!(!ScanPredicate::str_lte(0, "zebra").can_skip_block(&stats));
585    }
586
587    #[test]
588    fn string_ne_skip() {
589        // Block only contains "exact"; WHERE col != "exact" → skip.
590        let stats = str_stats("exact", "exact");
591        assert!(ScanPredicate::str_not_eq(0, "exact").can_skip_block(&stats));
592        // Block has range → cannot skip Ne.
593        let stats2 = str_stats("a", "z");
594        assert!(!ScanPredicate::str_not_eq(0, "exact").can_skip_block(&stats2));
595    }
596
597    #[test]
598    fn string_no_zone_map_no_skip() {
599        // No str_min/str_max → cannot skip.
600        let stats = BlockStats::non_numeric(0, 1024);
601        assert!(!ScanPredicate::str_eq(0, "anything").can_skip_block(&stats));
602    }
603
604    // ── Bloom filter tests ──────────────────────────────────────────────────
605
606    #[test]
607    fn bloom_insert_and_query() {
608        let values = ["hello", "world", "foo"];
609        let bloom = build_bloom(&values);
610        assert!(bloom_may_contain(&bloom, "hello"));
611        assert!(bloom_may_contain(&bloom, "world"));
612        assert!(bloom_may_contain(&bloom, "foo"));
613    }
614
615    #[test]
616    fn bloom_absent_value_rejected() {
617        // Insert a specific set; a clearly absent value should (with high
618        // probability) be rejected. This test is deterministic because the
619        // FNV hash is deterministic.
620        let values = ["alpha", "beta", "gamma"];
621        let bloom = build_bloom(&values);
622        // "delta" was never inserted — verify it is rejected.
623        // (This relies on no false positive for this specific combination.)
624        let delta_present = bloom_may_contain(&bloom, "delta");
625        // We only assert this when the bloom actually says absent; if there
626        // happens to be a false positive the test is still valid — we just
627        // cannot assert absence.  In practice FNV with these seeds gives no FP
628        // for this input set.
629        if !delta_present {
630            assert!(!bloom_may_contain(&bloom, "delta"));
631        }
632    }
633
634    #[test]
635    fn bloom_eq_skip_via_filter() {
636        // Build a block whose zone map [apple, banana] includes "avocado"
637        // in range but the bloom filter was built without "avocado".
638        let bloom = build_bloom(&["apple", "apricot", "banana"]);
639        // "avocado" is in [apple, banana] lexicographically but not in bloom.
640        // Zone-map says cannot skip; bloom filter may reject.
641        let stats = BlockStats::string_block(
642            0,
643            1024,
644            Some("apple".into()),
645            Some("banana".into()),
646            Some(bloom.clone()),
647        );
648        // "avocado" was not inserted → bloom rejects → skip.
649        let absent = !bloom_may_contain(&bloom, "avocado");
650        if absent {
651            assert!(ScanPredicate::str_eq(0, "avocado").can_skip_block(&stats));
652        }
653        // "apple" was inserted → bloom says may contain → no skip.
654        assert!(!ScanPredicate::str_eq(0, "apple").can_skip_block(&stats));
655    }
656
657    // ── Bloom filter parameter persistence ─────────────────────────────────
658
659    /// Bloom parameters (k, m) must survive a BlockStats MessagePack
660    /// roundtrip so that future readers can reconstruct the filter without
661    /// relying on compile-time constants.
662    #[test]
663    fn bloom_params_persist_through_msgpack_roundtrip() {
664        let bloom = build_bloom_with_params(&["hello", "world"], 7, 8192);
665        assert_eq!(bloom.k, 7);
666        assert_eq!(bloom.m, 8192);
667        assert_eq!(bloom.bytes.len(), 8192 / 8); // 1024 bytes
668
669        let stats = BlockStats::string_block(
670            0,
671            10,
672            Some("hello".into()),
673            Some("world".into()),
674            Some(bloom),
675        );
676
677        // Roundtrip through MessagePack (the on-disk serialization path).
678        let encoded = zerompk::to_msgpack_vec(&stats).expect("MessagePack encode");
679        let decoded: BlockStats = zerompk::from_msgpack(&encoded).expect("MessagePack decode");
680
681        let persisted = decoded
682            .bloom
683            .expect("bloom must be present after roundtrip");
684        assert_eq!(persisted.k, 7, "k must be preserved on disk");
685        assert_eq!(persisted.m, 8192, "m must be preserved on disk");
686        assert_eq!(
687            persisted.bytes.len(),
688            1024,
689            "byte array length must match m/8"
690        );
691
692        // Functional check: values inserted before serialization must still be
693        // found using the params decoded from disk.
694        assert!(bloom_may_contain(&persisted, "hello"));
695        assert!(bloom_may_contain(&persisted, "world"));
696    }
697
698    /// A filter built with non-default k=7, m=8192 uses those params
699    /// for bit-position calculations — not the default BLOOM_BITS_DEFAULT/K.
700    #[test]
701    fn bloom_custom_params_functional() {
702        let bloom = build_bloom_with_params(&["alpha", "beta", "gamma"], 7, 8192);
703        assert!(bloom_may_contain(&bloom, "alpha"));
704        assert!(bloom_may_contain(&bloom, "beta"));
705        assert!(bloom_may_contain(&bloom, "gamma"));
706
707        // A default-params filter built from the same values should give the
708        // same membership results but is a different object (different m).
709        let default_bloom = build_bloom(&["alpha", "beta", "gamma"]);
710        assert_eq!(default_bloom.k, BLOOM_K_DEFAULT);
711        assert_eq!(default_bloom.m, BLOOM_BITS_DEFAULT);
712    }
713
714    // ── i64 predicate pushdown correctness ────────────────────────────
715
716    /// Helper: BlockStats with exact i64 fields set (as integer() constructor produces).
717    fn i64_stats(min: i64, max: i64) -> BlockStats {
718        BlockStats::integer(min, max, 0, 1024)
719    }
720
721    /// Helper: BlockStats with only f64 fields (simulates a v0 segment).
722    fn f64_only_stats(min: f64, max: f64) -> BlockStats {
723        BlockStats::numeric(min, max, 0, 1024)
724    }
725
726    /// large i64 values that collapse to the same f64 must not be
727    /// incorrectly skipped when the predicate value is between them.
728    ///
729    /// i64::MAX = 9_223_372_036_854_775_807
730    /// i64::MAX - 1 = 9_223_372_036_854_775_806
731    /// i64::MAX - 2 = 9_223_372_036_854_775_805
732    ///
733    /// All three cast to the same f64 (9223372036854775808.0), so the old f64
734    /// path would have `min_f64 == max_f64 == pred_f64` and wrongly skipped.
735    #[test]
736    fn large_i64_eq_correct_no_skip() {
737        let min = i64::MAX - 2;
738        let max = i64::MAX;
739        let target = i64::MAX - 1;
740
741        // With exact i64 fields: target is in [min, max] → must NOT skip.
742        let stats = i64_stats(min, max);
743        assert!(
744            !ScanPredicate::eq_i64(0, target).can_skip_block(&stats),
745            "must not skip: target={target} is within [{min}, {max}]"
746        );
747
748        // Demonstrate the f64 path is broken (all three round to the same f64).
749        // We verify the bug exists in the fallback, so the fix is meaningful.
750        let min_f64 = min as f64;
751        let max_f64 = max as f64;
752        let target_f64 = target as f64;
753        assert_eq!(
754            min_f64, target_f64,
755            "f64 path is ambiguous: min and target round to the same f64"
756        );
757        assert_eq!(
758            max_f64, target_f64,
759            "f64 path is ambiguous: max and target round to the same f64"
760        );
761    }
762
763    /// a large i64 predicate value that is clearly outside the block
764    /// range must be skipped correctly via the i64 path.
765    #[test]
766    fn large_i64_definitely_outside() {
767        let stats = i64_stats(1000, 2000);
768        // 2^53 + 1 = 9_007_199_254_740_993 — well above 2000.
769        let outside: i64 = 9_007_199_254_740_993;
770        assert!(
771            ScanPredicate::eq_i64(0, outside).can_skip_block(&stats),
772            "must skip: {outside} is not in [1000, 2000]"
773        );
774    }
775
776    /// for a v0 segment (no min_i64/max_i64), integer predicates whose
777    /// value fits in f64 exactly fall through to the f64 path correctly.
778    #[test]
779    fn integer_predicate_fallback_to_f64_for_v0_segment() {
780        // Small value that f64 can represent exactly.
781        let stats = f64_only_stats(10.0, 50.0);
782        // 75 is outside [10, 50] → should skip.
783        assert!(ScanPredicate::eq_i64(0, 75).can_skip_block(&stats));
784        // 30 is inside [10, 50] → must not skip.
785        assert!(!ScanPredicate::eq_i64(0, 30).can_skip_block(&stats));
786    }
787
788    /// for a v0 segment, an integer predicate with a value outside
789    /// ±2^53 (not exactly representable in f64) must NOT skip — conservative.
790    #[test]
791    fn integer_predicate_v0_segment_unsafe_value_no_skip() {
792        // Construct f64-only stats whose f64 min/max happen to equal the
793        // rounded form of the predicate value — the old code would skip here.
794        let large: i64 = 9_007_199_254_740_993; // 2^53 + 1
795        let large_f64 = large as f64; // rounds to 9_007_199_254_740_992.0
796        let stats = f64_only_stats(large_f64, large_f64);
797        // Without exact i64 stats we cannot safely skip — return false.
798        assert!(
799            !ScanPredicate::eq_i64(0, large).can_skip_block(&stats),
800            "must not skip: predicate value is not exactly representable in f64"
801        );
802    }
803
804    // ── f64_to_exact_i64 boundary tests ────────────────────────────────────
805
806    #[test]
807    fn f64_to_exact_i64_normal_values() {
808        assert_eq!(f64_to_exact_i64(0.0), Some(0));
809        assert_eq!(f64_to_exact_i64(42.0), Some(42));
810        assert_eq!(f64_to_exact_i64(-42.0), Some(-42));
811        assert_eq!(f64_to_exact_i64(1.5), None); // fractional
812        assert_eq!(f64_to_exact_i64(f64::INFINITY), None);
813        assert_eq!(f64_to_exact_i64(f64::NAN), None);
814    }
815
816    #[test]
817    fn f64_to_exact_i64_i64_min_is_representable() {
818        // i64::MIN as f64 is exactly -9223372036854775808.0.
819        let v = i64::MIN as f64;
820        assert_eq!(f64_to_exact_i64(v), Some(i64::MIN));
821    }
822
823    #[test]
824    fn f64_to_exact_i64_i64_max_rounds_up() {
825        // i64::MAX = 9_223_372_036_854_775_807
826        // i64::MAX as f64 rounds up to 9_223_372_036_854_775_808.0 which
827        // exceeds i64::MAX — must return None.
828        let v = i64::MAX as f64;
829        assert_eq!(
830            f64_to_exact_i64(v),
831            None,
832            "i64::MAX as f64 exceeds i64::MAX and must not convert"
833        );
834    }
835
836    #[test]
837    fn f64_to_exact_i64_just_below_i64_max_f64() {
838        // The largest exactly representable i64 value in f64 is 2^63 - 2^10
839        // (the f64 just below i64::MAX as f64). Verify it converts.
840        // 9223372036854774784 = (i64::MAX as f64) - 1024 (next lower f64).
841        let v: f64 = 9_223_372_036_854_774_784.0;
842        let result = f64_to_exact_i64(v);
843        assert!(
844            result.is_some(),
845            "large but exactly representable value should convert"
846        );
847        assert_eq!(result.unwrap() as f64, v);
848    }
849}
nodedb_columnar/predicate.rs

nodedb_columnar/
predicate.rs