nodedb_columnar/predicate.rs
1// SPDX-License-Identifier: Apache-2.0
2
3//! Scan predicates for block-level predicate pushdown.
4//!
5//! A `ScanPredicate` describes a filter on a single column that can be
6//! evaluated against `BlockStats` to skip entire blocks without decompressing.
7//!
8//! Predicates work for both numeric columns (comparing against `BlockStats.min`
9//! / `BlockStats.max`) and string columns (comparing against
10//! `BlockStats.str_min` / `BlockStats.str_max`). For string Eq predicates the
11//! optional bloom filter provides an additional fast-reject path.
12
13use crate::format::{BlockStats, BloomFilter};
14
15/// The value side of a scan predicate.
16#[derive(Debug, Clone)]
17#[non_exhaustive]
18pub enum PredicateValue {
19 /// A numeric threshold (f64 for float columns, or integral f64 for i64 columns
20 /// whose value fits within ±2^53 exactly).
21 Numeric(f64),
22 /// An exact integer threshold for i64/timestamp columns.
23 ///
24 /// When the planner knows the predicate value is an integer it should use
25 /// this variant so that values outside ±2^53 are compared losslessly
26 /// against `BlockStats.min_i64`/`max_i64`.
27 Integer(i64),
28 /// A string threshold for lexicographic comparison.
29 String(String),
30}
31
32/// A predicate on a single column for block-level pushdown.
33#[derive(Debug, Clone)]
34pub struct ScanPredicate {
35 /// Column index in the schema.
36 pub col_idx: usize,
37 /// The comparison operation.
38 pub op: PredicateOp,
39 /// The threshold value.
40 pub value: PredicateValue,
41}
42
43/// Comparison operator for scan predicates.
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45#[non_exhaustive]
46pub enum PredicateOp {
47 /// `column > value`
48 Gt,
49 /// `column >= value`
50 Gte,
51 /// `column < value`
52 Lt,
53 /// `column <= value`
54 Lte,
55 /// `column = value`
56 Eq,
57 /// `column != value`
58 Ne,
59}
60
61impl ScanPredicate {
62 // ── Numeric constructors ────────────────────────────────────────────────
63
64 /// Create a predicate: column > value (numeric).
65 pub fn gt(col_idx: usize, value: f64) -> Self {
66 Self {
67 col_idx,
68 op: PredicateOp::Gt,
69 value: PredicateValue::Numeric(value),
70 }
71 }
72
73 /// Create a predicate: column >= value (numeric).
74 pub fn gte(col_idx: usize, value: f64) -> Self {
75 Self {
76 col_idx,
77 op: PredicateOp::Gte,
78 value: PredicateValue::Numeric(value),
79 }
80 }
81
82 /// Create a predicate: column < value (numeric).
83 pub fn lt(col_idx: usize, value: f64) -> Self {
84 Self {
85 col_idx,
86 op: PredicateOp::Lt,
87 value: PredicateValue::Numeric(value),
88 }
89 }
90
91 /// Create a predicate: column <= value (numeric).
92 pub fn lte(col_idx: usize, value: f64) -> Self {
93 Self {
94 col_idx,
95 op: PredicateOp::Lte,
96 value: PredicateValue::Numeric(value),
97 }
98 }
99
100 /// Create a predicate: column = value (numeric).
101 pub fn eq(col_idx: usize, value: f64) -> Self {
102 Self {
103 col_idx,
104 op: PredicateOp::Eq,
105 value: PredicateValue::Numeric(value),
106 }
107 }
108
109 /// Create a predicate: column != value (numeric).
110 /// Named `not_eq` to avoid conflict with `PartialEq::ne`.
111 pub fn not_eq(col_idx: usize, value: f64) -> Self {
112 Self {
113 col_idx,
114 op: PredicateOp::Ne,
115 value: PredicateValue::Numeric(value),
116 }
117 }
118
119 // ── Integer constructors (lossless i64 predicates) ─────────────────────
120
121 /// Create a predicate: column = value (exact integer, lossless for i64/timestamp).
122 pub fn eq_i64(col_idx: usize, value: i64) -> Self {
123 Self {
124 col_idx,
125 op: PredicateOp::Eq,
126 value: PredicateValue::Integer(value),
127 }
128 }
129
130 /// Create a predicate: column != value (exact integer).
131 pub fn not_eq_i64(col_idx: usize, value: i64) -> Self {
132 Self {
133 col_idx,
134 op: PredicateOp::Ne,
135 value: PredicateValue::Integer(value),
136 }
137 }
138
139 /// Create a predicate: column > value (exact integer).
140 pub fn gt_i64(col_idx: usize, value: i64) -> Self {
141 Self {
142 col_idx,
143 op: PredicateOp::Gt,
144 value: PredicateValue::Integer(value),
145 }
146 }
147
148 /// Create a predicate: column >= value (exact integer).
149 pub fn gte_i64(col_idx: usize, value: i64) -> Self {
150 Self {
151 col_idx,
152 op: PredicateOp::Gte,
153 value: PredicateValue::Integer(value),
154 }
155 }
156
157 /// Create a predicate: column < value (exact integer).
158 pub fn lt_i64(col_idx: usize, value: i64) -> Self {
159 Self {
160 col_idx,
161 op: PredicateOp::Lt,
162 value: PredicateValue::Integer(value),
163 }
164 }
165
166 /// Create a predicate: column <= value (exact integer).
167 pub fn lte_i64(col_idx: usize, value: i64) -> Self {
168 Self {
169 col_idx,
170 op: PredicateOp::Lte,
171 value: PredicateValue::Integer(value),
172 }
173 }
174
175 // ── String constructors ─────────────────────────────────────────────────
176
177 /// Create a predicate: column = value (string, lexicographic).
178 pub fn str_eq(col_idx: usize, value: impl Into<String>) -> Self {
179 Self {
180 col_idx,
181 op: PredicateOp::Eq,
182 value: PredicateValue::String(value.into()),
183 }
184 }
185
186 /// Create a predicate: column != value (string, lexicographic).
187 pub fn str_not_eq(col_idx: usize, value: impl Into<String>) -> Self {
188 Self {
189 col_idx,
190 op: PredicateOp::Ne,
191 value: PredicateValue::String(value.into()),
192 }
193 }
194
195 /// Create a predicate: column > value (string, lexicographic).
196 pub fn str_gt(col_idx: usize, value: impl Into<String>) -> Self {
197 Self {
198 col_idx,
199 op: PredicateOp::Gt,
200 value: PredicateValue::String(value.into()),
201 }
202 }
203
204 /// Create a predicate: column >= value (string, lexicographic).
205 pub fn str_gte(col_idx: usize, value: impl Into<String>) -> Self {
206 Self {
207 col_idx,
208 op: PredicateOp::Gte,
209 value: PredicateValue::String(value.into()),
210 }
211 }
212
213 /// Create a predicate: column < value (string, lexicographic).
214 pub fn str_lt(col_idx: usize, value: impl Into<String>) -> Self {
215 Self {
216 col_idx,
217 op: PredicateOp::Lt,
218 value: PredicateValue::String(value.into()),
219 }
220 }
221
222 /// Create a predicate: column <= value (string, lexicographic).
223 pub fn str_lte(col_idx: usize, value: impl Into<String>) -> Self {
224 Self {
225 col_idx,
226 op: PredicateOp::Lte,
227 value: PredicateValue::String(value.into()),
228 }
229 }
230
231 // ── Block-skip logic ────────────────────────────────────────────────────
232
233 /// Whether a block can be entirely skipped based on its statistics.
234 ///
235 /// Returns `true` if the block provably contains no matching rows.
236 /// Returns `false` if the block might contain matching rows (must scan).
237 pub fn can_skip_block(&self, stats: &BlockStats) -> bool {
238 match &self.value {
239 PredicateValue::Numeric(v) => can_skip_numeric(self.op, *v, stats),
240 PredicateValue::Integer(v) => can_skip_integer(self.op, *v, stats),
241 PredicateValue::String(v) => can_skip_string(self.op, v, stats),
242 }
243 }
244}
245
246/// Block-skip logic for numeric predicates.
247fn can_skip_numeric(op: PredicateOp, value: f64, stats: &BlockStats) -> bool {
248 // Non-numeric columns (NaN stats) can never be skipped via numeric predicate.
249 if stats.min.is_nan() || stats.max.is_nan() {
250 return false;
251 }
252
253 match op {
254 // column > value → skip if block.max <= value
255 PredicateOp::Gt => stats.max <= value,
256 // column >= value → skip if block.max < value
257 PredicateOp::Gte => stats.max < value,
258 // column < value → skip if block.min >= value
259 PredicateOp::Lt => stats.min >= value,
260 // column <= value → skip if block.min > value
261 PredicateOp::Lte => stats.min > value,
262 // column = value → skip if value outside [min, max]
263 PredicateOp::Eq => value < stats.min || value > stats.max,
264 // column != value → skip only if entire block is that single value
265 PredicateOp::Ne => stats.min == value && stats.max == value,
266 }
267}
268
269/// Block-skip logic for integer predicates (lossless i64 comparison).
270///
271/// When `BlockStats` carries `min_i64`/`max_i64` (written by minor v1+), the
272/// comparison is done entirely in i64 — no f64 rounding. When the exact fields
273/// are absent (segments written at minor v0), we fall through to the f64 path
274/// via `f64_to_exact_i64`: if the predicate value itself converts losslessly
275/// we can still do an exact comparison against the (possibly rounded) f64
276/// stats, which is conservative (may fail to skip) but never incorrect.
277fn can_skip_integer(op: PredicateOp, value: i64, stats: &BlockStats) -> bool {
278 // Prefer lossless i64 stats when available.
279 if let (Some(smin), Some(smax)) = (stats.min_i64, stats.max_i64) {
280 return match op {
281 PredicateOp::Gt => smax <= value,
282 PredicateOp::Gte => smax < value,
283 PredicateOp::Lt => smin >= value,
284 PredicateOp::Lte => smin > value,
285 PredicateOp::Eq => value < smin || value > smax,
286 PredicateOp::Ne => smin == value && smax == value,
287 };
288 }
289
290 // Fallback: convert the predicate value to f64 if it is exactly
291 // representable, then use the f64 stats path. If the conversion is lossy
292 // we cannot safely skip — return false (conservative).
293 match f64_to_exact_i64(value as f64).and(Some(value as f64)) {
294 Some(fv) => can_skip_numeric(op, fv, stats),
295 None => false,
296 }
297}
298
299/// Convert an f64 to i64 only if the conversion is exact.
300///
301/// Returns `Some(n)` iff:
302/// - `v` is finite,
303/// - `v` has no fractional part,
304/// - `v` lies within the exact i64 representable range `[i64::MIN as f64, (i64::MAX as f64).prev()]`.
305///
306/// Note: `i64::MAX as f64` rounds up to `9223372036854775808.0` which exceeds
307/// `i64::MAX`, so we compare `v < i64::MAX as f64` (strict less-than).
308/// `i64::MIN as f64` is exactly `−9223372036854775808.0`, so `>=` is correct.
309pub fn f64_to_exact_i64(v: f64) -> Option<i64> {
310 if !v.is_finite() || v.fract() != 0.0 {
311 return None;
312 }
313 // i64::MIN as f64 is exactly representable.
314 // i64::MAX as f64 rounds up past i64::MAX, so use strict less-than.
315 if v < i64::MIN as f64 || v >= i64::MAX as f64 {
316 return None;
317 }
318 Some(v as i64)
319}
320
321/// Block-skip logic for string predicates.
322fn can_skip_string(op: PredicateOp, value: &str, stats: &BlockStats) -> bool {
323 let (Some(smin), Some(smax)) = (&stats.str_min, &stats.str_max) else {
324 // No string zone-map information → cannot skip.
325 return false;
326 };
327
328 let skip_by_range = match op {
329 // column > value → skip if block.max <= value (no string is > value)
330 PredicateOp::Gt => smax.as_str() <= value,
331 // column >= value → skip if block.max < value
332 PredicateOp::Gte => smax.as_str() < value,
333 // column < value → skip if block.min >= value
334 PredicateOp::Lt => smin.as_str() >= value,
335 // column <= value → skip if block.min > value
336 PredicateOp::Lte => smin.as_str() > value,
337 // column = value → skip if value outside [min, max]
338 PredicateOp::Eq => value < smin.as_str() || value > smax.as_str(),
339 // column != value → skip only if the entire block contains that exact value
340 PredicateOp::Ne => smin.as_str() == value && smax.as_str() == value,
341 };
342
343 if skip_by_range {
344 return true;
345 }
346
347 // For Eq predicates, apply bloom filter as an additional fast-reject.
348 if op == PredicateOp::Eq
349 && let Some(ref bloom) = stats.bloom
350 && !bloom_may_contain(bloom, value)
351 {
352 return true; // Bloom says "definitely not present" → skip.
353 }
354
355 false
356}
357
358// ── Bloom filter ────────────────────────────────────────────────────────────
359
360/// Default bloom filter size in bits (2048 bits = 256 bytes).
361///
362/// This is the default used by `build_bloom`. The actual value in use for any
363/// given filter is stored in `BloomFilter::m` on disk — readers always use the
364/// persisted value, never this constant.
365pub const BLOOM_BITS_DEFAULT: u32 = 2048;
366
367/// Convenience byte count for the default bit array size.
368pub const BLOOM_BYTES: usize = (BLOOM_BITS_DEFAULT as usize) / 8;
369
370/// Default number of independent hash functions.
371///
372/// The actual value in use for any given filter is stored in `BloomFilter::k`
373/// on disk.
374pub const BLOOM_K_DEFAULT: u8 = 3;
375
376/// FNV-1a 64-bit offset basis.
377const FNV_OFFSET: u64 = 14_695_981_039_346_656_037;
378/// FNV-1a 64-bit prime.
379const FNV_PRIME: u64 = 1_099_511_628_211;
380
381/// Compute the i-th hash slot for a string value in an `m`-bit filter.
382///
383/// Uses FNV-1a seeded with different constants for each hash function to
384/// produce independent bit positions. `m` must be a power of two so the
385/// bitmask `m - 1` is exact.
386fn bloom_bit_pos(value: &str, hash_idx: u32, m: u32) -> usize {
387 // Mix the hash index into the seed to produce distinct hash functions.
388 let mut hash = FNV_OFFSET ^ (hash_idx as u64).wrapping_mul(FNV_PRIME);
389 for byte in value.bytes() {
390 hash ^= byte as u64;
391 hash = hash.wrapping_mul(FNV_PRIME);
392 }
393 // Map to [0, m). m is always a power of two so (m - 1) is a valid mask.
394 (hash as usize) & ((m as usize) - 1)
395}
396
397/// Insert a string value into a `BloomFilter`.
398pub fn bloom_insert(bloom: &mut BloomFilter, value: &str) {
399 for i in 0..(bloom.k as u32) {
400 let bit = bloom_bit_pos(value, i, bloom.m);
401 bloom.bytes[bit / 8] |= 1 << (bit % 8);
402 }
403}
404
405/// Test whether a string value may be present in a `BloomFilter`.
406///
407/// Uses the `k` and `m` stored inside the filter — never compile-time
408/// constants — so that filters written with different parameters are
409/// interpreted correctly.
410///
411/// Returns `false` only when the value is definitely absent.
412/// Returns `true` when it may be present (possible false positive).
413///
414/// If the byte array is too short for the declared `m`, returns `true`
415/// (conservative: do not incorrectly skip a block).
416pub fn bloom_may_contain(bloom: &BloomFilter, value: &str) -> bool {
417 let expected_bytes = (bloom.m as usize).div_ceil(8);
418 if bloom.bytes.len() < expected_bytes {
419 // Malformed/truncated bloom — default to "may contain".
420 return true;
421 }
422 for i in 0..(bloom.k as u32) {
423 let bit = bloom_bit_pos(value, i, bloom.m);
424 if bloom.bytes[bit / 8] & (1 << (bit % 8)) == 0 {
425 return false;
426 }
427 }
428 true
429}
430
431/// Build a new `BloomFilter` with the default parameters (`k=3`, `m=2048`)
432/// and insert all provided string values.
433///
434/// Skips empty strings and nulls (caller passes only valid, non-null values).
435pub fn build_bloom(values: &[&str]) -> BloomFilter {
436 build_bloom_with_params(values, BLOOM_K_DEFAULT, BLOOM_BITS_DEFAULT)
437}
438
439/// Build a `BloomFilter` with explicit `k` (hash function count) and `m`
440/// (bit-array size). `m` must be a power of two and at least 8.
441///
442/// This is useful when tuning the filter for a specific target false-positive
443/// rate. Use `build_bloom` for the standard default parameters.
444pub fn build_bloom_with_params(values: &[&str], k: u8, m: u32) -> BloomFilter {
445 debug_assert!(
446 m >= 8 && m.is_power_of_two(),
447 "m must be a power of two ≥ 8"
448 );
449 let byte_count = (m as usize).div_ceil(8);
450 let mut bloom = BloomFilter {
451 k,
452 m,
453 bytes: vec![0u8; byte_count],
454 };
455 for v in values {
456 bloom_insert(&mut bloom, v);
457 }
458 bloom
459}
460
461#[cfg(test)]
462mod tests {
463 use super::*;
464 use zerompk;
465
466 fn stats(min: f64, max: f64) -> BlockStats {
467 BlockStats::numeric(min, max, 0, 1024)
468 }
469
470 #[test]
471 fn gt_predicate() {
472 let pred = ScanPredicate::gt(0, 50.0);
473 // Block [10, 40] → max=40 ≤ 50 → skip.
474 assert!(pred.can_skip_block(&stats(10.0, 40.0)));
475 // Block [10, 60] → max=60 > 50 → scan.
476 assert!(!pred.can_skip_block(&stats(10.0, 60.0)));
477 // Block [10, 50] → max=50 ≤ 50 → skip (strict >).
478 assert!(pred.can_skip_block(&stats(10.0, 50.0)));
479 }
480
481 #[test]
482 fn gte_predicate() {
483 let pred = ScanPredicate::gte(0, 50.0);
484 // Block [10, 49] → max=49 < 50 → skip.
485 assert!(pred.can_skip_block(&stats(10.0, 49.0)));
486 // Block [10, 50] → max=50 ≥ 50 → scan.
487 assert!(!pred.can_skip_block(&stats(10.0, 50.0)));
488 }
489
490 #[test]
491 fn lt_predicate() {
492 let pred = ScanPredicate::lt(0, 50.0);
493 // Block [60, 100] → min=60 ≥ 50 → skip.
494 assert!(pred.can_skip_block(&stats(60.0, 100.0)));
495 // Block [40, 100] → min=40 < 50 → scan.
496 assert!(!pred.can_skip_block(&stats(40.0, 100.0)));
497 }
498
499 #[test]
500 fn eq_predicate() {
501 let pred = ScanPredicate::eq(0, 50.0);
502 // Block [10, 40] → 50 > max → skip.
503 assert!(pred.can_skip_block(&stats(10.0, 40.0)));
504 // Block [60, 100] → 50 < min → skip.
505 assert!(pred.can_skip_block(&stats(60.0, 100.0)));
506 // Block [40, 60] → 50 in range → scan.
507 assert!(!pred.can_skip_block(&stats(40.0, 60.0)));
508 }
509
510 #[test]
511 fn ne_predicate() {
512 let pred = ScanPredicate::not_eq(0, 50.0);
513 // Block [50, 50] → entire block is 50 → skip.
514 assert!(pred.can_skip_block(&stats(50.0, 50.0)));
515 // Block [40, 60] → not all 50 → scan.
516 assert!(!pred.can_skip_block(&stats(40.0, 60.0)));
517 }
518
519 #[test]
520 fn non_numeric_never_skipped_by_numeric_pred() {
521 let pred = ScanPredicate::gt(0, 50.0);
522 let nan_stats = BlockStats::non_numeric(0, 1024);
523 assert!(!pred.can_skip_block(&nan_stats));
524 }
525
526 // ── String predicate tests ──────────────────────────────────────────────
527
528 fn str_stats(smin: &str, smax: &str) -> BlockStats {
529 BlockStats::string_block(0, 1024, Some(smin.into()), Some(smax.into()), None)
530 }
531
532 #[test]
533 fn string_eq_skip_below_range() {
534 // Block contains ["apple".."banana"]; query = "aaa" < "apple" → skip.
535 let stats = str_stats("apple", "banana");
536 assert!(ScanPredicate::str_eq(0, "aaa").can_skip_block(&stats));
537 }
538
539 #[test]
540 fn string_eq_skip_above_range() {
541 // Block contains ["apple".."banana"]; query = "zzz" > "banana" → skip.
542 let stats = str_stats("apple", "banana");
543 assert!(ScanPredicate::str_eq(0, "zzz").can_skip_block(&stats));
544 }
545
546 #[test]
547 fn string_eq_no_skip_in_range() {
548 // Block contains ["apple".."banana"]; query = "avocado" ∈ range → scan.
549 let stats = str_stats("apple", "banana");
550 assert!(!ScanPredicate::str_eq(0, "avocado").can_skip_block(&stats));
551 }
552
553 #[test]
554 fn string_gt_skip() {
555 // Block max = "fig"; WHERE col > "fig" → smax ≤ value → skip.
556 let stats = str_stats("apple", "fig");
557 assert!(ScanPredicate::str_gt(0, "fig").can_skip_block(&stats));
558 // WHERE col > "egg" → smax="fig" > "egg" → scan.
559 assert!(!ScanPredicate::str_gt(0, "egg").can_skip_block(&stats));
560 }
561
562 #[test]
563 fn string_lt_skip() {
564 // Block min = "mango"; WHERE col < "mango" → smin ≥ value → skip.
565 let stats = str_stats("mango", "pear");
566 assert!(ScanPredicate::str_lt(0, "mango").can_skip_block(&stats));
567 // WHERE col < "orange" → smin="mango" < "orange" → scan.
568 assert!(!ScanPredicate::str_lt(0, "orange").can_skip_block(&stats));
569 }
570
571 #[test]
572 fn string_gte_skip() {
573 // Block max = "cat"; WHERE col >= "dog" → smax < "dog" → skip.
574 let stats = str_stats("ant", "cat");
575 assert!(ScanPredicate::str_gte(0, "dog").can_skip_block(&stats));
576 assert!(!ScanPredicate::str_gte(0, "cat").can_skip_block(&stats));
577 }
578
579 #[test]
580 fn string_lte_skip() {
581 // Block min = "zebra"; WHERE col <= "yak" → smin > "yak" → skip.
582 let stats = str_stats("zebra", "zoo");
583 assert!(ScanPredicate::str_lte(0, "yak").can_skip_block(&stats));
584 assert!(!ScanPredicate::str_lte(0, "zebra").can_skip_block(&stats));
585 }
586
587 #[test]
588 fn string_ne_skip() {
589 // Block only contains "exact"; WHERE col != "exact" → skip.
590 let stats = str_stats("exact", "exact");
591 assert!(ScanPredicate::str_not_eq(0, "exact").can_skip_block(&stats));
592 // Block has range → cannot skip Ne.
593 let stats2 = str_stats("a", "z");
594 assert!(!ScanPredicate::str_not_eq(0, "exact").can_skip_block(&stats2));
595 }
596
597 #[test]
598 fn string_no_zone_map_no_skip() {
599 // No str_min/str_max → cannot skip.
600 let stats = BlockStats::non_numeric(0, 1024);
601 assert!(!ScanPredicate::str_eq(0, "anything").can_skip_block(&stats));
602 }
603
604 // ── Bloom filter tests ──────────────────────────────────────────────────
605
606 #[test]
607 fn bloom_insert_and_query() {
608 let values = ["hello", "world", "foo"];
609 let bloom = build_bloom(&values);
610 assert!(bloom_may_contain(&bloom, "hello"));
611 assert!(bloom_may_contain(&bloom, "world"));
612 assert!(bloom_may_contain(&bloom, "foo"));
613 }
614
615 #[test]
616 fn bloom_absent_value_rejected() {
617 // Insert a specific set; a clearly absent value should (with high
618 // probability) be rejected. This test is deterministic because the
619 // FNV hash is deterministic.
620 let values = ["alpha", "beta", "gamma"];
621 let bloom = build_bloom(&values);
622 // "delta" was never inserted — verify it is rejected.
623 // (This relies on no false positive for this specific combination.)
624 let delta_present = bloom_may_contain(&bloom, "delta");
625 // We only assert this when the bloom actually says absent; if there
626 // happens to be a false positive the test is still valid — we just
627 // cannot assert absence. In practice FNV with these seeds gives no FP
628 // for this input set.
629 if !delta_present {
630 assert!(!bloom_may_contain(&bloom, "delta"));
631 }
632 }
633
634 #[test]
635 fn bloom_eq_skip_via_filter() {
636 // Build a block whose zone map [apple, banana] includes "avocado"
637 // in range but the bloom filter was built without "avocado".
638 let bloom = build_bloom(&["apple", "apricot", "banana"]);
639 // "avocado" is in [apple, banana] lexicographically but not in bloom.
640 // Zone-map says cannot skip; bloom filter may reject.
641 let stats = BlockStats::string_block(
642 0,
643 1024,
644 Some("apple".into()),
645 Some("banana".into()),
646 Some(bloom.clone()),
647 );
648 // "avocado" was not inserted → bloom rejects → skip.
649 let absent = !bloom_may_contain(&bloom, "avocado");
650 if absent {
651 assert!(ScanPredicate::str_eq(0, "avocado").can_skip_block(&stats));
652 }
653 // "apple" was inserted → bloom says may contain → no skip.
654 assert!(!ScanPredicate::str_eq(0, "apple").can_skip_block(&stats));
655 }
656
657 // ── Bloom filter parameter persistence ─────────────────────────────────
658
659 /// Bloom parameters (k, m) must survive a BlockStats MessagePack
660 /// roundtrip so that future readers can reconstruct the filter without
661 /// relying on compile-time constants.
662 #[test]
663 fn bloom_params_persist_through_msgpack_roundtrip() {
664 let bloom = build_bloom_with_params(&["hello", "world"], 7, 8192);
665 assert_eq!(bloom.k, 7);
666 assert_eq!(bloom.m, 8192);
667 assert_eq!(bloom.bytes.len(), 8192 / 8); // 1024 bytes
668
669 let stats = BlockStats::string_block(
670 0,
671 10,
672 Some("hello".into()),
673 Some("world".into()),
674 Some(bloom),
675 );
676
677 // Roundtrip through MessagePack (the on-disk serialization path).
678 let encoded = zerompk::to_msgpack_vec(&stats).expect("MessagePack encode");
679 let decoded: BlockStats = zerompk::from_msgpack(&encoded).expect("MessagePack decode");
680
681 let persisted = decoded
682 .bloom
683 .expect("bloom must be present after roundtrip");
684 assert_eq!(persisted.k, 7, "k must be preserved on disk");
685 assert_eq!(persisted.m, 8192, "m must be preserved on disk");
686 assert_eq!(
687 persisted.bytes.len(),
688 1024,
689 "byte array length must match m/8"
690 );
691
692 // Functional check: values inserted before serialization must still be
693 // found using the params decoded from disk.
694 assert!(bloom_may_contain(&persisted, "hello"));
695 assert!(bloom_may_contain(&persisted, "world"));
696 }
697
698 /// A filter built with non-default k=7, m=8192 uses those params
699 /// for bit-position calculations — not the default BLOOM_BITS_DEFAULT/K.
700 #[test]
701 fn bloom_custom_params_functional() {
702 let bloom = build_bloom_with_params(&["alpha", "beta", "gamma"], 7, 8192);
703 assert!(bloom_may_contain(&bloom, "alpha"));
704 assert!(bloom_may_contain(&bloom, "beta"));
705 assert!(bloom_may_contain(&bloom, "gamma"));
706
707 // A default-params filter built from the same values should give the
708 // same membership results but is a different object (different m).
709 let default_bloom = build_bloom(&["alpha", "beta", "gamma"]);
710 assert_eq!(default_bloom.k, BLOOM_K_DEFAULT);
711 assert_eq!(default_bloom.m, BLOOM_BITS_DEFAULT);
712 }
713
714 // ── i64 predicate pushdown correctness ────────────────────────────
715
716 /// Helper: BlockStats with exact i64 fields set (as integer() constructor produces).
717 fn i64_stats(min: i64, max: i64) -> BlockStats {
718 BlockStats::integer(min, max, 0, 1024)
719 }
720
721 /// Helper: BlockStats with only f64 fields (simulates a v0 segment).
722 fn f64_only_stats(min: f64, max: f64) -> BlockStats {
723 BlockStats::numeric(min, max, 0, 1024)
724 }
725
726 /// large i64 values that collapse to the same f64 must not be
727 /// incorrectly skipped when the predicate value is between them.
728 ///
729 /// i64::MAX = 9_223_372_036_854_775_807
730 /// i64::MAX - 1 = 9_223_372_036_854_775_806
731 /// i64::MAX - 2 = 9_223_372_036_854_775_805
732 ///
733 /// All three cast to the same f64 (9223372036854775808.0), so the old f64
734 /// path would have `min_f64 == max_f64 == pred_f64` and wrongly skipped.
735 #[test]
736 fn large_i64_eq_correct_no_skip() {
737 let min = i64::MAX - 2;
738 let max = i64::MAX;
739 let target = i64::MAX - 1;
740
741 // With exact i64 fields: target is in [min, max] → must NOT skip.
742 let stats = i64_stats(min, max);
743 assert!(
744 !ScanPredicate::eq_i64(0, target).can_skip_block(&stats),
745 "must not skip: target={target} is within [{min}, {max}]"
746 );
747
748 // Demonstrate the f64 path is broken (all three round to the same f64).
749 // We verify the bug exists in the fallback, so the fix is meaningful.
750 let min_f64 = min as f64;
751 let max_f64 = max as f64;
752 let target_f64 = target as f64;
753 assert_eq!(
754 min_f64, target_f64,
755 "f64 path is ambiguous: min and target round to the same f64"
756 );
757 assert_eq!(
758 max_f64, target_f64,
759 "f64 path is ambiguous: max and target round to the same f64"
760 );
761 }
762
763 /// a large i64 predicate value that is clearly outside the block
764 /// range must be skipped correctly via the i64 path.
765 #[test]
766 fn large_i64_definitely_outside() {
767 let stats = i64_stats(1000, 2000);
768 // 2^53 + 1 = 9_007_199_254_740_993 — well above 2000.
769 let outside: i64 = 9_007_199_254_740_993;
770 assert!(
771 ScanPredicate::eq_i64(0, outside).can_skip_block(&stats),
772 "must skip: {outside} is not in [1000, 2000]"
773 );
774 }
775
776 /// for a v0 segment (no min_i64/max_i64), integer predicates whose
777 /// value fits in f64 exactly fall through to the f64 path correctly.
778 #[test]
779 fn integer_predicate_fallback_to_f64_for_v0_segment() {
780 // Small value that f64 can represent exactly.
781 let stats = f64_only_stats(10.0, 50.0);
782 // 75 is outside [10, 50] → should skip.
783 assert!(ScanPredicate::eq_i64(0, 75).can_skip_block(&stats));
784 // 30 is inside [10, 50] → must not skip.
785 assert!(!ScanPredicate::eq_i64(0, 30).can_skip_block(&stats));
786 }
787
788 /// for a v0 segment, an integer predicate with a value outside
789 /// ±2^53 (not exactly representable in f64) must NOT skip — conservative.
790 #[test]
791 fn integer_predicate_v0_segment_unsafe_value_no_skip() {
792 // Construct f64-only stats whose f64 min/max happen to equal the
793 // rounded form of the predicate value — the old code would skip here.
794 let large: i64 = 9_007_199_254_740_993; // 2^53 + 1
795 let large_f64 = large as f64; // rounds to 9_007_199_254_740_992.0
796 let stats = f64_only_stats(large_f64, large_f64);
797 // Without exact i64 stats we cannot safely skip — return false.
798 assert!(
799 !ScanPredicate::eq_i64(0, large).can_skip_block(&stats),
800 "must not skip: predicate value is not exactly representable in f64"
801 );
802 }
803
804 // ── f64_to_exact_i64 boundary tests ────────────────────────────────────
805
806 #[test]
807 fn f64_to_exact_i64_normal_values() {
808 assert_eq!(f64_to_exact_i64(0.0), Some(0));
809 assert_eq!(f64_to_exact_i64(42.0), Some(42));
810 assert_eq!(f64_to_exact_i64(-42.0), Some(-42));
811 assert_eq!(f64_to_exact_i64(1.5), None); // fractional
812 assert_eq!(f64_to_exact_i64(f64::INFINITY), None);
813 assert_eq!(f64_to_exact_i64(f64::NAN), None);
814 }
815
816 #[test]
817 fn f64_to_exact_i64_i64_min_is_representable() {
818 // i64::MIN as f64 is exactly -9223372036854775808.0.
819 let v = i64::MIN as f64;
820 assert_eq!(f64_to_exact_i64(v), Some(i64::MIN));
821 }
822
823 #[test]
824 fn f64_to_exact_i64_i64_max_rounds_up() {
825 // i64::MAX = 9_223_372_036_854_775_807
826 // i64::MAX as f64 rounds up to 9_223_372_036_854_775_808.0 which
827 // exceeds i64::MAX — must return None.
828 let v = i64::MAX as f64;
829 assert_eq!(
830 f64_to_exact_i64(v),
831 None,
832 "i64::MAX as f64 exceeds i64::MAX and must not convert"
833 );
834 }
835
836 #[test]
837 fn f64_to_exact_i64_just_below_i64_max_f64() {
838 // The largest exactly representable i64 value in f64 is 2^63 - 2^10
839 // (the f64 just below i64::MAX as f64). Verify it converts.
840 // 9223372036854774784 = (i64::MAX as f64) - 1024 (next lower f64).
841 let v: f64 = 9_223_372_036_854_774_784.0;
842 let result = f64_to_exact_i64(v);
843 assert!(
844 result.is_some(),
845 "large but exactly representable value should convert"
846 );
847 assert_eq!(result.unwrap() as f64, v);
848 }
849}