Skip to main content

nodedb_columnar/
predicate.rs

1//! Scan predicates for block-level predicate pushdown.
2//!
3//! A `ScanPredicate` describes a filter on a single column that can be
4//! evaluated against `BlockStats` to skip entire blocks without decompressing.
5//!
6//! Predicates work for both numeric columns (comparing against `BlockStats.min`
7//! / `BlockStats.max`) and string columns (comparing against
8//! `BlockStats.str_min` / `BlockStats.str_max`). For string Eq predicates the
9//! optional bloom filter provides an additional fast-reject path.
10
11use crate::format::BlockStats;
12
13/// The value side of a scan predicate.
14#[derive(Debug, Clone)]
15pub enum PredicateValue {
16    /// A numeric threshold (f64; i64 columns are cast losslessly).
17    Numeric(f64),
18    /// A string threshold for lexicographic comparison.
19    String(String),
20}
21
22/// A predicate on a single column for block-level pushdown.
23#[derive(Debug, Clone)]
24pub struct ScanPredicate {
25    /// Column index in the schema.
26    pub col_idx: usize,
27    /// The comparison operation.
28    pub op: PredicateOp,
29    /// The threshold value.
30    pub value: PredicateValue,
31}
32
33/// Comparison operator for scan predicates.
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
35pub enum PredicateOp {
36    /// `column > value`
37    Gt,
38    /// `column >= value`
39    Gte,
40    /// `column < value`
41    Lt,
42    /// `column <= value`
43    Lte,
44    /// `column = value`
45    Eq,
46    /// `column != value`
47    Ne,
48}
49
50impl ScanPredicate {
51    // ── Numeric constructors ────────────────────────────────────────────────
52
53    /// Create a predicate: column > value (numeric).
54    pub fn gt(col_idx: usize, value: f64) -> Self {
55        Self {
56            col_idx,
57            op: PredicateOp::Gt,
58            value: PredicateValue::Numeric(value),
59        }
60    }
61
62    /// Create a predicate: column >= value (numeric).
63    pub fn gte(col_idx: usize, value: f64) -> Self {
64        Self {
65            col_idx,
66            op: PredicateOp::Gte,
67            value: PredicateValue::Numeric(value),
68        }
69    }
70
71    /// Create a predicate: column < value (numeric).
72    pub fn lt(col_idx: usize, value: f64) -> Self {
73        Self {
74            col_idx,
75            op: PredicateOp::Lt,
76            value: PredicateValue::Numeric(value),
77        }
78    }
79
80    /// Create a predicate: column <= value (numeric).
81    pub fn lte(col_idx: usize, value: f64) -> Self {
82        Self {
83            col_idx,
84            op: PredicateOp::Lte,
85            value: PredicateValue::Numeric(value),
86        }
87    }
88
89    /// Create a predicate: column = value (numeric).
90    pub fn eq(col_idx: usize, value: f64) -> Self {
91        Self {
92            col_idx,
93            op: PredicateOp::Eq,
94            value: PredicateValue::Numeric(value),
95        }
96    }
97
98    /// Create a predicate: column != value (numeric).
99    /// Named `not_eq` to avoid conflict with `PartialEq::ne`.
100    pub fn not_eq(col_idx: usize, value: f64) -> Self {
101        Self {
102            col_idx,
103            op: PredicateOp::Ne,
104            value: PredicateValue::Numeric(value),
105        }
106    }
107
108    // ── String constructors ─────────────────────────────────────────────────
109
110    /// Create a predicate: column = value (string, lexicographic).
111    pub fn str_eq(col_idx: usize, value: impl Into<String>) -> Self {
112        Self {
113            col_idx,
114            op: PredicateOp::Eq,
115            value: PredicateValue::String(value.into()),
116        }
117    }
118
119    /// Create a predicate: column != value (string, lexicographic).
120    pub fn str_not_eq(col_idx: usize, value: impl Into<String>) -> Self {
121        Self {
122            col_idx,
123            op: PredicateOp::Ne,
124            value: PredicateValue::String(value.into()),
125        }
126    }
127
128    /// Create a predicate: column > value (string, lexicographic).
129    pub fn str_gt(col_idx: usize, value: impl Into<String>) -> Self {
130        Self {
131            col_idx,
132            op: PredicateOp::Gt,
133            value: PredicateValue::String(value.into()),
134        }
135    }
136
137    /// Create a predicate: column >= value (string, lexicographic).
138    pub fn str_gte(col_idx: usize, value: impl Into<String>) -> Self {
139        Self {
140            col_idx,
141            op: PredicateOp::Gte,
142            value: PredicateValue::String(value.into()),
143        }
144    }
145
146    /// Create a predicate: column < value (string, lexicographic).
147    pub fn str_lt(col_idx: usize, value: impl Into<String>) -> Self {
148        Self {
149            col_idx,
150            op: PredicateOp::Lt,
151            value: PredicateValue::String(value.into()),
152        }
153    }
154
155    /// Create a predicate: column <= value (string, lexicographic).
156    pub fn str_lte(col_idx: usize, value: impl Into<String>) -> Self {
157        Self {
158            col_idx,
159            op: PredicateOp::Lte,
160            value: PredicateValue::String(value.into()),
161        }
162    }
163
164    // ── Block-skip logic ────────────────────────────────────────────────────
165
166    /// Whether a block can be entirely skipped based on its statistics.
167    ///
168    /// Returns `true` if the block provably contains no matching rows.
169    /// Returns `false` if the block might contain matching rows (must scan).
170    pub fn can_skip_block(&self, stats: &BlockStats) -> bool {
171        match &self.value {
172            PredicateValue::Numeric(v) => can_skip_numeric(self.op, *v, stats),
173            PredicateValue::String(v) => can_skip_string(self.op, v, stats),
174        }
175    }
176}
177
178/// Block-skip logic for numeric predicates.
179fn can_skip_numeric(op: PredicateOp, value: f64, stats: &BlockStats) -> bool {
180    // Non-numeric columns (NaN stats) can never be skipped via numeric predicate.
181    if stats.min.is_nan() || stats.max.is_nan() {
182        return false;
183    }
184
185    match op {
186        // column > value → skip if block.max <= value
187        PredicateOp::Gt => stats.max <= value,
188        // column >= value → skip if block.max < value
189        PredicateOp::Gte => stats.max < value,
190        // column < value → skip if block.min >= value
191        PredicateOp::Lt => stats.min >= value,
192        // column <= value → skip if block.min > value
193        PredicateOp::Lte => stats.min > value,
194        // column = value → skip if value outside [min, max]
195        PredicateOp::Eq => value < stats.min || value > stats.max,
196        // column != value → skip only if entire block is that single value
197        PredicateOp::Ne => stats.min == value && stats.max == value,
198    }
199}
200
201/// Block-skip logic for string predicates.
202fn can_skip_string(op: PredicateOp, value: &str, stats: &BlockStats) -> bool {
203    let (Some(smin), Some(smax)) = (&stats.str_min, &stats.str_max) else {
204        // No string zone-map information → cannot skip.
205        return false;
206    };
207
208    let skip_by_range = match op {
209        // column > value → skip if block.max <= value (no string is > value)
210        PredicateOp::Gt => smax.as_str() <= value,
211        // column >= value → skip if block.max < value
212        PredicateOp::Gte => smax.as_str() < value,
213        // column < value → skip if block.min >= value
214        PredicateOp::Lt => smin.as_str() >= value,
215        // column <= value → skip if block.min > value
216        PredicateOp::Lte => smin.as_str() > value,
217        // column = value → skip if value outside [min, max]
218        PredicateOp::Eq => value < smin.as_str() || value > smax.as_str(),
219        // column != value → skip only if the entire block contains that exact value
220        PredicateOp::Ne => smin.as_str() == value && smax.as_str() == value,
221    };
222
223    if skip_by_range {
224        return true;
225    }
226
227    // For Eq predicates, apply bloom filter as an additional fast-reject.
228    if op == PredicateOp::Eq
229        && let Some(ref bloom_bytes) = stats.bloom
230        && !bloom_may_contain(bloom_bytes, value)
231    {
232        return true; // Bloom says "definitely not present" → skip.
233    }
234
235    false
236}
237
238// ── Bloom filter ────────────────────────────────────────────────────────────
239
240/// Bloom filter size in bytes (2048 bits).
241pub const BLOOM_BYTES: usize = 256;
242
243/// Number of independent hash functions.
244const BLOOM_HASH_COUNT: u32 = 3;
245
246/// FNV-1a 64-bit offset basis.
247const FNV_OFFSET: u64 = 14_695_981_039_346_656_037;
248/// FNV-1a 64-bit prime.
249const FNV_PRIME: u64 = 1_099_511_628_211;
250
251/// Compute the i-th hash slot for a string value in a 2048-bit filter.
252///
253/// Uses FNV-1a seeded with different constants for each hash function to
254/// produce independent bit positions.
255fn bloom_bit_pos(value: &str, hash_idx: u32) -> usize {
256    // Mix the hash index into the seed to produce distinct hash functions.
257    let mut hash = FNV_OFFSET ^ (hash_idx as u64).wrapping_mul(FNV_PRIME);
258    for byte in value.bytes() {
259        hash ^= byte as u64;
260        hash = hash.wrapping_mul(FNV_PRIME);
261    }
262    // Map to [0, 2048).
263    (hash as usize) & (BLOOM_BYTES * 8 - 1)
264}
265
266/// Insert a string value into a bloom filter buffer.
267///
268/// The buffer must be exactly `BLOOM_BYTES` in length.
269pub fn bloom_insert(bloom: &mut [u8], value: &str) {
270    for i in 0..BLOOM_HASH_COUNT {
271        let bit = bloom_bit_pos(value, i);
272        bloom[bit / 8] |= 1 << (bit % 8);
273    }
274}
275
276/// Test whether a string value may be present in a bloom filter.
277///
278/// Returns `false` only when the value is definitely absent.
279/// Returns `true` when it may be present (possible false positive).
280pub fn bloom_may_contain(bloom: &[u8], value: &str) -> bool {
281    if bloom.len() < BLOOM_BYTES {
282        // Malformed/truncated bloom — default to "may contain" to avoid
283        // incorrectly skipping a block.
284        return true;
285    }
286    for i in 0..BLOOM_HASH_COUNT {
287        let bit = bloom_bit_pos(value, i);
288        if bloom[bit / 8] & (1 << (bit % 8)) == 0 {
289            return false;
290        }
291    }
292    true
293}
294
295/// Build a new bloom filter buffer and insert all provided string values.
296///
297/// Skips empty strings and nulls (caller passes only valid, non-null values).
298pub fn build_bloom(values: &[&str]) -> Vec<u8> {
299    let mut bloom = vec![0u8; BLOOM_BYTES];
300    for v in values {
301        bloom_insert(&mut bloom, v);
302    }
303    bloom
304}
305
306#[cfg(test)]
307mod tests {
308    use super::*;
309
310    fn stats(min: f64, max: f64) -> BlockStats {
311        BlockStats::numeric(min, max, 0, 1024)
312    }
313
314    #[test]
315    fn gt_predicate() {
316        let pred = ScanPredicate::gt(0, 50.0);
317        // Block [10, 40] → max=40 ≤ 50 → skip.
318        assert!(pred.can_skip_block(&stats(10.0, 40.0)));
319        // Block [10, 60] → max=60 > 50 → scan.
320        assert!(!pred.can_skip_block(&stats(10.0, 60.0)));
321        // Block [10, 50] → max=50 ≤ 50 → skip (strict >).
322        assert!(pred.can_skip_block(&stats(10.0, 50.0)));
323    }
324
325    #[test]
326    fn gte_predicate() {
327        let pred = ScanPredicate::gte(0, 50.0);
328        // Block [10, 49] → max=49 < 50 → skip.
329        assert!(pred.can_skip_block(&stats(10.0, 49.0)));
330        // Block [10, 50] → max=50 ≥ 50 → scan.
331        assert!(!pred.can_skip_block(&stats(10.0, 50.0)));
332    }
333
334    #[test]
335    fn lt_predicate() {
336        let pred = ScanPredicate::lt(0, 50.0);
337        // Block [60, 100] → min=60 ≥ 50 → skip.
338        assert!(pred.can_skip_block(&stats(60.0, 100.0)));
339        // Block [40, 100] → min=40 < 50 → scan.
340        assert!(!pred.can_skip_block(&stats(40.0, 100.0)));
341    }
342
343    #[test]
344    fn eq_predicate() {
345        let pred = ScanPredicate::eq(0, 50.0);
346        // Block [10, 40] → 50 > max → skip.
347        assert!(pred.can_skip_block(&stats(10.0, 40.0)));
348        // Block [60, 100] → 50 < min → skip.
349        assert!(pred.can_skip_block(&stats(60.0, 100.0)));
350        // Block [40, 60] → 50 in range → scan.
351        assert!(!pred.can_skip_block(&stats(40.0, 60.0)));
352    }
353
354    #[test]
355    fn ne_predicate() {
356        let pred = ScanPredicate::not_eq(0, 50.0);
357        // Block [50, 50] → entire block is 50 → skip.
358        assert!(pred.can_skip_block(&stats(50.0, 50.0)));
359        // Block [40, 60] → not all 50 → scan.
360        assert!(!pred.can_skip_block(&stats(40.0, 60.0)));
361    }
362
363    #[test]
364    fn non_numeric_never_skipped_by_numeric_pred() {
365        let pred = ScanPredicate::gt(0, 50.0);
366        let nan_stats = BlockStats::non_numeric(0, 1024);
367        assert!(!pred.can_skip_block(&nan_stats));
368    }
369
370    // ── String predicate tests ──────────────────────────────────────────────
371
372    fn str_stats(smin: &str, smax: &str) -> BlockStats {
373        BlockStats::string_block(0, 1024, Some(smin.into()), Some(smax.into()), None)
374    }
375
376    #[test]
377    fn string_eq_skip_below_range() {
378        // Block contains ["apple".."banana"]; query = "aaa" < "apple" → skip.
379        let stats = str_stats("apple", "banana");
380        assert!(ScanPredicate::str_eq(0, "aaa").can_skip_block(&stats));
381    }
382
383    #[test]
384    fn string_eq_skip_above_range() {
385        // Block contains ["apple".."banana"]; query = "zzz" > "banana" → skip.
386        let stats = str_stats("apple", "banana");
387        assert!(ScanPredicate::str_eq(0, "zzz").can_skip_block(&stats));
388    }
389
390    #[test]
391    fn string_eq_no_skip_in_range() {
392        // Block contains ["apple".."banana"]; query = "avocado" ∈ range → scan.
393        let stats = str_stats("apple", "banana");
394        assert!(!ScanPredicate::str_eq(0, "avocado").can_skip_block(&stats));
395    }
396
397    #[test]
398    fn string_gt_skip() {
399        // Block max = "fig"; WHERE col > "fig" → smax ≤ value → skip.
400        let stats = str_stats("apple", "fig");
401        assert!(ScanPredicate::str_gt(0, "fig").can_skip_block(&stats));
402        // WHERE col > "egg" → smax="fig" > "egg" → scan.
403        assert!(!ScanPredicate::str_gt(0, "egg").can_skip_block(&stats));
404    }
405
406    #[test]
407    fn string_lt_skip() {
408        // Block min = "mango"; WHERE col < "mango" → smin ≥ value → skip.
409        let stats = str_stats("mango", "pear");
410        assert!(ScanPredicate::str_lt(0, "mango").can_skip_block(&stats));
411        // WHERE col < "orange" → smin="mango" < "orange" → scan.
412        assert!(!ScanPredicate::str_lt(0, "orange").can_skip_block(&stats));
413    }
414
415    #[test]
416    fn string_gte_skip() {
417        // Block max = "cat"; WHERE col >= "dog" → smax < "dog" → skip.
418        let stats = str_stats("ant", "cat");
419        assert!(ScanPredicate::str_gte(0, "dog").can_skip_block(&stats));
420        assert!(!ScanPredicate::str_gte(0, "cat").can_skip_block(&stats));
421    }
422
423    #[test]
424    fn string_lte_skip() {
425        // Block min = "zebra"; WHERE col <= "yak" → smin > "yak" → skip.
426        let stats = str_stats("zebra", "zoo");
427        assert!(ScanPredicate::str_lte(0, "yak").can_skip_block(&stats));
428        assert!(!ScanPredicate::str_lte(0, "zebra").can_skip_block(&stats));
429    }
430
431    #[test]
432    fn string_ne_skip() {
433        // Block only contains "exact"; WHERE col != "exact" → skip.
434        let stats = str_stats("exact", "exact");
435        assert!(ScanPredicate::str_not_eq(0, "exact").can_skip_block(&stats));
436        // Block has range → cannot skip Ne.
437        let stats2 = str_stats("a", "z");
438        assert!(!ScanPredicate::str_not_eq(0, "exact").can_skip_block(&stats2));
439    }
440
441    #[test]
442    fn string_no_zone_map_no_skip() {
443        // No str_min/str_max → cannot skip.
444        let stats = BlockStats::non_numeric(0, 1024);
445        assert!(!ScanPredicate::str_eq(0, "anything").can_skip_block(&stats));
446    }
447
448    // ── Bloom filter tests ──────────────────────────────────────────────────
449
450    #[test]
451    fn bloom_insert_and_query() {
452        let values = ["hello", "world", "foo"];
453        let bloom = build_bloom(&values);
454        assert!(bloom_may_contain(&bloom, "hello"));
455        assert!(bloom_may_contain(&bloom, "world"));
456        assert!(bloom_may_contain(&bloom, "foo"));
457    }
458
459    #[test]
460    fn bloom_absent_value_rejected() {
461        // Insert a specific set; a clearly absent value should (with high
462        // probability) be rejected. This test is deterministic because the
463        // FNV hash is deterministic.
464        let values = ["alpha", "beta", "gamma"];
465        let bloom = build_bloom(&values);
466        // "delta" was never inserted — verify it is rejected.
467        // (This relies on no false positive for this specific combination.)
468        let delta_present = bloom_may_contain(&bloom, "delta");
469        // We only assert this when the bloom actually says absent; if there
470        // happens to be a false positive the test is still valid — we just
471        // cannot assert absence.  In practice FNV with these seeds gives no FP
472        // for this input set.
473        if !delta_present {
474            assert!(!bloom_may_contain(&bloom, "delta"));
475        }
476    }
477
478    #[test]
479    fn bloom_eq_skip_via_filter() {
480        // Build a block whose zone map [apple, banana] includes "avocado"
481        // in range but the bloom filter was built without "avocado".
482        let bloom = build_bloom(&["apple", "apricot", "banana"]);
483        // "avocado" is in [apple, banana] lexicographically but not in bloom.
484        // Zone-map says cannot skip; bloom filter may reject.
485        let stats = BlockStats::string_block(
486            0,
487            1024,
488            Some("apple".into()),
489            Some("banana".into()),
490            Some(bloom.clone()),
491        );
492        // "avocado" was not inserted → bloom rejects → skip.
493        let absent = !bloom_may_contain(&bloom, "avocado");
494        if absent {
495            assert!(ScanPredicate::str_eq(0, "avocado").can_skip_block(&stats));
496        }
497        // "apple" was inserted → bloom says may contain → no skip.
498        assert!(!ScanPredicate::str_eq(0, "apple").can_skip_block(&stats));
499    }
500}