Skip to main content

nodedb_columnar/
filter.rs

1//! Vectorized predicate evaluation on dictionary-encoded columns.
2//!
3//! Implements late materialization for `DictEncoded` columns: predicates are
4//! evaluated on compact integer IDs rather than decompressed strings. This
5//! turns `O(N * string_len)` evaluation into `O(dict_size * string_len + N)`
6//! — a large win when cardinality is low (the common case for dict-encoded
7//! columns).
8//!
9//! # Short-circuit paths
10//!
11//! - **eq, not-in-dict**: return all-zero mask immediately (O(1) reject)
12//! - **ne, not-in-dict**: return all-ones mask immediately (O(1) accept)
13//! - **contains, empty-match-set**: return all-zero mask immediately
14//!
15//! # Bitmask encoding
16//!
17//! Results are returned as `Vec<u64>` packed bitmasks: bit *i* is set if row
18//! *i* passes the predicate. Use `words_for(row_count)` to size the output.
19
20use std::collections::{HashMap, HashSet};
21
22use crate::memtable::ColumnData;
23use crate::reader::DecodedColumn;
24
25/// Unpacked view of a memtable `DictEncoded` column.
26///
27/// The validity slice is returned via `Cow` — `Borrowed` when the column is
28/// nullable (has an explicit bitmap), `Owned(all-true)` when non-nullable.
29type MemtableDict<'a> = (
30    &'a [u32],
31    &'a [String],
32    &'a HashMap<String, u32>,
33    std::borrow::Cow<'a, [bool]>,
34);
35
36// ---------------------------------------------------------------------------
37// Public API
38// ---------------------------------------------------------------------------
39
40/// Evaluate an equality predicate on a memtable `DictEncoded` column.
41///
42/// Returns `Some(mask)` where bit *i* is set if row *i* equals `value` and
43/// `valid[i]` is true. Returns `None` if `col` is not `DictEncoded`.
44pub fn dict_eval_eq(col: &ColumnData, value: &str, row_count: usize) -> Option<Vec<u64>> {
45    let (ids, _, reverse, valid) = unpack_memtable(col)?;
46    match reverse.get(value) {
47        None => Some(zero_mask(row_count)),
48        Some(&target_id) => Some(build_eq_mask(ids, &valid, target_id, row_count)),
49    }
50}
51
52/// Evaluate a not-equal predicate on a memtable `DictEncoded` column.
53///
54/// Returns `Some(mask)` where bit *i* is set if row *i* is not equal to
55/// `value` (or is NULL, which is treated as not matching the value).
56/// Returns `None` if `col` is not `DictEncoded`.
57pub fn dict_eval_ne(col: &ColumnData, value: &str, row_count: usize) -> Option<Vec<u64>> {
58    let (ids, _, reverse, valid) = unpack_memtable(col)?;
59    match reverse.get(value) {
60        None => Some(all_valid_mask(&valid, row_count)),
61        Some(&target_id) => Some(build_ne_mask(ids, &valid, target_id, row_count)),
62    }
63}
64
65/// Evaluate a substring-contains predicate on a memtable `DictEncoded` column.
66///
67/// All dictionary entries that contain `substr` are collected into a set of
68/// matching IDs (O(dict_size * string_len)), then rows are filtered by ID
69/// membership (O(N)).
70///
71/// Returns `None` if `col` is not `DictEncoded`.
72pub fn dict_eval_contains(col: &ColumnData, substr: &str, row_count: usize) -> Option<Vec<u64>> {
73    let (ids, dictionary, _, valid) = unpack_memtable(col)?;
74    let matching = matching_ids_contains(dictionary, substr);
75    if matching.is_empty() {
76        return Some(zero_mask(row_count));
77    }
78    Some(build_set_mask(ids, &valid, &matching, row_count))
79}
80
81/// Evaluate a LIKE predicate on a memtable `DictEncoded` column.
82///
83/// Supports `%` as wildcard (leading, trailing, both, none). Only simple
84/// patterns are supported; complex patterns with mid-string `%` fall back
85/// to `None` (caller should decompress and evaluate).
86///
87/// Returns `None` if `col` is not `DictEncoded` or the pattern is unsupported.
88pub fn dict_eval_like(col: &ColumnData, pattern: &str, row_count: usize) -> Option<Vec<u64>> {
89    let (ids, dictionary, _, valid) = unpack_memtable(col)?;
90    let matching = matching_ids_like(dictionary, pattern)?;
91    if matching.is_empty() {
92        return Some(zero_mask(row_count));
93    }
94    Some(build_set_mask(ids, &valid, &matching, row_count))
95}
96
97// ---------------------------------------------------------------------------
98// Decoded-column variants (from SegmentReader)
99// ---------------------------------------------------------------------------
100
101/// Evaluate an equality predicate on a `DecodedColumn::DictEncoded`.
102///
103/// The `DecodedColumn` variant carries no reverse map, so lookup is a linear
104/// scan of the dictionary — acceptable because dictionaries are small.
105pub fn decoded_dict_eval_eq(
106    col: &DecodedColumn,
107    value: &str,
108    row_count: usize,
109) -> Option<Vec<u64>> {
110    let (ids, dictionary, valid) = unpack_decoded(col)?;
111    match find_dict_id(dictionary, value) {
112        None => Some(zero_mask(row_count)),
113        Some(target_id) => Some(build_eq_mask(ids, valid, target_id, row_count)),
114    }
115}
116
117/// Evaluate a not-equal predicate on a `DecodedColumn::DictEncoded`.
118pub fn decoded_dict_eval_ne(
119    col: &DecodedColumn,
120    value: &str,
121    row_count: usize,
122) -> Option<Vec<u64>> {
123    let (ids, dictionary, valid) = unpack_decoded(col)?;
124    match find_dict_id(dictionary, value) {
125        None => Some(all_valid_mask(valid, row_count)),
126        Some(target_id) => Some(build_ne_mask(ids, valid, target_id, row_count)),
127    }
128}
129
130/// Evaluate a substring-contains predicate on a `DecodedColumn::DictEncoded`.
131pub fn decoded_dict_eval_contains(
132    col: &DecodedColumn,
133    substr: &str,
134    row_count: usize,
135) -> Option<Vec<u64>> {
136    let (ids, dictionary, valid) = unpack_decoded(col)?;
137    let matching = matching_ids_contains(dictionary, substr);
138    if matching.is_empty() {
139        return Some(zero_mask(row_count));
140    }
141    Some(build_set_mask(ids, valid, &matching, row_count))
142}
143
144/// Evaluate a LIKE predicate on a `DecodedColumn::DictEncoded`.
145pub fn decoded_dict_eval_like(
146    col: &DecodedColumn,
147    pattern: &str,
148    row_count: usize,
149) -> Option<Vec<u64>> {
150    let (ids, dictionary, valid) = unpack_decoded(col)?;
151    let matching = matching_ids_like(dictionary, pattern)?;
152    if matching.is_empty() {
153        return Some(zero_mask(row_count));
154    }
155    Some(build_set_mask(ids, valid, &matching, row_count))
156}
157
158// ---------------------------------------------------------------------------
159// Bitmask helpers (public, mirrors nodedb-query simd_filter API surface)
160// ---------------------------------------------------------------------------
161
162/// Number of `u64` words needed to hold `row_count` bits.
163#[inline]
164pub fn words_for(row_count: usize) -> usize {
165    row_count.div_ceil(64)
166}
167
168/// Bitwise AND of two equal-length bitmasks.
169pub fn bitmask_and(a: &[u64], b: &[u64]) -> Vec<u64> {
170    let len = a.len().min(b.len());
171    let mut out = vec![0u64; len];
172    for i in 0..len {
173        out[i] = a[i] & b[i];
174    }
175    out
176}
177
178/// All-ones bitmask for `row_count` rows.
179pub fn bitmask_all(row_count: usize) -> Vec<u64> {
180    let words = words_for(row_count);
181    let mut out = vec![u64::MAX; words];
182    let tail = row_count % 64;
183    if tail > 0 && !out.is_empty() {
184        *out.last_mut().expect("non-empty") = (1u64 << tail) - 1;
185    }
186    out
187}
188
189// ---------------------------------------------------------------------------
190// Internal helpers
191// ---------------------------------------------------------------------------
192
193/// Destructure a memtable `ColumnData::DictEncoded`.
194fn unpack_memtable(col: &ColumnData) -> Option<MemtableDict<'_>> {
195    if let ColumnData::DictEncoded {
196        ids,
197        dictionary,
198        reverse,
199        valid,
200    } = col
201    {
202        let validity = match valid {
203            Some(v) => std::borrow::Cow::Borrowed(v.as_slice()),
204            None => std::borrow::Cow::Owned(vec![true; ids.len()]),
205        };
206        Some((ids.as_slice(), dictionary.as_slice(), reverse, validity))
207    } else {
208        None
209    }
210}
211
212/// Destructure a `DecodedColumn::DictEncoded`.
213fn unpack_decoded(col: &DecodedColumn) -> Option<(&[u32], &[String], &[bool])> {
214    if let DecodedColumn::DictEncoded {
215        ids,
216        dictionary,
217        valid,
218    } = col
219    {
220        Some((ids.as_slice(), dictionary.as_slice(), valid.as_slice()))
221    } else {
222        None
223    }
224}
225
226/// Linear scan to find the ID of `value` in a dictionary.
227fn find_dict_id(dictionary: &[String], value: &str) -> Option<u32> {
228    dictionary.iter().position(|s| s == value).map(|i| i as u32)
229}
230
231/// Collect IDs of all dictionary entries that contain `substr`.
232fn matching_ids_contains(dictionary: &[String], substr: &str) -> HashSet<u32> {
233    dictionary
234        .iter()
235        .enumerate()
236        .filter(|(_, s)| s.contains(substr))
237        .map(|(i, _)| i as u32)
238        .collect()
239}
240
241/// Collect IDs matching a simple LIKE pattern (leading/trailing `%` only).
242///
243/// Returns `None` for unsupported patterns (mid-string `%`).
244fn matching_ids_like(dictionary: &[String], pattern: &str) -> Option<HashSet<u32>> {
245    let matching = match (pattern.starts_with('%'), pattern.ends_with('%')) {
246        (true, true) => {
247            // %substr%
248            let inner = pattern.trim_matches('%');
249            if inner.contains('%') {
250                return None; // mid-string wildcard
251            }
252            dictionary
253                .iter()
254                .enumerate()
255                .filter(|(_, s)| s.contains(inner))
256                .map(|(i, _)| i as u32)
257                .collect()
258        }
259        (true, false) => {
260            // %suffix
261            let suffix = &pattern[1..];
262            if suffix.contains('%') {
263                return None;
264            }
265            dictionary
266                .iter()
267                .enumerate()
268                .filter(|(_, s)| s.ends_with(suffix))
269                .map(|(i, _)| i as u32)
270                .collect()
271        }
272        (false, true) => {
273            // prefix%
274            let prefix = &pattern[..pattern.len() - 1];
275            if prefix.contains('%') {
276                return None;
277            }
278            dictionary
279                .iter()
280                .enumerate()
281                .filter(|(_, s)| s.starts_with(prefix))
282                .map(|(i, _)| i as u32)
283                .collect()
284        }
285        (false, false) => {
286            // exact match — no wildcards
287            if pattern.contains('%') {
288                return None;
289            }
290            dictionary
291                .iter()
292                .enumerate()
293                .filter(|(_, s)| s.as_str() == pattern)
294                .map(|(i, _)| i as u32)
295                .collect()
296        }
297    };
298    Some(matching)
299}
300
301/// Build an equality bitmask: bit i set iff `valid[i] && ids[i] == target_id`.
302fn build_eq_mask(ids: &[u32], valid: &[bool], target_id: u32, row_count: usize) -> Vec<u64> {
303    let words = words_for(row_count);
304    let mut mask = vec![0u64; words];
305    let n = row_count.min(ids.len()).min(valid.len());
306    for i in 0..n {
307        if valid[i] && ids[i] == target_id {
308            mask[i / 64] |= 1u64 << (i % 64);
309        }
310    }
311    mask
312}
313
314/// Build a not-equal bitmask: bit i set iff `valid[i] && ids[i] != target_id`.
315fn build_ne_mask(ids: &[u32], valid: &[bool], target_id: u32, row_count: usize) -> Vec<u64> {
316    let words = words_for(row_count);
317    let mut mask = vec![0u64; words];
318    let n = row_count.min(ids.len()).min(valid.len());
319    for i in 0..n {
320        if valid[i] && ids[i] != target_id {
321            mask[i / 64] |= 1u64 << (i % 64);
322        }
323    }
324    mask
325}
326
327/// Build a set-membership bitmask: bit i set iff `valid[i] && matching.contains(&ids[i])`.
328fn build_set_mask(
329    ids: &[u32],
330    valid: &[bool],
331    matching: &HashSet<u32>,
332    row_count: usize,
333) -> Vec<u64> {
334    let words = words_for(row_count);
335    let mut mask = vec![0u64; words];
336    let n = row_count.min(ids.len()).min(valid.len());
337    for i in 0..n {
338        if valid[i] && matching.contains(&ids[i]) {
339            mask[i / 64] |= 1u64 << (i % 64);
340        }
341    }
342    mask
343}
344
345/// All-zeros bitmask of the correct size.
346#[inline]
347fn zero_mask(row_count: usize) -> Vec<u64> {
348    vec![0u64; words_for(row_count)]
349}
350
351/// Bitmask where bit i is set for all valid (non-null) rows.
352fn all_valid_mask(valid: &[bool], row_count: usize) -> Vec<u64> {
353    let words = words_for(row_count);
354    let mut mask = vec![0u64; words];
355    let n = row_count.min(valid.len());
356    for i in 0..n {
357        if valid[i] {
358            mask[i / 64] |= 1u64 << (i % 64);
359        }
360    }
361    mask
362}
363
364// ---------------------------------------------------------------------------
365// Tests
366// ---------------------------------------------------------------------------
367
368#[cfg(test)]
369mod tests {
370    use super::*;
371    use crate::memtable::ColumnData;
372    use crate::reader::DecodedColumn;
373    use std::collections::HashMap;
374
375    fn make_dict_col(values: &[Option<&str>]) -> ColumnData {
376        let mut dictionary: Vec<String> = Vec::new();
377        let mut reverse: HashMap<String, u32> = HashMap::new();
378        let mut ids: Vec<u32> = Vec::new();
379        let mut valid: Vec<bool> = Vec::new();
380
381        for opt in values {
382            match opt {
383                None => {
384                    ids.push(0);
385                    valid.push(false);
386                }
387                Some(s) => {
388                    let id = if let Some(&existing) = reverse.get(*s) {
389                        existing
390                    } else {
391                        let new_id = dictionary.len() as u32;
392                        dictionary.push(s.to_string());
393                        reverse.insert(s.to_string(), new_id);
394                        new_id
395                    };
396                    ids.push(id);
397                    valid.push(true);
398                }
399            }
400        }
401
402        ColumnData::DictEncoded {
403            ids,
404            dictionary,
405            reverse,
406            valid: Some(valid),
407        }
408    }
409
410    fn make_decoded_col(values: &[Option<&str>]) -> DecodedColumn {
411        let mut dictionary: Vec<String> = Vec::new();
412        let mut id_map: HashMap<String, u32> = HashMap::new();
413        let mut ids: Vec<u32> = Vec::new();
414        let mut valid: Vec<bool> = Vec::new();
415
416        for opt in values {
417            match opt {
418                None => {
419                    ids.push(0);
420                    valid.push(false);
421                }
422                Some(s) => {
423                    let id = if let Some(&existing) = id_map.get(*s) {
424                        existing
425                    } else {
426                        let new_id = dictionary.len() as u32;
427                        dictionary.push(s.to_string());
428                        id_map.insert(s.to_string(), new_id);
429                        new_id
430                    };
431                    ids.push(id);
432                    valid.push(true);
433                }
434            }
435        }
436
437        DecodedColumn::DictEncoded {
438            ids,
439            dictionary,
440            valid,
441        }
442    }
443
444    fn bits(mask: &[u64], row_count: usize) -> Vec<bool> {
445        (0..row_count)
446            .map(|i| (mask[i / 64] >> (i % 64)) & 1 == 1)
447            .collect()
448    }
449
450    // ---- eq ----------------------------------------------------------------
451
452    #[test]
453    fn dict_eq_match() {
454        let col = make_dict_col(&[Some("web"), Some("db"), Some("web"), Some("cache")]);
455        let mask = dict_eval_eq(&col, "web", 4).unwrap();
456        assert_eq!(bits(&mask, 4), vec![true, false, true, false]);
457    }
458
459    #[test]
460    fn dict_eq_value_not_in_dict_returns_zero_mask() {
461        let col = make_dict_col(&[Some("web"), Some("db")]);
462        let mask = dict_eval_eq(&col, "missing", 2).unwrap();
463        assert_eq!(bits(&mask, 2), vec![false, false]);
464    }
465
466    #[test]
467    fn dict_eq_null_rows_excluded() {
468        let col = make_dict_col(&[Some("web"), None, Some("web")]);
469        let mask = dict_eval_eq(&col, "web", 3).unwrap();
470        assert_eq!(bits(&mask, 3), vec![true, false, true]);
471    }
472
473    // ---- ne ----------------------------------------------------------------
474
475    #[test]
476    fn dict_ne_basic() {
477        let col = make_dict_col(&[Some("web"), Some("db"), Some("web")]);
478        let mask = dict_eval_ne(&col, "web", 3).unwrap();
479        assert_eq!(bits(&mask, 3), vec![false, true, false]);
480    }
481
482    #[test]
483    fn dict_ne_value_not_in_dict_all_valid_rows_pass() {
484        let col = make_dict_col(&[Some("web"), None, Some("db")]);
485        let mask = dict_eval_ne(&col, "missing", 3).unwrap();
486        // NULL row (index 1) does NOT pass.
487        assert_eq!(bits(&mask, 3), vec![true, false, true]);
488    }
489
490    // ---- contains ----------------------------------------------------------
491
492    #[test]
493    fn dict_contains_basic() {
494        let col = make_dict_col(&[Some("web-1"), Some("db-1"), Some("web-2"), Some("cache")]);
495        let mask = dict_eval_contains(&col, "web", 4).unwrap();
496        assert_eq!(bits(&mask, 4), vec![true, false, true, false]);
497    }
498
499    #[test]
500    fn dict_contains_no_match_zero_mask() {
501        let col = make_dict_col(&[Some("alpha"), Some("beta")]);
502        let mask = dict_eval_contains(&col, "gamma", 2).unwrap();
503        assert_eq!(bits(&mask, 2), vec![false, false]);
504    }
505
506    // ---- like --------------------------------------------------------------
507
508    #[test]
509    fn dict_like_prefix_wildcard() {
510        let col = make_dict_col(&[Some("web-1"), Some("db-1"), Some("web-2")]);
511        let mask = dict_eval_like(&col, "web%", 3).unwrap();
512        assert_eq!(bits(&mask, 3), vec![true, false, true]);
513    }
514
515    #[test]
516    fn dict_like_suffix_wildcard() {
517        let col = make_dict_col(&[Some("alpha-web"), Some("beta-db"), Some("gamma-web")]);
518        let mask = dict_eval_like(&col, "%web", 3).unwrap();
519        assert_eq!(bits(&mask, 3), vec![true, false, true]);
520    }
521
522    #[test]
523    fn dict_like_both_wildcards() {
524        let col = make_dict_col(&[Some("alpha-web-1"), Some("beta-db"), Some("gamma-web-2")]);
525        let mask = dict_eval_like(&col, "%web%", 3).unwrap();
526        assert_eq!(bits(&mask, 3), vec![true, false, true]);
527    }
528
529    #[test]
530    fn dict_like_exact_no_wildcards() {
531        let col = make_dict_col(&[Some("exact"), Some("other")]);
532        let mask = dict_eval_like(&col, "exact", 2).unwrap();
533        assert_eq!(bits(&mask, 2), vec![true, false]);
534    }
535
536    #[test]
537    fn dict_like_unsupported_mid_wildcard_returns_none() {
538        let col = make_dict_col(&[Some("abc")]);
539        assert!(dict_eval_like(&col, "a%c", 1).is_none());
540    }
541
542    // ---- decoded column ----------------------------------------------------
543
544    #[test]
545    fn decoded_dict_eq_match() {
546        let col = make_decoded_col(&[Some("web"), Some("db"), Some("web")]);
547        let mask = decoded_dict_eval_eq(&col, "web", 3).unwrap();
548        assert_eq!(bits(&mask, 3), vec![true, false, true]);
549    }
550
551    #[test]
552    fn decoded_dict_eq_not_in_dict() {
553        let col = make_decoded_col(&[Some("web"), Some("db")]);
554        let mask = decoded_dict_eval_eq(&col, "missing", 2).unwrap();
555        assert_eq!(bits(&mask, 2), vec![false, false]);
556    }
557
558    #[test]
559    fn decoded_dict_ne_not_in_dict_all_valid_pass() {
560        let col = make_decoded_col(&[Some("a"), None, Some("b")]);
561        let mask = decoded_dict_eval_ne(&col, "missing", 3).unwrap();
562        assert_eq!(bits(&mask, 3), vec![true, false, true]);
563    }
564
565    #[test]
566    fn decoded_dict_contains() {
567        let col = make_decoded_col(&[Some("web-1"), Some("db"), Some("web-2")]);
568        let mask = decoded_dict_eval_contains(&col, "web", 3).unwrap();
569        assert_eq!(bits(&mask, 3), vec![true, false, true]);
570    }
571
572    #[test]
573    fn decoded_dict_like() {
574        let col = make_decoded_col(&[Some("web-1"), Some("db"), Some("web-2")]);
575        let mask = decoded_dict_eval_like(&col, "web%", 3).unwrap();
576        assert_eq!(bits(&mask, 3), vec![true, false, true]);
577    }
578
579    // ---- short-circuit and bitmask helpers ---------------------------------
580
581    #[test]
582    fn bitmask_all_correct_tail_bits() {
583        // row_count=65: word0 covers rows 0..63 (all set), word1 covers row 64 only (bit 0).
584        let mask = bitmask_all(65);
585        assert_eq!(mask.len(), 2);
586        assert_eq!(mask[0], u64::MAX);
587        assert_eq!(mask[1], 1u64); // only bit 0 set in last word (row 64)
588
589        // row_count=66: word1 covers rows 64 and 65 → bits 0 and 1.
590        let mask66 = bitmask_all(66);
591        assert_eq!(mask66[1], 0b11u64);
592    }
593
594    #[test]
595    fn words_for_alignment() {
596        assert_eq!(words_for(0), 0);
597        assert_eq!(words_for(1), 1);
598        assert_eq!(words_for(64), 1);
599        assert_eq!(words_for(65), 2);
600    }
601
602    #[test]
603    fn non_dict_encoded_col_returns_none() {
604        let col = ColumnData::Int64 {
605            values: vec![1, 2, 3],
606            valid: Some(vec![true, true, true]),
607        };
608        assert!(dict_eval_eq(&col, "x", 3).is_none());
609        assert!(dict_eval_ne(&col, "x", 3).is_none());
610        assert!(dict_eval_contains(&col, "x", 3).is_none());
611        assert!(dict_eval_like(&col, "x%", 3).is_none());
612    }
613}