Skip to main content

nodedb_columnar/
filter.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Vectorized predicate evaluation on dictionary-encoded columns.
4//!
5//! Implements late materialization for `DictEncoded` columns: predicates are
6//! evaluated on compact integer IDs rather than decompressed strings. This
7//! turns `O(N * string_len)` evaluation into `O(dict_size * string_len + N)`
8//! — a large win when cardinality is low (the common case for dict-encoded
9//! columns).
10//!
11//! # Short-circuit paths
12//!
13//! - **eq, not-in-dict**: return all-zero mask immediately (O(1) reject)
14//! - **ne, not-in-dict**: return all-ones mask immediately (O(1) accept)
15//! - **contains, empty-match-set**: return all-zero mask immediately
16//!
17//! # Bitmask encoding
18//!
19//! Results are returned as `Vec<u64>` packed bitmasks: bit *i* is set if row
20//! *i* passes the predicate. Use `words_for(row_count)` to size the output.
21
22use std::collections::{HashMap, HashSet};
23
24use crate::memtable::ColumnData;
25use crate::reader::DecodedColumn;
26
27/// Unpacked view of a memtable `DictEncoded` column.
28///
29/// The validity slice is returned via `Cow` — `Borrowed` when the column is
30/// nullable (has an explicit bitmap), `Owned(all-true)` when non-nullable.
31type MemtableDict<'a> = (
32    &'a [u32],
33    &'a [String],
34    &'a HashMap<String, u32>,
35    std::borrow::Cow<'a, [bool]>,
36);
37
38// ---------------------------------------------------------------------------
39// Public API
40// ---------------------------------------------------------------------------
41
42/// Evaluate an equality predicate on a memtable `DictEncoded` column.
43///
44/// Returns `Some(mask)` where bit *i* is set if row *i* equals `value` and
45/// `valid[i]` is true. Returns `None` if `col` is not `DictEncoded`.
46pub fn dict_eval_eq(col: &ColumnData, value: &str, row_count: usize) -> Option<Vec<u64>> {
47    let (ids, _, reverse, valid) = unpack_memtable(col)?;
48    match reverse.get(value) {
49        None => Some(zero_mask(row_count)),
50        Some(&target_id) => Some(build_eq_mask(ids, &valid, target_id, row_count)),
51    }
52}
53
54/// Evaluate a not-equal predicate on a memtable `DictEncoded` column.
55///
56/// Returns `Some(mask)` where bit *i* is set if row *i* is not equal to
57/// `value` (or is NULL, which is treated as not matching the value).
58/// Returns `None` if `col` is not `DictEncoded`.
59pub fn dict_eval_ne(col: &ColumnData, value: &str, row_count: usize) -> Option<Vec<u64>> {
60    let (ids, _, reverse, valid) = unpack_memtable(col)?;
61    match reverse.get(value) {
62        None => Some(all_valid_mask(&valid, row_count)),
63        Some(&target_id) => Some(build_ne_mask(ids, &valid, target_id, row_count)),
64    }
65}
66
67/// Evaluate a substring-contains predicate on a memtable `DictEncoded` column.
68///
69/// All dictionary entries that contain `substr` are collected into a set of
70/// matching IDs (O(dict_size * string_len)), then rows are filtered by ID
71/// membership (O(N)).
72///
73/// Returns `None` if `col` is not `DictEncoded`.
74pub fn dict_eval_contains(col: &ColumnData, substr: &str, row_count: usize) -> Option<Vec<u64>> {
75    let (ids, dictionary, _, valid) = unpack_memtable(col)?;
76    let matching = matching_ids_contains(dictionary, substr);
77    if matching.is_empty() {
78        return Some(zero_mask(row_count));
79    }
80    Some(build_set_mask(ids, &valid, &matching, row_count))
81}
82
83/// Evaluate a LIKE predicate on a memtable `DictEncoded` column.
84///
85/// Supports `%` as wildcard (leading, trailing, both, none). Only simple
86/// patterns are supported; complex patterns with mid-string `%` fall back
87/// to `None` (caller should decompress and evaluate).
88///
89/// Returns `None` if `col` is not `DictEncoded` or the pattern is unsupported.
90pub fn dict_eval_like(col: &ColumnData, pattern: &str, row_count: usize) -> Option<Vec<u64>> {
91    let (ids, dictionary, _, valid) = unpack_memtable(col)?;
92    let matching = matching_ids_like(dictionary, pattern)?;
93    if matching.is_empty() {
94        return Some(zero_mask(row_count));
95    }
96    Some(build_set_mask(ids, &valid, &matching, row_count))
97}
98
99// ---------------------------------------------------------------------------
100// Decoded-column variants (from SegmentReader)
101// ---------------------------------------------------------------------------
102
103/// Evaluate an equality predicate on a `DecodedColumn::DictEncoded`.
104///
105/// The `DecodedColumn` variant carries no reverse map, so lookup is a linear
106/// scan of the dictionary — acceptable because dictionaries are small.
107pub fn decoded_dict_eval_eq(
108    col: &DecodedColumn,
109    value: &str,
110    row_count: usize,
111) -> Option<Vec<u64>> {
112    let (ids, dictionary, valid) = unpack_decoded(col)?;
113    match find_dict_id(dictionary, value) {
114        None => Some(zero_mask(row_count)),
115        Some(target_id) => Some(build_eq_mask(ids, valid, target_id, row_count)),
116    }
117}
118
119/// Evaluate a not-equal predicate on a `DecodedColumn::DictEncoded`.
120pub fn decoded_dict_eval_ne(
121    col: &DecodedColumn,
122    value: &str,
123    row_count: usize,
124) -> Option<Vec<u64>> {
125    let (ids, dictionary, valid) = unpack_decoded(col)?;
126    match find_dict_id(dictionary, value) {
127        None => Some(all_valid_mask(valid, row_count)),
128        Some(target_id) => Some(build_ne_mask(ids, valid, target_id, row_count)),
129    }
130}
131
132/// Evaluate a substring-contains predicate on a `DecodedColumn::DictEncoded`.
133pub fn decoded_dict_eval_contains(
134    col: &DecodedColumn,
135    substr: &str,
136    row_count: usize,
137) -> Option<Vec<u64>> {
138    let (ids, dictionary, valid) = unpack_decoded(col)?;
139    let matching = matching_ids_contains(dictionary, substr);
140    if matching.is_empty() {
141        return Some(zero_mask(row_count));
142    }
143    Some(build_set_mask(ids, valid, &matching, row_count))
144}
145
146/// Evaluate a LIKE predicate on a `DecodedColumn::DictEncoded`.
147pub fn decoded_dict_eval_like(
148    col: &DecodedColumn,
149    pattern: &str,
150    row_count: usize,
151) -> Option<Vec<u64>> {
152    let (ids, dictionary, valid) = unpack_decoded(col)?;
153    let matching = matching_ids_like(dictionary, pattern)?;
154    if matching.is_empty() {
155        return Some(zero_mask(row_count));
156    }
157    Some(build_set_mask(ids, valid, &matching, row_count))
158}
159
160// ---------------------------------------------------------------------------
161// Bitmask helpers (public, mirrors nodedb-query simd_filter API surface)
162// ---------------------------------------------------------------------------
163
164/// Number of `u64` words needed to hold `row_count` bits.
165#[inline]
166pub fn words_for(row_count: usize) -> usize {
167    row_count.div_ceil(64)
168}
169
170/// Bitwise AND of two equal-length bitmasks.
171pub fn bitmask_and(a: &[u64], b: &[u64]) -> Vec<u64> {
172    let len = a.len().min(b.len());
173    let mut out = vec![0u64; len];
174    for i in 0..len {
175        out[i] = a[i] & b[i];
176    }
177    out
178}
179
180/// All-ones bitmask for `row_count` rows.
181pub fn bitmask_all(row_count: usize) -> Vec<u64> {
182    let words = words_for(row_count);
183    let mut out = vec![u64::MAX; words];
184    let tail = row_count % 64;
185    if tail > 0 && !out.is_empty() {
186        *out.last_mut().expect("non-empty") = (1u64 << tail) - 1;
187    }
188    out
189}
190
191// ---------------------------------------------------------------------------
192// Internal helpers
193// ---------------------------------------------------------------------------
194
195/// Destructure a memtable `ColumnData::DictEncoded`.
196fn unpack_memtable(col: &ColumnData) -> Option<MemtableDict<'_>> {
197    if let ColumnData::DictEncoded {
198        ids,
199        dictionary,
200        reverse,
201        valid,
202    } = col
203    {
204        let validity = match valid {
205            Some(v) => std::borrow::Cow::Borrowed(v.as_slice()),
206            None => std::borrow::Cow::Owned(vec![true; ids.len()]),
207        };
208        Some((ids.as_slice(), dictionary.as_slice(), reverse, validity))
209    } else {
210        None
211    }
212}
213
214/// Destructure a `DecodedColumn::DictEncoded`.
215fn unpack_decoded(col: &DecodedColumn) -> Option<(&[u32], &[String], &[bool])> {
216    if let DecodedColumn::DictEncoded {
217        ids,
218        dictionary,
219        valid,
220    } = col
221    {
222        Some((ids.as_slice(), dictionary.as_slice(), valid.as_slice()))
223    } else {
224        None
225    }
226}
227
228/// Linear scan to find the ID of `value` in a dictionary.
229fn find_dict_id(dictionary: &[String], value: &str) -> Option<u32> {
230    dictionary.iter().position(|s| s == value).map(|i| i as u32)
231}
232
233/// Collect IDs of all dictionary entries that contain `substr`.
234fn matching_ids_contains(dictionary: &[String], substr: &str) -> HashSet<u32> {
235    dictionary
236        .iter()
237        .enumerate()
238        .filter(|(_, s)| s.contains(substr))
239        .map(|(i, _)| i as u32)
240        .collect()
241}
242
243/// Collect IDs matching a simple LIKE pattern (leading/trailing `%` only).
244///
245/// Returns `None` for unsupported patterns (mid-string `%`).
246fn matching_ids_like(dictionary: &[String], pattern: &str) -> Option<HashSet<u32>> {
247    let matching = match (pattern.starts_with('%'), pattern.ends_with('%')) {
248        (true, true) => {
249            // %substr%
250            let inner = pattern.trim_matches('%');
251            if inner.contains('%') {
252                return None; // mid-string wildcard
253            }
254            dictionary
255                .iter()
256                .enumerate()
257                .filter(|(_, s)| s.contains(inner))
258                .map(|(i, _)| i as u32)
259                .collect()
260        }
261        (true, false) => {
262            // %suffix
263            let suffix = &pattern[1..];
264            if suffix.contains('%') {
265                return None;
266            }
267            dictionary
268                .iter()
269                .enumerate()
270                .filter(|(_, s)| s.ends_with(suffix))
271                .map(|(i, _)| i as u32)
272                .collect()
273        }
274        (false, true) => {
275            // prefix%
276            let prefix = &pattern[..pattern.len() - 1];
277            if prefix.contains('%') {
278                return None;
279            }
280            dictionary
281                .iter()
282                .enumerate()
283                .filter(|(_, s)| s.starts_with(prefix))
284                .map(|(i, _)| i as u32)
285                .collect()
286        }
287        (false, false) => {
288            // exact match — no wildcards
289            if pattern.contains('%') {
290                return None;
291            }
292            dictionary
293                .iter()
294                .enumerate()
295                .filter(|(_, s)| s.as_str() == pattern)
296                .map(|(i, _)| i as u32)
297                .collect()
298        }
299    };
300    Some(matching)
301}
302
303/// Build an equality bitmask: bit i set iff `valid[i] && ids[i] == target_id`.
304fn build_eq_mask(ids: &[u32], valid: &[bool], target_id: u32, row_count: usize) -> Vec<u64> {
305    let words = words_for(row_count);
306    let mut mask = vec![0u64; words];
307    let n = row_count.min(ids.len()).min(valid.len());
308    for i in 0..n {
309        if valid[i] && ids[i] == target_id {
310            mask[i / 64] |= 1u64 << (i % 64);
311        }
312    }
313    mask
314}
315
316/// Build a not-equal bitmask: bit i set iff `valid[i] && ids[i] != target_id`.
317fn build_ne_mask(ids: &[u32], valid: &[bool], target_id: u32, row_count: usize) -> Vec<u64> {
318    let words = words_for(row_count);
319    let mut mask = vec![0u64; words];
320    let n = row_count.min(ids.len()).min(valid.len());
321    for i in 0..n {
322        if valid[i] && ids[i] != target_id {
323            mask[i / 64] |= 1u64 << (i % 64);
324        }
325    }
326    mask
327}
328
329/// Build a set-membership bitmask: bit i set iff `valid[i] && matching.contains(&ids[i])`.
330fn build_set_mask(
331    ids: &[u32],
332    valid: &[bool],
333    matching: &HashSet<u32>,
334    row_count: usize,
335) -> Vec<u64> {
336    let words = words_for(row_count);
337    let mut mask = vec![0u64; words];
338    let n = row_count.min(ids.len()).min(valid.len());
339    for i in 0..n {
340        if valid[i] && matching.contains(&ids[i]) {
341            mask[i / 64] |= 1u64 << (i % 64);
342        }
343    }
344    mask
345}
346
347/// All-zeros bitmask of the correct size.
348#[inline]
349fn zero_mask(row_count: usize) -> Vec<u64> {
350    vec![0u64; words_for(row_count)]
351}
352
353/// Bitmask where bit i is set for all valid (non-null) rows.
354fn all_valid_mask(valid: &[bool], row_count: usize) -> Vec<u64> {
355    let words = words_for(row_count);
356    let mut mask = vec![0u64; words];
357    let n = row_count.min(valid.len());
358    for i in 0..n {
359        if valid[i] {
360            mask[i / 64] |= 1u64 << (i % 64);
361        }
362    }
363    mask
364}
365
366// ---------------------------------------------------------------------------
367// Tests
368// ---------------------------------------------------------------------------
369
370#[cfg(test)]
371mod tests {
372    use super::*;
373    use crate::memtable::ColumnData;
374    use crate::reader::DecodedColumn;
375    use std::collections::HashMap;
376
377    fn make_dict_col(values: &[Option<&str>]) -> ColumnData {
378        let mut dictionary: Vec<String> = Vec::new();
379        let mut reverse: HashMap<String, u32> = HashMap::new();
380        let mut ids: Vec<u32> = Vec::new();
381        let mut valid: Vec<bool> = Vec::new();
382
383        for opt in values {
384            match opt {
385                None => {
386                    ids.push(0);
387                    valid.push(false);
388                }
389                Some(s) => {
390                    let id = if let Some(&existing) = reverse.get(*s) {
391                        existing
392                    } else {
393                        let new_id = dictionary.len() as u32;
394                        dictionary.push(s.to_string());
395                        reverse.insert(s.to_string(), new_id);
396                        new_id
397                    };
398                    ids.push(id);
399                    valid.push(true);
400                }
401            }
402        }
403
404        ColumnData::DictEncoded {
405            ids,
406            dictionary,
407            reverse,
408            valid: Some(valid),
409        }
410    }
411
412    fn make_decoded_col(values: &[Option<&str>]) -> DecodedColumn {
413        let mut dictionary: Vec<String> = Vec::new();
414        let mut id_map: HashMap<String, u32> = HashMap::new();
415        let mut ids: Vec<u32> = Vec::new();
416        let mut valid: Vec<bool> = Vec::new();
417
418        for opt in values {
419            match opt {
420                None => {
421                    ids.push(0);
422                    valid.push(false);
423                }
424                Some(s) => {
425                    let id = if let Some(&existing) = id_map.get(*s) {
426                        existing
427                    } else {
428                        let new_id = dictionary.len() as u32;
429                        dictionary.push(s.to_string());
430                        id_map.insert(s.to_string(), new_id);
431                        new_id
432                    };
433                    ids.push(id);
434                    valid.push(true);
435                }
436            }
437        }
438
439        DecodedColumn::DictEncoded {
440            ids,
441            dictionary,
442            valid,
443        }
444    }
445
446    fn bits(mask: &[u64], row_count: usize) -> Vec<bool> {
447        (0..row_count)
448            .map(|i| (mask[i / 64] >> (i % 64)) & 1 == 1)
449            .collect()
450    }
451
452    // ---- eq ----------------------------------------------------------------
453
454    #[test]
455    fn dict_eq_match() {
456        let col = make_dict_col(&[Some("web"), Some("db"), Some("web"), Some("cache")]);
457        let mask = dict_eval_eq(&col, "web", 4).unwrap();
458        assert_eq!(bits(&mask, 4), vec![true, false, true, false]);
459    }
460
461    #[test]
462    fn dict_eq_value_not_in_dict_returns_zero_mask() {
463        let col = make_dict_col(&[Some("web"), Some("db")]);
464        let mask = dict_eval_eq(&col, "missing", 2).unwrap();
465        assert_eq!(bits(&mask, 2), vec![false, false]);
466    }
467
468    #[test]
469    fn dict_eq_null_rows_excluded() {
470        let col = make_dict_col(&[Some("web"), None, Some("web")]);
471        let mask = dict_eval_eq(&col, "web", 3).unwrap();
472        assert_eq!(bits(&mask, 3), vec![true, false, true]);
473    }
474
475    // ---- ne ----------------------------------------------------------------
476
477    #[test]
478    fn dict_ne_basic() {
479        let col = make_dict_col(&[Some("web"), Some("db"), Some("web")]);
480        let mask = dict_eval_ne(&col, "web", 3).unwrap();
481        assert_eq!(bits(&mask, 3), vec![false, true, false]);
482    }
483
484    #[test]
485    fn dict_ne_value_not_in_dict_all_valid_rows_pass() {
486        let col = make_dict_col(&[Some("web"), None, Some("db")]);
487        let mask = dict_eval_ne(&col, "missing", 3).unwrap();
488        // NULL row (index 1) does NOT pass.
489        assert_eq!(bits(&mask, 3), vec![true, false, true]);
490    }
491
492    // ---- contains ----------------------------------------------------------
493
494    #[test]
495    fn dict_contains_basic() {
496        let col = make_dict_col(&[Some("web-1"), Some("db-1"), Some("web-2"), Some("cache")]);
497        let mask = dict_eval_contains(&col, "web", 4).unwrap();
498        assert_eq!(bits(&mask, 4), vec![true, false, true, false]);
499    }
500
501    #[test]
502    fn dict_contains_no_match_zero_mask() {
503        let col = make_dict_col(&[Some("alpha"), Some("beta")]);
504        let mask = dict_eval_contains(&col, "gamma", 2).unwrap();
505        assert_eq!(bits(&mask, 2), vec![false, false]);
506    }
507
508    // ---- like --------------------------------------------------------------
509
510    #[test]
511    fn dict_like_prefix_wildcard() {
512        let col = make_dict_col(&[Some("web-1"), Some("db-1"), Some("web-2")]);
513        let mask = dict_eval_like(&col, "web%", 3).unwrap();
514        assert_eq!(bits(&mask, 3), vec![true, false, true]);
515    }
516
517    #[test]
518    fn dict_like_suffix_wildcard() {
519        let col = make_dict_col(&[Some("alpha-web"), Some("beta-db"), Some("gamma-web")]);
520        let mask = dict_eval_like(&col, "%web", 3).unwrap();
521        assert_eq!(bits(&mask, 3), vec![true, false, true]);
522    }
523
524    #[test]
525    fn dict_like_both_wildcards() {
526        let col = make_dict_col(&[Some("alpha-web-1"), Some("beta-db"), Some("gamma-web-2")]);
527        let mask = dict_eval_like(&col, "%web%", 3).unwrap();
528        assert_eq!(bits(&mask, 3), vec![true, false, true]);
529    }
530
531    #[test]
532    fn dict_like_exact_no_wildcards() {
533        let col = make_dict_col(&[Some("exact"), Some("other")]);
534        let mask = dict_eval_like(&col, "exact", 2).unwrap();
535        assert_eq!(bits(&mask, 2), vec![true, false]);
536    }
537
538    #[test]
539    fn dict_like_unsupported_mid_wildcard_returns_none() {
540        let col = make_dict_col(&[Some("abc")]);
541        assert!(dict_eval_like(&col, "a%c", 1).is_none());
542    }
543
544    // ---- decoded column ----------------------------------------------------
545
546    #[test]
547    fn decoded_dict_eq_match() {
548        let col = make_decoded_col(&[Some("web"), Some("db"), Some("web")]);
549        let mask = decoded_dict_eval_eq(&col, "web", 3).unwrap();
550        assert_eq!(bits(&mask, 3), vec![true, false, true]);
551    }
552
553    #[test]
554    fn decoded_dict_eq_not_in_dict() {
555        let col = make_decoded_col(&[Some("web"), Some("db")]);
556        let mask = decoded_dict_eval_eq(&col, "missing", 2).unwrap();
557        assert_eq!(bits(&mask, 2), vec![false, false]);
558    }
559
560    #[test]
561    fn decoded_dict_ne_not_in_dict_all_valid_pass() {
562        let col = make_decoded_col(&[Some("a"), None, Some("b")]);
563        let mask = decoded_dict_eval_ne(&col, "missing", 3).unwrap();
564        assert_eq!(bits(&mask, 3), vec![true, false, true]);
565    }
566
567    #[test]
568    fn decoded_dict_contains() {
569        let col = make_decoded_col(&[Some("web-1"), Some("db"), Some("web-2")]);
570        let mask = decoded_dict_eval_contains(&col, "web", 3).unwrap();
571        assert_eq!(bits(&mask, 3), vec![true, false, true]);
572    }
573
574    #[test]
575    fn decoded_dict_like() {
576        let col = make_decoded_col(&[Some("web-1"), Some("db"), Some("web-2")]);
577        let mask = decoded_dict_eval_like(&col, "web%", 3).unwrap();
578        assert_eq!(bits(&mask, 3), vec![true, false, true]);
579    }
580
581    // ---- short-circuit and bitmask helpers ---------------------------------
582
583    #[test]
584    fn bitmask_all_correct_tail_bits() {
585        // row_count=65: word0 covers rows 0..63 (all set), word1 covers row 64 only (bit 0).
586        let mask = bitmask_all(65);
587        assert_eq!(mask.len(), 2);
588        assert_eq!(mask[0], u64::MAX);
589        assert_eq!(mask[1], 1u64); // only bit 0 set in last word (row 64)
590
591        // row_count=66: word1 covers rows 64 and 65 → bits 0 and 1.
592        let mask66 = bitmask_all(66);
593        assert_eq!(mask66[1], 0b11u64);
594    }
595
596    #[test]
597    fn words_for_alignment() {
598        assert_eq!(words_for(0), 0);
599        assert_eq!(words_for(1), 1);
600        assert_eq!(words_for(64), 1);
601        assert_eq!(words_for(65), 2);
602    }
603
604    #[test]
605    fn non_dict_encoded_col_returns_none() {
606        let col = ColumnData::Int64 {
607            values: vec![1, 2, 3],
608            valid: Some(vec![true, true, true]),
609        };
610        assert!(dict_eval_eq(&col, "x", 3).is_none());
611        assert!(dict_eval_ne(&col, "x", 3).is_none());
612        assert!(dict_eval_contains(&col, "x", 3).is_none());
613        assert!(dict_eval_like(&col, "x%", 3).is_none());
614    }
615}