Skip to main content

grafeo_core/codec/
dictionary.rs

1//! Dictionary encoding for repeated strings.
2//!
3//! If your data has lots of repeated strings (like node labels or edge types),
4//! dictionary encoding stores each unique string once and references it by a
5//! small integer code. A million "Person" labels becomes one string + a million
6//! 4-byte codes instead of a million strings.
7//!
8//! # Example
9//!
10//! ```no_run
11//! # use grafeo_core::codec::dictionary::DictionaryBuilder;
12//! let mut builder = DictionaryBuilder::new();
13//! builder.add("Person");
14//! builder.add("Company");
15//! builder.add("Person");  // same as first - reuses code 0
16//! builder.add("Person");  // reuses code 0 again
17//!
18//! let dict = builder.build();
19//! // Dictionary: ["Person", "Company"]
20//! // Codes:      [0, 1, 0, 0]
21//! assert_eq!(dict.dictionary_size(), 2);  // Only 2 unique strings stored
22//! ```
23
24use std::collections::HashMap;
25use std::sync::Arc;
26
27use bytes::{Bytes, BytesMut};
28
29/// Reads an LE u32 at byte offset `idx * 4`. Returns `None` if out of range.
30#[inline]
31fn read_code_at(bytes: &Bytes, idx: usize) -> Option<u32> {
32    let start = idx.checked_mul(4)?;
33    let end = start.checked_add(4)?;
34    let chunk: [u8; 4] = bytes.get(start..end)?.try_into().ok()?;
35    Some(u32::from_le_bytes(chunk))
36}
37
38/// Encodes `codes` as LE u32 bytes wrapped in a refcounted `Bytes`.
39fn codes_to_bytes(codes: &[u32]) -> Bytes {
40    let mut buf = BytesMut::with_capacity(codes.len() * 4);
41    for &c in codes {
42        buf.extend_from_slice(&c.to_le_bytes());
43    }
44    buf.freeze()
45}
46
47/// Reads a single bit from a u64-word bitmap encoded as LE bytes.
48#[inline]
49fn null_bit_at(bitmap: &Bytes, index: usize) -> bool {
50    let word_idx = index / 64;
51    let bit_idx = index % 64;
52    let start = word_idx * 8;
53    let Some(chunk) = bitmap.get(start..start + 8) else {
54        return false;
55    };
56    let Ok(arr): Result<[u8; 8], _> = chunk.try_into() else {
57        return false;
58    };
59    (u64::from_le_bytes(arr) & (1 << bit_idx)) != 0
60}
61
62/// Two-variant backing for u32 dictionary codes.
63#[derive(Debug, Clone)]
64enum CodeStore {
65    Inline(Vec<u32>),
66    Mapped(Bytes),
67}
68
69impl CodeStore {
70    #[inline]
71    fn as_slice(&self) -> Option<&[u32]> {
72        match self {
73            Self::Inline(v) => Some(v.as_slice()),
74            Self::Mapped(_) => None,
75        }
76    }
77
78    #[inline]
79    fn code_at(&self, idx: usize) -> Option<u32> {
80        match self {
81            Self::Inline(v) => v.get(idx).copied(),
82            Self::Mapped(b) => read_code_at(b, idx),
83        }
84    }
85
86    fn to_bytes(&self) -> Bytes {
87        match self {
88            Self::Inline(v) => codes_to_bytes(v),
89            Self::Mapped(b) => b.clone(),
90        }
91    }
92
93    fn byte_len(&self) -> usize {
94        match self {
95            Self::Inline(v) => v.len() * 4,
96            Self::Mapped(b) => b.len(),
97        }
98    }
99
100    #[inline]
101    fn len_codes(&self, code_count: usize) -> usize {
102        match self {
103            Self::Inline(v) => v.len(),
104            Self::Mapped(_) => code_count,
105        }
106    }
107}
108
109/// Two-variant backing for the LE-u64 null bitmap.
110#[derive(Debug, Clone)]
111enum NullBitmap {
112    Inline(Vec<u64>),
113    Mapped(Bytes),
114}
115
116impl NullBitmap {
117    #[inline]
118    fn as_slice(&self) -> Option<&[u64]> {
119        match self {
120            Self::Inline(v) => Some(v.as_slice()),
121            Self::Mapped(_) => None,
122        }
123    }
124
125    #[inline]
126    fn is_null(&self, index: usize) -> bool {
127        match self {
128            Self::Inline(words) => {
129                let word_idx = index / 64;
130                let bit_idx = index % 64;
131                words
132                    .get(word_idx)
133                    .is_some_and(|w| (*w & (1u64 << bit_idx)) != 0)
134            }
135            Self::Mapped(b) => null_bit_at(b, index),
136        }
137    }
138}
139
140/// Stores repeated strings efficiently by referencing them with integer codes.
141///
142/// Each unique string appears once in the dictionary. Values are stored as
143/// LE u32 indices pointing into that dictionary, refcounted as
144/// [`bytes::Bytes`] so heap-owned and mmap-backed columns share the same
145/// type (revised D7).
146#[derive(Debug, Clone)]
147pub struct DictionaryEncoding {
148    /// The dictionary of unique strings.
149    dictionary: Arc<[Arc<str>]>,
150    /// Encoded values: `Inline(Vec<u32>)` or `Mapped(Bytes)`.
151    codes: CodeStore,
152    /// Number of code values.
153    code_count: usize,
154    /// Optional null bitmap: `Inline(Vec<u64>)` or `Mapped(Bytes)`.
155    null_bitmap: Option<NullBitmap>,
156}
157
158impl DictionaryEncoding {
159    /// Creates a new dictionary encoding from a dictionary and codes (legacy
160    /// `Vec<u32>` input).
161    pub fn new(dictionary: Arc<[Arc<str>]>, codes: Vec<u32>) -> Self {
162        let code_count = codes.len();
163        Self {
164            dictionary,
165            codes: CodeStore::Inline(codes),
166            code_count,
167            null_bitmap: None,
168        }
169    }
170
171    /// Constructs a dictionary encoding from pre-encoded bytes (Phase 3c
172    /// entry point).
173    ///
174    /// `codes_bytes` must be `code_count * 4` bytes of LE u32 values.
175    pub fn from_bytes_storage(
176        dictionary: Arc<[Arc<str>]>,
177        codes_bytes: Bytes,
178        code_count: usize,
179    ) -> Self {
180        Self {
181            dictionary,
182            codes: CodeStore::Mapped(codes_bytes),
183            code_count,
184            null_bitmap: None,
185        }
186    }
187
188    /// Adds a null bitmap to this encoding (legacy `Vec<u64>` input).
189    pub fn with_nulls(mut self, null_bitmap: Vec<u64>) -> Self {
190        self.null_bitmap = Some(NullBitmap::Inline(null_bitmap));
191        self
192    }
193
194    /// Adds a pre-encoded null bitmap (Phase 3c entry point).
195    pub fn with_null_bytes(mut self, null_bitmap: Bytes) -> Self {
196        self.null_bitmap = Some(NullBitmap::Mapped(null_bitmap));
197        self
198    }
199
200    /// Returns the number of values.
201    pub fn len(&self) -> usize {
202        self.codes.len_codes(self.code_count)
203    }
204
205    /// Returns whether the encoding is empty.
206    pub fn is_empty(&self) -> bool {
207        self.codes.len_codes(self.code_count) == 0
208    }
209
210    /// Returns the number of unique strings in the dictionary.
211    pub fn dictionary_size(&self) -> usize {
212        self.dictionary.len()
213    }
214
215    /// Returns the dictionary.
216    pub fn dictionary(&self) -> &Arc<[Arc<str>]> {
217        &self.dictionary
218    }
219
220    /// Returns the encoded codes as raw LE u32 bytes (always materialised).
221    ///
222    /// Phase 3b: codes storage is `bytes::Bytes`. Use [`Self::code_at`] for
223    /// indexed access; this returns the raw byte storage for serializers
224    /// that write the storage out directly.
225    #[must_use]
226    pub fn codes_bytes(&self) -> Bytes {
227        self.codes.to_bytes()
228    }
229
230    /// Returns a direct `&[u32]` slice when the codes live in RAM.
231    #[must_use]
232    #[inline]
233    pub fn as_codes_slice(&self) -> Option<&[u32]> {
234        self.codes.as_slice()
235    }
236
237    /// Returns a direct `&[u64]` view of the null bitmap when it lives in
238    /// RAM. `None` when there is no null bitmap or the bitmap is mmap-backed.
239    #[must_use]
240    #[inline]
241    pub fn as_null_words_slice(&self) -> Option<&[u64]> {
242        self.null_bitmap.as_ref().and_then(NullBitmap::as_slice)
243    }
244
245    /// Number of u32 codes stored.
246    pub fn code_count(&self) -> usize {
247        self.code_count
248    }
249
250    /// Returns the code at `idx`, or `None` if out of range.
251    #[must_use]
252    #[inline]
253    pub fn code_at(&self, idx: usize) -> Option<u32> {
254        self.codes.code_at(idx)
255    }
256
257    /// Returns the codes as a materialized `Vec<u32>` (allocates).
258    ///
259    /// Prefer [`Self::code_at`] or [`Self::code_count`] for reads. This exists
260    /// for callers that need a contiguous slice and accept the allocation
261    /// (e.g., legacy serialization paths).
262    pub fn codes(&self) -> Vec<u32> {
263        (0..self.code_count)
264            .map(|i| self.codes.code_at(i).unwrap_or(0))
265            .collect()
266    }
267
268    /// Returns whether the value at index is null.
269    #[must_use]
270    #[inline]
271    pub fn is_null(&self, index: usize) -> bool {
272        match &self.null_bitmap {
273            Some(b) => b.is_null(index),
274            None => false,
275        }
276    }
277
278    /// Returns the string value at the given index.
279    ///
280    /// Returns `None` if the value is null.
281    pub fn get(&self, index: usize) -> Option<&str> {
282        if self.is_null(index) {
283            return None;
284        }
285        let code = self.code_at(index)?;
286        self.dictionary.get(code as usize).map(|s| s.as_ref())
287    }
288
289    /// Returns the code at the given index.
290    pub fn get_code(&self, index: usize) -> Option<u32> {
291        if self.is_null(index) {
292            return None;
293        }
294        self.code_at(index)
295    }
296
297    /// Iterates over all values, yielding `Option<&str>`.
298    pub fn iter(&self) -> impl Iterator<Item = Option<&str>> {
299        (0..self.len()).map(move |i| self.get(i))
300    }
301
302    /// Returns the compression ratio (original size / compressed size).
303    pub fn compression_ratio(&self) -> f64 {
304        if self.is_empty() {
305            return 1.0;
306        }
307
308        // Estimate original size: sum of string lengths
309        let original_size: usize = (0..self.code_count)
310            .map(|i| {
311                let code = self.codes.code_at(i).unwrap_or(0) as usize;
312                self.dictionary.get(code).map_or(0, |s| s.len())
313            })
314            .sum();
315
316        // Compressed size: dictionary + codes
317        let dict_size: usize = self.dictionary.iter().map(|s| s.len()).sum();
318        let codes_size = self.codes.byte_len();
319        let compressed_size = dict_size + codes_size;
320
321        if compressed_size == 0 {
322            return 1.0;
323        }
324
325        original_size as f64 / compressed_size as f64
326    }
327
328    /// Encodes a lookup value into a code, if it exists in the dictionary.
329    pub fn encode(&self, value: &str) -> Option<u32> {
330        self.dictionary
331            .iter()
332            .position(|s| s.as_ref() == value)
333            .and_then(|i| u32::try_from(i).ok())
334    }
335
336    /// Returns the row offsets where the code matches `predicate` and the
337    /// row is not null.
338    ///
339    /// Branches once on the storage variants so the per-row body is a tight
340    /// loop over native slices in the common in-memory case.
341    pub fn filter_by_code(&self, predicate: impl Fn(u32) -> bool) -> Vec<usize> {
342        if self.code_count == 0 {
343            return Vec::new();
344        }
345
346        let null_words: Option<&[u64]> = self.as_null_words_slice();
347
348        match (self.codes.as_slice(), null_words, &self.null_bitmap) {
349            // Hot path: codes inline, no nulls.
350            (Some(codes), _, None) => codes
351                .iter()
352                .enumerate()
353                .filter_map(|(i, &c)| predicate(c).then_some(i))
354                .collect(),
355
356            // Hot path: codes inline, null bitmap inline.
357            (Some(codes), Some(nulls), Some(_)) => codes
358                .iter()
359                .enumerate()
360                .filter_map(|(i, &c)| {
361                    let word = nulls.get(i / 64).copied().unwrap_or(0);
362                    let is_null = (word & (1u64 << (i % 64))) != 0;
363                    (!is_null && predicate(c)).then_some(i)
364                })
365                .collect(),
366
367            // Mixed / mmap path: fall back to per-element decode.
368            _ => {
369                let mut out = Vec::new();
370                for i in 0..self.code_count {
371                    if self.is_null(i) {
372                        continue;
373                    }
374                    if let Some(c) = self.codes.code_at(i)
375                        && predicate(c)
376                    {
377                        out.push(i);
378                    }
379                }
380                out
381            }
382        }
383    }
384}
385
386/// Builds a dictionary encoding by streaming values through.
387///
388/// Call [`add()`](Self::add) for each value - we'll automatically assign codes
389/// and build the dictionary. Then [`build()`](Self::build) to get the final encoding.
390#[derive(Debug)]
391pub struct DictionaryBuilder {
392    /// Map from string to code.
393    string_to_code: HashMap<Arc<str>, u32>,
394    /// Dictionary (code -> string).
395    dictionary: Vec<Arc<str>>,
396    /// Encoded values.
397    codes: Vec<u32>,
398    /// Null positions (for marking nulls).
399    null_positions: Vec<usize>,
400}
401
402impl DictionaryBuilder {
403    /// Creates a new dictionary builder.
404    pub fn new() -> Self {
405        Self {
406            string_to_code: HashMap::new(),
407            dictionary: Vec::new(),
408            codes: Vec::new(),
409            null_positions: Vec::new(),
410        }
411    }
412
413    /// Creates a new dictionary builder with estimated capacity.
414    pub fn with_capacity(value_capacity: usize, dictionary_capacity: usize) -> Self {
415        Self {
416            string_to_code: HashMap::with_capacity(dictionary_capacity),
417            dictionary: Vec::with_capacity(dictionary_capacity),
418            codes: Vec::with_capacity(value_capacity),
419            null_positions: Vec::new(),
420        }
421    }
422
423    /// Adds a string value to the encoding.
424    ///
425    /// Returns the code assigned to this value.
426    pub fn add(&mut self, value: &str) -> u32 {
427        if let Some(&code) = self.string_to_code.get(value) {
428            self.codes.push(code);
429            code
430        } else {
431            // reason: dictionary size is bounded by u32 (codes are u32)
432            #[allow(clippy::cast_possible_truncation)]
433            let code = self.dictionary.len() as u32;
434            let arc_value: Arc<str> = value.into();
435            self.string_to_code.insert(arc_value.clone(), code);
436            self.dictionary.push(arc_value);
437            self.codes.push(code);
438            code
439        }
440    }
441
442    /// Adds a null value.
443    pub fn add_null(&mut self) {
444        let idx = self.codes.len();
445        self.null_positions.push(idx);
446        self.codes.push(0); // Placeholder code
447    }
448
449    /// Adds an optional value.
450    pub fn add_optional(&mut self, value: Option<&str>) -> Option<u32> {
451        match value {
452            Some(v) => Some(self.add(v)),
453            None => {
454                self.add_null();
455                None
456            }
457        }
458    }
459
460    /// Returns the current number of values.
461    pub fn len(&self) -> usize {
462        self.codes.len()
463    }
464
465    /// Returns whether the builder is empty.
466    pub fn is_empty(&self) -> bool {
467        self.codes.is_empty()
468    }
469
470    /// Returns the current dictionary size.
471    pub fn dictionary_size(&self) -> usize {
472        self.dictionary.len()
473    }
474
475    /// Builds the dictionary encoding.
476    pub fn build(self) -> DictionaryEncoding {
477        let null_bitmap = if self.null_positions.is_empty() {
478            None
479        } else {
480            let num_words = (self.codes.len() + 63) / 64;
481            let mut bitmap = vec![0u64; num_words];
482            for &pos in &self.null_positions {
483                let word_idx = pos / 64;
484                let bit_idx = pos % 64;
485                bitmap[word_idx] |= 1 << bit_idx;
486            }
487            Some(bitmap)
488        };
489
490        let dict: Arc<[Arc<str>]> = self.dictionary.into();
491
492        let mut encoding = DictionaryEncoding::new(dict, self.codes);
493        if let Some(bitmap) = null_bitmap {
494            encoding = encoding.with_nulls(bitmap);
495        }
496        encoding
497    }
498
499    /// Clears the builder for reuse.
500    pub fn clear(&mut self) {
501        self.string_to_code.clear();
502        self.dictionary.clear();
503        self.codes.clear();
504        self.null_positions.clear();
505    }
506}
507
508impl Default for DictionaryBuilder {
509    fn default() -> Self {
510        Self::new()
511    }
512}
513
514/// Extension trait for building dictionary encodings from iterators.
515pub trait IntoDictionaryEncoding {
516    /// Creates a dictionary encoding from an iterator of strings.
517    fn into_dictionary_encoding(self) -> DictionaryEncoding;
518}
519
520impl<'a, I> IntoDictionaryEncoding for I
521where
522    I: IntoIterator<Item = &'a str>,
523{
524    fn into_dictionary_encoding(self) -> DictionaryEncoding {
525        let mut builder = DictionaryBuilder::new();
526        for s in self {
527            builder.add(s);
528        }
529        builder.build()
530    }
531}
532
533#[cfg(test)]
534mod tests {
535    use super::*;
536
537    #[test]
538    fn test_dictionary_builder_basic() {
539        let mut builder = DictionaryBuilder::new();
540        builder.add("apple");
541        builder.add("banana");
542        builder.add("apple");
543        builder.add("cherry");
544        builder.add("apple");
545
546        let dict = builder.build();
547
548        assert_eq!(dict.len(), 5);
549        assert_eq!(dict.dictionary_size(), 3);
550
551        assert_eq!(dict.get(0), Some("apple"));
552        assert_eq!(dict.get(1), Some("banana"));
553        assert_eq!(dict.get(2), Some("apple"));
554        assert_eq!(dict.get(3), Some("cherry"));
555        assert_eq!(dict.get(4), Some("apple"));
556    }
557
558    #[test]
559    fn test_dictionary_codes() {
560        let mut builder = DictionaryBuilder::new();
561        let code_apple = builder.add("apple");
562        let code_banana = builder.add("banana");
563        let code_apple2 = builder.add("apple");
564
565        assert_eq!(code_apple, code_apple2);
566        assert_ne!(code_apple, code_banana);
567
568        let dict = builder.build();
569        assert_eq!(dict.codes(), vec![0, 1, 0]);
570    }
571
572    #[test]
573    fn test_dictionary_with_nulls() {
574        let mut builder = DictionaryBuilder::new();
575        builder.add("apple");
576        builder.add_null();
577        builder.add("banana");
578        builder.add_null();
579
580        let dict = builder.build();
581
582        assert_eq!(dict.len(), 4);
583        assert_eq!(dict.get(0), Some("apple"));
584        assert_eq!(dict.get(1), None);
585        assert!(dict.is_null(1));
586        assert_eq!(dict.get(2), Some("banana"));
587        assert_eq!(dict.get(3), None);
588        assert!(dict.is_null(3));
589    }
590
591    #[test]
592    fn test_dictionary_encode_lookup() {
593        let mut builder = DictionaryBuilder::new();
594        builder.add("apple");
595        builder.add("banana");
596        builder.add("cherry");
597
598        let dict = builder.build();
599
600        assert_eq!(dict.encode("apple"), Some(0));
601        assert_eq!(dict.encode("banana"), Some(1));
602        assert_eq!(dict.encode("cherry"), Some(2));
603        assert_eq!(dict.encode("date"), None);
604    }
605
606    #[test]
607    fn test_dictionary_filter_by_code() {
608        let mut builder = DictionaryBuilder::new();
609        builder.add("apple");
610        builder.add("banana");
611        builder.add("apple");
612        builder.add("cherry");
613        builder.add("apple");
614
615        let dict = builder.build();
616        let apple_code = dict.encode("apple").unwrap();
617
618        let indices = dict.filter_by_code(|code| code == apple_code);
619        assert_eq!(indices, vec![0, 2, 4]);
620    }
621
622    #[test]
623    fn test_compression_ratio() {
624        let mut builder = DictionaryBuilder::new();
625
626        // Add many repeated long strings
627        for _ in 0..100 {
628            builder.add("this_is_a_very_long_string_that_repeats_many_times");
629        }
630
631        let dict = builder.build();
632
633        // Compression ratio should be > 1 for highly repetitive data
634        let ratio = dict.compression_ratio();
635        assert!(ratio > 1.0, "Expected compression ratio > 1, got {}", ratio);
636    }
637
638    #[test]
639    fn test_into_dictionary_encoding() {
640        let strings = vec!["apple", "banana", "apple", "cherry"];
641        let dict: DictionaryEncoding = strings.into_iter().into_dictionary_encoding();
642
643        assert_eq!(dict.len(), 4);
644        assert_eq!(dict.dictionary_size(), 3);
645    }
646
647    #[test]
648    fn test_empty_dictionary() {
649        let builder = DictionaryBuilder::new();
650        let dict = builder.build();
651
652        assert!(dict.is_empty());
653        assert_eq!(dict.dictionary_size(), 0);
654        assert_eq!(dict.get(0), None);
655    }
656
657    #[test]
658    fn test_single_value() {
659        let mut builder = DictionaryBuilder::new();
660        builder.add("only_value");
661
662        let dict = builder.build();
663
664        assert_eq!(dict.len(), 1);
665        assert_eq!(dict.dictionary_size(), 1);
666        assert_eq!(dict.get(0), Some("only_value"));
667    }
668
669    #[test]
670    fn test_all_unique() {
671        let mut builder = DictionaryBuilder::new();
672        builder.add("a");
673        builder.add("b");
674        builder.add("c");
675        builder.add("d");
676
677        let dict = builder.build();
678
679        assert_eq!(dict.len(), 4);
680        assert_eq!(dict.dictionary_size(), 4);
681        assert_eq!(dict.codes(), vec![0, 1, 2, 3]);
682    }
683
684    #[test]
685    fn test_all_same() {
686        let mut builder = DictionaryBuilder::new();
687        for _ in 0..10 {
688            builder.add("same");
689        }
690
691        let dict = builder.build();
692
693        assert_eq!(dict.len(), 10);
694        assert_eq!(dict.dictionary_size(), 1);
695        assert!(dict.codes().iter().all(|&c| c == 0));
696    }
697
698    #[test]
699    fn test_dict_inline_codes_slice_via_builder() {
700        let mut b = DictionaryBuilder::new();
701        for s in ["alpha", "beta", "alpha", "gamma", "beta"] {
702            b.add(s);
703        }
704        let dict = b.build();
705
706        let slice = dict.as_codes_slice();
707        assert!(
708            slice.is_some(),
709            "DictionaryBuilder must produce Inline codes for fast scan"
710        );
711        let codes = slice.unwrap();
712        assert_eq!(codes.len(), 5);
713        assert_eq!(codes[0], codes[2]); // both "alpha"
714        assert_eq!(codes[1], codes[4]); // both "beta"
715    }
716
717    #[test]
718    fn test_dict_mapped_codes_yield_no_slice() {
719        let dictionary: Arc<[Arc<str>]> = Arc::from(vec![Arc::from("a"), Arc::from("b")]);
720        let codes_le: Vec<u8> = [0u32, 1, 0, 1]
721            .iter()
722            .flat_map(|c| c.to_le_bytes())
723            .collect();
724        let dict =
725            DictionaryEncoding::from_bytes_storage(dictionary, bytes::Bytes::from(codes_le), 4);
726        assert!(dict.as_codes_slice().is_none());
727        assert_eq!(dict.code_at(0), Some(0));
728        assert_eq!(dict.code_at(3), Some(1));
729    }
730
731    #[test]
732    fn test_dict_inline_null_words_slice_via_with_nulls() {
733        let mut b = DictionaryBuilder::new();
734        for s in ["x", "y", "z"] {
735            b.add(s);
736        }
737        let dict = b.build().with_nulls(vec![0b0000_0010_u64]); // index 1 is null
738
739        assert!(
740            dict.as_null_words_slice().is_some(),
741            "with_nulls(Vec<u64>) must produce Inline null bitmap"
742        );
743        assert!(!dict.is_null(0));
744        assert!(dict.is_null(1));
745        assert!(!dict.is_null(2));
746    }
747
748    #[test]
749    fn test_dict_filter_by_code_matches_between_inline_and_mapped() {
750        let strings = ["a", "b", "c", "a", "a", "b", "c", "a", "b", "a"];
751        let mut b = DictionaryBuilder::new();
752        for s in strings {
753            b.add(s);
754        }
755        let inline = b.build();
756
757        let codes_b = inline.codes_bytes();
758        let dict_arc = inline.dictionary().clone();
759        let mapped = DictionaryEncoding::from_bytes_storage(dict_arc, codes_b, strings.len());
760
761        let target = inline.encode("a").unwrap();
762        let from_inline = inline.filter_by_code(|c| c == target);
763        let from_mapped = mapped.filter_by_code(|c| c == target);
764        assert_eq!(from_inline, from_mapped);
765        assert_eq!(from_inline, vec![0, 3, 4, 7, 9]);
766    }
767
768    #[test]
769    fn test_dict_filter_by_code_respects_inline_null_bitmap() {
770        let mut b = DictionaryBuilder::new();
771        for s in ["x", "y", "x", "y"] {
772            b.add(s);
773        }
774        // Mark index 2 ("x") as null.
775        let dict = b.build().with_nulls(vec![0b0000_0100_u64]);
776        let target = dict.encode("x").unwrap();
777        let hits = dict.filter_by_code(|c| c == target);
778        assert_eq!(hits, vec![0]); // index 2 excluded by null bit
779    }
780
781    #[test]
782    fn test_dict_filter_by_code_fallback_with_mapped_codes_and_mapped_nulls() {
783        let dictionary: Arc<[Arc<str>]> = Arc::from(vec![Arc::from("a"), Arc::from("b")]);
784
785        // Codes: a, b, a, b, a -> 0, 1, 0, 1, 0
786        let codes_le: Vec<u8> = [0u32, 1, 0, 1, 0]
787            .iter()
788            .flat_map(|c| c.to_le_bytes())
789            .collect();
790        let codes_bytes = bytes::Bytes::from(codes_le);
791
792        // Null bitmap as LE u64 bytes: mark index 2 (the third "a") as null.
793        // 0b0000_0100 -> bit 2 set
794        let null_le: Vec<u8> = 0b0000_0100_u64.to_le_bytes().to_vec();
795        let null_bytes = bytes::Bytes::from(null_le);
796
797        let dict = DictionaryEncoding::from_bytes_storage(dictionary, codes_bytes, 5)
798            .with_null_bytes(null_bytes);
799
800        let target_a = dict.encode("a").expect("'a' is in dictionary");
801        // "a" is at indices 0, 2, 4. Index 2 is null. So result should be [0, 4].
802        let hits = dict.filter_by_code(|c| c == target_a);
803        assert_eq!(hits, vec![0, 4]);
804    }
805}