1use std::collections::HashMap;
25use std::sync::Arc;
26
27use bytes::{Bytes, BytesMut};
28
29#[inline]
31fn read_code_at(bytes: &Bytes, idx: usize) -> Option<u32> {
32 let start = idx.checked_mul(4)?;
33 let end = start.checked_add(4)?;
34 let chunk: [u8; 4] = bytes.get(start..end)?.try_into().ok()?;
35 Some(u32::from_le_bytes(chunk))
36}
37
38fn codes_to_bytes(codes: &[u32]) -> Bytes {
40 let mut buf = BytesMut::with_capacity(codes.len() * 4);
41 for &c in codes {
42 buf.extend_from_slice(&c.to_le_bytes());
43 }
44 buf.freeze()
45}
46
47#[inline]
49fn null_bit_at(bitmap: &Bytes, index: usize) -> bool {
50 let word_idx = index / 64;
51 let bit_idx = index % 64;
52 let start = word_idx * 8;
53 let Some(chunk) = bitmap.get(start..start + 8) else {
54 return false;
55 };
56 let Ok(arr): Result<[u8; 8], _> = chunk.try_into() else {
57 return false;
58 };
59 (u64::from_le_bytes(arr) & (1 << bit_idx)) != 0
60}
61
62#[derive(Debug, Clone)]
64enum CodeStore {
65 Inline(Vec<u32>),
66 Mapped(Bytes),
67}
68
69impl CodeStore {
70 #[inline]
71 fn as_slice(&self) -> Option<&[u32]> {
72 match self {
73 Self::Inline(v) => Some(v.as_slice()),
74 Self::Mapped(_) => None,
75 }
76 }
77
78 #[inline]
79 fn code_at(&self, idx: usize) -> Option<u32> {
80 match self {
81 Self::Inline(v) => v.get(idx).copied(),
82 Self::Mapped(b) => read_code_at(b, idx),
83 }
84 }
85
86 fn to_bytes(&self) -> Bytes {
87 match self {
88 Self::Inline(v) => codes_to_bytes(v),
89 Self::Mapped(b) => b.clone(),
90 }
91 }
92
93 fn byte_len(&self) -> usize {
94 match self {
95 Self::Inline(v) => v.len() * 4,
96 Self::Mapped(b) => b.len(),
97 }
98 }
99
100 #[inline]
101 fn len_codes(&self, code_count: usize) -> usize {
102 match self {
103 Self::Inline(v) => v.len(),
104 Self::Mapped(_) => code_count,
105 }
106 }
107}
108
109#[derive(Debug, Clone)]
111enum NullBitmap {
112 Inline(Vec<u64>),
113 Mapped(Bytes),
114}
115
116impl NullBitmap {
117 #[inline]
118 fn as_slice(&self) -> Option<&[u64]> {
119 match self {
120 Self::Inline(v) => Some(v.as_slice()),
121 Self::Mapped(_) => None,
122 }
123 }
124
125 #[inline]
126 fn is_null(&self, index: usize) -> bool {
127 match self {
128 Self::Inline(words) => {
129 let word_idx = index / 64;
130 let bit_idx = index % 64;
131 words
132 .get(word_idx)
133 .is_some_and(|w| (*w & (1u64 << bit_idx)) != 0)
134 }
135 Self::Mapped(b) => null_bit_at(b, index),
136 }
137 }
138}
139
140#[derive(Debug, Clone)]
147pub struct DictionaryEncoding {
148 dictionary: Arc<[Arc<str>]>,
150 codes: CodeStore,
152 code_count: usize,
154 null_bitmap: Option<NullBitmap>,
156}
157
158impl DictionaryEncoding {
159 pub fn new(dictionary: Arc<[Arc<str>]>, codes: Vec<u32>) -> Self {
162 let code_count = codes.len();
163 Self {
164 dictionary,
165 codes: CodeStore::Inline(codes),
166 code_count,
167 null_bitmap: None,
168 }
169 }
170
171 pub fn from_bytes_storage(
176 dictionary: Arc<[Arc<str>]>,
177 codes_bytes: Bytes,
178 code_count: usize,
179 ) -> Self {
180 Self {
181 dictionary,
182 codes: CodeStore::Mapped(codes_bytes),
183 code_count,
184 null_bitmap: None,
185 }
186 }
187
188 pub fn with_nulls(mut self, null_bitmap: Vec<u64>) -> Self {
190 self.null_bitmap = Some(NullBitmap::Inline(null_bitmap));
191 self
192 }
193
194 pub fn with_null_bytes(mut self, null_bitmap: Bytes) -> Self {
196 self.null_bitmap = Some(NullBitmap::Mapped(null_bitmap));
197 self
198 }
199
200 pub fn len(&self) -> usize {
202 self.codes.len_codes(self.code_count)
203 }
204
205 pub fn is_empty(&self) -> bool {
207 self.codes.len_codes(self.code_count) == 0
208 }
209
210 pub fn dictionary_size(&self) -> usize {
212 self.dictionary.len()
213 }
214
215 pub fn dictionary(&self) -> &Arc<[Arc<str>]> {
217 &self.dictionary
218 }
219
220 #[must_use]
226 pub fn codes_bytes(&self) -> Bytes {
227 self.codes.to_bytes()
228 }
229
230 #[must_use]
232 #[inline]
233 pub fn as_codes_slice(&self) -> Option<&[u32]> {
234 self.codes.as_slice()
235 }
236
237 #[must_use]
240 #[inline]
241 pub fn as_null_words_slice(&self) -> Option<&[u64]> {
242 self.null_bitmap.as_ref().and_then(NullBitmap::as_slice)
243 }
244
245 pub fn code_count(&self) -> usize {
247 self.code_count
248 }
249
250 #[must_use]
252 #[inline]
253 pub fn code_at(&self, idx: usize) -> Option<u32> {
254 self.codes.code_at(idx)
255 }
256
257 pub fn codes(&self) -> Vec<u32> {
263 (0..self.code_count)
264 .map(|i| self.codes.code_at(i).unwrap_or(0))
265 .collect()
266 }
267
268 #[must_use]
270 #[inline]
271 pub fn is_null(&self, index: usize) -> bool {
272 match &self.null_bitmap {
273 Some(b) => b.is_null(index),
274 None => false,
275 }
276 }
277
278 pub fn get(&self, index: usize) -> Option<&str> {
282 if self.is_null(index) {
283 return None;
284 }
285 let code = self.code_at(index)?;
286 self.dictionary.get(code as usize).map(|s| s.as_ref())
287 }
288
289 pub fn get_code(&self, index: usize) -> Option<u32> {
291 if self.is_null(index) {
292 return None;
293 }
294 self.code_at(index)
295 }
296
297 pub fn iter(&self) -> impl Iterator<Item = Option<&str>> {
299 (0..self.len()).map(move |i| self.get(i))
300 }
301
302 pub fn compression_ratio(&self) -> f64 {
304 if self.is_empty() {
305 return 1.0;
306 }
307
308 let original_size: usize = (0..self.code_count)
310 .map(|i| {
311 let code = self.codes.code_at(i).unwrap_or(0) as usize;
312 self.dictionary.get(code).map_or(0, |s| s.len())
313 })
314 .sum();
315
316 let dict_size: usize = self.dictionary.iter().map(|s| s.len()).sum();
318 let codes_size = self.codes.byte_len();
319 let compressed_size = dict_size + codes_size;
320
321 if compressed_size == 0 {
322 return 1.0;
323 }
324
325 original_size as f64 / compressed_size as f64
326 }
327
328 pub fn encode(&self, value: &str) -> Option<u32> {
330 self.dictionary
331 .iter()
332 .position(|s| s.as_ref() == value)
333 .and_then(|i| u32::try_from(i).ok())
334 }
335
336 pub fn filter_by_code(&self, predicate: impl Fn(u32) -> bool) -> Vec<usize> {
342 if self.code_count == 0 {
343 return Vec::new();
344 }
345
346 let null_words: Option<&[u64]> = self.as_null_words_slice();
347
348 match (self.codes.as_slice(), null_words, &self.null_bitmap) {
349 (Some(codes), _, None) => codes
351 .iter()
352 .enumerate()
353 .filter_map(|(i, &c)| predicate(c).then_some(i))
354 .collect(),
355
356 (Some(codes), Some(nulls), Some(_)) => codes
358 .iter()
359 .enumerate()
360 .filter_map(|(i, &c)| {
361 let word = nulls.get(i / 64).copied().unwrap_or(0);
362 let is_null = (word & (1u64 << (i % 64))) != 0;
363 (!is_null && predicate(c)).then_some(i)
364 })
365 .collect(),
366
367 _ => {
369 let mut out = Vec::new();
370 for i in 0..self.code_count {
371 if self.is_null(i) {
372 continue;
373 }
374 if let Some(c) = self.codes.code_at(i)
375 && predicate(c)
376 {
377 out.push(i);
378 }
379 }
380 out
381 }
382 }
383 }
384}
385
386#[derive(Debug)]
391pub struct DictionaryBuilder {
392 string_to_code: HashMap<Arc<str>, u32>,
394 dictionary: Vec<Arc<str>>,
396 codes: Vec<u32>,
398 null_positions: Vec<usize>,
400}
401
402impl DictionaryBuilder {
403 pub fn new() -> Self {
405 Self {
406 string_to_code: HashMap::new(),
407 dictionary: Vec::new(),
408 codes: Vec::new(),
409 null_positions: Vec::new(),
410 }
411 }
412
413 pub fn with_capacity(value_capacity: usize, dictionary_capacity: usize) -> Self {
415 Self {
416 string_to_code: HashMap::with_capacity(dictionary_capacity),
417 dictionary: Vec::with_capacity(dictionary_capacity),
418 codes: Vec::with_capacity(value_capacity),
419 null_positions: Vec::new(),
420 }
421 }
422
423 pub fn add(&mut self, value: &str) -> u32 {
427 if let Some(&code) = self.string_to_code.get(value) {
428 self.codes.push(code);
429 code
430 } else {
431 #[allow(clippy::cast_possible_truncation)]
433 let code = self.dictionary.len() as u32;
434 let arc_value: Arc<str> = value.into();
435 self.string_to_code.insert(arc_value.clone(), code);
436 self.dictionary.push(arc_value);
437 self.codes.push(code);
438 code
439 }
440 }
441
442 pub fn add_null(&mut self) {
444 let idx = self.codes.len();
445 self.null_positions.push(idx);
446 self.codes.push(0); }
448
449 pub fn add_optional(&mut self, value: Option<&str>) -> Option<u32> {
451 match value {
452 Some(v) => Some(self.add(v)),
453 None => {
454 self.add_null();
455 None
456 }
457 }
458 }
459
460 pub fn len(&self) -> usize {
462 self.codes.len()
463 }
464
465 pub fn is_empty(&self) -> bool {
467 self.codes.is_empty()
468 }
469
470 pub fn dictionary_size(&self) -> usize {
472 self.dictionary.len()
473 }
474
475 pub fn build(self) -> DictionaryEncoding {
477 let null_bitmap = if self.null_positions.is_empty() {
478 None
479 } else {
480 let num_words = (self.codes.len() + 63) / 64;
481 let mut bitmap = vec![0u64; num_words];
482 for &pos in &self.null_positions {
483 let word_idx = pos / 64;
484 let bit_idx = pos % 64;
485 bitmap[word_idx] |= 1 << bit_idx;
486 }
487 Some(bitmap)
488 };
489
490 let dict: Arc<[Arc<str>]> = self.dictionary.into();
491
492 let mut encoding = DictionaryEncoding::new(dict, self.codes);
493 if let Some(bitmap) = null_bitmap {
494 encoding = encoding.with_nulls(bitmap);
495 }
496 encoding
497 }
498
499 pub fn clear(&mut self) {
501 self.string_to_code.clear();
502 self.dictionary.clear();
503 self.codes.clear();
504 self.null_positions.clear();
505 }
506}
507
508impl Default for DictionaryBuilder {
509 fn default() -> Self {
510 Self::new()
511 }
512}
513
514pub trait IntoDictionaryEncoding {
516 fn into_dictionary_encoding(self) -> DictionaryEncoding;
518}
519
520impl<'a, I> IntoDictionaryEncoding for I
521where
522 I: IntoIterator<Item = &'a str>,
523{
524 fn into_dictionary_encoding(self) -> DictionaryEncoding {
525 let mut builder = DictionaryBuilder::new();
526 for s in self {
527 builder.add(s);
528 }
529 builder.build()
530 }
531}
532
533#[cfg(test)]
534mod tests {
535 use super::*;
536
537 #[test]
538 fn test_dictionary_builder_basic() {
539 let mut builder = DictionaryBuilder::new();
540 builder.add("apple");
541 builder.add("banana");
542 builder.add("apple");
543 builder.add("cherry");
544 builder.add("apple");
545
546 let dict = builder.build();
547
548 assert_eq!(dict.len(), 5);
549 assert_eq!(dict.dictionary_size(), 3);
550
551 assert_eq!(dict.get(0), Some("apple"));
552 assert_eq!(dict.get(1), Some("banana"));
553 assert_eq!(dict.get(2), Some("apple"));
554 assert_eq!(dict.get(3), Some("cherry"));
555 assert_eq!(dict.get(4), Some("apple"));
556 }
557
558 #[test]
559 fn test_dictionary_codes() {
560 let mut builder = DictionaryBuilder::new();
561 let code_apple = builder.add("apple");
562 let code_banana = builder.add("banana");
563 let code_apple2 = builder.add("apple");
564
565 assert_eq!(code_apple, code_apple2);
566 assert_ne!(code_apple, code_banana);
567
568 let dict = builder.build();
569 assert_eq!(dict.codes(), vec![0, 1, 0]);
570 }
571
572 #[test]
573 fn test_dictionary_with_nulls() {
574 let mut builder = DictionaryBuilder::new();
575 builder.add("apple");
576 builder.add_null();
577 builder.add("banana");
578 builder.add_null();
579
580 let dict = builder.build();
581
582 assert_eq!(dict.len(), 4);
583 assert_eq!(dict.get(0), Some("apple"));
584 assert_eq!(dict.get(1), None);
585 assert!(dict.is_null(1));
586 assert_eq!(dict.get(2), Some("banana"));
587 assert_eq!(dict.get(3), None);
588 assert!(dict.is_null(3));
589 }
590
591 #[test]
592 fn test_dictionary_encode_lookup() {
593 let mut builder = DictionaryBuilder::new();
594 builder.add("apple");
595 builder.add("banana");
596 builder.add("cherry");
597
598 let dict = builder.build();
599
600 assert_eq!(dict.encode("apple"), Some(0));
601 assert_eq!(dict.encode("banana"), Some(1));
602 assert_eq!(dict.encode("cherry"), Some(2));
603 assert_eq!(dict.encode("date"), None);
604 }
605
606 #[test]
607 fn test_dictionary_filter_by_code() {
608 let mut builder = DictionaryBuilder::new();
609 builder.add("apple");
610 builder.add("banana");
611 builder.add("apple");
612 builder.add("cherry");
613 builder.add("apple");
614
615 let dict = builder.build();
616 let apple_code = dict.encode("apple").unwrap();
617
618 let indices = dict.filter_by_code(|code| code == apple_code);
619 assert_eq!(indices, vec![0, 2, 4]);
620 }
621
622 #[test]
623 fn test_compression_ratio() {
624 let mut builder = DictionaryBuilder::new();
625
626 for _ in 0..100 {
628 builder.add("this_is_a_very_long_string_that_repeats_many_times");
629 }
630
631 let dict = builder.build();
632
633 let ratio = dict.compression_ratio();
635 assert!(ratio > 1.0, "Expected compression ratio > 1, got {}", ratio);
636 }
637
638 #[test]
639 fn test_into_dictionary_encoding() {
640 let strings = vec!["apple", "banana", "apple", "cherry"];
641 let dict: DictionaryEncoding = strings.into_iter().into_dictionary_encoding();
642
643 assert_eq!(dict.len(), 4);
644 assert_eq!(dict.dictionary_size(), 3);
645 }
646
647 #[test]
648 fn test_empty_dictionary() {
649 let builder = DictionaryBuilder::new();
650 let dict = builder.build();
651
652 assert!(dict.is_empty());
653 assert_eq!(dict.dictionary_size(), 0);
654 assert_eq!(dict.get(0), None);
655 }
656
657 #[test]
658 fn test_single_value() {
659 let mut builder = DictionaryBuilder::new();
660 builder.add("only_value");
661
662 let dict = builder.build();
663
664 assert_eq!(dict.len(), 1);
665 assert_eq!(dict.dictionary_size(), 1);
666 assert_eq!(dict.get(0), Some("only_value"));
667 }
668
669 #[test]
670 fn test_all_unique() {
671 let mut builder = DictionaryBuilder::new();
672 builder.add("a");
673 builder.add("b");
674 builder.add("c");
675 builder.add("d");
676
677 let dict = builder.build();
678
679 assert_eq!(dict.len(), 4);
680 assert_eq!(dict.dictionary_size(), 4);
681 assert_eq!(dict.codes(), vec![0, 1, 2, 3]);
682 }
683
684 #[test]
685 fn test_all_same() {
686 let mut builder = DictionaryBuilder::new();
687 for _ in 0..10 {
688 builder.add("same");
689 }
690
691 let dict = builder.build();
692
693 assert_eq!(dict.len(), 10);
694 assert_eq!(dict.dictionary_size(), 1);
695 assert!(dict.codes().iter().all(|&c| c == 0));
696 }
697
698 #[test]
699 fn test_dict_inline_codes_slice_via_builder() {
700 let mut b = DictionaryBuilder::new();
701 for s in ["alpha", "beta", "alpha", "gamma", "beta"] {
702 b.add(s);
703 }
704 let dict = b.build();
705
706 let slice = dict.as_codes_slice();
707 assert!(
708 slice.is_some(),
709 "DictionaryBuilder must produce Inline codes for fast scan"
710 );
711 let codes = slice.unwrap();
712 assert_eq!(codes.len(), 5);
713 assert_eq!(codes[0], codes[2]); assert_eq!(codes[1], codes[4]); }
716
717 #[test]
718 fn test_dict_mapped_codes_yield_no_slice() {
719 let dictionary: Arc<[Arc<str>]> = Arc::from(vec![Arc::from("a"), Arc::from("b")]);
720 let codes_le: Vec<u8> = [0u32, 1, 0, 1]
721 .iter()
722 .flat_map(|c| c.to_le_bytes())
723 .collect();
724 let dict =
725 DictionaryEncoding::from_bytes_storage(dictionary, bytes::Bytes::from(codes_le), 4);
726 assert!(dict.as_codes_slice().is_none());
727 assert_eq!(dict.code_at(0), Some(0));
728 assert_eq!(dict.code_at(3), Some(1));
729 }
730
731 #[test]
732 fn test_dict_inline_null_words_slice_via_with_nulls() {
733 let mut b = DictionaryBuilder::new();
734 for s in ["x", "y", "z"] {
735 b.add(s);
736 }
737 let dict = b.build().with_nulls(vec![0b0000_0010_u64]); assert!(
740 dict.as_null_words_slice().is_some(),
741 "with_nulls(Vec<u64>) must produce Inline null bitmap"
742 );
743 assert!(!dict.is_null(0));
744 assert!(dict.is_null(1));
745 assert!(!dict.is_null(2));
746 }
747
748 #[test]
749 fn test_dict_filter_by_code_matches_between_inline_and_mapped() {
750 let strings = ["a", "b", "c", "a", "a", "b", "c", "a", "b", "a"];
751 let mut b = DictionaryBuilder::new();
752 for s in strings {
753 b.add(s);
754 }
755 let inline = b.build();
756
757 let codes_b = inline.codes_bytes();
758 let dict_arc = inline.dictionary().clone();
759 let mapped = DictionaryEncoding::from_bytes_storage(dict_arc, codes_b, strings.len());
760
761 let target = inline.encode("a").unwrap();
762 let from_inline = inline.filter_by_code(|c| c == target);
763 let from_mapped = mapped.filter_by_code(|c| c == target);
764 assert_eq!(from_inline, from_mapped);
765 assert_eq!(from_inline, vec![0, 3, 4, 7, 9]);
766 }
767
768 #[test]
769 fn test_dict_filter_by_code_respects_inline_null_bitmap() {
770 let mut b = DictionaryBuilder::new();
771 for s in ["x", "y", "x", "y"] {
772 b.add(s);
773 }
774 let dict = b.build().with_nulls(vec![0b0000_0100_u64]);
776 let target = dict.encode("x").unwrap();
777 let hits = dict.filter_by_code(|c| c == target);
778 assert_eq!(hits, vec![0]); }
780
781 #[test]
782 fn test_dict_filter_by_code_fallback_with_mapped_codes_and_mapped_nulls() {
783 let dictionary: Arc<[Arc<str>]> = Arc::from(vec![Arc::from("a"), Arc::from("b")]);
784
785 let codes_le: Vec<u8> = [0u32, 1, 0, 1, 0]
787 .iter()
788 .flat_map(|c| c.to_le_bytes())
789 .collect();
790 let codes_bytes = bytes::Bytes::from(codes_le);
791
792 let null_le: Vec<u8> = 0b0000_0100_u64.to_le_bytes().to_vec();
795 let null_bytes = bytes::Bytes::from(null_le);
796
797 let dict = DictionaryEncoding::from_bytes_storage(dictionary, codes_bytes, 5)
798 .with_null_bytes(null_bytes);
799
800 let target_a = dict.encode("a").expect("'a' is in dictionary");
801 let hits = dict.filter_by_code(|c| c == target_a);
803 assert_eq!(hits, vec![0, 4]);
804 }
805}