1use crate::BitMask;
47use std::collections::BTreeMap;
48
49#[derive(Debug, Clone, Default, PartialEq, Eq)]
67pub struct ByteStringPool {
68 bytes: Vec<u8>,
69 offsets: Vec<u32>,
70 lens: Vec<u32>,
71}
72
73impl ByteStringPool {
74 pub fn new() -> Self {
76 Self::default()
77 }
78
79 pub fn len(&self) -> usize {
81 self.offsets.len()
82 }
83
84 pub fn is_empty(&self) -> bool {
86 self.offsets.is_empty()
87 }
88
89 pub fn byte_size(&self) -> usize {
91 self.bytes.len()
92 }
93
94 pub fn push(&mut self, bytes: &[u8]) -> Result<ByteStrView, ByteDictError> {
101 let len = u32::try_from(bytes.len()).map_err(|_| ByteDictError::EntryTooLong {
102 got: bytes.len(),
103 })?;
104 let new_total = self.bytes.len().checked_add(bytes.len()).ok_or(
105 ByteDictError::PoolOverflow {
106 attempted: bytes.len(),
107 current: self.bytes.len(),
108 },
109 )?;
110 let offset = u32::try_from(self.bytes.len()).map_err(|_| ByteDictError::PoolOverflow {
111 attempted: bytes.len(),
112 current: self.bytes.len(),
113 })?;
114 if new_total > u32::MAX as usize {
115 return Err(ByteDictError::PoolOverflow {
116 attempted: bytes.len(),
117 current: self.bytes.len(),
118 });
119 }
120 self.bytes.extend_from_slice(bytes);
121 self.offsets.push(offset);
122 self.lens.push(len);
123 Ok(ByteStrView { offset, len })
124 }
125
126 pub fn get(&self, view: ByteStrView) -> &[u8] {
131 let start = view.offset as usize;
132 let end = start.saturating_add(view.len as usize);
133 if end > self.bytes.len() {
134 debug_assert!(false, "ByteStrView out of bounds for pool");
135 return &[];
136 }
137 &self.bytes[start..end]
138 }
139
140 pub fn get_by_index(&self, idx: usize) -> Option<&[u8]> {
144 let offset = *self.offsets.get(idx)? as usize;
145 let len = *self.lens.get(idx)? as usize;
146 Some(&self.bytes[offset..offset + len])
147 }
148
149 pub fn iter(&self) -> impl Iterator<Item = (usize, &[u8])> + '_ {
151 self.offsets.iter().zip(self.lens.iter()).enumerate().map(
152 move |(i, (&off, &len))| {
153 let start = off as usize;
154 let end = start + len as usize;
155 (i, &self.bytes[start..end])
156 },
157 )
158 }
159
160 pub fn iter_views(&self) -> impl Iterator<Item = ByteStrView> + '_ {
164 self.offsets
165 .iter()
166 .zip(self.lens.iter())
167 .map(|(&offset, &len)| ByteStrView { offset, len })
168 }
169}
170
171#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
177pub struct ByteStrView {
178 pub offset: u32,
179 pub len: u32,
180}
181
182impl ByteStrView {
183 pub fn byte_len(self) -> usize {
185 self.len as usize
186 }
187
188 pub fn is_empty(self) -> bool {
190 self.len == 0
191 }
192}
193
194#[derive(Debug, Clone, PartialEq, Eq)]
209pub enum AdaptiveCodes {
210 U8(Vec<u8>),
211 U16(Vec<u16>),
212 U32(Vec<u32>),
213 U64(Vec<u64>),
214}
215
216impl AdaptiveCodes {
217 pub fn new() -> Self {
219 AdaptiveCodes::U8(Vec::new())
220 }
221
222 pub fn with_capacity(cap: usize) -> Self {
224 AdaptiveCodes::U8(Vec::with_capacity(cap))
225 }
226
227 pub fn len(&self) -> usize {
229 match self {
230 AdaptiveCodes::U8(v) => v.len(),
231 AdaptiveCodes::U16(v) => v.len(),
232 AdaptiveCodes::U32(v) => v.len(),
233 AdaptiveCodes::U64(v) => v.len(),
234 }
235 }
236
237 pub fn is_empty(&self) -> bool {
239 self.len() == 0
240 }
241
242 pub fn push(&mut self, code: u64) {
248 if self.needs_promotion_for(code) {
250 self.promote_to_fit(code);
251 }
252 match self {
253 AdaptiveCodes::U8(v) => v.push(code as u8),
254 AdaptiveCodes::U16(v) => v.push(code as u16),
255 AdaptiveCodes::U32(v) => v.push(code as u32),
256 AdaptiveCodes::U64(v) => v.push(code),
257 }
258 }
259
260 pub fn get(&self, i: usize) -> u64 {
262 match self {
263 AdaptiveCodes::U8(v) => v[i] as u64,
264 AdaptiveCodes::U16(v) => v[i] as u64,
265 AdaptiveCodes::U32(v) => v[i] as u64,
266 AdaptiveCodes::U64(v) => v[i],
267 }
268 }
269
270 pub fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
273 match self {
274 AdaptiveCodes::U8(v) => Box::new(v.iter().map(|&x| x as u64)),
275 AdaptiveCodes::U16(v) => Box::new(v.iter().map(|&x| x as u64)),
276 AdaptiveCodes::U32(v) => Box::new(v.iter().map(|&x| x as u64)),
277 AdaptiveCodes::U64(v) => Box::new(v.iter().copied()),
278 }
279 }
280
281 pub fn width_bytes(&self) -> usize {
284 match self {
285 AdaptiveCodes::U8(_) => 1,
286 AdaptiveCodes::U16(_) => 2,
287 AdaptiveCodes::U32(_) => 4,
288 AdaptiveCodes::U64(_) => 8,
289 }
290 }
291
292 fn needs_promotion_for(&self, code: u64) -> bool {
294 match self {
295 AdaptiveCodes::U8(_) => code > u8::MAX as u64,
296 AdaptiveCodes::U16(_) => code > u16::MAX as u64,
297 AdaptiveCodes::U32(_) => code > u32::MAX as u64,
298 AdaptiveCodes::U64(_) => false,
299 }
300 }
301
302 fn promote_to_fit(&mut self, code: u64) {
304 let old = std::mem::replace(self, AdaptiveCodes::U64(Vec::new()));
306 let target = if code <= u16::MAX as u64 {
307 let v: Vec<u16> = match old {
309 AdaptiveCodes::U8(v) => v.into_iter().map(|x| x as u16).collect(),
310 AdaptiveCodes::U16(v) => v,
311 AdaptiveCodes::U32(_) | AdaptiveCodes::U64(_) => {
312 debug_assert!(false, "promote_to_fit called with shrinking target");
313 Vec::new()
314 }
315 };
316 AdaptiveCodes::U16(v)
317 } else if code <= u32::MAX as u64 {
318 let v: Vec<u32> = match old {
319 AdaptiveCodes::U8(v) => v.into_iter().map(|x| x as u32).collect(),
320 AdaptiveCodes::U16(v) => v.into_iter().map(|x| x as u32).collect(),
321 AdaptiveCodes::U32(v) => v,
322 AdaptiveCodes::U64(_) => {
323 debug_assert!(false, "promote_to_fit called with shrinking target");
324 Vec::new()
325 }
326 };
327 AdaptiveCodes::U32(v)
328 } else {
329 let v: Vec<u64> = match old {
330 AdaptiveCodes::U8(v) => v.into_iter().map(|x| x as u64).collect(),
331 AdaptiveCodes::U16(v) => v.into_iter().map(|x| x as u64).collect(),
332 AdaptiveCodes::U32(v) => v.into_iter().map(|x| x as u64).collect(),
333 AdaptiveCodes::U64(v) => v,
334 };
335 AdaptiveCodes::U64(v)
336 };
337 *self = target;
338 }
339}
340
341impl Default for AdaptiveCodes {
342 fn default() -> Self {
343 Self::new()
344 }
345}
346
347#[derive(Debug, Clone, PartialEq, Eq)]
366pub enum CategoryOrdering {
367 FirstSeen,
368 Lexical,
369 Explicit(Vec<Vec<u8>>),
370}
371
372#[derive(Debug, Clone, PartialEq, Eq)]
390pub enum UnknownCategoryPolicy {
391 Error,
392 MapToOther { other_code: u64 },
393 MapToNull,
394 ExtendDictionary,
395}
396
397#[derive(Debug, Clone, Copy, PartialEq, Eq)]
399pub enum InternedCode {
400 Code(u64),
402 Null,
405}
406
407#[derive(Debug, Clone, PartialEq, Eq)]
412pub enum ByteDictError {
413 EntryTooLong { got: usize },
415 PoolOverflow { attempted: usize, current: usize },
417 UnknownCategory,
419 Frozen,
421 ExplicitOrderingHasDuplicates,
423 OtherCodeNotInDictionary { code: u64 },
426}
427
428impl std::fmt::Display for ByteDictError {
429 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
430 match self {
431 ByteDictError::EntryTooLong { got } => {
432 write!(f, "ByteStringPool entry too long: {got} bytes (max {})", u32::MAX)
433 }
434 ByteDictError::PoolOverflow { attempted, current } => write!(
435 f,
436 "ByteStringPool overflow: cannot append {attempted} bytes (current {current})"
437 ),
438 ByteDictError::UnknownCategory => {
439 write!(f, "unknown category in frozen dictionary (Error policy)")
440 }
441 ByteDictError::Frozen => write!(f, "dictionary is frozen"),
442 ByteDictError::ExplicitOrderingHasDuplicates => {
443 write!(f, "explicit ordering contains duplicate values")
444 }
445 ByteDictError::OtherCodeNotInDictionary { code } => {
446 write!(f, "MapToOther policy: other_code {code} not in dictionary")
447 }
448 }
449 }
450}
451
452impl std::error::Error for ByteDictError {}
453
454#[derive(Debug, Clone)]
474pub struct ByteDictionary {
475 pool: ByteStringPool,
476 lookup: BTreeMap<Vec<u8>, u64>,
481 code_to_view: Vec<ByteStrView>,
484 frozen: bool,
485 ordering: CategoryOrdering,
486 dharht: Option<crate::detcoll::DHarht<u64>>,
492 hash_index: Option<crate::detcoll::SealedU64Map<u64>>,
499}
500
501impl ByteDictionary {
502 pub fn new() -> Self {
504 Self::with_ordering(CategoryOrdering::FirstSeen)
505 }
506
507 pub fn with_ordering(ordering: CategoryOrdering) -> Self {
510 let mut dict = ByteDictionary {
511 pool: ByteStringPool::new(),
512 lookup: BTreeMap::new(),
513 code_to_view: Vec::new(),
514 frozen: false,
515 ordering: ordering.clone(),
516 dharht: None,
517 hash_index: None,
518 };
519 if let CategoryOrdering::Explicit(values) = ordering {
520 for v in values {
521 let _ = dict.intern_internal(&v, false);
522 }
523 }
524 dict
525 }
526
527 pub fn from_explicit(values: Vec<Vec<u8>>) -> Result<Self, ByteDictError> {
531 let mut sorted = values.clone();
533 sorted.sort();
534 for w in sorted.windows(2) {
535 if w[0] == w[1] {
536 return Err(ByteDictError::ExplicitOrderingHasDuplicates);
537 }
538 }
539 Ok(Self::with_ordering(CategoryOrdering::Explicit(values)))
540 }
541
542 pub fn len(&self) -> usize {
544 self.code_to_view.len()
545 }
546
547 pub fn is_empty(&self) -> bool {
549 self.code_to_view.is_empty()
550 }
551
552 pub fn is_frozen(&self) -> bool {
554 self.frozen
555 }
556
557 pub fn ordering(&self) -> &CategoryOrdering {
559 &self.ordering
560 }
561
562 pub fn freeze(&mut self) {
566 self.frozen = true;
567 }
568
569 pub fn pool(&self) -> &ByteStringPool {
571 &self.pool
572 }
573
574 pub fn get(&self, code: u64) -> Option<&[u8]> {
577 let idx = usize::try_from(code).ok()?;
578 let view = *self.code_to_view.get(idx)?;
579 Some(self.pool.get(view))
580 }
581
582 pub fn lookup(&self, bytes: &[u8]) -> Option<u64> {
590 if let Some(d) = &self.dharht {
591 return d.get_bytes(bytes).copied();
592 }
593 self.lookup.get(bytes).copied()
594 }
595
596 pub fn seal_for_lookup(&mut self) {
613 let mut d = crate::detcoll::DHarht::new();
614 for (k, &code) in &self.lookup {
615 d.insert_bytes(k.as_slice(), code);
616 }
617 d.seal_for_lookup();
618 self.dharht = Some(d);
619 }
620
621 pub fn is_lookup_sealed(&self) -> bool {
625 self.dharht.is_some()
626 }
627
628 pub fn dharht_overflow_count(&self) -> u64 {
631 self.dharht.as_ref().map(|d| d.overflow_count()).unwrap_or(0)
632 }
633
634 pub fn seal_with_u64_hash_index(&mut self) {
649 let mut idx = crate::detcoll::SealedU64Map::new();
650 for (k, &code) in &self.lookup {
651 let h = crate::detcoll::hash_bytes(k.as_slice());
652 idx.insert(h, code);
653 }
654 idx.seal();
655 self.hash_index = Some(idx);
656 }
657
658 pub fn is_hash_indexed(&self) -> bool {
660 self.hash_index.is_some()
661 }
662
663 pub fn lookup_by_hash(&self, hash: u64) -> Option<u64> {
674 self.hash_index.as_ref()?.get(hash).copied()
675 }
676
677 pub fn lookup_by_hash_verify(&self, hash: u64, bytes: &[u8]) -> Option<u64> {
682 let code = self.lookup_by_hash(hash)?;
683 let stored = self.get(code)?;
684 if stored == bytes { Some(code) } else { None }
685 }
686
687 pub fn intern(&mut self, bytes: &[u8]) -> Result<u64, ByteDictError> {
695 self.intern_internal(bytes, true)
696 }
697
698 pub fn intern_with_policy(
701 &mut self,
702 bytes: &[u8],
703 policy: &UnknownCategoryPolicy,
704 ) -> Result<InternedCode, ByteDictError> {
705 if let Some(c) = self.lookup.get(bytes) {
706 return Ok(InternedCode::Code(*c));
707 }
708 if !self.frozen {
709 return self.intern_internal(bytes, true).map(InternedCode::Code);
711 }
712 match policy {
713 UnknownCategoryPolicy::Error => Err(ByteDictError::UnknownCategory),
714 UnknownCategoryPolicy::MapToNull => Ok(InternedCode::Null),
715 UnknownCategoryPolicy::MapToOther { other_code } => {
716 if (*other_code as usize) < self.code_to_view.len() {
717 Ok(InternedCode::Code(*other_code))
718 } else {
719 Err(ByteDictError::OtherCodeNotInDictionary { code: *other_code })
720 }
721 }
722 UnknownCategoryPolicy::ExtendDictionary => {
723 self.intern_internal(bytes, false).map(InternedCode::Code)
725 }
726 }
727 }
728
729 fn intern_internal(
732 &mut self,
733 bytes: &[u8],
734 respect_frozen: bool,
735 ) -> Result<u64, ByteDictError> {
736 if let Some(&c) = self.lookup.get(bytes) {
737 return Ok(c);
738 }
739 if respect_frozen && self.frozen {
740 return Err(ByteDictError::Frozen);
741 }
742 if let CategoryOrdering::Explicit(_) = &self.ordering {
744 if respect_frozen {
745 return Err(ByteDictError::UnknownCategory);
746 }
747 }
748 let view = self.pool.push(bytes)?;
749 let code = self.code_to_view.len() as u64;
750 self.code_to_view.push(view);
751 self.lookup.insert(bytes.to_vec(), code);
752 Ok(code)
753 }
754
755 pub fn seal_lexical(&mut self) -> Vec<u64> {
762 if !matches!(self.ordering, CategoryOrdering::Lexical) {
763 return (0..self.code_to_view.len() as u64).collect();
764 }
765 let mut entries: Vec<(Vec<u8>, u64)> = self
767 .lookup
768 .iter()
769 .map(|(k, &v)| (k.clone(), v))
770 .collect();
771 entries.sort_by(|a, b| a.0.cmp(&b.0));
772 let mut perm = vec![0u64; self.code_to_view.len()];
773 let mut new_pool = ByteStringPool::new();
775 let mut new_code_to_view: Vec<ByteStrView> = Vec::with_capacity(entries.len());
776 let mut new_lookup: BTreeMap<Vec<u8>, u64> = BTreeMap::new();
777 for (new_code, (bytes, old_code)) in entries.into_iter().enumerate() {
778 let view = new_pool
779 .push(&bytes)
780 .expect("seal_lexical: re-push cannot exceed original pool size");
781 new_code_to_view.push(view);
782 new_lookup.insert(bytes, new_code as u64);
783 perm[old_code as usize] = new_code as u64;
784 }
785 self.pool = new_pool;
786 self.code_to_view = new_code_to_view;
787 self.lookup = new_lookup;
788 perm
789 }
790
791 pub fn iter(&self) -> impl Iterator<Item = (u64, &[u8])> + '_ {
793 self.code_to_view.iter().enumerate().map(move |(i, &v)| {
794 (i as u64, self.pool.get(v))
795 })
796 }
797}
798
799impl Default for ByteDictionary {
800 fn default() -> Self {
801 Self::new()
802 }
803}
804
805#[derive(Debug, Clone)]
824pub struct CategoricalColumn {
825 codes: AdaptiveCodes,
826 dictionary: ByteDictionary,
827 nulls: Option<BitMask>,
828}
829
830impl CategoricalColumn {
831 pub fn new() -> Self {
833 Self {
834 codes: AdaptiveCodes::new(),
835 dictionary: ByteDictionary::new(),
836 nulls: None,
837 }
838 }
839
840 pub fn with_dictionary(dictionary: ByteDictionary) -> Self {
843 Self {
844 codes: AdaptiveCodes::new(),
845 dictionary,
846 nulls: None,
847 }
848 }
849
850 pub fn len(&self) -> usize {
852 self.codes.len()
853 }
854
855 pub fn is_empty(&self) -> bool {
857 self.codes.is_empty()
858 }
859
860 pub fn dictionary(&self) -> &ByteDictionary {
862 &self.dictionary
863 }
864
865 pub fn codes(&self) -> &AdaptiveCodes {
867 &self.codes
868 }
869
870 pub fn nulls(&self) -> Option<&BitMask> {
872 self.nulls.as_ref()
873 }
874
875 pub fn is_null(&self, i: usize) -> bool {
877 match &self.nulls {
878 None => false,
879 Some(b) => !b.get(i),
880 }
881 }
882
883 pub fn push(&mut self, bytes: &[u8]) -> Result<u64, ByteDictError> {
887 let code = self.dictionary.intern(bytes)?;
888 self.codes.push(code);
889 if let Some(b) = &mut self.nulls {
890 *b = bitmask_with_extra_valid(b);
894 }
895 Ok(code)
896 }
897
898 pub fn push_null(&mut self) {
900 self.codes.push(0);
905 match &mut self.nulls {
906 None => {
907 let n = self.codes.len();
911 let mut words = vec![0u64; (n + 63) / 64];
912 for i in 0..(n - 1) {
915 words[i / 64] |= 1u64 << (i % 64);
916 }
917 self.nulls =
918 Some(BitMask::from_words_for_test(words, n));
919 }
920 Some(b) => {
921 *b = bitmask_with_extra_invalid(b);
922 }
923 }
924 }
925
926 pub fn push_with_policy(
929 &mut self,
930 bytes: &[u8],
931 policy: &UnknownCategoryPolicy,
932 ) -> Result<(), ByteDictError> {
933 match self.dictionary.intern_with_policy(bytes, policy)? {
934 InternedCode::Code(c) => {
935 self.codes.push(c);
936 if let Some(b) = &mut self.nulls {
937 *b = bitmask_with_extra_valid(b);
938 }
939 }
940 InternedCode::Null => {
941 self.push_null();
942 }
943 }
944 Ok(())
945 }
946
947 pub fn get(&self, i: usize) -> Option<&[u8]> {
949 if self.is_null(i) {
950 return None;
951 }
952 let code = self.codes.get(i);
953 self.dictionary.get(code)
954 }
955
956 pub fn iter(&self) -> impl Iterator<Item = (usize, Option<&[u8]>)> + '_ {
958 (0..self.len()).map(move |i| (i, self.get(i)))
959 }
960
961 pub fn seal_lexical(&mut self) {
966 if !matches!(self.dictionary.ordering(), CategoryOrdering::Lexical) {
967 return;
968 }
969 let perm = self.dictionary.seal_lexical();
970 let mut new_codes = AdaptiveCodes::with_capacity(self.codes.len());
971 for c in self.codes.iter() {
972 new_codes.push(perm[c as usize]);
973 }
974 self.codes = new_codes;
975 }
976
977 pub fn profile(&self) -> CategoricalProfile {
980 let cardinality = self.dictionary.len();
981 let nrows = self.len();
982 let missing = match &self.nulls {
983 None => 0,
984 Some(b) => nrows - b.count_ones(),
985 };
986 let bytes_used = self.dictionary.pool().byte_size();
987 let avg_byte_len = if cardinality == 0 {
988 0
989 } else {
990 bytes_used / cardinality
991 };
992 let mut max_byte_len = 0usize;
993 for (_, b) in self.dictionary.iter() {
994 if b.len() > max_byte_len {
995 max_byte_len = b.len();
996 }
997 }
998 let unique_ratio_thousandths = if nrows == 0 {
999 0
1000 } else {
1001 (cardinality as u64).saturating_mul(1000) / nrows as u64
1002 };
1003 CategoricalProfile {
1004 nrows,
1005 cardinality,
1006 missing,
1007 bytes_used,
1008 avg_byte_len,
1009 max_byte_len,
1010 code_width_bytes: self.codes.width_bytes(),
1011 unique_ratio_thousandths,
1012 }
1013 }
1014}
1015
1016impl Default for CategoricalColumn {
1017 fn default() -> Self {
1018 Self::new()
1019 }
1020}
1021
1022#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1025pub struct CategoricalProfile {
1026 pub nrows: usize,
1027 pub cardinality: usize,
1028 pub missing: usize,
1029 pub bytes_used: usize,
1030 pub avg_byte_len: usize,
1031 pub max_byte_len: usize,
1032 pub code_width_bytes: usize,
1033 pub unique_ratio_thousandths: u64,
1036}
1037
1038fn bitmask_with_extra_valid(b: &BitMask) -> BitMask {
1045 let n = b.nrows() + 1;
1046 let mut words: Vec<u64> = b.words_slice().to_vec();
1047 let needed = (n + 63) / 64;
1048 while words.len() < needed {
1049 words.push(0);
1050 }
1051 let i = n - 1;
1052 words[i / 64] |= 1u64 << (i % 64);
1053 BitMask::from_words_for_test(words, n)
1054}
1055
1056fn bitmask_with_extra_invalid(b: &BitMask) -> BitMask {
1059 let n = b.nrows() + 1;
1060 let mut words: Vec<u64> = b.words_slice().to_vec();
1061 let needed = (n + 63) / 64;
1062 while words.len() < needed {
1063 words.push(0);
1064 }
1065 BitMask::from_words_for_test(words, n)
1067}
1068
1069#[cfg(test)]
1074mod tests {
1075 use super::*;
1076
1077 #[test]
1080 fn pool_round_trip_simple() {
1081 let mut p = ByteStringPool::new();
1082 let v_hello = p.push(b"hello").unwrap();
1083 let v_world = p.push(b"world").unwrap();
1084 assert_eq!(p.get(v_hello), b"hello");
1085 assert_eq!(p.get(v_world), b"world");
1086 assert_eq!(p.len(), 2);
1087 assert_eq!(p.byte_size(), 10);
1088 }
1089
1090 #[test]
1091 fn pool_round_trip_empty_strings() {
1092 let mut p = ByteStringPool::new();
1094 let v_empty = p.push(b"").unwrap();
1095 let v_x = p.push(b"x").unwrap();
1096 let v_empty2 = p.push(b"").unwrap();
1097 assert!(p.get(v_empty).is_empty());
1098 assert_eq!(p.get(v_x), b"x");
1099 assert!(p.get(v_empty2).is_empty());
1100 assert_eq!(p.len(), 3);
1101 }
1102
1103 #[test]
1104 fn pool_round_trip_embedded_nul_and_high_bytes() {
1105 let mut p = ByteStringPool::new();
1106 let payload: &[u8] = &[0u8, 1, 2, 0, 0xff, 0xfe, b'a'];
1107 let v = p.push(payload).unwrap();
1108 assert_eq!(p.get(v), payload);
1109 }
1110
1111 #[test]
1112 fn pool_get_by_index_matches_views() {
1113 let mut p = ByteStringPool::new();
1114 p.push(b"a").unwrap();
1115 p.push(b"bb").unwrap();
1116 p.push(b"ccc").unwrap();
1117 assert_eq!(p.get_by_index(0).unwrap(), b"a");
1118 assert_eq!(p.get_by_index(1).unwrap(), b"bb");
1119 assert_eq!(p.get_by_index(2).unwrap(), b"ccc");
1120 assert!(p.get_by_index(3).is_none());
1121 }
1122
1123 #[test]
1124 fn pool_iter_is_insertion_order() {
1125 let mut p = ByteStringPool::new();
1126 p.push(b"z").unwrap();
1127 p.push(b"a").unwrap();
1128 p.push(b"m").unwrap();
1129 let collected: Vec<&[u8]> = p.iter().map(|(_, b)| b).collect();
1130 assert_eq!(collected, vec![b"z" as &[u8], b"a", b"m"]);
1131 }
1132
1133 #[test]
1136 fn adaptive_codes_starts_u8() {
1137 let c = AdaptiveCodes::new();
1138 assert!(matches!(c, AdaptiveCodes::U8(_)));
1139 assert_eq!(c.width_bytes(), 1);
1140 }
1141
1142 #[test]
1143 fn adaptive_codes_stays_u8_at_255() {
1144 let mut c = AdaptiveCodes::new();
1145 for i in 0u64..=255 {
1146 c.push(i);
1147 }
1148 assert!(matches!(c, AdaptiveCodes::U8(_)));
1149 assert_eq!(c.len(), 256);
1150 for i in 0u64..=255 {
1152 assert_eq!(c.get(i as usize), i);
1153 }
1154 }
1155
1156 #[test]
1157 fn adaptive_codes_promotes_u8_to_u16_at_256() {
1158 let mut c = AdaptiveCodes::new();
1159 for i in 0u64..=255 {
1160 c.push(i);
1161 }
1162 assert!(matches!(c, AdaptiveCodes::U8(_)));
1163 c.push(256);
1164 assert!(matches!(c, AdaptiveCodes::U16(_)));
1165 for i in 0u64..=255 {
1167 assert_eq!(c.get(i as usize), i);
1168 }
1169 assert_eq!(c.get(256), 256);
1170 }
1171
1172 #[test]
1173 fn adaptive_codes_promotes_u16_to_u32_at_65536() {
1174 let mut c = AdaptiveCodes::U16(vec![0u16, 1, 2, 65_535]);
1175 c.push(65_536);
1176 assert!(matches!(c, AdaptiveCodes::U32(_)));
1177 assert_eq!(c.get(0), 0);
1178 assert_eq!(c.get(3), 65_535);
1179 assert_eq!(c.get(4), 65_536);
1180 }
1181
1182 #[test]
1183 fn adaptive_codes_promotes_u32_to_u64_above_4b() {
1184 let mut c = AdaptiveCodes::U32(vec![0u32, 1, u32::MAX]);
1185 let huge = (u32::MAX as u64) + 1;
1186 c.push(huge);
1187 assert!(matches!(c, AdaptiveCodes::U64(_)));
1188 assert_eq!(c.get(0), 0);
1189 assert_eq!(c.get(2), u32::MAX as u64);
1190 assert_eq!(c.get(3), huge);
1191 }
1192
1193 #[test]
1194 fn adaptive_codes_iter_matches_get() {
1195 let mut c = AdaptiveCodes::new();
1196 for i in 0u64..10 {
1197 c.push(i * 7);
1198 }
1199 let via_iter: Vec<u64> = c.iter().collect();
1200 let via_get: Vec<u64> = (0..c.len()).map(|i| c.get(i)).collect();
1201 assert_eq!(via_iter, via_get);
1202 }
1203
1204 #[test]
1207 fn dict_intern_assigns_sequential_codes() {
1208 let mut d = ByteDictionary::new();
1209 assert_eq!(d.intern(b"red").unwrap(), 0);
1210 assert_eq!(d.intern(b"green").unwrap(), 1);
1211 assert_eq!(d.intern(b"blue").unwrap(), 2);
1212 assert_eq!(d.intern(b"red").unwrap(), 0);
1214 assert_eq!(d.intern(b"green").unwrap(), 1);
1215 assert_eq!(d.len(), 3);
1216 }
1217
1218 #[test]
1219 fn dict_get_round_trips_bytes() {
1220 let mut d = ByteDictionary::new();
1221 d.intern(b"red").unwrap();
1222 d.intern(b"green").unwrap();
1223 d.intern(b"blue").unwrap();
1224 assert_eq!(d.get(0).unwrap(), b"red");
1225 assert_eq!(d.get(1).unwrap(), b"green");
1226 assert_eq!(d.get(2).unwrap(), b"blue");
1227 assert!(d.get(99).is_none());
1228 }
1229
1230 #[test]
1231 fn dict_lookup_does_not_extend() {
1232 let mut d = ByteDictionary::new();
1233 d.intern(b"red").unwrap();
1234 assert_eq!(d.lookup(b"red"), Some(0));
1235 assert_eq!(d.lookup(b"green"), None);
1236 assert_eq!(d.len(), 1);
1237 }
1238
1239 #[test]
1240 fn dict_first_seen_is_insertion_order() {
1241 let mut d = ByteDictionary::new();
1243 assert_eq!(d.intern(b"B").unwrap(), 0);
1244 assert_eq!(d.intern(b"A").unwrap(), 1);
1245 assert_eq!(d.intern(b"C").unwrap(), 2);
1246 assert_eq!(d.get(0).unwrap(), b"B");
1247 assert_eq!(d.get(1).unwrap(), b"A");
1248 assert_eq!(d.get(2).unwrap(), b"C");
1249 }
1250
1251 #[test]
1252 fn dict_lexical_seal_reorders_by_bytes() {
1253 let mut d = ByteDictionary::with_ordering(CategoryOrdering::Lexical);
1254 d.intern(b"banana").unwrap();
1255 d.intern(b"apple").unwrap();
1256 d.intern(b"cherry").unwrap();
1257 let perm = d.seal_lexical();
1258 assert_eq!(d.get(0).unwrap(), b"apple");
1260 assert_eq!(d.get(1).unwrap(), b"banana");
1261 assert_eq!(d.get(2).unwrap(), b"cherry");
1262 assert_eq!(perm, vec![1, 0, 2]);
1267 }
1268
1269 #[test]
1270 fn dict_lexical_two_insertion_orders_seal_to_same_codes() {
1271 let mut d1 = ByteDictionary::with_ordering(CategoryOrdering::Lexical);
1272 let mut d2 = ByteDictionary::with_ordering(CategoryOrdering::Lexical);
1273 d1.intern(b"banana").unwrap();
1274 d1.intern(b"apple").unwrap();
1275 d1.intern(b"cherry").unwrap();
1276 d2.intern(b"cherry").unwrap();
1277 d2.intern(b"banana").unwrap();
1278 d2.intern(b"apple").unwrap();
1279 d1.seal_lexical();
1280 d2.seal_lexical();
1281 for code in 0u64..3 {
1282 assert_eq!(d1.get(code), d2.get(code), "lex ordering diverges at {code}");
1283 }
1284 }
1285
1286 #[test]
1287 fn dict_explicit_pre_populates_codes() {
1288 let d = ByteDictionary::from_explicit(vec![
1289 b"low".to_vec(),
1290 b"med".to_vec(),
1291 b"high".to_vec(),
1292 ])
1293 .unwrap();
1294 assert_eq!(d.lookup(b"low"), Some(0));
1295 assert_eq!(d.lookup(b"med"), Some(1));
1296 assert_eq!(d.lookup(b"high"), Some(2));
1297 }
1298
1299 #[test]
1300 fn dict_explicit_rejects_duplicates() {
1301 let err = ByteDictionary::from_explicit(vec![
1302 b"a".to_vec(),
1303 b"b".to_vec(),
1304 b"a".to_vec(),
1305 ]);
1306 assert!(matches!(err, Err(ByteDictError::ExplicitOrderingHasDuplicates)));
1307 }
1308
1309 #[test]
1310 fn dict_explicit_rejects_unknown_intern_when_frozen() {
1311 let mut d = ByteDictionary::from_explicit(vec![b"a".to_vec(), b"b".to_vec()]).unwrap();
1312 d.freeze();
1313 let res = d.intern(b"c");
1314 assert!(matches!(res, Err(ByteDictError::Frozen)));
1315 }
1316
1317 #[test]
1320 fn dict_frozen_intern_errors() {
1321 let mut d = ByteDictionary::new();
1322 d.intern(b"a").unwrap();
1323 d.freeze();
1324 let err = d.intern(b"b");
1325 assert!(matches!(err, Err(ByteDictError::Frozen)));
1326 }
1327
1328 #[test]
1329 fn policy_error_returns_unknown_category() {
1330 let mut d = ByteDictionary::new();
1331 d.intern(b"a").unwrap();
1332 d.freeze();
1333 let res = d.intern_with_policy(b"b", &UnknownCategoryPolicy::Error);
1334 assert!(matches!(res, Err(ByteDictError::UnknownCategory)));
1335 }
1336
1337 #[test]
1338 fn policy_map_to_null_returns_null() {
1339 let mut d = ByteDictionary::new();
1340 d.intern(b"a").unwrap();
1341 d.freeze();
1342 let res = d.intern_with_policy(b"b", &UnknownCategoryPolicy::MapToNull);
1343 assert_eq!(res, Ok(InternedCode::Null));
1344 }
1345
1346 #[test]
1347 fn policy_map_to_other_returns_other_code() {
1348 let mut d = ByteDictionary::new();
1349 let _a = d.intern(b"a").unwrap();
1350 let other = d.intern(b"Other").unwrap();
1351 d.freeze();
1352 let res = d.intern_with_policy(
1353 b"unseen",
1354 &UnknownCategoryPolicy::MapToOther { other_code: other },
1355 );
1356 assert_eq!(res, Ok(InternedCode::Code(other)));
1357 }
1358
1359 #[test]
1360 fn policy_map_to_other_rejects_invalid_other_code() {
1361 let mut d = ByteDictionary::new();
1362 d.intern(b"a").unwrap();
1363 d.freeze();
1364 let res = d.intern_with_policy(
1365 b"x",
1366 &UnknownCategoryPolicy::MapToOther { other_code: 99 },
1367 );
1368 assert!(matches!(
1369 res,
1370 Err(ByteDictError::OtherCodeNotInDictionary { code: 99 })
1371 ));
1372 }
1373
1374 #[test]
1375 fn policy_extend_dictionary_bypasses_frozen() {
1376 let mut d = ByteDictionary::new();
1377 d.intern(b"a").unwrap();
1378 d.freeze();
1379 let res = d
1380 .intern_with_policy(b"b", &UnknownCategoryPolicy::ExtendDictionary)
1381 .unwrap();
1382 assert_eq!(res, InternedCode::Code(1));
1383 assert_eq!(d.len(), 2);
1384 assert!(d.is_frozen());
1386 let strict = d.intern_with_policy(b"c", &UnknownCategoryPolicy::Error);
1387 assert!(matches!(strict, Err(ByteDictError::UnknownCategory)));
1388 }
1389
1390 #[test]
1393 fn col_push_round_trip() {
1394 let mut c = CategoricalColumn::new();
1395 c.push(b"red").unwrap();
1396 c.push(b"blue").unwrap();
1397 c.push(b"red").unwrap();
1398 assert_eq!(c.len(), 3);
1399 assert_eq!(c.get(0).unwrap(), b"red");
1400 assert_eq!(c.get(1).unwrap(), b"blue");
1401 assert_eq!(c.get(2).unwrap(), b"red");
1402 assert_eq!(c.dictionary().len(), 2);
1404 }
1405
1406 #[test]
1407 fn col_push_null_is_observed() {
1408 let mut c = CategoricalColumn::new();
1409 c.push(b"red").unwrap();
1410 c.push_null();
1411 c.push(b"blue").unwrap();
1412 assert_eq!(c.len(), 3);
1413 assert!(!c.is_null(0));
1414 assert!(c.is_null(1));
1415 assert!(!c.is_null(2));
1416 assert_eq!(c.get(0).unwrap(), b"red");
1417 assert!(c.get(1).is_none());
1418 assert_eq!(c.get(2).unwrap(), b"blue");
1419 }
1420
1421 #[test]
1422 fn col_seal_lexical_remaps_codes() {
1423 let mut c =
1424 CategoricalColumn::with_dictionary(ByteDictionary::with_ordering(CategoryOrdering::Lexical));
1425 c.push(b"banana").unwrap();
1426 c.push(b"apple").unwrap();
1427 c.push(b"banana").unwrap();
1428 c.push(b"cherry").unwrap();
1429 c.seal_lexical();
1430 assert_eq!(c.codes().get(0), 1, "row0 was banana → seal to 1");
1432 assert_eq!(c.codes().get(1), 0, "row1 was apple → seal to 0");
1433 assert_eq!(c.codes().get(2), 1, "row2 was banana → seal to 1");
1434 assert_eq!(c.codes().get(3), 2, "row3 was cherry → seal to 2");
1435 assert_eq!(c.get(0).unwrap(), b"banana");
1437 assert_eq!(c.get(1).unwrap(), b"apple");
1438 assert_eq!(c.get(2).unwrap(), b"banana");
1439 assert_eq!(c.get(3).unwrap(), b"cherry");
1440 }
1441
1442 #[test]
1443 fn col_promotion_at_256_distinct() {
1444 let mut c = CategoricalColumn::new();
1446 for i in 0u32..257 {
1447 let s = format!("v{}", i);
1449 c.push(s.as_bytes()).unwrap();
1450 }
1451 assert_eq!(c.dictionary().len(), 257);
1452 assert!(matches!(
1454 c.codes(),
1455 AdaptiveCodes::U16(_) | AdaptiveCodes::U32(_) | AdaptiveCodes::U64(_)
1456 ));
1457 for i in 0u32..257 {
1459 let s = format!("v{}", i);
1460 assert_eq!(c.get(i as usize).unwrap(), s.as_bytes());
1461 }
1462 }
1463
1464 #[test]
1465 fn col_push_with_policy_null_records_null() {
1466 let mut c = CategoricalColumn::new();
1467 c.push(b"a").unwrap();
1468 c.push(b"b").unwrap();
1469 let mut dict = ByteDictionary::new();
1473 dict.intern(b"a").unwrap();
1474 dict.intern(b"b").unwrap();
1475 dict.freeze();
1476 let mut c2 = CategoricalColumn::with_dictionary(dict);
1477 c2.push_with_policy(b"a", &UnknownCategoryPolicy::Error).unwrap();
1478 c2.push_with_policy(b"unseen", &UnknownCategoryPolicy::MapToNull)
1479 .unwrap();
1480 c2.push_with_policy(b"b", &UnknownCategoryPolicy::Error).unwrap();
1481 assert_eq!(c2.len(), 3);
1482 assert!(!c2.is_null(0));
1483 assert!(c2.is_null(1));
1484 assert!(!c2.is_null(2));
1485 }
1486
1487 #[test]
1490 fn profile_reports_basic_stats() {
1491 let mut c = CategoricalColumn::new();
1492 c.push(b"red").unwrap();
1493 c.push(b"blue").unwrap();
1494 c.push_null();
1495 c.push(b"red").unwrap();
1496 let p = c.profile();
1497 assert_eq!(p.nrows, 4);
1498 assert_eq!(p.cardinality, 2);
1499 assert_eq!(p.missing, 1);
1500 assert_eq!(p.bytes_used, 7); assert_eq!(p.unique_ratio_thousandths, 500);
1503 assert_eq!(p.code_width_bytes, 1);
1504 }
1505
1506 #[test]
1509 fn determinism_same_input_first_seen_two_dicts() {
1510 let inputs: &[&[u8]] = &[b"alpha", b"beta", b"alpha", b"gamma", b"beta", b"delta"];
1511 let mut d1 = ByteDictionary::new();
1512 let mut d2 = ByteDictionary::new();
1513 let codes1: Vec<u64> = inputs.iter().map(|b| d1.intern(b).unwrap()).collect();
1514 let codes2: Vec<u64> = inputs.iter().map(|b| d2.intern(b).unwrap()).collect();
1515 assert_eq!(codes1, codes2, "FirstSeen must be deterministic");
1516 assert_eq!(codes1, vec![0, 1, 0, 2, 1, 3]);
1518 }
1519
1520 #[test]
1521 fn determinism_lexical_two_permutations_seal_identical() {
1522 let perm1: &[&[u8]] = &[b"alpha", b"beta", b"gamma", b"delta"];
1523 let perm2: &[&[u8]] = &[b"delta", b"gamma", b"beta", b"alpha"];
1524
1525 let mut d1 = ByteDictionary::with_ordering(CategoryOrdering::Lexical);
1526 let mut d2 = ByteDictionary::with_ordering(CategoryOrdering::Lexical);
1527 for s in perm1 {
1528 d1.intern(s).unwrap();
1529 }
1530 for s in perm2 {
1531 d2.intern(s).unwrap();
1532 }
1533 d1.seal_lexical();
1534 d2.seal_lexical();
1535 for code in 0u64..4 {
1538 assert_eq!(d1.get(code), d2.get(code), "code {code}");
1539 }
1540 }
1541}