vortex_btrblocks/compressor/integer/
dictionary.rs1use vortex_array::IntoArray;
9use vortex_array::arrays::DictArray;
10use vortex_array::arrays::PrimitiveArray;
11use vortex_array::validity::Validity;
12use vortex_array::vtable::ValidityHelper;
13use vortex_buffer::Buffer;
14
15use super::IntegerStats;
16use super::stats::ErasedStats;
17
18macro_rules! typed_encode {
19 ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{
20 let values: Buffer<$typ> = $typed.distinct_values.keys().map(|x| x.0).collect();
21
22 let max_code = values.len();
23 let codes = if max_code <= u8::MAX as usize {
24 let buf =
25 <DictEncoder as Encode<$typ, u8>>::encode(&values, $stats.src.as_slice::<$typ>());
26 PrimitiveArray::new(buf, $validity.clone()).into_array()
27 } else if max_code <= u16::MAX as usize {
28 let buf =
29 <DictEncoder as Encode<$typ, u16>>::encode(&values, $stats.src.as_slice::<$typ>());
30 PrimitiveArray::new(buf, $validity.clone()).into_array()
31 } else {
32 let buf =
33 <DictEncoder as Encode<$typ, u32>>::encode(&values, $stats.src.as_slice::<$typ>());
34 PrimitiveArray::new(buf, $validity.clone()).into_array()
35 };
36
37 let values_validity = match $validity {
38 Validity::NonNullable => Validity::NonNullable,
39 _ => Validity::AllValid,
40 };
41
42 let values = PrimitiveArray::new(values, values_validity).into_array();
43 unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) }
45 }};
46}
47
48#[expect(
50 clippy::cognitive_complexity,
51 reason = "complexity from match on all integer types"
52)]
53pub fn dictionary_encode(stats: &IntegerStats) -> DictArray {
54 let src_validity = stats.src.validity();
56
57 match &stats.typed {
58 ErasedStats::U8(typed) => typed_encode!(stats, typed, src_validity, u8),
59 ErasedStats::U16(typed) => typed_encode!(stats, typed, src_validity, u16),
60 ErasedStats::U32(typed) => typed_encode!(stats, typed, src_validity, u32),
61 ErasedStats::U64(typed) => typed_encode!(stats, typed, src_validity, u64),
62 ErasedStats::I8(typed) => typed_encode!(stats, typed, src_validity, i8),
63 ErasedStats::I16(typed) => typed_encode!(stats, typed, src_validity, i16),
64 ErasedStats::I32(typed) => typed_encode!(stats, typed, src_validity, i32),
65 ErasedStats::I64(typed) => typed_encode!(stats, typed, src_validity, i64),
66 }
67}
68
69struct DictEncoder;
70
71trait Encode<T, I> {
72 fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
74}
75
76macro_rules! impl_encode {
77 ($typ:ty) => { impl_encode!($typ, u8, u16, u32); };
78 ($typ:ty, $($ityp:ty),+) => {
79 $(
80 impl Encode<$typ, $ityp> for DictEncoder {
81 #[allow(clippy::cast_possible_truncation)]
82 fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
83 let mut codes =
84 vortex_utils::aliases::hash_map::HashMap::<$typ, $ityp>::with_capacity(
85 distinct.len(),
86 );
87 for (code, &value) in distinct.iter().enumerate() {
88 codes.insert(value, code as $ityp);
89 }
90
91 let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
92 for value in values {
93 unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) };
97 }
98
99 return output.freeze();
100 }
101 }
102 )*
103 };
104}
105
106impl_encode!(u8);
107impl_encode!(u16);
108impl_encode!(u32);
109impl_encode!(u64);
110impl_encode!(i8);
111impl_encode!(i16);
112impl_encode!(i32);
113impl_encode!(i64);
114
115#[cfg(test)]
116mod tests {
117 use vortex_array::Array;
118 use vortex_array::IntoArray;
119 use vortex_array::arrays::BoolArray;
120 use vortex_array::arrays::PrimitiveArray;
121 use vortex_array::assert_arrays_eq;
122 use vortex_array::validity::Validity;
123 use vortex_buffer::buffer;
124
125 use super::IntegerStats;
126 use super::dictionary_encode;
127 use crate::CompressorStats;
128
129 #[test]
130 fn test_dict_encode_integer_stats() {
131 let data = buffer![100i32, 200, 100, 0, 100];
133 let validity =
134 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
135 let array = PrimitiveArray::new(data, validity);
136
137 let stats = IntegerStats::generate(&array);
138 let dict_array = dictionary_encode(&stats);
139 assert_eq!(dict_array.values().len(), 2);
140 assert_eq!(dict_array.codes().len(), 5);
141
142 let undict = dict_array;
143
144 let expected = PrimitiveArray::new(
147 buffer![100i32, 200, 100, 100, 100],
148 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()),
149 )
150 .into_array();
151 assert_arrays_eq!(undict.as_ref(), expected.as_ref());
152 }
153}