vortex_btrblocks/integer/
dictionary.rs1use vortex_array::IntoArray;
7use vortex_array::arrays::{DictArray, PrimitiveArray};
8use vortex_array::validity::Validity;
9use vortex_array::vtable::ValidityHelper;
10use vortex_buffer::Buffer;
11
12use crate::integer::IntegerStats;
13use crate::integer::stats::ErasedStats;
14
15macro_rules! typed_encode {
16 ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{
17 let values: Buffer<$typ> = $typed.distinct_values.keys().map(|x| x.0).collect();
18
19 let max_code = values.len();
20 let codes = if max_code <= u8::MAX as usize {
21 let buf =
22 <DictEncoder as Encode<$typ, u8>>::encode(&values, $stats.src.as_slice::<$typ>());
23 PrimitiveArray::new(buf, $validity.clone()).into_array()
24 } else if max_code <= u16::MAX as usize {
25 let buf =
26 <DictEncoder as Encode<$typ, u16>>::encode(&values, $stats.src.as_slice::<$typ>());
27 PrimitiveArray::new(buf, $validity.clone()).into_array()
28 } else {
29 let buf =
30 <DictEncoder as Encode<$typ, u32>>::encode(&values, $stats.src.as_slice::<$typ>());
31 PrimitiveArray::new(buf, $validity.clone()).into_array()
32 };
33
34 let values_validity = match $validity {
35 Validity::NonNullable => Validity::NonNullable,
36 _ => Validity::AllValid,
37 };
38
39 let values = PrimitiveArray::new(values, values_validity).into_array();
40 unsafe { DictArray::new_unchecked(codes, values) }
42 }};
43}
44
45#[allow(clippy::cognitive_complexity)]
47pub fn dictionary_encode(stats: &IntegerStats) -> DictArray {
48 let src_validity = stats.src.validity();
50
51 match &stats.typed {
52 ErasedStats::U8(typed) => typed_encode!(stats, typed, src_validity, u8),
53 ErasedStats::U16(typed) => typed_encode!(stats, typed, src_validity, u16),
54 ErasedStats::U32(typed) => typed_encode!(stats, typed, src_validity, u32),
55 ErasedStats::U64(typed) => typed_encode!(stats, typed, src_validity, u64),
56 ErasedStats::I8(typed) => typed_encode!(stats, typed, src_validity, i8),
57 ErasedStats::I16(typed) => typed_encode!(stats, typed, src_validity, i16),
58 ErasedStats::I32(typed) => typed_encode!(stats, typed, src_validity, i32),
59 ErasedStats::I64(typed) => typed_encode!(stats, typed, src_validity, i64),
60 }
61}
62
63struct DictEncoder;
64
65trait Encode<T, I> {
66 fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
68}
69
70macro_rules! impl_encode {
71 ($typ:ty) => { impl_encode!($typ, u8, u16, u32); };
72 ($typ:ty, $($ityp:ty),+) => {
73 $(
74 impl Encode<$typ, $ityp> for DictEncoder {
75 #[allow(clippy::cast_possible_truncation)]
76 fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
77 let mut codes =
78 vortex_utils::aliases::hash_map::HashMap::<$typ, $ityp>::with_capacity(
79 distinct.len(),
80 );
81 for (code, &value) in distinct.iter().enumerate() {
82 codes.insert(value, code as $ityp);
83 }
84
85 let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
86 for value in values {
87 unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) };
91 }
92
93 return output.freeze();
94 }
95 }
96 )*
97 };
98}
99
100impl_encode!(u8);
101impl_encode!(u16);
102impl_encode!(u32);
103impl_encode!(u64);
104impl_encode!(i8);
105impl_encode!(i16);
106impl_encode!(i32);
107impl_encode!(i64);
108
109#[cfg(test)]
110mod tests {
111 use vortex_array::arrays::{BoolArray, PrimitiveArray};
112 use vortex_array::validity::Validity;
113 use vortex_array::{Array, IntoArray, assert_arrays_eq};
114 use vortex_buffer::buffer;
115
116 use crate::CompressorStats;
117 use crate::integer::IntegerStats;
118 use crate::integer::dictionary::dictionary_encode;
119
120 #[test]
121 fn test_dict_encode_integer_stats() {
122 let data = buffer![100i32, 200, 100, 0, 100];
124 let validity =
125 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
126 let array = PrimitiveArray::new(data, validity);
127
128 let stats = IntegerStats::generate(&array);
129 let dict_array = dictionary_encode(&stats);
130 assert_eq!(dict_array.values().len(), 2);
131 assert_eq!(dict_array.codes().len(), 5);
132
133 let undict = dict_array;
134
135 let expected = PrimitiveArray::new(
138 buffer![100i32, 200, 100, 100, 100],
139 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()),
140 )
141 .into_array();
142 assert_arrays_eq!(undict.as_ref(), expected.as_ref());
143 }
144}