vortex_btrblocks/integer/
dictionary.rs1use vortex_array::IntoArray;
4use vortex_array::arrays::PrimitiveArray;
5use vortex_array::validity::Validity;
6use vortex_array::vtable::ValidityHelper;
7use vortex_buffer::Buffer;
8use vortex_dict::DictArray;
9use vortex_error::VortexResult;
10
11use crate::integer::IntegerStats;
12use crate::integer::stats::ErasedStats;
13
14macro_rules! typed_encode {
15 ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{
16 let values: Buffer<$typ> = $typed.distinct_values.keys().map(|x| x.0).collect();
17
18 let max_code = values.len();
19 let codes = if max_code <= u8::MAX as usize {
20 let buf =
21 <DictEncoder as Encode<$typ, u8>>::encode(&values, $stats.src.as_slice::<$typ>());
22 PrimitiveArray::new(buf, $validity.clone()).into_array()
23 } else if max_code <= u16::MAX as usize {
24 let buf =
25 <DictEncoder as Encode<$typ, u16>>::encode(&values, $stats.src.as_slice::<$typ>());
26 PrimitiveArray::new(buf, $validity.clone()).into_array()
27 } else {
28 let buf =
29 <DictEncoder as Encode<$typ, u32>>::encode(&values, $stats.src.as_slice::<$typ>());
30 PrimitiveArray::new(buf, $validity.clone()).into_array()
31 };
32
33 let values_validity = match $validity {
34 Validity::NonNullable => Validity::NonNullable,
35 _ => Validity::AllValid,
36 };
37
38 let values = PrimitiveArray::new(values, values_validity).into_array();
39 DictArray::try_new(codes, values)
40 }};
41}
42
43#[allow(clippy::cognitive_complexity)]
44pub fn dictionary_encode(stats: &IntegerStats) -> VortexResult<DictArray> {
45 let src_validity = stats.src.validity();
47
48 match &stats.typed {
49 ErasedStats::U8(typed) => typed_encode!(stats, typed, src_validity, u8),
50 ErasedStats::U16(typed) => typed_encode!(stats, typed, src_validity, u16),
51 ErasedStats::U32(typed) => typed_encode!(stats, typed, src_validity, u32),
52 ErasedStats::U64(typed) => typed_encode!(stats, typed, src_validity, u64),
53 ErasedStats::I8(typed) => typed_encode!(stats, typed, src_validity, i8),
54 ErasedStats::I16(typed) => typed_encode!(stats, typed, src_validity, i16),
55 ErasedStats::I32(typed) => typed_encode!(stats, typed, src_validity, i32),
56 ErasedStats::I64(typed) => typed_encode!(stats, typed, src_validity, i64),
57 }
58}
59
60struct DictEncoder;
61
62trait Encode<T, I> {
63 fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
65}
66
67macro_rules! impl_encode {
68 ($typ:ty) => { impl_encode!($typ, u8, u16, u32); };
69 ($typ:ty, $($ityp:ty),+) => {
70 $(
71 impl Encode<$typ, $ityp> for DictEncoder {
72 #[allow(clippy::cast_possible_truncation)]
73 fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
74 let mut codes =
75 vortex_array::aliases::hash_map::HashMap::<$typ, $ityp>::with_capacity(
76 distinct.len(),
77 );
78 for (code, &value) in distinct.iter().enumerate() {
79 codes.insert(value, code as $ityp);
80 }
81
82 let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
83 for value in values {
84 unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) };
88 }
89
90 return output.freeze();
91 }
92 }
93 )*
94 };
95}
96
97impl_encode!(u8);
98impl_encode!(u16);
99impl_encode!(u32);
100impl_encode!(u64);
101impl_encode!(i8);
102impl_encode!(i16);
103impl_encode!(i32);
104impl_encode!(i64);
105
106#[cfg(test)]
107mod tests {
108 use vortex_array::arrays::{BoolArray, PrimitiveArray};
109 use vortex_array::validity::Validity;
110 use vortex_array::{Array, IntoArray, ToCanonical};
111 use vortex_buffer::buffer;
112
113 use crate::CompressorStats;
114 use crate::integer::IntegerStats;
115 use crate::integer::dictionary::dictionary_encode;
116
117 #[test]
118 fn test_dict_encode_integer_stats() {
119 let data = buffer![100i32, 200, 100, 0, 100];
121 let validity =
122 Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
123 let array = PrimitiveArray::new(data, validity);
124
125 let stats = IntegerStats::generate(&array);
126 let dict_array = dictionary_encode(&stats).unwrap();
127 assert_eq!(dict_array.values().len(), 2);
128 assert_eq!(dict_array.codes().len(), 5);
129
130 let undict = dict_array.to_primitive().unwrap();
131
132 assert_eq!(undict.as_slice::<i32>(), &[100i32, 200, 100, 100, 100]);
135 }
136}