vortex_sampling_compressor/compressors/
bitpacked.rs

1#![allow(clippy::cast_possible_truncation)]
2use vortex_array::aliases::hash_set::HashSet;
3use vortex_array::array::PrimitiveArray;
4use vortex_array::variants::PrimitiveArrayTrait;
5use vortex_array::{Array, Encoding, EncodingId, IntoArray, IntoArrayVariant};
6use vortex_dtype::match_each_integer_ptype;
7use vortex_error::{vortex_bail, vortex_err, vortex_panic, VortexResult};
8use vortex_fastlanes::{
9    bitpack_unchecked, count_exceptions, find_best_bit_width, find_min_patchless_bit_width,
10    gather_patches, BitPackedArray, BitPackedEncoding,
11};
12
13use crate::compressors::{CompressedArray, CompressionTree, EncodingCompressor};
14use crate::{constants, SamplingCompressor};
15
16pub const BITPACK_WITH_PATCHES: BitPackedCompressor = BitPackedCompressor {
17    allow_patches: true,
18};
19pub const BITPACK_NO_PATCHES: BitPackedCompressor = BitPackedCompressor {
20    allow_patches: false,
21};
22
23#[derive(Debug)]
24pub struct BitPackedCompressor {
25    allow_patches: bool,
26}
27
28impl BitPackedCompressor {
29    fn find_bit_width(&self, array: &PrimitiveArray) -> VortexResult<u8> {
30        if self.allow_patches {
31            find_best_bit_width(array)
32        } else {
33            find_min_patchless_bit_width(array)
34        }
35    }
36}
37
38impl EncodingCompressor for BitPackedCompressor {
39    fn id(&self) -> &str {
40        if self.allow_patches {
41            "fastlanes.bitpacked"
42        } else {
43            "fastlanes.bitpacked_no_patches"
44        }
45    }
46
47    fn cost(&self) -> u8 {
48        if self.allow_patches {
49            constants::BITPACKED_WITH_PATCHES_COST
50        } else {
51            constants::BITPACKED_NO_PATCHES_COST
52        }
53    }
54
55    fn can_compress(&self, array: &Array) -> Option<&dyn EncodingCompressor> {
56        // Only support primitive arrays
57        let parray = PrimitiveArray::maybe_from(array)?;
58
59        // Only integer arrays can be bit-packed
60        if !parray.ptype().is_int() {
61            return None;
62        }
63
64        // Only arrays with non-negative values can be bit-packed
65        if !parray.ptype().is_unsigned_int() {
66            let has_negative_elements = match_each_integer_ptype!(parray.ptype(), |$P| {
67                parray.statistics().compute_min::<Option<$P>>().unwrap_or_default().unwrap_or_default() < 0
68            });
69
70            if has_negative_elements {
71                return None;
72            }
73        }
74
75        let bit_width = self.find_bit_width(&parray).ok()?;
76
77        // Check that the bit width is less than the type's bit width
78        if bit_width == parray.ptype().bit_width() as u8 {
79            return None;
80        }
81
82        Some(self)
83    }
84
85    fn compress<'a>(
86        &'a self,
87        array: &Array,
88        _like: Option<CompressionTree<'a>>,
89        ctx: SamplingCompressor<'a>,
90    ) -> VortexResult<CompressedArray<'a>> {
91        let parray = array.clone().into_primitive()?;
92        // Only arrays with non-negative values can be bit-packed
93        if !parray.ptype().is_unsigned_int() {
94            let has_negative_elements = match_each_integer_ptype!(parray.ptype(), |$P| {
95                parray.statistics().compute_min::<Option<$P>>().unwrap_or_default().unwrap_or_default() < 0
96            });
97
98            if has_negative_elements {
99                vortex_bail!("Cannot BitPackCompressor::compress an array with negative values");
100            }
101        }
102
103        let bit_width_freq = parray
104            .statistics()
105            .compute_bit_width_freq()
106            .ok_or_else(|| vortex_err!(ComputeError: "missing bit width frequency"))?;
107
108        let bit_width = self.find_bit_width(&parray)?;
109        let num_exceptions = count_exceptions(bit_width, &bit_width_freq);
110        if !self.allow_patches && num_exceptions > 0 {
111            vortex_panic!(
112                "Found {} exceptions with patchless bit width {}",
113                num_exceptions,
114                bit_width
115            )
116        }
117
118        if bit_width == parray.ptype().bit_width() as u8 {
119            // Nothing we can do
120            return Ok(CompressedArray::uncompressed(array.clone()));
121        }
122
123        let validity = ctx.compress_validity(parray.validity())?;
124        // SAFETY: we check that the array only contains non-negative values.
125        let packed_buffer = unsafe { bitpack_unchecked(&parray, bit_width)? };
126        let patches = (num_exceptions > 0)
127            .then(|| {
128                gather_patches(&parray, bit_width, num_exceptions).map(|p| {
129                    ctx.auxiliary("patches")
130                        .excluding(&BITPACK_WITH_PATCHES)
131                        .including(&BITPACK_NO_PATCHES)
132                        .compress_patches(p)
133                })
134            })
135            .flatten()
136            .transpose()?;
137
138        Ok(CompressedArray::compressed(
139            // SAFETY: we ensure the array contains no negative values.
140            unsafe {
141                BitPackedArray::new_unchecked(
142                    packed_buffer,
143                    parray.ptype(),
144                    validity,
145                    patches,
146                    bit_width,
147                    parray.len(),
148                )?
149            }
150            .into_array(),
151            Some(CompressionTree::new(self, vec![])),
152            array,
153        ))
154    }
155
156    fn used_encodings(&self) -> HashSet<EncodingId> {
157        HashSet::from([BitPackedEncoding::ID])
158    }
159}
160
161#[cfg(test)]
162mod tests {
163    use vortex_array::array::ConstantArray;
164    use vortex_array::IntoArray;
165    use vortex_buffer::buffer;
166
167    use crate::compressors::bitpacked::{BITPACK_NO_PATCHES, BITPACK_WITH_PATCHES};
168    use crate::compressors::EncodingCompressor;
169    use crate::SamplingCompressor;
170
171    #[test]
172    fn cannot_compress() {
173        // cannot compress when array contains negative values
174        assert!(BITPACK_NO_PATCHES
175            .can_compress(&buffer![-1i32, 0i32, 1i32].into_array())
176            .is_none());
177
178        // Non-integer primitive array.
179        assert!(BITPACK_NO_PATCHES
180            .can_compress(&buffer![0f32, 1f32].into_array())
181            .is_none());
182
183        // non-PrimitiveArray
184        assert!(BITPACK_NO_PATCHES
185            .can_compress(&ConstantArray::new(3u32, 10).into_array())
186            .is_none());
187    }
188
189    #[test]
190    fn can_compress() {
191        // Unsigned integers
192        assert!(BITPACK_NO_PATCHES
193            .can_compress(&buffer![0u32, 1u32, 2u32].into_array())
194            .is_some());
195
196        // Signed non-negative integers
197        assert!(BITPACK_WITH_PATCHES
198            .can_compress(&buffer![0i32, 1i32, 2i32].into_array())
199            .is_some());
200    }
201
202    #[test]
203    fn compress_negatives_fails() {
204        assert!(BITPACK_NO_PATCHES
205            .compress(
206                &buffer![-1i32, 0i32].into_array(),
207                None,
208                SamplingCompressor::default(),
209            )
210            .is_err());
211    }
212}