Skip to main content

vortex_fastlanes/bitpacking/array/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::Display;
5use std::fmt::Formatter;
6
7use fastlanes::BitPacking;
8use vortex_array::ArrayRef;
9use vortex_array::ExecutionCtx;
10use vortex_array::TypedArrayRef;
11use vortex_array::array_slots;
12use vortex_array::arrays::Primitive;
13use vortex_array::arrays::PrimitiveArray;
14use vortex_array::buffer::BufferHandle;
15use vortex_array::dtype::DType;
16use vortex_array::dtype::NativePType;
17use vortex_array::dtype::PType;
18use vortex_array::patches::PatchSlotIndices;
19use vortex_array::patches::Patches;
20use vortex_array::patches::PatchesData;
21use vortex_array::validity::Validity;
22use vortex_array::vtable::child_to_validity;
23use vortex_error::VortexResult;
24use vortex_error::vortex_ensure;
25use vortex_error::vortex_err;
26
27pub mod bitpack_compress;
28pub mod bitpack_decompress;
29pub mod unpack_iter;
30
31use crate::BitPackedArray;
32use crate::bitpack_compress::bitpack_encode;
33use crate::unpack_iter::BitPacked as BitPackedIter;
34use crate::unpack_iter::BitUnpackedChunks;
35
36#[array_slots(crate::BitPacked)]
37pub struct BitPackedSlots {
38    /// The indices of exception values that don't fit in the bit-packed representation.
39    pub patch_indices: Option<ArrayRef>,
40    /// The exception values that don't fit in the bit-packed representation.
41    pub patch_values: Option<ArrayRef>,
42    /// Chunk offsets for the patch indices/values.
43    pub patch_chunk_offsets: Option<ArrayRef>,
44    /// The validity bitmap indicating which elements are non-null.
45    pub validity_child: Option<ArrayRef>,
46}
47
48pub(crate) const PATCH_SLOTS: PatchSlotIndices = PatchSlotIndices {
49    indices: BitPackedSlots::PATCH_INDICES,
50    values: BitPackedSlots::PATCH_VALUES,
51    chunk_offsets: BitPackedSlots::PATCH_CHUNK_OFFSETS,
52};
53
54pub struct BitPackedDataParts {
55    pub offset: u16,
56    pub bit_width: u8,
57    pub len: usize,
58    pub packed: BufferHandle,
59    pub patches: Option<Patches>,
60    pub validity: Validity,
61}
62
63#[derive(Clone, Debug)]
64pub struct BitPackedData {
65    /// The offset within the first block (created with a slice).
66    /// 0 <= offset < 1024
67    pub(super) offset: u16,
68    pub(super) bit_width: u8,
69    pub(super) packed: BufferHandle,
70    /// Patch metadata for reconstructing Patches from slots.
71    pub(super) patches_data: Option<PatchesData>,
72}
73
74impl Display for BitPackedData {
75    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
76        write!(f, "bit_width: {}, offset: {}", self.bit_width, self.offset)
77    }
78}
79
80impl BitPackedData {
81    /// Create a new bitpacked array using a buffer of packed data.
82    ///
83    /// The packed data should be interpreted as a sequence of values with size `bit_width`.
84    ///
85    /// # Errors
86    ///
87    /// This method returns errors if any of the metadata is inconsistent, for example the packed
88    /// buffer provided does not have the right size according to the supplied length and target
89    /// PType.
90    ///
91    /// # Safety
92    ///
93    /// For signed arrays, it is the caller's responsibility to ensure that there are no values
94    /// that can be interpreted once unpacked to the provided PType.
95    ///
96    /// This invariant is upheld by the compressor, but callers must ensure this if they wish to
97    /// construct a new `BitPackedArray` from parts.
98    ///
99    /// See also the [`encode`][Self::encode] method on this type for a safe path to create a new
100    /// bit-packed array.
101    /// A safe constructor for a `BitPackedArray` from its components:
102    ///
103    /// * `packed` is ByteBuffer holding the compressed data that was packed with FastLanes
104    ///   bit-packing to a `bit_width` bits per value. `length` is the length of the original
105    ///   vector. Note that the packed is padded with zeros to the next multiple of 1024 elements
106    ///   if `length` is not divisible by 1024.
107    /// * `ptype` of the original data
108    /// * `validity` to track any nulls
109    /// * `patches` optionally provided for values that did not pack
110    ///
111    /// Any failure in validation will result in an error.
112    ///
113    /// # Validation
114    ///
115    /// * The `ptype` must be an integer
116    /// * `validity` must have `length` len
117    /// * Any patches must have any `array_len` equal to `length`
118    /// * The `packed` buffer must be exactly sized to hold `length` values of `bit_width` rounded
119    ///   up to the next multiple of 1024.
120    ///
121    /// Any violation of these preconditions will result in an error.
122    pub fn try_new(
123        packed: BufferHandle,
124        patches: Option<Patches>,
125        bit_width: u8,
126        offset: u16,
127    ) -> VortexResult<Self> {
128        vortex_ensure!(bit_width <= 64, "Unsupported bit width {bit_width}");
129        vortex_ensure!(
130            offset < 1024,
131            "Offset must be less than the full block i.e., 1024, got {offset}"
132        );
133
134        Ok(Self {
135            offset,
136            bit_width,
137            packed,
138            patches_data: patches.as_ref().map(PatchesData::from_patches),
139        })
140    }
141
142    pub(crate) fn validate(
143        packed: &BufferHandle,
144        ptype: PType,
145        validity: &Validity,
146        patches: Option<&Patches>,
147        bit_width: u8,
148        length: usize,
149        offset: u16,
150    ) -> VortexResult<()> {
151        vortex_ensure!(ptype.is_int(), MismatchedTypes: "integer", ptype);
152        vortex_ensure!(bit_width <= 64, "Unsupported bit width {bit_width}");
153
154        if let Some(validity_len) = validity.maybe_len() {
155            vortex_ensure!(
156                validity_len == length,
157                "BitPackedArray validity length {validity_len} != array length {length}",
158            );
159        }
160
161        // Validate patches
162        if let Some(patches) = patches {
163            Self::validate_patches(patches, ptype, length)?;
164        }
165
166        // Validate packed buffer
167        let expected_packed_len =
168            (length + offset as usize).div_ceil(1024) * (128 * bit_width as usize);
169        vortex_ensure!(
170            packed.len() == expected_packed_len,
171            "Expected {} packed bytes, got {}",
172            expected_packed_len,
173            packed.len()
174        );
175
176        Ok(())
177    }
178
179    fn validate_patches(patches: &Patches, ptype: PType, len: usize) -> VortexResult<()> {
180        // Ensure that array and patches have same ptype
181        vortex_ensure!(
182            patches.dtype().eq_ignore_nullability(ptype.into()),
183            "Patches DType {} does not match BitPackedArray dtype {}",
184            patches.dtype().as_nonnullable(),
185            ptype
186        );
187
188        vortex_ensure!(
189            patches.array_len() == len,
190            "BitPackedArray patches length {} != expected {len}",
191            patches.array_len(),
192        );
193
194        Ok(())
195    }
196
197    pub fn ptype(&self, dtype: &DType) -> PType {
198        dtype.as_ptype()
199    }
200
201    /// Underlying bit packed values as byte array
202    #[inline]
203    pub fn packed(&self) -> &BufferHandle {
204        &self.packed
205    }
206
207    /// Access the slice of packed values as an array of `T`
208    #[inline]
209    pub fn packed_slice<T: NativePType + BitPacking>(&self) -> &[T] {
210        let packed_bytes = self.packed().as_host();
211        let packed_ptr: *const T = packed_bytes.as_ptr().cast();
212        // Return number of elements of type `T` packed in the buffer
213        let packed_len = packed_bytes.len() / size_of::<T>();
214
215        // SAFETY: as_slice points to buffer memory that outlives the lifetime of `self`.
216        //  Unfortunately Rust cannot understand this, so we reconstruct the slice from raw parts
217        //  to get it to reinterpret the lifetime.
218        unsafe { std::slice::from_raw_parts(packed_ptr, packed_len) }
219    }
220
221    /// Accessor for bit unpacked chunks
222    pub fn unpacked_chunks<T: BitPackedIter>(
223        &self,
224        dtype: &DType,
225        len: usize,
226    ) -> VortexResult<BitUnpackedChunks<T>> {
227        assert_eq!(
228            T::PTYPE,
229            self.ptype(dtype),
230            "Requested type doesn't match the array ptype"
231        );
232        BitUnpackedChunks::try_new(self, len)
233    }
234
235    /// Bit-width of the packed values
236    #[inline]
237    pub fn bit_width(&self) -> u8 {
238        self.bit_width
239    }
240
241    #[inline]
242    pub fn offset(&self) -> u16 {
243        self.offset
244    }
245
246    /// Bit-pack an array of primitive integers down to the target bit-width using the FastLanes
247    /// SIMD-accelerated packing kernels.
248    ///
249    /// # Errors
250    ///
251    /// If the provided array is not an integer type, an error will be returned.
252    ///
253    /// If the provided array contains negative values, an error will be returned.
254    ///
255    /// If the requested bit-width for packing is larger than the array's native width, an
256    /// error will be returned.
257    pub fn encode(
258        array: &ArrayRef,
259        bit_width: u8,
260        ctx: &mut ExecutionCtx,
261    ) -> VortexResult<BitPackedArray> {
262        let parray: PrimitiveArray = array
263            .clone()
264            .try_downcast::<Primitive>()
265            .map_err(|a| vortex_err!(InvalidArgument: "Bitpacking can only encode primitive arrays, got {}", a.encoding_id()))?;
266        bitpack_encode(&parray, bit_width, None, ctx)
267    }
268
269    /// Calculate the maximum value that **can** be contained by this array, given its bit-width.
270    ///
271    /// Note that this value need not actually be present in the array.
272    #[inline]
273    pub fn max_packed_value(&self) -> usize {
274        (1 << self.bit_width()) - 1
275    }
276}
277
278pub trait BitPackedArrayExt: BitPackedArraySlotsExt {
279    #[inline]
280    fn packed(&self) -> &BufferHandle {
281        BitPackedData::packed(self)
282    }
283
284    #[inline]
285    fn bit_width(&self) -> u8 {
286        BitPackedData::bit_width(self)
287    }
288
289    #[inline]
290    fn offset(&self) -> u16 {
291        BitPackedData::offset(self)
292    }
293
294    #[inline]
295    fn patches(&self) -> Option<Patches> {
296        PatchesData::patches_from_slots(
297            self.patches_data.as_ref(),
298            self.as_ref().len(),
299            self.as_ref().slots(),
300            PATCH_SLOTS,
301        )
302    }
303
304    #[inline]
305    fn validity(&self) -> Validity {
306        child_to_validity(self.validity_child(), self.as_ref().dtype().nullability())
307    }
308
309    #[inline]
310    fn packed_slice<T: NativePType + BitPacking>(&self) -> &[T] {
311        BitPackedData::packed_slice::<T>(self)
312    }
313
314    #[inline]
315    fn unpacked_chunks<T: BitPackedIter>(&self) -> VortexResult<BitUnpackedChunks<T>> {
316        BitPackedData::unpacked_chunks::<T>(self, self.as_ref().dtype(), self.as_ref().len())
317    }
318}
319
320impl<T: TypedArrayRef<crate::BitPacked>> BitPackedArrayExt for T {}
321
322#[cfg(test)]
323mod test {
324    use std::sync::LazyLock;
325
326    use vortex_array::IntoArray;
327    use vortex_array::VortexSessionExecute;
328    use vortex_array::arrays::PrimitiveArray;
329    use vortex_array::assert_arrays_eq;
330    use vortex_array::session::ArraySession;
331    use vortex_buffer::Buffer;
332    use vortex_session::VortexSession;
333
334    use crate::BitPackedData;
335    use crate::bitpacking::array::BitPackedArrayExt;
336
337    static SESSION: LazyLock<VortexSession> =
338        LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
339
340    #[test]
341    fn test_encode() {
342        let mut ctx = SESSION.create_execution_ctx();
343        let values = [
344            Some(1u64),
345            None,
346            Some(1),
347            None,
348            Some(1),
349            None,
350            Some(u64::MAX),
351        ];
352        let uncompressed = PrimitiveArray::from_option_iter(values);
353        let packed = BitPackedData::encode(&uncompressed.into_array(), 1, &mut ctx).unwrap();
354        let expected = PrimitiveArray::from_option_iter(values);
355        let packed_primitive = packed
356            .as_array()
357            .clone()
358            .execute::<PrimitiveArray>(&mut ctx)
359            .unwrap();
360        assert_arrays_eq!(packed_primitive, expected);
361    }
362
363    #[test]
364    fn test_encode_too_wide() {
365        let mut ctx = SESSION.create_execution_ctx();
366        let values = [Some(1u8), None, Some(1), None, Some(1), None];
367        let uncompressed = PrimitiveArray::from_option_iter(values);
368        let _packed = BitPackedData::encode(&uncompressed.clone().into_array(), 8, &mut ctx)
369            .expect_err("Cannot pack value into the same width");
370        let _packed = BitPackedData::encode(&uncompressed.into_array(), 9, &mut ctx)
371            .expect_err("Cannot pack value into larger width");
372    }
373
374    #[test]
375    fn signed_with_patches() {
376        let mut ctx = SESSION.create_execution_ctx();
377        let values: Buffer<i32> = (0i32..=512).collect();
378        let parray = values.clone().into_array();
379
380        let packed_with_patches = BitPackedData::encode(&parray, 9, &mut ctx).unwrap();
381        assert!(packed_with_patches.patches().is_some());
382        let packed_primitive = packed_with_patches
383            .as_array()
384            .clone()
385            .execute::<PrimitiveArray>(&mut ctx)
386            .unwrap();
387        assert_arrays_eq!(
388            packed_primitive,
389            PrimitiveArray::new(values, vortex_array::validity::Validity::NonNullable)
390        );
391    }
392}