compressed_intvec/fixed/
builder.rs

1//! # [`FixedVec`] Builders
2//!
3//! This module provides builders for constructing an owned [`FixedVec`].
4//!
5//! There are two main builders:
6//! - [`FixedVecBuilder`]: For building from a slice (`&[T]`). It can automatically
7//!   determine the optimal bit width from the data.
8//! - [`FixedVecFromIterBuilder`]: For building from an iterator. This is useful
9//!   for large datasets, but requires the bit width to be specified manually.
10//!
11//! # Examples
12//!
13//! ## Building from a slice
14//!
15//! ```
16//! use compressed_intvec::fixed::{FixedVec, BitWidth, UFixedVec};
17//! use compressed_intvec::fixed::builder::FixedVecBuilder;
18//!
19//! let data: &[u32] = &[10, 20, 30, 40, 50];
20//!
21//! // The builder can infer the minimal bit width.
22//! let vec: UFixedVec<u32> = FixedVecBuilder::new()
23//!     .build(data)
24//!     .unwrap();
25//!
26//! assert_eq!(vec.bit_width(), 6); // 50 requires 6 bits
27//!
28//! // Or a specific strategy can be chosen.
29//! let vec_pow2: UFixedVec<u32> = FixedVecBuilder::new()
30//!     .bit_width(BitWidth::PowerOfTwo)
31//!     .build(data)
32//!     .unwrap();
33//!
34//! assert_eq!(vec_pow2.bit_width(), 8);
35//! ```
36
37use crate::fixed::traits::{Storable, Word};
38use crate::fixed::{BitWidth, Error, FixedVec};
39use dsi_bitstream::{
40    impls::{BufBitWriter, MemWordWriterVec},
41    prelude::{BitWrite, Endianness},
42};
43use std::marker::PhantomData;
44
45/// A builder for creating a [`FixedVec`] from a slice of integers.
46///
47/// This builder analyzes a slice to determine the `bit_width` based on
48/// the selected [`BitWidth`] strategy and then constructs the compressed vector.
49#[derive(Debug, Clone)]
50pub struct FixedVecBuilder<T: Storable<W>, W: Word, E: Endianness> {
51    bit_width_strategy: BitWidth,
52    _phantom: PhantomData<(T, W, E)>,
53}
54
55impl<T, W, E> Default for FixedVecBuilder<T, W, E>
56where
57    T: Storable<W>,
58    W: Word,
59    E: Endianness,
60{
61    fn default() -> Self {
62        Self {
63            bit_width_strategy: BitWidth::default(),
64            _phantom: PhantomData,
65        }
66    }
67}
68
69impl<T, W, E> FixedVecBuilder<T, W, E>
70where
71    T: Storable<W>,
72    W: Word,
73    E: Endianness,
74    BufBitWriter<E, MemWordWriterVec<W, Vec<W>>>: BitWrite<E, Error = std::convert::Infallible>,
75{
76    /// Creates a new builder with the default [`BitWidth::Minimal`] strategy.
77    pub fn new() -> Self {
78        Self::default()
79    }
80
81    /// Sets the strategy for determining the number of bits for each element.
82    pub fn bit_width(mut self, strategy: BitWidth) -> Self {
83        self.bit_width_strategy = strategy;
84        self
85    }
86
87    /// Builds the [`FixedVec`] from a slice of data.
88    ///
89    /// This method consumes the builder and returns a new [`FixedVec`].
90    ///
91    /// # Errors
92    ///
93    /// Returns an [`Error`] if a value in the input is too large for the
94    /// specified `bit_width`, or if the parameters are invalid.
95    pub fn build(self, input: &[T]) -> Result<FixedVec<T, W, E, Vec<W>>, Error> {
96        let bits_per_word = <W as crate::fixed::traits::Word>::BITS;
97
98        // Determine the final bit width based on the chosen strategy.
99        let final_bit_width = match self.bit_width_strategy {
100            // If the user specified an exact bit width, use it.
101            BitWidth::Explicit(n) => n,
102            // Otherwise, analyze the data to find the maximum value.
103            _ => {
104                let max_val: W = input
105                    .iter()
106                    .map(|&val| <T as Storable<W>>::into_word(val))
107                    .max()
108                    .unwrap_or(W::ZERO);
109
110                // Calculate the minimum number of bits required.
111                let min_bits = (bits_per_word - max_val.leading_zeros() as usize).max(1);
112
113                // Apply the selected strategy.
114                match self.bit_width_strategy {
115                    BitWidth::Minimal => min_bits,
116                    BitWidth::PowerOfTwo => min_bits.next_power_of_two().min(bits_per_word),
117                    BitWidth::Explicit(_) => unreachable!(), // Handled above.
118                }
119            }
120        };
121
122        if !input.is_empty() && final_bit_width == 0 {
123            return Err(Error::InvalidParameters(
124                "bit_width cannot be zero for a non-empty vector".to_string(),
125            ));
126        }
127
128        if final_bit_width > bits_per_word {
129            return Err(Error::InvalidParameters(format!(
130                "bit_width ({final_bit_width}) cannot be greater than the word size ({bits_per_word})",
131            )));
132        }
133
134        // Handle the empty case separately.
135        if input.is_empty() {
136            return Ok(unsafe { FixedVec::new_unchecked(Vec::new(), 0, final_bit_width) });
137        }
138
139        // Allocate the buffer for the bit-packed data.
140        let total_bits = input.len() * final_bit_width;
141        let num_words = total_bits.div_ceil(bits_per_word);
142        let buffer = vec![W::ZERO; num_words + 1]; // +1 for padding.
143
144        let mut writer = BufBitWriter::new(MemWordWriterVec::new(buffer));
145
146        // Define the upper bound for values to check against.
147        let limit = if final_bit_width < bits_per_word {
148            W::ONE << final_bit_width
149        } else {
150            W::max_value()
151        };
152
153        // Write each value to the bitstream.
154        for (i, &value_t) in input.iter().enumerate() {
155            let value_w = <T as Storable<W>>::into_word(value_t);
156            // Ensure the value fits within the calculated bit width.
157            if final_bit_width < bits_per_word && value_w >= limit {
158                return Err(Error::ValueTooLarge {
159                    value: value_w.to_u128().unwrap(),
160                    index: i,
161                    bit_width: final_bit_width,
162                });
163            }
164            writer
165                .write_bits(value_w.to_u64().unwrap(), final_bit_width)
166                .unwrap();
167        }
168
169        writer.flush().unwrap();
170        let data = writer.into_inner().unwrap().into_inner();
171
172        Ok(unsafe { FixedVec::new_unchecked(data, input.len(), final_bit_width) })
173    }
174}
175
176/// A builder for creating a [`FixedVec`] from an iterator.
177///
178/// This builder is designed for streaming data from an iterator without first
179/// collecting it into a slice. It requires the `bit_width` to be specified
180/// manually, as it cannot analyze the data in advance.
181#[derive(Debug)]
182pub struct FixedVecFromIterBuilder<
183    T: Storable<W>,
184    W: Word,
185    E: Endianness,
186    I: IntoIterator<Item = T>,
187> {
188    iter: I,
189    bit_width: usize,
190    _phantom: PhantomData<(T, W, E)>,
191}
192
193impl<T, W, E, I> FixedVecFromIterBuilder<T, W, E, I>
194where
195    T: Storable<W>,
196    W: Word,
197    E: Endianness,
198    I: IntoIterator<Item = T>,
199    BufBitWriter<E, MemWordWriterVec<W, Vec<W>>>: BitWrite<E, Error = std::convert::Infallible>,
200{
201    /// Creates a new builder from an iterator and a specified `bit_width`.
202    pub fn new(iter: I, bit_width: usize) -> Self {
203        Self {
204            iter,
205            bit_width,
206            _phantom: PhantomData,
207        }
208    }
209
210    /// Builds the [`FixedVec`] by consuming the iterator.
211    ///
212    /// # Errors
213    ///
214    /// Returns an [`Error`] if a value from the iterator is too large for the
215    /// specified `bit_width`, or if the parameters are invalid.
216    pub fn build(self) -> Result<FixedVec<T, W, E, Vec<W>>, Error> {
217        let bits_per_word = <W as crate::fixed::traits::Word>::BITS;
218        if self.bit_width > bits_per_word {
219            return Err(Error::InvalidParameters(format!(
220                "bit_width ({}) cannot be greater than the word size ({})",
221                self.bit_width, bits_per_word
222            )));
223        }
224
225        let mut writer = BufBitWriter::new(MemWordWriterVec::new(Vec::<W>::new()));
226
227        let mut len = 0;
228        let limit = if self.bit_width < bits_per_word {
229            W::ONE << self.bit_width
230        } else {
231            W::max_value()
232        };
233
234        // Write each value from the iterator to the bitstream.
235        for (i, value_t) in self.iter.into_iter().enumerate() {
236            let value_w = <T as Storable<W>>::into_word(value_t);
237            if self.bit_width < bits_per_word && value_w >= limit {
238                return Err(Error::ValueTooLarge {
239                    value: value_w.to_u128().unwrap(),
240                    index: i,
241                    bit_width: self.bit_width,
242                });
243            }
244            writer
245                .write_bits(value_w.to_u64().unwrap(), self.bit_width)
246                .unwrap();
247            len += 1;
248        }
249
250        writer.flush().unwrap();
251        let mut data = writer.into_inner().unwrap().into_inner();
252
253        // Add padding word to the end of the buffer.
254        if len > 0 {
255            data.push(W::ZERO);
256        }
257        data.shrink_to_fit();
258
259        Ok(unsafe { FixedVec::new_unchecked(data, len, self.bit_width) })
260    }
261}