Skip to main content

compressed_intvec/fixed/
builder.rs

1//! # [`FixedVec`] Builders
2//!
3//! This module provides builders for constructing an owned [`FixedVec`].
4//!
5//! There are two main builders:
6//! - [`FixedVecBuilder`]: For building from a slice (`&[T]`). It can automatically
7//!   determine the optimal bit width from the data.
8//! - [`FixedVecFromIterBuilder`]: For building from an iterator. This is useful
9//!   for large datasets, but requires the bit width to be specified manually.
10//!
11//! # Examples
12//!
13//! ## Building from a slice
14//!
15//! ```
16//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
17//! use compressed_intvec::fixed::{FixedVec, BitWidth, UFixedVec};
18//! use compressed_intvec::fixed::builder::FixedVecBuilder;
19//!
20//! let data: &[u32] = &[10, 20, 30, 40, 50];
21//!
22//! // The builder can infer the minimal bit width.
23//! let vec: UFixedVec<u32> = FixedVecBuilder::new()
24//!     .build(data)?;
25//!
26//! assert_eq!(vec.bit_width(), 6); // 50 requires 6 bits
27//!
28//! // Or a specific strategy can be chosen.
29//! let vec_pow2: UFixedVec<u32> = FixedVecBuilder::new()
30//!     .bit_width(BitWidth::PowerOfTwo)
31//!     .build(data)?;
32//!
33//! assert_eq!(vec_pow2.bit_width(), 8);
34//! # Ok(())
35//! # }
36//! ```
37
38use crate::fixed::traits::{Storable, Word};
39use crate::fixed::{BitWidth, Error, FixedVec};
40use dsi_bitstream::{
41    impls::{BufBitWriter, MemWordWriterVec},
42    prelude::{BitWrite, Endianness},
43};
44use std::marker::PhantomData;
45
46/// A builder for creating a [`FixedVec`] from a slice of integers.
47///
48/// This builder analyzes a slice to determine the `bit_width` based on
49/// the selected [`BitWidth`] strategy and then constructs the compressed vector.
50#[derive(Debug, Clone)]
51pub struct FixedVecBuilder<T: Storable<W>, W: Word, E: Endianness> {
52    bit_width_strategy: BitWidth,
53    _phantom: PhantomData<(T, W, E)>,
54}
55
56impl<T, W, E> Default for FixedVecBuilder<T, W, E>
57where
58    T: Storable<W>,
59    W: Word,
60    E: Endianness,
61{
62    fn default() -> Self {
63        Self {
64            bit_width_strategy: BitWidth::default(),
65            _phantom: PhantomData,
66        }
67    }
68}
69
70impl<T, W, E> FixedVecBuilder<T, W, E>
71where
72    T: Storable<W>,
73    W: Word,
74    E: Endianness,
75    BufBitWriter<E, MemWordWriterVec<W, Vec<W>>>: BitWrite<E, Error = std::convert::Infallible>,
76{
77    /// Creates a new builder with the default [`BitWidth::Minimal`] strategy.
78    pub fn new() -> Self {
79        Self::default()
80    }
81
82    /// Sets the strategy for determining the number of bits for each element.
83    pub fn bit_width(mut self, strategy: BitWidth) -> Self {
84        self.bit_width_strategy = strategy;
85        self
86    }
87
88    /// Builds the [`FixedVec`] from a slice of data.
89    ///
90    /// This method consumes the builder and returns a new [`FixedVec`].
91    ///
92    /// # Errors
93    ///
94    /// Returns an [`Error`] if a value in the input is too large for the
95    /// specified `bit_width`, or if the parameters are invalid.
96    pub fn build(self, input: &[T]) -> Result<FixedVec<T, W, E, Vec<W>>, Error> {
97        let bits_per_word = <W as crate::fixed::traits::Word>::BITS;
98
99        // Determine the final bit width based on the chosen strategy.
100        let final_bit_width = match self.bit_width_strategy {
101            // If the user specified an exact bit width, use it.
102            BitWidth::Explicit(n) => n,
103            // Otherwise, analyze the data to find the maximum value.
104            _ => {
105                let max_val: W = input
106                    .iter()
107                    .map(|&val| <T as Storable<W>>::into_word(val))
108                    .max()
109                    .unwrap_or(W::ZERO);
110
111                // Calculate the minimum number of bits required.
112                let min_bits = (bits_per_word - max_val.leading_zeros() as usize).max(1);
113
114                // Apply the selected strategy.
115                match self.bit_width_strategy {
116                    BitWidth::Minimal => min_bits,
117                    BitWidth::PowerOfTwo => min_bits.next_power_of_two().min(bits_per_word),
118                    BitWidth::Explicit(_) => unreachable!(), // Handled above.
119                }
120            }
121        };
122
123        if !input.is_empty() && final_bit_width == 0 {
124            return Err(Error::InvalidParameters(
125                "bit_width cannot be zero for a non-empty vector".to_string(),
126            ));
127        }
128
129        if final_bit_width > bits_per_word {
130            return Err(Error::InvalidParameters(format!(
131                "bit_width ({final_bit_width}) cannot be greater than the word size ({bits_per_word})",
132            )));
133        }
134
135        // Handle the empty case separately.
136        if input.is_empty() {
137            return Ok(unsafe { FixedVec::new_unchecked(Vec::new(), 0, final_bit_width) });
138        }
139
140        // Allocate the buffer for the bit-packed data.
141        let total_bits = input.len() * final_bit_width;
142        let num_words = total_bits.div_ceil(bits_per_word);
143        let buffer = vec![W::ZERO; num_words + 1]; // +1 for padding.
144
145        let mut writer = BufBitWriter::new(MemWordWriterVec::new(buffer));
146
147        // Define the upper bound for values to check against.
148        let limit = if final_bit_width < bits_per_word {
149            W::ONE << final_bit_width
150        } else {
151            W::max_value()
152        };
153
154        // Write each value to the bitstream.
155        for (i, &value_t) in input.iter().enumerate() {
156            let value_w = <T as Storable<W>>::into_word(value_t);
157            // Ensure the value fits within the calculated bit width.
158            if final_bit_width < bits_per_word && value_w >= limit {
159                return Err(Error::ValueTooLarge {
160                    value: value_w.to_u128().unwrap(),
161                    index: i,
162                    bit_width: final_bit_width,
163                });
164            }
165            writer
166                .write_bits(value_w.to_u64().unwrap(), final_bit_width)
167                .unwrap();
168        }
169
170        writer.flush().unwrap();
171        let data = writer.into_inner().unwrap().into_inner();
172
173        Ok(unsafe { FixedVec::new_unchecked(data, input.len(), final_bit_width) })
174    }
175}
176
177/// A builder for creating a [`FixedVec`] from an iterator.
178///
179/// This builder is designed for streaming data from an iterator without first
180/// collecting it into a slice. It requires the `bit_width` to be specified
181/// manually, as it cannot analyze the data in advance.
182#[derive(Debug)]
183pub struct FixedVecFromIterBuilder<
184    T: Storable<W>,
185    W: Word,
186    E: Endianness,
187    I: IntoIterator<Item = T>,
188> {
189    iter: I,
190    bit_width: usize,
191    _phantom: PhantomData<(T, W, E)>,
192}
193
194impl<T, W, E, I> FixedVecFromIterBuilder<T, W, E, I>
195where
196    T: Storable<W>,
197    W: Word,
198    E: Endianness,
199    I: IntoIterator<Item = T>,
200    BufBitWriter<E, MemWordWriterVec<W, Vec<W>>>: BitWrite<E, Error = std::convert::Infallible>,
201{
202    /// Creates a new builder from an iterator and a specified `bit_width`.
203    pub fn new(iter: I, bit_width: usize) -> Self {
204        Self {
205            iter,
206            bit_width,
207            _phantom: PhantomData,
208        }
209    }
210
211    /// Builds the [`FixedVec`] by consuming the iterator.
212    ///
213    /// # Errors
214    ///
215    /// Returns an [`Error`] if a value from the iterator is too large for the
216    /// specified `bit_width`, or if the parameters are invalid.
217    pub fn build(self) -> Result<FixedVec<T, W, E, Vec<W>>, Error> {
218        let bits_per_word = <W as crate::fixed::traits::Word>::BITS;
219        if self.bit_width > bits_per_word {
220            return Err(Error::InvalidParameters(format!(
221                "bit_width ({}) cannot be greater than the word size ({})",
222                self.bit_width, bits_per_word
223            )));
224        }
225
226        let mut writer = BufBitWriter::new(MemWordWriterVec::new(Vec::<W>::new()));
227
228        let mut len = 0;
229        let limit = if self.bit_width < bits_per_word {
230            W::ONE << self.bit_width
231        } else {
232            W::max_value()
233        };
234
235        // Write each value from the iterator to the bitstream.
236        for (i, value_t) in self.iter.into_iter().enumerate() {
237            let value_w = <T as Storable<W>>::into_word(value_t);
238            if self.bit_width < bits_per_word && value_w >= limit {
239                return Err(Error::ValueTooLarge {
240                    value: value_w.to_u128().unwrap(),
241                    index: i,
242                    bit_width: self.bit_width,
243                });
244            }
245            writer
246                .write_bits(value_w.to_u64().unwrap(), self.bit_width)
247                .unwrap();
248            len += 1;
249        }
250
251        writer.flush().unwrap();
252        let mut data = writer.into_inner().unwrap().into_inner();
253
254        // Add padding word to the end of the buffer.
255        if len > 0 {
256            data.push(W::ZERO);
257        }
258        data.shrink_to_fit();
259
260        Ok(unsafe { FixedVec::new_unchecked(data, len, self.bit_width) })
261    }
262}