Skip to main content

compressed_intvec/variable/
builder.rs

1//! Builders for constructing an [`VarVec`].
2//!
3//! This module provides the two primary builders for creating an [`VarVec`]:
4//!
5//! - [`VarVecBuilder`]: For building from an existing slice of data. This is the
6//!   most flexible builder, as it can analyze the data to automatically select
7//!   an optimal compression codec.
8//! - [`VarVecFromIterBuilder`]: For building from an iterator. This is suitable
9//!   for large datasets that are generated on the fly, but it requires the
10//!   compression codec to be specified manually.
11//!
12//! [`VarVec`]: crate::variable::VarVec
13
14use super::{codec, codec::Codec, traits::Storable, VarVec, VarVecBitWriter, VarVecError};
15use crate::common::codec_writer::CodecWriter;
16use crate::fixed::{BitWidth, FixedVec};
17use dsi_bitstream::{
18    dispatch::StaticCodeWrite,
19    impls::MemWordWriterVec,
20    prelude::{BitWrite, CodesWrite, Endianness, LE},
21};
22use std::marker::PhantomData;
23
24/// A builder for creating an [`VarVec`] from a slice of integers.
25///
26/// This builder is the primary entry point for constructing a compressed vector
27/// when the data is already available in memory. It allows for detailed
28/// configuration of the sampling rate (`k`) and the compression codec.
29///
30/// This builder always produces an owned [`VarVec`]. It is obtained
31/// by calling [`VarVec::builder`].
32#[derive(Debug)]
33pub struct VarVecBuilder<T: Storable, E: Endianness> {
34    k: usize,
35    codec_spec: Codec,
36    _markers: PhantomData<(T, E)>,
37}
38
39impl<T: Storable, E: Endianness> VarVecBuilder<T, E> {
40    /// Creates a new builder for a [`VarVec`] with default settings.
41    ///
42    /// By default, the sampling rate is `k=32` and the codec is chosen
43    /// automatically via [`Codec::Auto`].
44    pub(super) fn new() -> Self {
45        Self {
46            k: 32,
47            codec_spec: Codec::Auto,
48            _markers: PhantomData,
49        }
50    }
51
52    /// Sets the sampling rate `k` for random access.
53    ///
54    /// The sampling rate determines how many elements are stored between each
55    /// sample point. A smaller `k` results in faster random access but uses
56    /// more memory for the sampling table. See the [module-level documentation](super)
57    /// for a detailed explanation.
58    ///
59    /// # Panics
60    ///
61    /// The [`build`](VarVecBuilder::build) method will return an error if `k` is 0.
62    pub fn k(mut self, k: usize) -> Self {
63        self.k = k;
64        self
65    }
66
67    /// Sets the compression codec to use.
68    ///
69    /// The choice of codec can significantly impact the compression ratio.
70    /// By default, this is [`Codec::Auto`], which analyzes the data
71    /// to select the best codec.
72    pub fn codec(mut self, codec_spec: Codec) -> Self {
73        self.codec_spec = codec_spec;
74        self
75    }
76
77    /// Builds the [`VarVec`] from a slice of data, consuming the builder.
78    ///
79    /// This method performs the compression and builds the sampling table.
80    ///
81    /// # Errors
82    ///
83    /// Returns an [`VarVecError`] if the parameters are invalid (e.g., `k=0`) or
84    /// if an error occurs during compression.
85    ///
86    /// # Examples
87    ///
88    /// ```
89    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
90    /// use compressed_intvec::variable::{VarVec, SVarVec, Codec};
91    ///
92    /// let data: &[i16] = &[-100, 0, 50, -2, 1000];
93    ///
94    /// let vec: SVarVec<i16> = VarVec::builder()
95    ///     .k(2) // Smaller `k` for faster access
96    ///     .codec(Codec::Delta) // Explicitly choose Delta coding
97    ///     .build(data)?;
98    ///
99    /// assert_eq!(vec.len(), 5);
100    /// assert_eq!(vec.get(0), Some(-100));
101    /// # Ok(())
102    /// # }
103    /// ```
104    pub fn build(self, input: &[T]) -> Result<VarVec<T, E, Vec<u64>>, VarVecError>
105    where
106        VarVecBitWriter<E>: BitWrite<E, Error = core::convert::Infallible> + CodesWrite<E>,
107    {
108        if self.k == 0 {
109            return Err(VarVecError::InvalidParameters(
110                "Sampling rate k cannot be zero".to_string(),
111            ));
112        }
113
114        // Resolve codec: only iterate for analysis when necessary.
115        let resolved_code = if self.codec_spec.requires_analysis() {
116            // Analysis needed: iterate input once, convert on-the-fly.
117            codec::resolve_codec_from_iter(input.iter().map(|&x| x.to_word()), self.codec_spec)?
118        } else {
119            // No analysis needed: resolve directly without data access.
120            codec::resolve_codec(&[] as &[u64], self.codec_spec)?
121        };
122
123        if input.is_empty() {
124            let empty_samples = FixedVec::<u64, u64, LE>::builder()
125                .build(&[0u64; 0])
126                .unwrap();
127            return Ok(unsafe {
128                VarVec::new_unchecked(Vec::new(), empty_samples, self.k, 0, resolved_code)
129            });
130        }
131
132        let word_writer = MemWordWriterVec::new(Vec::new());
133        let mut writer = VarVecBitWriter::<E>::new(word_writer);
134
135        let sample_capacity = input.len().div_ceil(self.k);
136        let mut temp_samples = Vec::with_capacity(sample_capacity);
137        let mut current_bit_offset = 0;
138
139        // Resolve the codec dispatch ONCE at the beginning.
140        // This eliminates per-element match overhead for common codecs.
141        let code_writer = CodecWriter::new(resolved_code);
142
143        // Iterate through the data, writing compressed values and recording samples.
144        for (i, &value) in input.iter().enumerate() {
145            if i % self.k == 0 {
146                temp_samples.push(current_bit_offset as u64);
147            }
148
149            let bits_written = code_writer.write(&mut writer, value.to_word())?;
150            current_bit_offset += bits_written;
151        }
152        // Write a final stopper to ensure the last value can always be read safely.
153        writer.write_bits(u64::MAX, 64).unwrap();
154
155        // Compress the recorded samples into a FixedVec.
156        let samples = FixedVec::<u64, u64, LE>::builder()
157            .bit_width(BitWidth::Minimal)
158            .build(&temp_samples)
159            .unwrap();
160
161        writer.flush().unwrap();
162        let mut data = writer.into_inner().unwrap().into_inner();
163        data.shrink_to_fit();
164
165        Ok(unsafe { VarVec::new_unchecked(data, samples, self.k, input.len(), resolved_code) })
166    }
167}
168
169/// A builder for creating an [`VarVec`] from an iterator.
170///
171/// This builder is designed for constructing an [`VarVec`] from a data source that
172/// is an iterator. It consumes the iterator and compresses its elements on the fly.
173/// It is obtained by calling [`VarVec::from_iter_builder`].
174///
175/// # Limitations
176///
177/// This builder does **not** support automatic codec selection (i.e., [`Codec::Auto`])
178/// or automatic parameter estimation for codecs like [`Rice`](Codec::Rice) or [`Golomb`](Codec::Golomb). Since the
179/// iterator is consumed in a single pass, the data cannot be pre-analyzed to
180/// determine its statistical properties. The user must specify a concrete codec.
181#[derive(Debug)]
182pub struct VarVecFromIterBuilder<T: Storable, E: Endianness, I: IntoIterator<Item = T>> {
183    iter: I,
184    k: usize,
185    codec_spec: Codec,
186    _markers: PhantomData<(T, E)>,
187}
188
189impl<T: Storable, E: Endianness, I: IntoIterator<Item = T>> VarVecFromIterBuilder<T, E, I> {
190    /// Creates a new builder from an iterator with default settings.
191    ///
192    /// By default, the sampling rate is `k=32` and the codec is [`Codec::Gamma`],
193    /// as automatic selection is not possible.
194    pub(super) fn new(iter: I) -> Self {
195        Self {
196            iter,
197            k: 32,
198            codec_spec: Codec::Gamma,
199            _markers: PhantomData,
200        }
201    }
202
203    /// Sets the sampling rate `k` for random access.
204    pub fn k(mut self, k: usize) -> Self {
205        self.k = k;
206        self
207    }
208
209    /// Sets the compression codec to use.
210    ///
211    /// # Errors
212    ///
213    /// The [`build`](Self::build) method will return an error if a codec specification that
214    /// requires data analysis is provided (e.g., [`Codec::Auto`]).
215    pub fn codec(mut self, codec_spec: Codec) -> Self {
216        self.codec_spec = codec_spec;
217        self
218    }
219
220    /// Builds the [`VarVec`] by consuming the iterator.
221    ///
222    /// This method iterates through the provided data source, compresses it,
223    /// and builds the sampling table in a single pass.
224    ///
225    /// # Errors
226    ///
227    /// Returns an [`VarVecError`] if an automatic codec spec is used or if `k` is 0.
228    ///
229    /// # Examples
230    ///
231    /// ```
232    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
233    /// use compressed_intvec::variable::{VarVec, UVarVec, Codec};
234    ///
235    /// // Create a vector from a range iterator
236    /// let data_iter = 0..1000u32;
237    ///
238    /// let vec: UVarVec<u32> = VarVec::from_iter_builder(data_iter)
239    ///     .k(64)
240    ///     .codec(Codec::Gamma) // Must be specified
241    ///     .build()?;
242    ///
243    /// assert_eq!(vec.len(), 1000);
244    /// assert_eq!(vec.get(999), Some(999));
245    /// # Ok(())
246    /// # }
247    /// ```
248    pub fn build(self) -> Result<VarVec<T, E, Vec<u64>>, VarVecError>
249    where
250        VarVecBitWriter<E>: BitWrite<E, Error = core::convert::Infallible> + CodesWrite<E>,
251    {
252        // Resolve the codec, but return an error if it requires data analysis.
253        let resolved_code = match self.codec_spec {
254            Codec::Auto
255            | Codec::Rice { log2_b: None }
256            | Codec::Zeta { k: None }
257            | Codec::Golomb { b: None } => {
258                return Err(VarVecError::InvalidParameters("Automatic parameter selection is not supported for iterator-based construction. Please provide fixed parameters.".to_string()));
259            }
260            // Pass an empty slice for validation; the parameters are explicit.
261            spec => codec::resolve_codec(&[0u64; 0], spec)?,
262        };
263
264        if self.k == 0 {
265            return Err(VarVecError::InvalidParameters(
266                "Sampling rate k cannot be zero".to_string(),
267            ));
268        }
269
270        let word_writer = MemWordWriterVec::new(Vec::new());
271        let mut writer = VarVecBitWriter::<E>::new(word_writer);
272        let mut len = 0;
273
274        let mut temp_samples = Vec::new();
275        let mut current_bit_offset = 0;
276
277        // Resolve the codec dispatch ONCE at the beginning.
278        let code_writer = CodecWriter::new(resolved_code);
279
280        for (i, value) in self.iter.into_iter().enumerate() {
281            if i % self.k == 0 {
282                temp_samples.push(current_bit_offset as u64);
283            }
284
285            let bits_written = code_writer.write(&mut writer, value.to_word())?;
286            current_bit_offset += bits_written;
287            len += 1;
288        }
289        writer.write_bits(u64::MAX, 64).unwrap(); // Stopper
290
291        let samples = FixedVec::<u64, u64, LE>::builder()
292            .bit_width(BitWidth::Minimal)
293            .build(&temp_samples)
294            .unwrap();
295
296        writer.flush().unwrap();
297        let mut data = writer.into_inner().unwrap().into_inner();
298        data.shrink_to_fit();
299
300        Ok(unsafe { VarVec::new_unchecked(data, samples, self.k, len, resolved_code) })
301    }
302}