compressed_intvec/variable/
builder.rs

1//! Builders for constructing an [`IntVec`].
2//!
3//! This module provides the two primary builders for creating an [`IntVec`]:
4//!
5//! - [`IntVecBuilder`]: For building from an existing slice of data. This is the
6//!   most flexible builder, as it can analyze the data to automatically select
7//!   an optimal compression codec.
8//! - [`IntVecFromIterBuilder`]: For building from an iterator. This is suitable
9//!   for large datasets that are generated on the fly, but it requires the
10//!   compression codec to be specified manually.
11//!
12//! [`IntVec`]: crate::variable::IntVec
13
14use super::{codec, traits::Storable, IntVec, IntVecBitWriter, IntVecError, VariableCodecSpec};
15use crate::fixed::{BitWidth, FixedVec};
16use dsi_bitstream::{
17    codes::{
18        DeltaWrite, ExpGolombWrite, GammaWrite, GolombWrite, OmegaWrite, PiWrite, RiceWrite,
19        VByteBeWrite, VByteLeWrite, ZetaWrite,
20    },
21    impls::MemWordWriterVec,
22    prelude::{BitWrite, Codes, CodesWrite, Endianness, LE},
23};
24use std::marker::PhantomData;
25
26/// A builder for creating an [`IntVec`] from a slice of integers.
27///
28/// This builder is the primary entry point for constructing a compressed vector
29/// when the data is already available in memory. It allows for detailed
30/// configuration of the sampling rate (`k`) and the compression codec.
31///
32/// This builder always produces an owned `IntVec<T, E, Vec<u64>>`. It is obtained
33/// by calling [`IntVec::builder`].
34#[derive(Debug)]
35pub struct IntVecBuilder<T: Storable, E: Endianness> {
36    k: usize,
37    codec_spec: VariableCodecSpec,
38    _markers: PhantomData<(T, E)>,
39}
40
41impl<T: Storable, E: Endianness> IntVecBuilder<T, E> {
42    /// Creates a new builder for an `IntVec` with default settings.
43    ///
44    /// By default, the sampling rate is `k=32` and the codec is chosen
45    /// automatically via [`VariableCodecSpec::Auto`].
46    pub(super) fn new() -> Self {
47        Self {
48            k: 32,
49            codec_spec: VariableCodecSpec::Auto,
50            _markers: PhantomData,
51        }
52    }
53
54    /// Sets the sampling rate `k` for random access.
55    ///
56    /// The sampling rate determines how many elements are stored between each
57    /// sample point. A smaller `k` results in faster random access but uses
58    /// more memory for the sampling table. See the [module-level documentation](super)
59    /// for a detailed explanation.
60    ///
61    /// # Panics
62    ///
63    /// The [`build`](IntVecBuilder::build) method will return an error if `k` is 0.
64    pub fn k(mut self, k: usize) -> Self {
65        self.k = k;
66        self
67    }
68
69    /// Sets the compression codec to use.
70    ///
71    /// The choice of codec can significantly impact the compression ratio.
72    /// By default, this is [`VariableCodecSpec::Auto`], which analyzes the data
73    /// to select the best codec.
74    pub fn codec(mut self, codec_spec: VariableCodecSpec) -> Self {
75        self.codec_spec = codec_spec;
76        self
77    }
78
79    /// Builds the [`IntVec`] from a slice of data, consuming the builder.
80    ///
81    /// This method performs the compression and builds the sampling table.
82    ///
83    /// # Errors
84    ///
85    /// Returns an [`IntVecError`] if the parameters are invalid (e.g., `k=0`) or
86    /// if an error occurs during compression.
87    ///
88    /// # Examples
89    ///
90    /// ``` 
91    /// use compressed_intvec::variable::{IntVec, SIntVec, VariableCodecSpec};
92    ///
93    /// let data: &[i16] = &[-100, 0, 50, -2, 1000];
94    ///
95    /// let vec: SIntVec<i16> = IntVec::builder()
96    ///     .k(2) // Smaller `k` for faster access
97    ///     .codec(VariableCodecSpec::Delta) // Explicitly choose Delta coding
98    ///     .build(data)
99    ///     .unwrap();
100    ///
101    /// assert_eq!(vec.len(), 5);
102    /// assert_eq!(vec.get(0), Some(-100));
103    /// ```
104    pub fn build(self, input: &[T]) -> Result<IntVec<T, E, Vec<u64>>, IntVecError>
105    where
106        IntVecBitWriter<E>: BitWrite<E, Error = core::convert::Infallible> + CodesWrite<E>,
107    {
108        if self.k == 0 {
109            return Err(IntVecError::InvalidParameters(
110                "Sampling rate k cannot be zero".to_string(),
111            ));
112        }
113
114        // Convert the input data to a vector of u64 words for analysis and compression.
115        let words: Vec<u64> = input.iter().map(|&x| x.to_word()).collect();
116        let resolved_code = codec::resolve_codec(&words, self.codec_spec)?;
117
118        if input.is_empty() {
119            let empty_samples = FixedVec::<u64, u64, LE>::builder()
120                .build(&[0u64; 0])
121                .unwrap();
122            return Ok(unsafe {
123                IntVec::new_unchecked(Vec::new(), empty_samples, self.k, 0, resolved_code)
124            });
125        }
126
127        let word_writer = MemWordWriterVec::new(Vec::new());
128        let mut writer = IntVecBitWriter::<E>::new(word_writer);
129
130        let sample_capacity = input.len().div_ceil(self.k);
131        let mut temp_samples = Vec::with_capacity(sample_capacity);
132        let mut current_bit_offset = 0;
133
134        // Iterate through the data, writing compressed values and recording samples.
135        for (i, &value) in input.iter().enumerate() {
136            if i % self.k == 0 {
137                temp_samples.push(current_bit_offset as u64);
138            }
139
140            // Use our own dispatcher to call the appropriate write method.
141            // This avoids the limitations of dsi-bitstream's FuncCodeWriter.
142            let bits_written = match resolved_code {
143                Codes::Gamma => writer.write_gamma(value.to_word()).unwrap(),
144                Codes::Delta => writer.write_delta(value.to_word()).unwrap(),
145                Codes::Zeta { k } => writer.write_zeta(value.to_word(), k).unwrap(),
146                Codes::Golomb { b } => writer.write_golomb(value.to_word(), b as u64).unwrap(),
147                Codes::Rice { log2_b } => writer.write_rice(value.to_word(), log2_b).unwrap(),
148                Codes::Unary => writer.write_unary(value.to_word()).unwrap(),
149                Codes::Omega => writer.write_omega(value.to_word()).unwrap(),
150                Codes::Pi { k } => writer.write_pi(value.to_word(), k).unwrap(),
151                Codes::ExpGolomb { k } => writer.write_exp_golomb(value.to_word(), k).unwrap(),
152                Codes::VByteLe => writer.write_vbyte_le(value.to_word()).unwrap(),
153                Codes::VByteBe => writer.write_vbyte_be(value.to_word()).unwrap(),
154                _ => {
155                    return Err(IntVecError::InvalidParameters(
156                        "The specified codec is not supported for slice-based construction."
157                            .to_string(),
158                    ));
159                }
160            };
161            current_bit_offset += bits_written;
162        }
163        // Write a final stopper to ensure the last value can always be read safely.
164        writer.write_bits(u64::MAX, 64).unwrap();
165
166        // Compress the recorded samples into a FixedVec.
167        let samples = FixedVec::<u64, u64, LE>::builder()
168            .bit_width(BitWidth::Minimal)
169            .build(&temp_samples)
170            .unwrap();
171
172        writer.flush().unwrap();
173        let mut data = writer.into_inner().unwrap().into_inner();
174        data.shrink_to_fit();
175
176        Ok(unsafe {
177            IntVec::new_unchecked(data, samples, self.k, input.len(), resolved_code)
178        })
179    }
180}
181
182/// A builder for creating an [`IntVec`] from an iterator.
183///
184/// This builder is designed for constructing an [`IntVec`] from a data source that
185/// is an iterator. It consumes the iterator and compresses its elements on the fly.
186/// It is obtained by calling [`IntVec::from_iter_builder`].
187///
188/// # Limitations
189///
190/// This builder does **not** support automatic codec selection (i.e., [`VariableCodecSpec::Auto`])
191/// or automatic parameter estimation for codecs like [`Rice`](VariableCodecSpec::Rice) or [`Golomb`](VariableCodecSpec::Golomb). Since the
192/// iterator is consumed in a single pass, the data cannot be pre-analyzed to
193/// determine its statistical properties. The user must specify a concrete codec.
194#[derive(Debug)]
195pub struct IntVecFromIterBuilder<T: Storable, E: Endianness, I: IntoIterator<Item = T>> {
196    iter: I,
197    k: usize,
198    codec_spec: VariableCodecSpec,
199    _markers: PhantomData<(T, E)>,
200}
201
202impl<T: Storable, E: Endianness, I: IntoIterator<Item = T>> IntVecFromIterBuilder<T, E, I> {
203    /// Creates a new builder from an iterator with default settings.
204    ///
205    /// By default, the sampling rate is `k=32` and the codec is [`VariableCodecSpec::Gamma`],
206    /// as automatic selection is not possible.
207    pub(super) fn new(iter: I) -> Self {
208        Self {
209            iter,
210            k: 32,
211            codec_spec: VariableCodecSpec::Gamma,
212            _markers: PhantomData,
213        }
214    }
215
216    /// Sets the sampling rate `k` for random access.
217    pub fn k(mut self, k: usize) -> Self {
218        self.k = k;
219        self
220    }
221
222    /// Sets the compression codec to use.
223    ///
224    /// # Errors
225    ///
226    /// The [`build`](Self::build) method will return an error if a codec specification that
227    /// requires data analysis is provided (e.g., [`VariableCodecSpec::Auto`]).
228    pub fn codec(mut self, codec_spec: VariableCodecSpec) -> Self {
229        self.codec_spec = codec_spec;
230        self
231    }
232
233    /// Builds the [`IntVec`] by consuming the iterator.
234    ///
235    /// This method iterates through the provided data source, compresses it,
236    /// and builds the sampling table in a single pass.
237    ///
238    /// # Errors
239    ///
240    /// Returns an [`IntVecError`] if an automatic codec spec is used or if `k` is 0.
241    ///
242    /// # Examples
243    ///
244    /// ```
245    /// use compressed_intvec::variable::{IntVec, UIntVec, VariableCodecSpec};
246    ///
247    /// // Create a vector from a range iterator
248    /// let data_iter = 0..1000u32;
249    ///
250    /// let vec: UIntVec<u32> = IntVec::from_iter_builder(data_iter)
251    ///     .k(64)
252    ///     .codec(VariableCodecSpec::Gamma) // Must be specified
253    ///     .build()
254    ///     .unwrap();
255    ///
256    /// assert_eq!(vec.len(), 1000);
257    /// assert_eq!(vec.get(999), Some(999));
258    /// ```
259    pub fn build(self) -> Result<IntVec<T, E, Vec<u64>>, IntVecError>
260    where
261        IntVecBitWriter<E>: BitWrite<E, Error = core::convert::Infallible> + CodesWrite<E>,
262    {
263        // Resolve the codec, but return an error if it requires data analysis.
264        let resolved_code = match self.codec_spec {
265            VariableCodecSpec::Auto
266            | VariableCodecSpec::Rice { log2_b: None }
267            | VariableCodecSpec::Zeta { k: None }
268            | VariableCodecSpec::Golomb { b: None } => {
269                return Err(IntVecError::InvalidParameters("Automatic parameter selection is not supported for iterator-based construction. Please provide fixed parameters.".to_string()));
270            }
271            // Pass an empty slice for validation; the parameters are explicit.
272            spec => codec::resolve_codec(&[0u64; 0], spec)?,
273        };
274
275        if self.k == 0 {
276            return Err(IntVecError::InvalidParameters(
277                "Sampling rate k cannot be zero".to_string(),
278            ));
279        }
280
281        let word_writer = MemWordWriterVec::new(Vec::new());
282        let mut writer = IntVecBitWriter::<E>::new(word_writer);
283        let mut len = 0;
284
285        let mut temp_samples = Vec::new();
286        let mut current_bit_offset = 0;
287
288        for (i, value) in self.iter.into_iter().enumerate() {
289            if i % self.k == 0 {
290                temp_samples.push(current_bit_offset as u64);
291            }
292
293            // Use our own dispatcher to call the appropriate write method.
294            let bits_written = match resolved_code {
295                Codes::Gamma => writer.write_gamma(value.to_word()).unwrap(),
296                Codes::Delta => writer.write_delta(value.to_word()).unwrap(),
297                Codes::Zeta { k } => writer.write_zeta(value.to_word(), k).unwrap(),
298                Codes::Golomb { b } => writer.write_golomb(value.to_word(), b as u64).unwrap(),
299                Codes::Rice { log2_b } => writer.write_rice(value.to_word(), log2_b).unwrap(),
300                Codes::Unary => writer.write_unary(value.to_word()).unwrap(),
301                Codes::Omega => writer.write_omega(value.to_word()).unwrap(),
302                Codes::Pi { k } => writer.write_pi(value.to_word(), k).unwrap(),
303                Codes::ExpGolomb { k } => writer.write_exp_golomb(value.to_word(), k).unwrap(),
304                Codes::VByteLe => writer.write_vbyte_le(value.to_word()).unwrap(),
305                Codes::VByteBe => writer.write_vbyte_be(value.to_word()).unwrap(),
306                _ => {
307                    return Err(IntVecError::InvalidParameters(
308                        "The specified codec is not supported for iterator-based construction."
309                            .to_string(),
310                    ));
311                }
312            };
313            current_bit_offset += bits_written;
314            len += 1;
315        }
316        writer.write_bits(u64::MAX, 64).unwrap(); // Stopper
317
318        let samples = FixedVec::<u64, u64, LE>::builder()
319            .bit_width(BitWidth::Minimal)
320            .build(&temp_samples)
321            .unwrap();
322
323        writer.flush().unwrap();
324        let mut data = writer.into_inner().unwrap().into_inner();
325        data.shrink_to_fit();
326
327        Ok(unsafe { IntVec::new_unchecked(data, samples, self.k, len, resolved_code) })
328    }
329}