compressed_intvec/variable/builder.rs
1//! Builders for constructing an [`VarVec`].
2//!
3//! This module provides the two primary builders for creating an [`VarVec`]:
4//!
5//! - [`VarVecBuilder`]: For building from an existing slice of data. This is the
6//! most flexible builder, as it can analyze the data to automatically select
7//! an optimal compression codec.
8//! - [`VarVecFromIterBuilder`]: For building from an iterator. This is suitable
9//! for large datasets that are generated on the fly, but it requires the
10//! compression codec to be specified manually.
11//!
12//! [`VarVec`]: crate::variable::VarVec
13
14use super::{codec, codec::Codec, traits::Storable, VarVec, VarVecBitWriter, VarVecError};
15use crate::common::codec_writer::CodecWriter;
16use crate::fixed::{BitWidth, FixedVec};
17use dsi_bitstream::{
18 dispatch::StaticCodeWrite,
19 impls::MemWordWriterVec,
20 prelude::{BitWrite, CodesWrite, Endianness, LE},
21};
22use std::marker::PhantomData;
23
24/// A builder for creating an [`VarVec`] from a slice of integers.
25///
26/// This builder is the primary entry point for constructing a compressed vector
27/// when the data is already available in memory. It allows for detailed
28/// configuration of the sampling rate (`k`) and the compression codec.
29///
30/// This builder always produces an owned [`VarVec`]. It is obtained
31/// by calling [`VarVec::builder`].
32#[derive(Debug)]
33pub struct VarVecBuilder<T: Storable, E: Endianness> {
34 k: usize,
35 codec_spec: Codec,
36 _markers: PhantomData<(T, E)>,
37}
38
39impl<T: Storable, E: Endianness> VarVecBuilder<T, E> {
40 /// Creates a new builder for a [`VarVec`] with default settings.
41 ///
42 /// By default, the sampling rate is `k=32` and the codec is chosen
43 /// automatically via [`Codec::Auto`].
44 pub(super) fn new() -> Self {
45 Self {
46 k: 32,
47 codec_spec: Codec::Auto,
48 _markers: PhantomData,
49 }
50 }
51
52 /// Sets the sampling rate `k` for random access.
53 ///
54 /// The sampling rate determines how many elements are stored between each
55 /// sample point. A smaller `k` results in faster random access but uses
56 /// more memory for the sampling table. See the [module-level documentation](super)
57 /// for a detailed explanation.
58 ///
59 /// # Panics
60 ///
61 /// The [`build`](VarVecBuilder::build) method will return an error if `k` is 0.
62 pub fn k(mut self, k: usize) -> Self {
63 self.k = k;
64 self
65 }
66
67 /// Sets the compression codec to use.
68 ///
69 /// The choice of codec can significantly impact the compression ratio.
70 /// By default, this is [`Codec::Auto`], which analyzes the data
71 /// to select the best codec.
72 pub fn codec(mut self, codec_spec: Codec) -> Self {
73 self.codec_spec = codec_spec;
74 self
75 }
76
77 /// Builds the [`VarVec`] from a slice of data, consuming the builder.
78 ///
79 /// This method performs the compression and builds the sampling table.
80 ///
81 /// # Errors
82 ///
83 /// Returns an [`VarVecError`] if the parameters are invalid (e.g., `k=0`) or
84 /// if an error occurs during compression.
85 ///
86 /// # Examples
87 ///
88 /// ```
89 /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
90 /// use compressed_intvec::variable::{VarVec, SVarVec, Codec};
91 ///
92 /// let data: &[i16] = &[-100, 0, 50, -2, 1000];
93 ///
94 /// let vec: SVarVec<i16> = VarVec::builder()
95 /// .k(2) // Smaller `k` for faster access
96 /// .codec(Codec::Delta) // Explicitly choose Delta coding
97 /// .build(data)?;
98 ///
99 /// assert_eq!(vec.len(), 5);
100 /// assert_eq!(vec.get(0), Some(-100));
101 /// # Ok(())
102 /// # }
103 /// ```
104 pub fn build(self, input: &[T]) -> Result<VarVec<T, E, Vec<u64>>, VarVecError>
105 where
106 VarVecBitWriter<E>: BitWrite<E, Error = core::convert::Infallible> + CodesWrite<E>,
107 {
108 if self.k == 0 {
109 return Err(VarVecError::InvalidParameters(
110 "Sampling rate k cannot be zero".to_string(),
111 ));
112 }
113
114 // Resolve codec: only iterate for analysis when necessary.
115 let resolved_code = if self.codec_spec.requires_analysis() {
116 // Analysis needed: iterate input once, convert on-the-fly.
117 codec::resolve_codec_from_iter(input.iter().map(|&x| x.to_word()), self.codec_spec)?
118 } else {
119 // No analysis needed: resolve directly without data access.
120 codec::resolve_codec(&[] as &[u64], self.codec_spec)?
121 };
122
123 if input.is_empty() {
124 let empty_samples = FixedVec::<u64, u64, LE>::builder()
125 .build(&[0u64; 0])
126 .unwrap();
127 return Ok(unsafe {
128 VarVec::new_unchecked(Vec::new(), empty_samples, self.k, 0, resolved_code)
129 });
130 }
131
132 let word_writer = MemWordWriterVec::new(Vec::new());
133 let mut writer = VarVecBitWriter::<E>::new(word_writer);
134
135 let sample_capacity = input.len().div_ceil(self.k);
136 let mut temp_samples = Vec::with_capacity(sample_capacity);
137 let mut current_bit_offset = 0;
138
139 // Resolve the codec dispatch ONCE at the beginning.
140 // This eliminates per-element match overhead for common codecs.
141 let code_writer = CodecWriter::new(resolved_code);
142
143 // Iterate through the data, writing compressed values and recording samples.
144 for (i, &value) in input.iter().enumerate() {
145 if i % self.k == 0 {
146 temp_samples.push(current_bit_offset as u64);
147 }
148
149 let bits_written = code_writer.write(&mut writer, value.to_word())?;
150 current_bit_offset += bits_written;
151 }
152 // Write a final stopper to ensure the last value can always be read safely.
153 writer.write_bits(u64::MAX, 64).unwrap();
154
155 // Compress the recorded samples into a FixedVec.
156 let samples = FixedVec::<u64, u64, LE>::builder()
157 .bit_width(BitWidth::Minimal)
158 .build(&temp_samples)
159 .unwrap();
160
161 writer.flush().unwrap();
162 let mut data = writer.into_inner().unwrap().into_inner();
163 data.shrink_to_fit();
164
165 Ok(unsafe { VarVec::new_unchecked(data, samples, self.k, input.len(), resolved_code) })
166 }
167}
168
169/// A builder for creating an [`VarVec`] from an iterator.
170///
171/// This builder is designed for constructing an [`VarVec`] from a data source that
172/// is an iterator. It consumes the iterator and compresses its elements on the fly.
173/// It is obtained by calling [`VarVec::from_iter_builder`].
174///
175/// # Limitations
176///
177/// This builder does **not** support automatic codec selection (i.e., [`Codec::Auto`])
178/// or automatic parameter estimation for codecs like [`Rice`](Codec::Rice) or [`Golomb`](Codec::Golomb). Since the
179/// iterator is consumed in a single pass, the data cannot be pre-analyzed to
180/// determine its statistical properties. The user must specify a concrete codec.
181#[derive(Debug)]
182pub struct VarVecFromIterBuilder<T: Storable, E: Endianness, I: IntoIterator<Item = T>> {
183 iter: I,
184 k: usize,
185 codec_spec: Codec,
186 _markers: PhantomData<(T, E)>,
187}
188
189impl<T: Storable, E: Endianness, I: IntoIterator<Item = T>> VarVecFromIterBuilder<T, E, I> {
190 /// Creates a new builder from an iterator with default settings.
191 ///
192 /// By default, the sampling rate is `k=32` and the codec is [`Codec::Gamma`],
193 /// as automatic selection is not possible.
194 pub(super) fn new(iter: I) -> Self {
195 Self {
196 iter,
197 k: 32,
198 codec_spec: Codec::Gamma,
199 _markers: PhantomData,
200 }
201 }
202
203 /// Sets the sampling rate `k` for random access.
204 pub fn k(mut self, k: usize) -> Self {
205 self.k = k;
206 self
207 }
208
209 /// Sets the compression codec to use.
210 ///
211 /// # Errors
212 ///
213 /// The [`build`](Self::build) method will return an error if a codec specification that
214 /// requires data analysis is provided (e.g., [`Codec::Auto`]).
215 pub fn codec(mut self, codec_spec: Codec) -> Self {
216 self.codec_spec = codec_spec;
217 self
218 }
219
220 /// Builds the [`VarVec`] by consuming the iterator.
221 ///
222 /// This method iterates through the provided data source, compresses it,
223 /// and builds the sampling table in a single pass.
224 ///
225 /// # Errors
226 ///
227 /// Returns an [`VarVecError`] if an automatic codec spec is used or if `k` is 0.
228 ///
229 /// # Examples
230 ///
231 /// ```
232 /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
233 /// use compressed_intvec::variable::{VarVec, UVarVec, Codec};
234 ///
235 /// // Create a vector from a range iterator
236 /// let data_iter = 0..1000u32;
237 ///
238 /// let vec: UVarVec<u32> = VarVec::from_iter_builder(data_iter)
239 /// .k(64)
240 /// .codec(Codec::Gamma) // Must be specified
241 /// .build()?;
242 ///
243 /// assert_eq!(vec.len(), 1000);
244 /// assert_eq!(vec.get(999), Some(999));
245 /// # Ok(())
246 /// # }
247 /// ```
248 pub fn build(self) -> Result<VarVec<T, E, Vec<u64>>, VarVecError>
249 where
250 VarVecBitWriter<E>: BitWrite<E, Error = core::convert::Infallible> + CodesWrite<E>,
251 {
252 // Resolve the codec, but return an error if it requires data analysis.
253 let resolved_code = match self.codec_spec {
254 Codec::Auto
255 | Codec::Rice { log2_b: None }
256 | Codec::Zeta { k: None }
257 | Codec::Golomb { b: None } => {
258 return Err(VarVecError::InvalidParameters("Automatic parameter selection is not supported for iterator-based construction. Please provide fixed parameters.".to_string()));
259 }
260 // Pass an empty slice for validation; the parameters are explicit.
261 spec => codec::resolve_codec(&[0u64; 0], spec)?,
262 };
263
264 if self.k == 0 {
265 return Err(VarVecError::InvalidParameters(
266 "Sampling rate k cannot be zero".to_string(),
267 ));
268 }
269
270 let word_writer = MemWordWriterVec::new(Vec::new());
271 let mut writer = VarVecBitWriter::<E>::new(word_writer);
272 let mut len = 0;
273
274 let mut temp_samples = Vec::new();
275 let mut current_bit_offset = 0;
276
277 // Resolve the codec dispatch ONCE at the beginning.
278 let code_writer = CodecWriter::new(resolved_code);
279
280 for (i, value) in self.iter.into_iter().enumerate() {
281 if i % self.k == 0 {
282 temp_samples.push(current_bit_offset as u64);
283 }
284
285 let bits_written = code_writer.write(&mut writer, value.to_word())?;
286 current_bit_offset += bits_written;
287 len += 1;
288 }
289 writer.write_bits(u64::MAX, 64).unwrap(); // Stopper
290
291 let samples = FixedVec::<u64, u64, LE>::builder()
292 .bit_width(BitWidth::Minimal)
293 .build(&temp_samples)
294 .unwrap();
295
296 writer.flush().unwrap();
297 let mut data = writer.into_inner().unwrap().into_inner();
298 data.shrink_to_fit();
299
300 Ok(unsafe { VarVec::new_unchecked(data, samples, self.k, len, resolved_code) })
301 }
302}