compressed_intvec/variable/builder.rs
1//! Builders for constructing an [`IntVec`].
2//!
3//! This module provides the two primary builders for creating an [`IntVec`]:
4//!
5//! - [`IntVecBuilder`]: For building from an existing slice of data. This is the
6//! most flexible builder, as it can analyze the data to automatically select
7//! an optimal compression codec.
8//! - [`IntVecFromIterBuilder`]: For building from an iterator. This is suitable
9//! for large datasets that are generated on the fly, but it requires the
10//! compression codec to be specified manually.
11//!
12//! [`IntVec`]: crate::variable::IntVec
13
14use super::{codec, traits::Storable, IntVec, IntVecBitWriter, IntVecError, VariableCodecSpec};
15use crate::fixed::{BitWidth, FixedVec};
16use dsi_bitstream::{
17 codes::{
18 DeltaWrite, ExpGolombWrite, GammaWrite, GolombWrite, OmegaWrite, PiWrite, RiceWrite,
19 VByteBeWrite, VByteLeWrite, ZetaWrite,
20 },
21 impls::MemWordWriterVec,
22 prelude::{BitWrite, Codes, CodesWrite, Endianness, LE},
23};
24use std::marker::PhantomData;
25
26/// A builder for creating an [`IntVec`] from a slice of integers.
27///
28/// This builder is the primary entry point for constructing a compressed vector
29/// when the data is already available in memory. It allows for detailed
30/// configuration of the sampling rate (`k`) and the compression codec.
31///
32/// This builder always produces an owned `IntVec<T, E, Vec<u64>>`. It is obtained
33/// by calling [`IntVec::builder`].
34#[derive(Debug)]
35pub struct IntVecBuilder<T: Storable, E: Endianness> {
36 k: usize,
37 codec_spec: VariableCodecSpec,
38 _markers: PhantomData<(T, E)>,
39}
40
41impl<T: Storable, E: Endianness> IntVecBuilder<T, E> {
42 /// Creates a new builder for an `IntVec` with default settings.
43 ///
44 /// By default, the sampling rate is `k=32` and the codec is chosen
45 /// automatically via [`VariableCodecSpec::Auto`].
46 pub(super) fn new() -> Self {
47 Self {
48 k: 32,
49 codec_spec: VariableCodecSpec::Auto,
50 _markers: PhantomData,
51 }
52 }
53
54 /// Sets the sampling rate `k` for random access.
55 ///
56 /// The sampling rate determines how many elements are stored between each
57 /// sample point. A smaller `k` results in faster random access but uses
58 /// more memory for the sampling table. See the [module-level documentation](super)
59 /// for a detailed explanation.
60 ///
61 /// # Panics
62 ///
63 /// The [`build`](IntVecBuilder::build) method will return an error if `k` is 0.
64 pub fn k(mut self, k: usize) -> Self {
65 self.k = k;
66 self
67 }
68
69 /// Sets the compression codec to use.
70 ///
71 /// The choice of codec can significantly impact the compression ratio.
72 /// By default, this is [`VariableCodecSpec::Auto`], which analyzes the data
73 /// to select the best codec.
74 pub fn codec(mut self, codec_spec: VariableCodecSpec) -> Self {
75 self.codec_spec = codec_spec;
76 self
77 }
78
79 /// Builds the [`IntVec`] from a slice of data, consuming the builder.
80 ///
81 /// This method performs the compression and builds the sampling table.
82 ///
83 /// # Errors
84 ///
85 /// Returns an [`IntVecError`] if the parameters are invalid (e.g., `k=0`) or
86 /// if an error occurs during compression.
87 ///
88 /// # Examples
89 ///
90 /// ```
91 /// use compressed_intvec::variable::{IntVec, SIntVec, VariableCodecSpec};
92 ///
93 /// let data: &[i16] = &[-100, 0, 50, -2, 1000];
94 ///
95 /// let vec: SIntVec<i16> = IntVec::builder()
96 /// .k(2) // Smaller `k` for faster access
97 /// .codec(VariableCodecSpec::Delta) // Explicitly choose Delta coding
98 /// .build(data)
99 /// .unwrap();
100 ///
101 /// assert_eq!(vec.len(), 5);
102 /// assert_eq!(vec.get(0), Some(-100));
103 /// ```
104 pub fn build(self, input: &[T]) -> Result<IntVec<T, E, Vec<u64>>, IntVecError>
105 where
106 IntVecBitWriter<E>: BitWrite<E, Error = core::convert::Infallible> + CodesWrite<E>,
107 {
108 if self.k == 0 {
109 return Err(IntVecError::InvalidParameters(
110 "Sampling rate k cannot be zero".to_string(),
111 ));
112 }
113
114 // Convert the input data to a vector of u64 words for analysis and compression.
115 let words: Vec<u64> = input.iter().map(|&x| x.to_word()).collect();
116 let resolved_code = codec::resolve_codec(&words, self.codec_spec)?;
117
118 if input.is_empty() {
119 let empty_samples = FixedVec::<u64, u64, LE>::builder()
120 .build(&[0u64; 0])
121 .unwrap();
122 return Ok(unsafe {
123 IntVec::new_unchecked(Vec::new(), empty_samples, self.k, 0, resolved_code)
124 });
125 }
126
127 let word_writer = MemWordWriterVec::new(Vec::new());
128 let mut writer = IntVecBitWriter::<E>::new(word_writer);
129
130 let sample_capacity = input.len().div_ceil(self.k);
131 let mut temp_samples = Vec::with_capacity(sample_capacity);
132 let mut current_bit_offset = 0;
133
134 // Iterate through the data, writing compressed values and recording samples.
135 for (i, &value) in input.iter().enumerate() {
136 if i % self.k == 0 {
137 temp_samples.push(current_bit_offset as u64);
138 }
139
140 // Use our own dispatcher to call the appropriate write method.
141 // This avoids the limitations of dsi-bitstream's FuncCodeWriter.
142 let bits_written = match resolved_code {
143 Codes::Gamma => writer.write_gamma(value.to_word()).unwrap(),
144 Codes::Delta => writer.write_delta(value.to_word()).unwrap(),
145 Codes::Zeta { k } => writer.write_zeta(value.to_word(), k).unwrap(),
146 Codes::Golomb { b } => writer.write_golomb(value.to_word(), b as u64).unwrap(),
147 Codes::Rice { log2_b } => writer.write_rice(value.to_word(), log2_b).unwrap(),
148 Codes::Unary => writer.write_unary(value.to_word()).unwrap(),
149 Codes::Omega => writer.write_omega(value.to_word()).unwrap(),
150 Codes::Pi { k } => writer.write_pi(value.to_word(), k).unwrap(),
151 Codes::ExpGolomb { k } => writer.write_exp_golomb(value.to_word(), k).unwrap(),
152 Codes::VByteLe => writer.write_vbyte_le(value.to_word()).unwrap(),
153 Codes::VByteBe => writer.write_vbyte_be(value.to_word()).unwrap(),
154 _ => {
155 return Err(IntVecError::InvalidParameters(
156 "The specified codec is not supported for slice-based construction."
157 .to_string(),
158 ));
159 }
160 };
161 current_bit_offset += bits_written;
162 }
163 // Write a final stopper to ensure the last value can always be read safely.
164 writer.write_bits(u64::MAX, 64).unwrap();
165
166 // Compress the recorded samples into a FixedVec.
167 let samples = FixedVec::<u64, u64, LE>::builder()
168 .bit_width(BitWidth::Minimal)
169 .build(&temp_samples)
170 .unwrap();
171
172 writer.flush().unwrap();
173 let mut data = writer.into_inner().unwrap().into_inner();
174 data.shrink_to_fit();
175
176 Ok(unsafe {
177 IntVec::new_unchecked(data, samples, self.k, input.len(), resolved_code)
178 })
179 }
180}
181
182/// A builder for creating an [`IntVec`] from an iterator.
183///
184/// This builder is designed for constructing an [`IntVec`] from a data source that
185/// is an iterator. It consumes the iterator and compresses its elements on the fly.
186/// It is obtained by calling [`IntVec::from_iter_builder`].
187///
188/// # Limitations
189///
190/// This builder does **not** support automatic codec selection (i.e., [`VariableCodecSpec::Auto`])
191/// or automatic parameter estimation for codecs like [`Rice`](VariableCodecSpec::Rice) or [`Golomb`](VariableCodecSpec::Golomb). Since the
192/// iterator is consumed in a single pass, the data cannot be pre-analyzed to
193/// determine its statistical properties. The user must specify a concrete codec.
194#[derive(Debug)]
195pub struct IntVecFromIterBuilder<T: Storable, E: Endianness, I: IntoIterator<Item = T>> {
196 iter: I,
197 k: usize,
198 codec_spec: VariableCodecSpec,
199 _markers: PhantomData<(T, E)>,
200}
201
202impl<T: Storable, E: Endianness, I: IntoIterator<Item = T>> IntVecFromIterBuilder<T, E, I> {
203 /// Creates a new builder from an iterator with default settings.
204 ///
205 /// By default, the sampling rate is `k=32` and the codec is [`VariableCodecSpec::Gamma`],
206 /// as automatic selection is not possible.
207 pub(super) fn new(iter: I) -> Self {
208 Self {
209 iter,
210 k: 32,
211 codec_spec: VariableCodecSpec::Gamma,
212 _markers: PhantomData,
213 }
214 }
215
216 /// Sets the sampling rate `k` for random access.
217 pub fn k(mut self, k: usize) -> Self {
218 self.k = k;
219 self
220 }
221
222 /// Sets the compression codec to use.
223 ///
224 /// # Errors
225 ///
226 /// The [`build`](Self::build) method will return an error if a codec specification that
227 /// requires data analysis is provided (e.g., [`VariableCodecSpec::Auto`]).
228 pub fn codec(mut self, codec_spec: VariableCodecSpec) -> Self {
229 self.codec_spec = codec_spec;
230 self
231 }
232
233 /// Builds the [`IntVec`] by consuming the iterator.
234 ///
235 /// This method iterates through the provided data source, compresses it,
236 /// and builds the sampling table in a single pass.
237 ///
238 /// # Errors
239 ///
240 /// Returns an [`IntVecError`] if an automatic codec spec is used or if `k` is 0.
241 ///
242 /// # Examples
243 ///
244 /// ```
245 /// use compressed_intvec::variable::{IntVec, UIntVec, VariableCodecSpec};
246 ///
247 /// // Create a vector from a range iterator
248 /// let data_iter = 0..1000u32;
249 ///
250 /// let vec: UIntVec<u32> = IntVec::from_iter_builder(data_iter)
251 /// .k(64)
252 /// .codec(VariableCodecSpec::Gamma) // Must be specified
253 /// .build()
254 /// .unwrap();
255 ///
256 /// assert_eq!(vec.len(), 1000);
257 /// assert_eq!(vec.get(999), Some(999));
258 /// ```
259 pub fn build(self) -> Result<IntVec<T, E, Vec<u64>>, IntVecError>
260 where
261 IntVecBitWriter<E>: BitWrite<E, Error = core::convert::Infallible> + CodesWrite<E>,
262 {
263 // Resolve the codec, but return an error if it requires data analysis.
264 let resolved_code = match self.codec_spec {
265 VariableCodecSpec::Auto
266 | VariableCodecSpec::Rice { log2_b: None }
267 | VariableCodecSpec::Zeta { k: None }
268 | VariableCodecSpec::Golomb { b: None } => {
269 return Err(IntVecError::InvalidParameters("Automatic parameter selection is not supported for iterator-based construction. Please provide fixed parameters.".to_string()));
270 }
271 // Pass an empty slice for validation; the parameters are explicit.
272 spec => codec::resolve_codec(&[0u64; 0], spec)?,
273 };
274
275 if self.k == 0 {
276 return Err(IntVecError::InvalidParameters(
277 "Sampling rate k cannot be zero".to_string(),
278 ));
279 }
280
281 let word_writer = MemWordWriterVec::new(Vec::new());
282 let mut writer = IntVecBitWriter::<E>::new(word_writer);
283 let mut len = 0;
284
285 let mut temp_samples = Vec::new();
286 let mut current_bit_offset = 0;
287
288 for (i, value) in self.iter.into_iter().enumerate() {
289 if i % self.k == 0 {
290 temp_samples.push(current_bit_offset as u64);
291 }
292
293 // Use our own dispatcher to call the appropriate write method.
294 let bits_written = match resolved_code {
295 Codes::Gamma => writer.write_gamma(value.to_word()).unwrap(),
296 Codes::Delta => writer.write_delta(value.to_word()).unwrap(),
297 Codes::Zeta { k } => writer.write_zeta(value.to_word(), k).unwrap(),
298 Codes::Golomb { b } => writer.write_golomb(value.to_word(), b as u64).unwrap(),
299 Codes::Rice { log2_b } => writer.write_rice(value.to_word(), log2_b).unwrap(),
300 Codes::Unary => writer.write_unary(value.to_word()).unwrap(),
301 Codes::Omega => writer.write_omega(value.to_word()).unwrap(),
302 Codes::Pi { k } => writer.write_pi(value.to_word(), k).unwrap(),
303 Codes::ExpGolomb { k } => writer.write_exp_golomb(value.to_word(), k).unwrap(),
304 Codes::VByteLe => writer.write_vbyte_le(value.to_word()).unwrap(),
305 Codes::VByteBe => writer.write_vbyte_be(value.to_word()).unwrap(),
306 _ => {
307 return Err(IntVecError::InvalidParameters(
308 "The specified codec is not supported for iterator-based construction."
309 .to_string(),
310 ));
311 }
312 };
313 current_bit_offset += bits_written;
314 len += 1;
315 }
316 writer.write_bits(u64::MAX, 64).unwrap(); // Stopper
317
318 let samples = FixedVec::<u64, u64, LE>::builder()
319 .bit_width(BitWidth::Minimal)
320 .build(&temp_samples)
321 .unwrap();
322
323 writer.flush().unwrap();
324 let mut data = writer.into_inner().unwrap().into_inner();
325 data.shrink_to_fit();
326
327 Ok(unsafe { IntVec::new_unchecked(data, samples, self.k, len, resolved_code) })
328 }
329}