compressed_intvec/intvec.rs
1//! # Compressed IntVec
2//!
3//! This module provides the `IntVec` structure for efficiently compressing a series of `u64` values
4//! using a specified instantaneous code from the [dsi-bitstream](https://crates.io/crates/dsi_bitstream) crate.
5//!
6//! ## How It Works
7//!
8//! The `IntVec` structure compresses a series of `u64` values by encoding them with a
9//! specified codec. Every kth element is sampled during encoding, meaning its bit-offset
10//! is recorded. This sampling information allows the decoder to jump directly to the
11//! vicinity of any desired element, thereby minimizing the number of bits that need to be
12//! processed during random access.
13//!
14//! ## Features
15//!
16//! - **Efficient Compression:** Reduces storage requirements by encoding data into a compact bitstream.
17//! - **Fast Random Access:** Leverages sampled bit-offsets to quickly decode individual elements.
18//! - **Iterative Decoding:** Provides an iterator that decodes values on the fly for sequential access.
19//!
20//! ## Example Usage
21//!
22//! ```rust
23//! use compressed_intvec::intvec::IntVec;
24//! use compressed_intvec::codecs::GammaCodec;
25//! use dsi_bitstream::traits::BE;
26//!
27//! let input = vec![1, 2, 3, 4, 5, 6, 7, 8];
28//! let sampling_rate = 2;
29//! let intvec = IntVec::<BE, _, GammaCodec>::from(&input, sampling_rate).unwrap();
30//!
31//! // Fast random access using the sampled bit offsets
32//! assert_eq!(intvec.get(3), 4);
33//!
34//! // Decode the entire vector back to its original form
35//! let decompressed: Vec<u64> = intvec.into_vec();
36//! assert_eq!(decompressed, input);
37//! ```
38//!
39//! ## Type Aliases
40//!
41//! For convenience, this module provides type aliases for common configurations:
42//!
43//! - **BEIntVec:** A big-endian version of the compressed vector.
44//!
45//! ```rust
46//! use compressed_intvec::intvec::BEIntVec;
47//! use compressed_intvec::codecs::GammaCodec;
48//!
49//! let input = vec![10, 20, 30];
50//! let intvec = BEIntVec::<GammaCodec>::from(&input, 2).unwrap();
51//! ```
52//!
53//! - **LEIntVec:** A little-endian version of the compressed vector.
54//!
55//! ```rust
56//! use compressed_intvec::intvec::LEIntVec;
57//! use compressed_intvec::codecs::GammaCodec;
58//!
59//! let input = vec![9, 8, 7];
60//! let intvec = LEIntVec::<GammaCodec>::from(&input, 2).unwrap();
61//! ```
62
63use crate::codecs::Codec;
64use dsi_bitstream::{codes::params::DefaultReadParams, prelude::*};
65use mem_dbg::{MemDbg, MemSize};
66
67/// A compressed vector of unsigned 64-bit integers using a specified codec.
68///
69/// The `IntVec` structure compresses a sequence of `u64` values by encoding them using a
70/// provided codec. It supports random access via sampling and iteration over the decompressed
71/// values.
72///
73/// The type parameters are:
74/// - `E`: Endianness to use (`BE` or `LE`).
75/// - `W`: A bit writer implementing `BitWrite<E>`.
76/// - `C`: A codec implementing `Codec<E, W>`.
77///
78/// # Examples
79///
80/// ```rust
81/// use compressed_intvec::intvec::IntVec;
82/// use compressed_intvec::codecs::GammaCodec;
83/// use dsi_bitstream::traits::BE;
84///
85/// let input = vec![1, 2, 3, 4, 5];
86/// let k = 2;
87/// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
88/// assert_eq!(intvec.len(), input.len());
89/// ```
90#[derive(Debug, Clone, MemDbg, MemSize)]
91#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
92#[cfg_attr(
93 feature = "serde",
94 serde(bound(
95 serialize = "C::Params: Serialize",
96 deserialize = "C::Params: for<'a> Deserialize<'a>"
97 ))
98)]
99pub struct IntVec<E: Endianness, W: BitWrite<E>, C: Codec<E, W>> {
100 data: Vec<u64>,
101 samples: Vec<usize>,
102 k: usize,
103 len: usize,
104 codec_param: C::Params,
105 codec: std::marker::PhantomData<C>,
106 endian: std::marker::PhantomData<E>,
107}
108
109impl<E, C> IntVec<E, BufBitWriter<E, MemWordWriterVec<u64, Vec<u64>>>, C>
110where
111 E: Endianness,
112 C: Codec<E, BufBitWriter<E, MemWordWriterVec<u64, Vec<u64>>>>,
113 C::Params: Copy,
114 BufBitWriter<E, MemWordWriterVec<u64, Vec<u64>>>: BitWrite<E>,
115 for<'a> BufBitReader<E, MemWordReader<u64, &'a Vec<u64>>>:
116 GammaRead<E> + DeltaRead<E> + ZetaRead<E> + ZetaReadParam<E> + DeltaReadParam<E>,
117{
118 /// Constructs an [`IntVec`] from a slice of `u64` values using the provided codec parameters.
119 ///
120 /// This method encodes the input values using the specified codec. It also builds sampling
121 /// information for efficient random access. The parameter `k` determines the sampling rate,
122 /// i.e. every kth element's bit position is recorded as a sample.
123 ///
124 /// # Parameters
125 ///
126 /// - `input`: A slice of unsigned 64-bit integers to compress.
127 /// - `k`: The sampling rate; every kth element will have its bit offset stored.
128 /// - `codec_param`: The parameters for the codec used for encoding.
129 ///
130 /// # Returns
131 ///
132 /// Returns `Ok(IntVec)` on success or an error if encoding fails.
133 ///
134 /// # Examples
135 ///
136 /// ```rust
137 /// use compressed_intvec::intvec::IntVec;
138 /// use compressed_intvec::codecs::GammaCodec;
139 /// use dsi_bitstream::traits::BE;
140 ///
141 /// let input = vec![10, 20, 30, 40];
142 /// let k = 2;
143 /// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
144 /// assert_eq!(intvec.len(), input.len());
145 /// ```
146 pub fn from_with_param(
147 input: &[u64],
148 k: usize,
149 codec_param: C::Params,
150 ) -> Result<Self, Box<dyn std::error::Error>> {
151 let word_writer = MemWordWriterVec::new(Vec::new());
152 let mut writer = BufBitWriter::<E, MemWordWriterVec<u64, Vec<u64>>>::new(word_writer);
153 let mut samples = Vec::new();
154 let mut total_bits = 0;
155
156 for (i, &x) in input.iter().enumerate() {
157 if i % k == 0 {
158 samples.push(total_bits);
159 }
160 total_bits += C::encode(&mut writer, x, codec_param)?;
161 }
162
163 writer.flush()?;
164 let data = writer.into_inner()?.into_inner();
165
166 Ok(Self {
167 data,
168 samples,
169 codec: std::marker::PhantomData,
170 k,
171 len: input.len(),
172 codec_param,
173 endian: std::marker::PhantomData,
174 })
175 }
176
177 /// Constructs an [`IntVec`] from a slice of `u64` values using the default codec parameters.
178 ///
179 /// This is a convenience method that calls `from_with_param` with `C::Params::default()`.
180 ///
181 /// # Parameters
182 ///
183 /// - `input`: A slice of unsigned 64-bit integers to compress.
184 /// - `k`: The sampling rate; every kth element will have its bit offset stored.
185 ///
186 /// # Returns
187 ///
188 /// Returns `Ok(IntVec)` on success or an error if encoding fails.
189 ///
190 /// # Examples
191 ///
192 /// ```rust
193 /// use compressed_intvec::intvec::IntVec;
194 /// use compressed_intvec::codecs::GammaCodec;
195 /// use dsi_bitstream::traits::BE;
196 ///
197 /// let input = vec![5, 10, 15, 20];
198 /// let k = 2;
199 /// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
200 /// assert_eq!(intvec.len(), input.len());
201 /// ```
202 pub fn from(input: &[u64], k: usize) -> Result<Self, Box<dyn std::error::Error>>
203 where
204 C::Params: Default,
205 {
206 Self::from_with_param(input, k, C::Params::default())
207 }
208
209 /// Retrieves the element at the specified index after decompressing the value.
210 ///
211 /// This method uses the sampling information to quickly locate the encoded bit-stream
212 /// position and decodes the required value.
213 ///
214 /// # Panics
215 ///
216 /// Panics if the index is out of bounds.
217 ///
218 /// # Parameters
219 ///
220 /// - `index`: The index of the desired element.
221 ///
222 /// # Returns
223 ///
224 /// Returns the `u64` value at the given index.
225 ///
226 /// # Examples
227 ///
228 /// ```rust
229 /// use compressed_intvec::intvec::IntVec;
230 /// use compressed_intvec::codecs::GammaCodec;
231 /// use dsi_bitstream::traits::BE;
232 ///
233 /// let input = vec![3, 6, 9];
234 /// let k = 1;
235 /// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
236 /// assert_eq!(intvec.get(1), 6);
237 /// ```
238 #[inline(always)]
239 pub fn get(&self, index: usize) -> u64 {
240 if index >= self.len {
241 panic!("Index {} is out of bounds", index);
242 }
243
244 let sample_index = index / self.k;
245 let start_bit = self.samples[sample_index];
246 let mut reader =
247 BufBitReader::<E, MemWordReader<u64, &'_ Vec<u64>>, DefaultReadParams>::new(
248 MemWordReader::new(&self.data),
249 );
250 reader.skip_bits(start_bit).unwrap();
251
252 let mut value = 0;
253 let start_index = sample_index * self.k;
254 for _ in start_index..=index {
255 value = C::decode(&mut reader, self.codec_param).unwrap();
256 }
257 value
258 }
259
260 /// Consumes the [`IntVec`] and returns a vector containing all decompressed `u64` values.
261 ///
262 /// This method sequentially decodes all the values from the compressed representation
263 /// into a standard `Vec<u64>`.
264 ///
265 /// # Returns
266 ///
267 /// A vector of decompressed values.
268 ///
269 /// # Examples
270 ///
271 /// ```rust
272 /// use compressed_intvec::intvec::IntVec;
273 /// use compressed_intvec::codecs::GammaCodec;
274 /// use dsi_bitstream::traits::LE;
275 ///
276 /// let input = vec![7, 14, 21];
277 /// let k = 2;
278 /// let intvec = IntVec::<LE, _, GammaCodec>::from(&input, k).unwrap();
279 /// let output = intvec.into_vec();
280 /// assert_eq!(output, input);
281 /// ```
282 pub fn into_vec(self) -> Vec<u64> {
283 let word_reader = MemWordReader::new(&self.data);
284 let mut reader =
285 BufBitReader::<E, MemWordReader<u64, &'_ Vec<u64>>, DefaultReadParams>::new(
286 word_reader,
287 );
288 let mut values = Vec::with_capacity(self.len);
289
290 for _ in 0..self.len {
291 values.push(C::decode(&mut reader, self.codec_param).unwrap());
292 }
293 values
294 }
295
296 /// Returns the underlying storage (limbs) of the compressed bitstream.
297 ///
298 /// The limbs are stored as a vector of `u64`, which represents the raw compressed data.
299 ///
300 /// # Returns
301 ///
302 /// A clone of the internal vector of limbs.
303 ///
304 /// # Examples
305 ///
306 /// ```rust
307 /// use compressed_intvec::intvec::IntVec;
308 /// use compressed_intvec::codecs::GammaCodec;
309 /// use dsi_bitstream::traits::BE;
310 ///
311 /// let input = vec![2, 4, 6, 8];
312 /// let k = 2;
313 /// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
314 /// let limbs = intvec.limbs();
315 /// assert!(!limbs.is_empty());
316 /// ```
317 pub fn limbs(&self) -> Vec<u64> {
318 self.data.clone()
319 }
320
321 /// Returns the number of elements encoded in the [`IntVec`].
322 ///
323 /// # Returns
324 ///
325 /// The length (number of original elements) as a usize.
326 ///
327 /// # Examples
328 ///
329 /// ```rust
330 /// use compressed_intvec::intvec::IntVec;
331 /// use compressed_intvec::codecs::GammaCodec;
332 /// use dsi_bitstream::traits::BE;
333 ///
334 /// let input = vec![1, 2, 3];
335 /// let k = 2;
336 /// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
337 /// assert_eq!(intvec.len(), input.len());
338 /// ```
339 pub fn len(&self) -> usize {
340 self.len
341 }
342
343 /// Returns the sampling rate `k` used during encoding.
344 ///
345 /// This value indicates that every kth element's bit offset was stored for efficient access.
346 ///
347 /// # Returns
348 ///
349 /// The sampling rate as usize.
350 ///
351 /// # Examples
352 ///
353 /// ```rust
354 /// use compressed_intvec::intvec::IntVec;
355 /// use compressed_intvec::codecs::GammaCodec;
356 /// use dsi_bitstream::traits::BE;
357 ///
358 /// let input = vec![5, 10, 15, 20];
359 /// let k = 2;
360 /// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
361 /// assert_eq!(intvec.get_sampling_rate(), k);
362 /// ```
363 pub fn get_sampling_rate(&self) -> usize {
364 self.k
365 }
366
367 /// Returns the recorded sample bit positions used for random access.
368 ///
369 /// The returned vector contains the bit-offset positions for every kth element in the original data.
370 ///
371 /// # Returns
372 ///
373 /// A vector of sample positions.
374 ///
375 /// # Examples
376 ///
377 /// ```rust
378 /// use compressed_intvec::intvec::IntVec;
379 /// use compressed_intvec::codecs::GammaCodec;
380 /// use dsi_bitstream::traits::BE;
381 ///
382 /// let input = vec![10, 20, 30, 40, 50];
383 /// let k = 2;
384 /// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
385 /// let samples = intvec.get_samples();
386 /// assert!(!samples.is_empty());
387 /// ```
388 pub fn get_samples(&self) -> Vec<usize> {
389 self.samples.clone()
390 }
391
392 /// Checks if the [`IntVec`] contains no encoded elements.
393 ///
394 /// # Returns
395 ///
396 /// `true` if there are no elements, `false` otherwise.
397 ///
398 /// # Examples
399 ///
400 /// ```rust
401 /// use compressed_intvec::intvec::IntVec;
402 /// use compressed_intvec::codecs::GammaCodec;
403 /// use dsi_bitstream::traits::BE;
404 ///
405 /// let empty_intvec: IntVec<BE, _, GammaCodec> = IntVec::from(&[], 2).unwrap();
406 /// assert!(empty_intvec.is_empty());
407 /// ```
408 pub fn is_empty(&self) -> bool {
409 self.len == 0
410 }
411
412 /// Returns an iterator over the decompressed values contained in the [`IntVec`].
413 ///
414 /// The iterator decodes each value in sequence and yields it.
415 ///
416 /// # Returns
417 ///
418 /// An `IntVecIter` instance that implements `Iterator<Item = u64>`.
419 ///
420 /// # Examples
421 ///
422 /// ```rust
423 /// use compressed_intvec::intvec::IntVec;
424 /// use compressed_intvec::codecs::GammaCodec;
425 /// use dsi_bitstream::traits::BE;
426 ///
427 /// let input = vec![1, 3, 5, 7];
428 /// let k = 1;
429 /// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
430 /// let mut iter = intvec.iter();
431 /// for value in input {
432 /// assert_eq!(iter.next(), Some(value));
433 /// }
434 /// assert_eq!(iter.next(), None);
435 /// ```
436 pub fn iter(&self) -> IntVecIter<E, C> {
437 let word_reader = MemWordReader::new(&self.data);
438 let reader = BufBitReader::<E, MemWordReader<u64, &'_ Vec<u64>>, DefaultReadParams>::new(
439 word_reader,
440 );
441 IntVecIter {
442 intvec: self,
443 reader,
444 current_index: 0,
445 }
446 }
447}
448
449/// An iterator over the values of an [`IntVec`].
450///
451/// The iterator holds a reference to the [`IntVec`] and uses a bit reader to decode each value on the fly.
452///
453/// # Examples
454///
455/// ```rust
456/// use compressed_intvec::intvec::IntVec;
457/// use compressed_intvec::codecs::GammaCodec;
458/// use dsi_bitstream::traits::BE;
459///
460/// let input = vec![2, 4, 6];
461/// let k = 1;
462/// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
463/// let mut iter = intvec.iter();
464/// assert_eq!(iter.next(), Some(2));
465/// assert_eq!(iter.next(), Some(4));
466/// assert_eq!(iter.next(), Some(6));
467/// assert_eq!(iter.next(), None);
468/// ```
469type IntVecWriter<E> = BufBitWriter<E, MemWordWriterVec<u64, Vec<u64>>>;
470
471/// An iterator for the [`IntVec`] structure.
472///
473/// This structure holds a reference to the compressed integer vector alongside a
474/// bit-buffered reader and the current index. It encapsulates the necessary
475/// components to manage and traverse the vector efficiently.
476///
477/// Fields:
478/// - `intvec`: A reference to an [`IntVec`] instance, parameterized over its element type,
479/// the associated writer type (`IntVecWriter<E>`), and compression format (`C`).
480/// - `reader`: An instance of `BufBitReader` configured with a memory word reader
481/// (`MemWordReader<u64, &'a Vec<u64>>`) and default read parameters (`DefaultReadParams`),
482/// used for extracting bit-level information from the underlying data.
483/// - `current_index`: A usize value tracking the current position within the integer vector.
484pub struct IntVecIter<'a, E, C>
485where
486 E: Endianness,
487 C: Codec<E, IntVecWriter<E>>,
488 IntVecWriter<E>: dsi_bitstream::traits::BitWrite<E>,
489{
490 intvec: &'a IntVec<E, IntVecWriter<E>, C>,
491 reader: BufBitReader<E, MemWordReader<u64, &'a Vec<u64>>, DefaultReadParams>,
492 current_index: usize,
493}
494
495impl<'a, E, C> Iterator for IntVecIter<'a, E, C>
496where
497 E: Endianness,
498 C: Codec<E, BufBitWriter<E, MemWordWriterVec<u64, Vec<u64>>>>,
499 C::Params: Copy,
500 BufBitWriter<E, MemWordWriterVec<u64, Vec<u64>>>: BitWrite<E>,
501 BufBitReader<E, MemWordReader<u64, &'a Vec<u64>>, DefaultReadParams>:
502 GammaRead<E> + DeltaRead<E> + ZetaRead<E> + ZetaReadParam<E> + DeltaReadParam<E>,
503{
504 type Item = u64;
505
506 /// Advances the iterator and returns the next decompressed value.
507 ///
508 /// Returns [`None`] when iteration is finished.
509 ///
510 /// # Examples
511 ///
512 /// ```rust
513 /// use compressed_intvec::intvec::IntVec;
514 /// use compressed_intvec::codecs::GammaCodec;
515 /// use dsi_bitstream::traits::BE;
516 ///
517 /// let input = vec![5, 10];
518 /// let k = 1;
519 /// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
520 /// let mut iter = intvec.iter();
521 /// assert_eq!(iter.next(), Some(5));
522 /// assert_eq!(iter.next(), Some(10));
523 /// assert_eq!(iter.next(), None);
524 /// ```
525 fn next(&mut self) -> Option<Self::Item> {
526 if self.current_index >= self.intvec.len {
527 return None;
528 }
529
530 match C::decode(&mut self.reader, self.intvec.codec_param) {
531 Ok(value) => {
532 self.current_index += 1;
533 Some(value)
534 }
535 Err(_) => None,
536 }
537 }
538}
539
540/// Type alias for an [`IntVec`] with big-endian encoding and a default codec.
541pub type BEIntVec<C> = IntVec<BE, BufBitWriter<BE, MemWordWriterVec<u64, Vec<u64>>>, C>;
542
543/// Type alias for an [`IntVec`] with little-endian encoding and a default codec.
544pub type LEIntVec<C> = IntVec<LE, BufBitWriter<LE, MemWordWriterVec<u64, Vec<u64>>>, C>;