compressed_intvec/
intvec.rs

1//! # Compressed IntVec
2//!
3//! This module provides the `IntVec` structure for efficiently compressing a series of `u64` values
4//! using a specified instantaneous code from the [dsi-bitstream](https://crates.io/crates/dsi_bitstream) crate.
5//!
6//! ## How It Works
7//!
8//! The `IntVec` structure compresses a series of `u64` values by encoding them with a
9//! specified codec. Every kth element is sampled during encoding, meaning its bit-offset
10//! is recorded. This sampling information allows the decoder to jump directly to the
11//! vicinity of any desired element, thereby minimizing the number of bits that need to be
12//! processed during random access.
13//!
14//! ## Features
15//!
16//! - **Efficient Compression:** Reduces storage requirements by encoding data into a compact bitstream.
17//! - **Fast Random Access:** Leverages sampled bit-offsets to quickly decode individual elements.
18//! - **Iterative Decoding:** Provides an iterator that decodes values on the fly for sequential access.
19//!
20//! ## Example Usage
21//!
22//! ```rust
23//! use compressed_intvec::intvec::IntVec;
24//! use compressed_intvec::codecs::GammaCodec;
25//! use dsi_bitstream::traits::BE;
26//!
27//! let input = vec![1, 2, 3, 4, 5, 6, 7, 8];
28//! let sampling_rate = 2;
29//! let intvec = IntVec::<BE, _, GammaCodec>::from(&input, sampling_rate).unwrap();
30//!
31//! // Fast random access using the sampled bit offsets
32//! assert_eq!(intvec.get(3), 4);
33//!
34//! // Decode the entire vector back to its original form
35//! let decompressed: Vec<u64> = intvec.into_vec();
36//! assert_eq!(decompressed, input);
37//! ```
38//!
39//! ## Type Aliases
40//!
41//! For convenience, this module provides type aliases for common configurations:
42//!
43//! - **BEIntVec:** A big-endian version of the compressed vector.
44//!
45//!   ```rust
46//!   use compressed_intvec::intvec::BEIntVec;
47//!   use compressed_intvec::codecs::GammaCodec;
48//!
49//!   let input = vec![10, 20, 30];
50//!   let intvec = BEIntVec::<GammaCodec>::from(&input, 2).unwrap();
51//!   ```
52//!
53//! - **LEIntVec:** A little-endian version of the compressed vector.
54//!
55//!   ```rust
56//!   use compressed_intvec::intvec::LEIntVec;
57//!   use compressed_intvec::codecs::GammaCodec;
58//!
59//!   let input = vec![9, 8, 7];
60//!   let intvec = LEIntVec::<GammaCodec>::from(&input, 2).unwrap();
61//!   ```
62
63use crate::codecs::Codec;
64use dsi_bitstream::{codes::params::DefaultReadParams, prelude::*};
65use mem_dbg::{MemDbg, MemSize};
66
67/// A compressed vector of unsigned 64-bit integers using a specified codec.
68///
69/// The `IntVec` structure compresses a sequence of `u64` values by encoding them using a
70/// provided codec. It supports random access via sampling and iteration over the decompressed
71/// values.
72///
73/// The type parameters are:
74/// - `E`: Endianness to use (`BE` or `LE`).
75/// - `W`: A bit writer implementing `BitWrite<E>`.
76/// - `C`: A codec implementing `Codec<E, W>`.
77///
78/// # Examples
79///
80/// ```rust
81/// use compressed_intvec::intvec::IntVec;
82/// use compressed_intvec::codecs::GammaCodec;
83/// use dsi_bitstream::traits::BE;
84///
85/// let input = vec![1, 2, 3, 4, 5];
86/// let k = 2;
87/// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
88/// assert_eq!(intvec.len(), input.len());
89/// ```
90#[derive(Debug, Clone, MemDbg, MemSize)]
91#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
92#[cfg_attr(
93    feature = "serde",
94    serde(bound(
95        serialize = "C::Params: Serialize",
96        deserialize = "C::Params: for<'a> Deserialize<'a>"
97    ))
98)]
99pub struct IntVec<E: Endianness, W: BitWrite<E>, C: Codec<E, W>> {
100    data: Vec<u64>,
101    samples: Vec<usize>,
102    k: usize,
103    len: usize,
104    codec_param: C::Params,
105    codec: std::marker::PhantomData<C>,
106    endian: std::marker::PhantomData<E>,
107}
108
109impl<E, C> IntVec<E, BufBitWriter<E, MemWordWriterVec<u64, Vec<u64>>>, C>
110where
111    E: Endianness,
112    C: Codec<E, BufBitWriter<E, MemWordWriterVec<u64, Vec<u64>>>>,
113    C::Params: Copy,
114    BufBitWriter<E, MemWordWriterVec<u64, Vec<u64>>>: BitWrite<E>,
115    for<'a> BufBitReader<E, MemWordReader<u64, &'a Vec<u64>>>:
116        GammaRead<E> + DeltaRead<E> + ZetaRead<E> + ZetaReadParam<E> + DeltaReadParam<E>,
117{
118    /// Constructs an [`IntVec`] from a slice of `u64` values using the provided codec parameters.
119    ///
120    /// This method encodes the input values using the specified codec. It also builds sampling
121    /// information for efficient random access. The parameter `k` determines the sampling rate,
122    /// i.e. every kth element's bit position is recorded as a sample.
123    ///
124    /// # Parameters
125    ///
126    /// - `input`: A slice of unsigned 64-bit integers to compress.
127    /// - `k`: The sampling rate; every kth element will have its bit offset stored.
128    /// - `codec_param`: The parameters for the codec used for encoding.
129    ///
130    /// # Returns
131    ///
132    /// Returns `Ok(IntVec)` on success or an error if encoding fails.
133    ///
134    /// # Examples
135    ///
136    /// ```rust
137    /// use compressed_intvec::intvec::IntVec;
138    /// use compressed_intvec::codecs::GammaCodec;
139    /// use dsi_bitstream::traits::BE;
140    ///
141    /// let input = vec![10, 20, 30, 40];
142    /// let k = 2;
143    /// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
144    /// assert_eq!(intvec.len(), input.len());
145    /// ```
146    pub fn from_with_param(
147        input: &[u64],
148        k: usize,
149        codec_param: C::Params,
150    ) -> Result<Self, Box<dyn std::error::Error>> {
151        let word_writer = MemWordWriterVec::new(Vec::new());
152        let mut writer = BufBitWriter::<E, MemWordWriterVec<u64, Vec<u64>>>::new(word_writer);
153        let mut samples = Vec::new();
154        let mut total_bits = 0;
155
156        for (i, &x) in input.iter().enumerate() {
157            if i % k == 0 {
158                samples.push(total_bits);
159            }
160            total_bits += C::encode(&mut writer, x, codec_param)?;
161        }
162
163        writer.flush()?;
164        let data = writer.into_inner()?.into_inner();
165
166        Ok(Self {
167            data,
168            samples,
169            codec: std::marker::PhantomData,
170            k,
171            len: input.len(),
172            codec_param,
173            endian: std::marker::PhantomData,
174        })
175    }
176
177    /// Constructs an [`IntVec`] from a slice of `u64` values using the default codec parameters.
178    ///
179    /// This is a convenience method that calls `from_with_param` with `C::Params::default()`.
180    ///
181    /// # Parameters
182    ///
183    /// - `input`: A slice of unsigned 64-bit integers to compress.
184    /// - `k`: The sampling rate; every kth element will have its bit offset stored.
185    ///
186    /// # Returns
187    ///
188    /// Returns `Ok(IntVec)` on success or an error if encoding fails.
189    ///
190    /// # Examples
191    ///
192    /// ```rust
193    /// use compressed_intvec::intvec::IntVec;
194    /// use compressed_intvec::codecs::GammaCodec;
195    /// use dsi_bitstream::traits::BE;
196    ///
197    /// let input = vec![5, 10, 15, 20];
198    /// let k = 2;
199    /// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
200    /// assert_eq!(intvec.len(), input.len());
201    /// ```
202    pub fn from(input: &[u64], k: usize) -> Result<Self, Box<dyn std::error::Error>>
203    where
204        C::Params: Default,
205    {
206        Self::from_with_param(input, k, C::Params::default())
207    }
208
209    /// Retrieves the element at the specified index after decompressing the value.
210    ///
211    /// This method uses the sampling information to quickly locate the encoded bit-stream
212    /// position and decodes the required value.
213    ///
214    /// # Panics
215    ///
216    /// Panics if the index is out of bounds.
217    ///
218    /// # Parameters
219    ///
220    /// - `index`: The index of the desired element.
221    ///
222    /// # Returns
223    ///
224    /// Returns the `u64` value at the given index.
225    ///
226    /// # Examples
227    ///
228    /// ```rust
229    /// use compressed_intvec::intvec::IntVec;
230    /// use compressed_intvec::codecs::GammaCodec;
231    /// use dsi_bitstream::traits::BE;
232    ///
233    /// let input = vec![3, 6, 9];
234    /// let k = 1;
235    /// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
236    /// assert_eq!(intvec.get(1), 6);
237    /// ```
238    #[inline(always)]
239    pub fn get(&self, index: usize) -> u64 {
240        if index >= self.len {
241            panic!("Index {} is out of bounds", index);
242        }
243
244        let sample_index = index / self.k;
245        let start_bit = self.samples[sample_index];
246        let mut reader =
247            BufBitReader::<E, MemWordReader<u64, &'_ Vec<u64>>, DefaultReadParams>::new(
248                MemWordReader::new(&self.data),
249            );
250        reader.skip_bits(start_bit).unwrap();
251
252        let mut value = 0;
253        let start_index = sample_index * self.k;
254        for _ in start_index..=index {
255            value = C::decode(&mut reader, self.codec_param).unwrap();
256        }
257        value
258    }
259
260    /// Consumes the [`IntVec`] and returns a vector containing all decompressed `u64` values.
261    ///
262    /// This method sequentially decodes all the values from the compressed representation
263    /// into a standard `Vec<u64>`.
264    ///
265    /// # Returns
266    ///
267    /// A vector of decompressed values.
268    ///
269    /// # Examples
270    ///
271    /// ```rust
272    /// use compressed_intvec::intvec::IntVec;
273    /// use compressed_intvec::codecs::GammaCodec;
274    /// use dsi_bitstream::traits::LE;
275    ///
276    /// let input = vec![7, 14, 21];
277    /// let k = 2;
278    /// let intvec = IntVec::<LE, _, GammaCodec>::from(&input, k).unwrap();
279    /// let output = intvec.into_vec();
280    /// assert_eq!(output, input);
281    /// ```
282    pub fn into_vec(self) -> Vec<u64> {
283        let word_reader = MemWordReader::new(&self.data);
284        let mut reader =
285            BufBitReader::<E, MemWordReader<u64, &'_ Vec<u64>>, DefaultReadParams>::new(
286                word_reader,
287            );
288        let mut values = Vec::with_capacity(self.len);
289
290        for _ in 0..self.len {
291            values.push(C::decode(&mut reader, self.codec_param).unwrap());
292        }
293        values
294    }
295
296    /// Returns the underlying storage (limbs) of the compressed bitstream.
297    ///
298    /// The limbs are stored as a vector of `u64`, which represents the raw compressed data.
299    ///
300    /// # Returns
301    ///
302    /// A clone of the internal vector of limbs.
303    ///
304    /// # Examples
305    ///
306    /// ```rust
307    /// use compressed_intvec::intvec::IntVec;
308    /// use compressed_intvec::codecs::GammaCodec;
309    /// use dsi_bitstream::traits::BE;
310    ///
311    /// let input = vec![2, 4, 6, 8];
312    /// let k = 2;
313    /// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
314    /// let limbs = intvec.limbs();
315    /// assert!(!limbs.is_empty());
316    /// ```
317    pub fn limbs(&self) -> Vec<u64> {
318        self.data.clone()
319    }
320
321    /// Returns the number of elements encoded in the [`IntVec`].
322    ///
323    /// # Returns
324    ///
325    /// The length (number of original elements) as a usize.
326    ///
327    /// # Examples
328    ///
329    /// ```rust
330    /// use compressed_intvec::intvec::IntVec;
331    /// use compressed_intvec::codecs::GammaCodec;
332    /// use dsi_bitstream::traits::BE;
333    ///
334    /// let input = vec![1, 2, 3];
335    /// let k = 2;
336    /// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
337    /// assert_eq!(intvec.len(), input.len());
338    /// ```
339    pub fn len(&self) -> usize {
340        self.len
341    }
342
343    /// Returns the sampling rate `k` used during encoding.
344    ///
345    /// This value indicates that every kth element's bit offset was stored for efficient access.
346    ///
347    /// # Returns
348    ///
349    /// The sampling rate as usize.
350    ///
351    /// # Examples
352    ///
353    /// ```rust
354    /// use compressed_intvec::intvec::IntVec;
355    /// use compressed_intvec::codecs::GammaCodec;
356    /// use dsi_bitstream::traits::BE;
357    ///
358    /// let input = vec![5, 10, 15, 20];
359    /// let k = 2;
360    /// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
361    /// assert_eq!(intvec.get_sampling_rate(), k);
362    /// ```
363    pub fn get_sampling_rate(&self) -> usize {
364        self.k
365    }
366
367    /// Returns the recorded sample bit positions used for random access.
368    ///
369    /// The returned vector contains the bit-offset positions for every kth element in the original data.
370    ///
371    /// # Returns
372    ///
373    /// A vector of sample positions.
374    ///
375    /// # Examples
376    ///
377    /// ```rust
378    /// use compressed_intvec::intvec::IntVec;
379    /// use compressed_intvec::codecs::GammaCodec;
380    /// use dsi_bitstream::traits::BE;
381    ///
382    /// let input = vec![10, 20, 30, 40, 50];
383    /// let k = 2;
384    /// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
385    /// let samples = intvec.get_samples();
386    /// assert!(!samples.is_empty());
387    /// ```
388    pub fn get_samples(&self) -> Vec<usize> {
389        self.samples.clone()
390    }
391
392    /// Checks if the [`IntVec`] contains no encoded elements.
393    ///
394    /// # Returns
395    ///
396    /// `true` if there are no elements, `false` otherwise.
397    ///
398    /// # Examples
399    ///
400    /// ```rust
401    /// use compressed_intvec::intvec::IntVec;
402    /// use compressed_intvec::codecs::GammaCodec;
403    /// use dsi_bitstream::traits::BE;
404    ///
405    /// let empty_intvec: IntVec<BE, _, GammaCodec> = IntVec::from(&[], 2).unwrap();
406    /// assert!(empty_intvec.is_empty());
407    /// ```
408    pub fn is_empty(&self) -> bool {
409        self.len == 0
410    }
411
412    /// Returns an iterator over the decompressed values contained in the [`IntVec`].
413    ///
414    /// The iterator decodes each value in sequence and yields it.
415    ///
416    /// # Returns
417    ///
418    /// An `IntVecIter` instance that implements `Iterator<Item = u64>`.
419    ///
420    /// # Examples
421    ///
422    /// ```rust
423    /// use compressed_intvec::intvec::IntVec;
424    /// use compressed_intvec::codecs::GammaCodec;
425    /// use dsi_bitstream::traits::BE;
426    ///
427    /// let input = vec![1, 3, 5, 7];
428    /// let k = 1;
429    /// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
430    /// let mut iter = intvec.iter();
431    /// for value in input {
432    ///     assert_eq!(iter.next(), Some(value));
433    /// }
434    /// assert_eq!(iter.next(), None);
435    /// ```
436    pub fn iter(&self) -> IntVecIter<E, C> {
437        let word_reader = MemWordReader::new(&self.data);
438        let reader = BufBitReader::<E, MemWordReader<u64, &'_ Vec<u64>>, DefaultReadParams>::new(
439            word_reader,
440        );
441        IntVecIter {
442            intvec: self,
443            reader,
444            current_index: 0,
445        }
446    }
447}
448
449/// An iterator over the values of an [`IntVec`].
450///
451/// The iterator holds a reference to the [`IntVec`] and uses a bit reader to decode each value on the fly.
452///
453/// # Examples
454///
455/// ```rust
456/// use compressed_intvec::intvec::IntVec;
457/// use compressed_intvec::codecs::GammaCodec;
458/// use dsi_bitstream::traits::BE;
459///
460/// let input = vec![2, 4, 6];
461/// let k = 1;
462/// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
463/// let mut iter = intvec.iter();
464/// assert_eq!(iter.next(), Some(2));
465/// assert_eq!(iter.next(), Some(4));
466/// assert_eq!(iter.next(), Some(6));
467/// assert_eq!(iter.next(), None);
468/// ```
469type IntVecWriter<E> = BufBitWriter<E, MemWordWriterVec<u64, Vec<u64>>>;
470
471/// An iterator for the [`IntVec`] structure.
472///
473/// This structure holds a reference to the compressed integer vector alongside a
474/// bit-buffered reader and the current index. It encapsulates the necessary
475/// components to manage and traverse the vector efficiently.
476///
477/// Fields:
478/// - `intvec`: A reference to an [`IntVec`] instance, parameterized over its element type,
479///   the associated writer type (`IntVecWriter<E>`), and compression format (`C`).
480/// - `reader`: An instance of `BufBitReader` configured with a memory word reader
481///   (`MemWordReader<u64, &'a Vec<u64>>`) and default read parameters (`DefaultReadParams`),
482///   used for extracting bit-level information from the underlying data.
483/// - `current_index`: A usize value tracking the current position within the integer vector.
484pub struct IntVecIter<'a, E, C>
485where
486    E: Endianness,
487    C: Codec<E, IntVecWriter<E>>,
488    IntVecWriter<E>: dsi_bitstream::traits::BitWrite<E>,
489{
490    intvec: &'a IntVec<E, IntVecWriter<E>, C>,
491    reader: BufBitReader<E, MemWordReader<u64, &'a Vec<u64>>, DefaultReadParams>,
492    current_index: usize,
493}
494
495impl<'a, E, C> Iterator for IntVecIter<'a, E, C>
496where
497    E: Endianness,
498    C: Codec<E, BufBitWriter<E, MemWordWriterVec<u64, Vec<u64>>>>,
499    C::Params: Copy,
500    BufBitWriter<E, MemWordWriterVec<u64, Vec<u64>>>: BitWrite<E>,
501    BufBitReader<E, MemWordReader<u64, &'a Vec<u64>>, DefaultReadParams>:
502        GammaRead<E> + DeltaRead<E> + ZetaRead<E> + ZetaReadParam<E> + DeltaReadParam<E>,
503{
504    type Item = u64;
505
506    /// Advances the iterator and returns the next decompressed value.
507    ///
508    /// Returns [`None`] when iteration is finished.
509    ///
510    /// # Examples
511    ///
512    /// ```rust
513    /// use compressed_intvec::intvec::IntVec;
514    /// use compressed_intvec::codecs::GammaCodec;
515    /// use dsi_bitstream::traits::BE;
516    ///
517    /// let input = vec![5, 10];
518    /// let k = 1;
519    /// let intvec = IntVec::<BE, _, GammaCodec>::from(&input, k).unwrap();
520    /// let mut iter = intvec.iter();
521    /// assert_eq!(iter.next(), Some(5));
522    /// assert_eq!(iter.next(), Some(10));
523    /// assert_eq!(iter.next(), None);
524    /// ```
525    fn next(&mut self) -> Option<Self::Item> {
526        if self.current_index >= self.intvec.len {
527            return None;
528        }
529
530        match C::decode(&mut self.reader, self.intvec.codec_param) {
531            Ok(value) => {
532                self.current_index += 1;
533                Some(value)
534            }
535            Err(_) => None,
536        }
537    }
538}
539
540/// Type alias for an [`IntVec`] with big-endian encoding and a default codec.
541pub type BEIntVec<C> = IntVec<BE, BufBitWriter<BE, MemWordWriterVec<u64, Vec<u64>>>, C>;
542
543/// Type alias for an [`IntVec`] with little-endian encoding and a default codec.
544pub type LEIntVec<C> = IntVec<LE, BufBitWriter<LE, MemWordWriterVec<u64, Vec<u64>>>, C>;