compressed_intvec/
intvec.rs

1//! # Compressed IntVec Module
2//!
3//! This module delivers an efficient compressed vector for storing sequences of unsigned 64-bit integers.
4//! By leveraging bit-level encoding, it minimizes storage space while supporting fast random access.
5//!
6//! ## Overview
7//!
8//! The principal data structure, [`IntVec`], encapsulates a compressed bitstream along with sampling offsets.
9//! These offsets enable fast, random access to individual elements without decompressing the entire stream.
10//! The module offers two variants based on endianness:
11//!
12//! - **Big-Endian** ([`BEIntVec`])
13//! - **Little-Endian** ([`LEIntVec`])
14//!
15//! Both variants operate with codecs that implement the [`Codec`] trait, allowing for flexible and configurable
16//! encoding/decoding strategies. Some codecs even accept extra runtime parameters to fine-tune the compression.
17//!
18//! > **Note:** The [`IntVec`] structure is generic and you are not supposed to interact with it directly. Use the two endianness-specific types instead.
19//!
20//! ## Key Features
21//!
22//! - **Compact Storage:** Compresses sequences of integers into a streamlined bitstream.
23//! - **Fast Random Access:** Employs periodic sampling (every *k*-th element) to quickly locate individual elements.
24//! - **Flexible Codec Integration:** Compatible with any codec conforming to the [`Codec`] trait.
25//! - **Endianness Options:** Provides both big-endian and little-endian formats to suit various interoperability needs.
26//!
27//! ## Components
28//! - **[`IntVec`]:** The core structure holding compressed data, sampling offsets, codec parameters, and metadata.
29//!   Direct interaction with this structure is not permitted, and you should use the endianness-specific types instead.
30//! - **[`BEIntVec`] / [`LEIntVec`]:** Type aliases for the big-endian and little-endian implementations of [`IntVec`].
31//! - **Iterators:** [`BEIntVecIter`] and [`LEIntVecIter`] facilitate on-the-fly decoding as you iterate through the vector.
32//!
33//! ## Usage Examples
34//!
35//! ### Creating a Big-Endian Compressed Vector
36//!
37//! ```rust
38//! use compressed_intvec::intvec::BEIntVec;
39//! use compressed_intvec::codecs::ExpGolombCodec;
40//!
41//! // Define a vector of unsigned 64-bit integers.
42//! let input = vec![1, 5, 3, 1991, 42];
43//!
44//! // Create a Big-Endian compressed vector using ExpGolombCodec with an extra codec parameter (e.g., 3)
45//! // and sample every 2 elements.
46//! let intvec = BEIntVec::<ExpGolombCodec>::from_with_param(&input, 2, 3).unwrap();
47//!
48//! // Retrieve a specific element by its index.
49//! let value = intvec.get(3);
50//! assert_eq!(value, 1991);
51//!
52//! // Decompress the entire structure back into a standard vector.
53//! let decoded = intvec.into_vec();
54//! assert_eq!(decoded, input);
55//! ```
56//!
57//! ### Creating a Little-Endian Compressed Vector
58//!
59//! ```rust
60//! use compressed_intvec::intvec::LEIntVec;
61//! use compressed_intvec::codecs::GammaCodec;
62//!
63//! // Define a vector of unsigned 64-bit integers.
64//! let input = vec![10, 20, 30, 40, 50];
65//!
66//! // Create a Little-Endian compressed vector using GammaCodec without additional codec parameters,
67//! // sampling every 2 elements.
68//! let intvec = LEIntVec::<GammaCodec>::from(&input, 2).unwrap();
69//!
70//! // Verify that random access works correctly.
71//! assert_eq!(intvec.get(2), 30);
72//! ```
73//!
74//! ## Design Considerations
75//!
76//! - **Bitstream Representation:** The compressed data is stored as a vector of 64-bit words ([`Vec<u64>`]).
77//! - **Sampling Strategy:** To ensure fast random access, sampling offsets are recorded
78//!   for every *k*-th integer.
79//! - **Codec Abstraction:** The module is codec-agnostic; any codec that implements the [`Codec`] trait may be employed.
80//! - **Endianness Management:** Endianness is seamlessly handled via phantom types, supporting both big-endian and
81//!   little-endian variants without affecting performance.
82
83use crate::codecs::Codec;
84use dsi_bitstream::prelude::*;
85use mem_dbg::{MemDbg, MemSize};
86use std::marker::PhantomData;
87
88/// A compressed vector of integers.
89///
90/// The [`IntVec`] stores values in a compressed bitstream along with sample offsets for
91/// fast random access. The type is generic over an endianness (`E`) and codec (`C`).
92///
93/// # Type Parameters
94///
95/// - `E`: Endianness marker (e.g. [`BE`] or [`LE`]).
96/// - `C`: The codec used for compression. Must implement [`Codec<E, MyBitWrite<E>, MyBitRead<E>>`].
97///
98/// # Fields
99///
100/// - `data`: The compressed bitstream data.
101/// - `samples`: Offsets (in bits) into `data` for every k-th value, used to jump-start decoding.
102/// - `k`: The sampling period.
103/// - `len`: The total number of integers stored.
104/// - `codec_param`: Extra parameters for `C`.
105///
106/// # Examples
107///
108/// ```
109/// use compressed_intvec::intvec::BEIntVec;
110/// use compressed_intvec::codecs::GammaCodec;
111///
112/// // Create a compressed vector using a codec without extra parameters.
113/// let input = vec![1, 2, 3, 4, 5];
114/// let intvec = BEIntVec::<GammaCodec>::from(&input, 2).unwrap();
115/// let value = intvec.get(3);
116/// assert_eq!(value, 4);
117/// assert_eq!(intvec.len(), 5);
118/// ```
119#[derive(Debug, Clone, MemDbg, MemSize)]
120#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
121#[cfg_attr(
122    feature = "serde",
123    serde(bound(
124        serialize = "C::Params: Serialize",
125        deserialize = "C::Params: for<'a> Deserialize<'a>"
126    ))
127)]
128pub struct IntVec<E: Endianness, W: BitWrite<E>, C: Codec<E, W>> {
129    data: Vec<u64>,
130    samples: Vec<usize>,
131    codec: PhantomData<C>,
132    k: usize,
133    len: usize,
134    codec_param: C::Params,
135    endian: PhantomData<E>,
136}
137/// Big-endian variant of `IntVec`.
138pub type BEIntVec<C> = IntVec<BE, BufBitWriter<BE, MemWordWriterVec<u64, Vec<u64>>>, C>;
139
140impl<C: Codec<BE, BufBitWriter<BE, MemWordWriterVec<u64, Vec<u64>>>>> BEIntVec<C>
141where
142    C::Params: Copy,
143{
144    /// Creates a new [`BEIntVec`] from a vector of unsigned 64-bit integers.
145    ///
146    /// Values are encoded with the specified codec parameter.
147    ///
148    /// # Arguments
149    ///
150    /// - `input`: The values to be compressed.
151    /// - `k`: The sampling rate (every k-th value is stored as a sample).
152    /// - `codec_param`: Parameters for the codec.
153    ///
154    /// # Examples
155    ///
156    /// ```
157    /// use compressed_intvec::intvec::BEIntVec;
158    /// use compressed_intvec::codecs::ExpGolombCodec;
159    ///
160    /// let input = vec![1, 5, 3, 1991, 42];
161    /// let intvec = BEIntVec::<ExpGolombCodec>::from_with_param(&input, 2, 3).unwrap();
162    ///
163    /// let value = intvec.get(3);
164    /// assert_eq!(value, 1991);
165    /// ```
166    pub fn from_with_param(
167        input: &[u64],
168        k: usize,
169        codec_param: C::Params,
170    ) -> Result<Self, Box<dyn std::error::Error>> {
171        let word_writer = MemWordWriterVec::new(Vec::new());
172        let mut writer = BufBitWriter::<BE, MemWordWriterVec<u64, Vec<u64>>>::new(word_writer);
173        let mut samples = Vec::new();
174        let mut total_bits = 0;
175
176        for (i, &x) in input.iter().enumerate() {
177            if i % k == 0 {
178                samples.push(total_bits);
179            }
180            total_bits += C::encode(&mut writer, x, codec_param)?;
181        }
182
183        writer.flush()?;
184        let data = writer.into_inner()?.into_inner();
185
186        Ok(IntVec {
187            data,
188            samples,
189            codec: PhantomData,
190            k,
191            len: input.len(),
192            codec_param,
193            endian: PhantomData,
194        })
195    }
196
197    /// Retrieves the value at the given index.
198    ///
199    /// Panics if the index is out of bounds.
200    ///
201    /// # Examples
202    ///
203    /// ```
204    /// use compressed_intvec::intvec::BEIntVec;
205    /// use compressed_intvec::codecs::GammaCodec;
206    ///
207    /// let input = vec![1, 5, 3, 12, 42];
208    /// let intvec = BEIntVec::<GammaCodec>::from(&input, 2).unwrap();
209    /// let value = intvec.get(3);
210    /// assert_eq!(value, 12);
211    /// ```
212    ///
213    #[inline(always)]
214    pub fn get(&self, index: usize) -> u64 {
215        if index >= self.len {
216            panic!("Index {} is out of bounds", index);
217        }
218
219        let sample_index = index / self.k;
220        let start_bit = self.samples[sample_index];
221        let mut reader =
222            BufBitReader::<BE, MemWordReader<u64, &Vec<u64>>>::new(MemWordReader::new(&self.data));
223
224        reader.set_bit_pos(start_bit as u64).unwrap();
225
226        let mut value = 0;
227        let start_index = sample_index * self.k;
228        for _ in start_index..=index {
229            value = C::decode(&mut reader, self.codec_param).unwrap();
230        }
231        value
232    }
233
234    /// Returns the original vector of integers.
235    ///
236    /// This operation is expensive as it requires decoding the entire bitstream.
237    ///
238    /// # Examples
239    ///
240    /// ```
241    /// use compressed_intvec::intvec::BEIntVec;
242    /// use compressed_intvec::codecs::GammaCodec;
243    ///
244    /// let input = vec![43, 12, 5, 1991, 42];
245    /// let intvec = BEIntVec::<GammaCodec>::from(&input, 2).unwrap();
246    /// let values = intvec.into_vec();
247    /// assert_eq!(values, input);
248    /// ```
249    ///
250    pub fn into_vec(self) -> Vec<u64> {
251        let word_reader = MemWordReader::new(&self.data);
252        let mut reader = BufBitReader::<BE, MemWordReader<u64, &Vec<u64>>>::new(word_reader);
253        let mut values = Vec::with_capacity(self.len);
254
255        for _ in 0..self.len {
256            values.push(C::decode(&mut reader, self.codec_param).unwrap());
257        }
258
259        values
260    }
261
262    /// Returns an iterator over the decompressed integer values stored in this compressed vector.
263    ///
264    /// The iterator decodes values on the fly and does not modify the underlying data.
265    ///
266    /// # Example
267    ///
268    /// ```rust
269    /// use compressed_intvec::intvec::BEIntVec;
270    /// use compressed_intvec::codecs::GammaCodec;
271    ///
272    /// let input = vec![1, 5, 3, 12, 42];
273    /// let intvec = BEIntVec::<GammaCodec>::from(&input, 2).unwrap();
274    ///
275    /// // Iterate over the vector and print each value.
276    /// for (i, value) in intvec.iter().enumerate() {
277    ///     assert_eq!(value, input[i]);
278    /// }
279    /// ```
280    pub fn iter(&self) -> BEIntVecIter<C> {
281        let word_reader = MemWordReader::new(&self.data);
282        let reader = BufBitReader::new(word_reader);
283        BEIntVecIter {
284            intvec: self,
285            reader,
286            current_index: 0,
287        }
288    }
289
290    /// Returns a clone of the internal bitstream data as a vector of 64-bit unsigned integers.
291    ///
292    /// This can be used for debugging or low-level operations where access to the raw
293    /// compressed limb data is required.
294    pub fn limbs(&self) -> Vec<u64> {
295        self.data.clone()
296    }
297
298    /// Returns the number of integers stored in the compressed vector.
299    ///
300    /// This value represents the total count of decompressed integers.
301    pub fn len(&self) -> usize {
302        self.len
303    }
304
305    /// Returns the sampling rate used by the compressed vector.
306    pub fn get_sampling_rate(&self) -> usize {
307        self.k
308    }
309
310    /// Returns the codec parameter used by the compressed vector.
311    pub fn get_samples(&self) -> Vec<usize> {
312        self.samples.clone()
313    }
314
315    /// Checks whether the compressed vector contains no elements.
316    ///
317    /// Returns `true` if the vector is empty, and `false` otherwise.
318    pub fn is_empty(&self) -> bool {
319        self.len == 0
320    }
321}
322
323/// Convenience constructor for codecs with no extra runtime parameter.
324impl<C: Codec<BE, BufBitWriter<BE, MemWordWriterVec<u64, Vec<u64>>>, Params = ()>> BEIntVec<C> {
325    pub fn from(input: &[u64], k: usize) -> Result<Self, Box<dyn std::error::Error>> {
326        Self::from_with_param(input, k, ())
327    }
328}
329
330/// Iterator over the values stored in a [`BEIntVec`].
331/// The iterator decodes values on the fly.
332///
333/// # Examples
334///
335/// ```rust
336/// use compressed_intvec::intvec::BEIntVec;
337/// use compressed_intvec::codecs::GammaCodec;
338///
339/// // Create a big-endian compressed vector using GammaCodec.
340/// let input = vec![1, 5, 3, 1991, 42];
341/// let intvec = BEIntVec::<GammaCodec>::from(&input, 2).unwrap();
342///
343/// // Iterate over the compressed vector.
344/// for (i, value) in intvec.iter().enumerate() {
345///    assert_eq!(value, input[i]);
346/// }
347/// ```
348pub struct BEIntVecIter<'a, C>
349where
350    C: Codec<BE, BufBitWriter<BE, MemWordWriterVec<u64, Vec<u64>>>>,
351{
352    intvec: &'a BEIntVec<C>,
353    reader: BufBitReader<BE, MemWordReader<u64, &'a Vec<u64>>>,
354    current_index: usize,
355}
356
357impl<C> Iterator for BEIntVecIter<'_, C>
358where
359    C: Codec<BE, BufBitWriter<BE, MemWordWriterVec<u64, Vec<u64>>>>,
360    C::Params: Copy,
361{
362    type Item = u64;
363
364    fn next(&mut self) -> Option<Self::Item> {
365        if self.current_index >= self.intvec.len {
366            return None;
367        }
368
369        match C::decode(&mut self.reader, self.intvec.codec_param) {
370            Ok(value) => {
371                self.current_index += 1;
372                Some(value)
373            }
374            Err(_) => None,
375        }
376    }
377}
378
379/// Little-endian variant of `IntVec`.
380pub type LEIntVec<C> = IntVec<LE, BufBitWriter<LE, MemWordWriterVec<u64, Vec<u64>>>, C>;
381
382impl<C: Codec<LE, BufBitWriter<LE, MemWordWriterVec<u64, Vec<u64>>>>> LEIntVec<C>
383where
384    C: Codec<LE, BufBitWriter<LE, MemWordWriterVec<u64, Vec<u64>>>>,
385    C::Params: Copy,
386{
387    /// Creates a new [`LEIntVec`] from a vector of unsigned 64-bit integers.
388    ///
389    /// Values are encoded with the specified codec parameter.
390    ///
391    /// # Arguments
392    ///
393    /// - `input`: The values to be compressed.
394    /// - `k`: The sampling rate (every k-th value is stored as a sample).
395    /// - `codec_param`: Parameters for the codec.
396    ///
397    /// # Examples
398    ///
399    /// ```
400    /// use compressed_intvec::intvec::LEIntVec;
401    /// use compressed_intvec::codecs::ExpGolombCodec;
402    ///
403    /// let input = vec![1, 5, 3, 1991, 42];
404    /// let intvec = LEIntVec::<ExpGolombCodec>::from_with_param(&input, 2, 3).unwrap();
405    ///
406    /// let value = intvec.get(3);
407    /// assert_eq!(value, 1991);
408    /// ```
409    pub fn from_with_param(
410        input: &[u64],
411        k: usize,
412        codec_param: C::Params,
413    ) -> Result<Self, Box<dyn std::error::Error>> {
414        let word_writer = MemWordWriterVec::new(Vec::new());
415        let mut writer = BufBitWriter::<LE, MemWordWriterVec<u64, Vec<u64>>>::new(word_writer);
416        let mut samples = Vec::new();
417        let mut total_bits = 0;
418
419        for (i, &x) in input.iter().enumerate() {
420            if i % k == 0 {
421                samples.push(total_bits);
422            }
423            total_bits += C::encode(&mut writer, x, codec_param).unwrap();
424        }
425
426        writer.flush().unwrap();
427        let data = writer.into_inner().unwrap().into_inner();
428
429        Ok(IntVec {
430            data,
431            samples,
432            codec: PhantomData,
433            k,
434            len: input.len(),
435            codec_param,
436            endian: PhantomData,
437        })
438    }
439    /// Retrieves the value at the given index.
440    ///
441    /// Returns `None` if the index is out of bounds.
442    ///
443    /// # Examples
444    ///
445    /// ```
446    /// use compressed_intvec::intvec::LEIntVec;
447    /// use compressed_intvec::codecs::GammaCodec;
448    ///
449    /// let input = vec![1, 5, 3, 1991, 42];
450    /// let intvec = LEIntVec::<GammaCodec>::from(&input, 2).unwrap();
451    /// let value = intvec.get(3);
452    /// assert_eq!(value, 1991);
453    /// ```
454    ///
455    #[inline(always)]
456    pub fn get(&self, index: usize) -> u64 {
457        if index >= self.len {
458            panic!("Index {} is out of bounds", index);
459        }
460
461        let sample_index = index / self.k; // this is an integer division
462        let start_bit = self.samples[sample_index];
463        let mut reader =
464            BufBitReader::<LE, MemWordReader<u64, &Vec<u64>>>::new(MemWordReader::new(&self.data));
465
466        reader.set_bit_pos(start_bit as u64).unwrap();
467
468        let mut value = 0;
469        let start_index = sample_index * self.k;
470        for _ in start_index..=index {
471            value = C::decode(&mut reader, self.codec_param).unwrap();
472        }
473        value
474    }
475
476    /// Returns the original vector of integers.
477    ///
478    /// This operation is expensive as it requires decoding the entire bitstream.
479    ///
480    /// # Examples
481    ///
482    /// ```
483    /// use compressed_intvec::intvec::LEIntVec;
484    /// use compressed_intvec::codecs::GammaCodec;
485    ///
486    /// let input = vec![43, 12, 5, 1991, 42];
487    /// let intvec = LEIntVec::<GammaCodec>::from(&input, 2).unwrap();
488    /// let values = intvec.into_vec();
489    /// assert_eq!(values, input);
490    /// ```
491    ///
492    pub fn into_vec(self) -> Vec<u64> {
493        let word_reader = MemWordReader::new(&self.data);
494        let mut reader = BufBitReader::<LE, MemWordReader<u64, &Vec<u64>>>::new(word_reader);
495        let mut values = Vec::with_capacity(self.len);
496
497        for _ in 0..self.len {
498            values.push(C::decode(&mut reader, self.codec_param).unwrap());
499        }
500
501        values
502    }
503
504    /// Returns an iterator over the values stored in the vector.
505    pub fn iter(&self) -> LEIntVecIter<C> {
506        let word_reader = MemWordReader::new(&self.data);
507        let reader = BufBitReader::new(word_reader);
508        LEIntVecIter {
509            intvec: self,
510            reader,
511            current_index: 0,
512        }
513    }
514
515    /// Returns a clone of the internal bitstream data as a vector of 64-bit unsigned integers.
516    ///
517    /// This can be used for debugging or low-level operations where access to the raw
518    /// compressed limb data is required.
519    pub fn limbs(&self) -> Vec<u64> {
520        self.data.clone()
521    }
522
523    /// Returns the number of integers stored in the compressed vector.
524    ///
525    /// This value represents the total count of decompressed integers.
526    pub fn len(&self) -> usize {
527        self.len
528    }
529
530    /// Returns the sampling rate used by the compressed vector.
531    pub fn get_sampling_rate(&self) -> usize {
532        self.k
533    }
534
535    /// Returns the codec parameter used by the compressed vector.
536    pub fn get_samples(&self) -> Vec<usize> {
537        self.samples.clone()
538    }
539
540    /// Checks whether the compressed vector contains no elements.
541    ///
542    /// Returns `true` if the vector is empty, and `false` otherwise.
543    pub fn is_empty(&self) -> bool {
544        self.len == 0
545    }
546}
547
548/// Convenience constructor for codecs with no extra runtime parameter.
549impl<C: Codec<LE, BufBitWriter<LE, MemWordWriterVec<u64, Vec<u64>>>, Params = ()>> LEIntVec<C> {
550    pub fn from(input: &[u64], k: usize) -> Result<Self, Box<dyn std::error::Error>> {
551        Self::from_with_param(input, k, ())
552    }
553}
554
555/// Iterator over the values stored in a [`LEIntVec`].
556///
557/// This iterator decodes values on the fly using a bit‐stream reader.
558/// # Examples
559///
560/// ```rust
561/// use compressed_intvec::intvec::LEIntVec;
562/// use compressed_intvec::codecs::GammaCodec;
563///
564/// // Create a little-endian compressed vector using GammaCodec.
565/// // Note: Ensure your codec implements the required traits for iteration.
566/// let input = vec![10, 20, 30, 40, 50];
567/// let intvec = LEIntVec::<GammaCodec>::from(&input, 2).unwrap();
568///
569/// // Iterate over the compressed vector.
570/// for (i, value) in intvec.iter().enumerate() {
571///     assert_eq!(value, input[i]);
572/// }
573/// ```
574pub struct LEIntVecIter<'a, C>
575where
576    C: Codec<LE, BufBitWriter<LE, MemWordWriterVec<u64, Vec<u64>>>>,
577{
578    intvec: &'a LEIntVec<C>,
579    reader: BufBitReader<LE, MemWordReader<u64, &'a Vec<u64>>>,
580    current_index: usize,
581}
582
583impl<C> Iterator for LEIntVecIter<'_, C>
584where
585    C: Codec<LE, BufBitWriter<LE, MemWordWriterVec<u64, Vec<u64>>>>,
586    C::Params: Copy,
587{
588    type Item = u64;
589
590    fn next(&mut self) -> Option<Self::Item> {
591        if self.current_index >= self.intvec.len {
592            return None;
593        }
594
595        match C::decode(&mut self.reader, self.intvec.codec_param) {
596            Ok(value) => {
597                self.current_index += 1;
598                Some(value)
599            }
600            Err(_) => None,
601        }
602    }
603}