compressed_intvec/intvec.rs
1//! # Compressed IntVec Module
2//!
3//! This module delivers an efficient compressed vector for storing sequences of unsigned 64-bit integers.
4//! By leveraging bit-level encoding, it minimizes storage space while supporting fast random access.
5//!
6//! ## Overview
7//!
8//! The principal data structure, [`IntVec`], encapsulates a compressed bitstream along with sampling offsets.
9//! These offsets enable fast, random access to individual elements without decompressing the entire stream.
10//! The module offers two variants based on endianness:
11//!
12//! - **Big-Endian** ([`BEIntVec`])
13//! - **Little-Endian** ([`LEIntVec`])
14//!
15//! Both variants operate with codecs that implement the [`Codec`] trait, allowing for flexible and configurable
16//! encoding/decoding strategies. Some codecs even accept extra runtime parameters to fine-tune the compression.
17//!
18//! > **Note:** The [`IntVec`] structure is generic and you are not supposed to interact with it directly. Use the two endianness-specific types instead.
19//!
20//! ## Key Features
21//!
22//! - **Compact Storage:** Compresses sequences of integers into a streamlined bitstream.
23//! - **Fast Random Access:** Employs periodic sampling (every *k*-th element) to quickly locate individual elements.
24//! - **Flexible Codec Integration:** Compatible with any codec conforming to the [`Codec`] trait.
25//! - **Endianness Options:** Provides both big-endian and little-endian formats to suit various interoperability needs.
26//!
27//! ## Components
28//! - **[`IntVec`]:** The core structure holding compressed data, sampling offsets, codec parameters, and metadata.
29//! Direct interaction with this structure is not permitted, and you should use the endianness-specific types instead.
30//! - **[`BEIntVec`] / [`LEIntVec`]:** Type aliases for the big-endian and little-endian implementations of [`IntVec`].
31//! - **Iterators:** [`BEIntVecIter`] and [`LEIntVecIter`] facilitate on-the-fly decoding as you iterate through the vector.
32//!
33//! ## Usage Examples
34//!
35//! ### Creating a Big-Endian Compressed Vector
36//!
37//! ```rust
38//! use compressed_intvec::intvec::BEIntVec;
39//! use compressed_intvec::codecs::ExpGolombCodec;
40//!
41//! // Define a vector of unsigned 64-bit integers.
42//! let input = vec![1, 5, 3, 1991, 42];
43//!
44//! // Create a Big-Endian compressed vector using ExpGolombCodec with an extra codec parameter (e.g., 3)
45//! // and sample every 2 elements.
46//! let intvec = BEIntVec::<ExpGolombCodec>::from_with_param(&input, 2, 3).unwrap();
47//!
48//! // Retrieve a specific element by its index.
49//! let value = intvec.get(3);
50//! assert_eq!(value, 1991);
51//!
52//! // Decompress the entire structure back into a standard vector.
53//! let decoded = intvec.into_vec();
54//! assert_eq!(decoded, input);
55//! ```
56//!
57//! ### Creating a Little-Endian Compressed Vector
58//!
59//! ```rust
60//! use compressed_intvec::intvec::LEIntVec;
61//! use compressed_intvec::codecs::GammaCodec;
62//!
63//! // Define a vector of unsigned 64-bit integers.
64//! let input = vec![10, 20, 30, 40, 50];
65//!
66//! // Create a Little-Endian compressed vector using GammaCodec without additional codec parameters,
67//! // sampling every 2 elements.
68//! let intvec = LEIntVec::<GammaCodec>::from(&input, 2).unwrap();
69//!
70//! // Verify that random access works correctly.
71//! assert_eq!(intvec.get(2), 30);
72//! ```
73//!
74//! ## Design Considerations
75//!
76//! - **Bitstream Representation:** The compressed data is stored as a vector of 64-bit words ([`Vec<u64>`]).
77//! - **Sampling Strategy:** To ensure fast random access, sampling offsets are recorded
78//! for every *k*-th integer.
79//! - **Codec Abstraction:** The module is codec-agnostic; any codec that implements the [`Codec`] trait may be employed.
80//! - **Endianness Management:** Endianness is seamlessly handled via phantom types, supporting both big-endian and
81//! little-endian variants without affecting performance.
82
83use crate::codecs::Codec;
84use dsi_bitstream::prelude::*;
85use mem_dbg::{MemDbg, MemSize};
86use std::marker::PhantomData;
87
88/// A compressed vector of integers.
89///
90/// The [`IntVec`] stores values in a compressed bitstream along with sample offsets for
91/// fast random access. The type is generic over an endianness (`E`) and codec (`C`).
92///
93/// # Type Parameters
94///
95/// - `E`: Endianness marker (e.g. [`BE`] or [`LE`]).
96/// - `C`: The codec used for compression. Must implement [`Codec<E, MyBitWrite<E>, MyBitRead<E>>`].
97///
98/// # Fields
99///
100/// - `data`: The compressed bitstream data.
101/// - `samples`: Offsets (in bits) into `data` for every k-th value, used to jump-start decoding.
102/// - `k`: The sampling period.
103/// - `len`: The total number of integers stored.
104/// - `codec_param`: Extra parameters for `C`.
105///
106/// # Examples
107///
108/// ```
109/// use compressed_intvec::intvec::BEIntVec;
110/// use compressed_intvec::codecs::GammaCodec;
111///
112/// // Create a compressed vector using a codec without extra parameters.
113/// let input = vec![1, 2, 3, 4, 5];
114/// let intvec = BEIntVec::<GammaCodec>::from(&input, 2).unwrap();
115/// let value = intvec.get(3);
116/// assert_eq!(value, 4);
117/// assert_eq!(intvec.len(), 5);
118/// ```
119#[derive(Debug, Clone, MemDbg, MemSize)]
120#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
121#[cfg_attr(
122 feature = "serde",
123 serde(bound(
124 serialize = "C::Params: Serialize",
125 deserialize = "C::Params: for<'a> Deserialize<'a>"
126 ))
127)]
128pub struct IntVec<E: Endianness, W: BitWrite<E>, C: Codec<E, W>> {
129 data: Vec<u64>,
130 samples: Vec<usize>,
131 codec: PhantomData<C>,
132 k: usize,
133 len: usize,
134 codec_param: C::Params,
135 endian: PhantomData<E>,
136}
137/// Big-endian variant of `IntVec`.
138pub type BEIntVec<C> = IntVec<BE, BufBitWriter<BE, MemWordWriterVec<u64, Vec<u64>>>, C>;
139
140impl<C: Codec<BE, BufBitWriter<BE, MemWordWriterVec<u64, Vec<u64>>>>> BEIntVec<C>
141where
142 C::Params: Copy,
143{
144 /// Creates a new [`BEIntVec`] from a vector of unsigned 64-bit integers.
145 ///
146 /// Values are encoded with the specified codec parameter.
147 ///
148 /// # Arguments
149 ///
150 /// - `input`: The values to be compressed.
151 /// - `k`: The sampling rate (every k-th value is stored as a sample).
152 /// - `codec_param`: Parameters for the codec.
153 ///
154 /// # Examples
155 ///
156 /// ```
157 /// use compressed_intvec::intvec::BEIntVec;
158 /// use compressed_intvec::codecs::ExpGolombCodec;
159 ///
160 /// let input = vec![1, 5, 3, 1991, 42];
161 /// let intvec = BEIntVec::<ExpGolombCodec>::from_with_param(&input, 2, 3).unwrap();
162 ///
163 /// let value = intvec.get(3);
164 /// assert_eq!(value, 1991);
165 /// ```
166 pub fn from_with_param(
167 input: &[u64],
168 k: usize,
169 codec_param: C::Params,
170 ) -> Result<Self, Box<dyn std::error::Error>> {
171 let word_writer = MemWordWriterVec::new(Vec::new());
172 let mut writer = BufBitWriter::<BE, MemWordWriterVec<u64, Vec<u64>>>::new(word_writer);
173 let mut samples = Vec::new();
174 let mut total_bits = 0;
175
176 for (i, &x) in input.iter().enumerate() {
177 if i % k == 0 {
178 samples.push(total_bits);
179 }
180 total_bits += C::encode(&mut writer, x, codec_param)?;
181 }
182
183 writer.flush()?;
184 let data = writer.into_inner()?.into_inner();
185
186 Ok(IntVec {
187 data,
188 samples,
189 codec: PhantomData,
190 k,
191 len: input.len(),
192 codec_param,
193 endian: PhantomData,
194 })
195 }
196
197 /// Retrieves the value at the given index.
198 ///
199 /// Panics if the index is out of bounds.
200 ///
201 /// # Examples
202 ///
203 /// ```
204 /// use compressed_intvec::intvec::BEIntVec;
205 /// use compressed_intvec::codecs::GammaCodec;
206 ///
207 /// let input = vec![1, 5, 3, 12, 42];
208 /// let intvec = BEIntVec::<GammaCodec>::from(&input, 2).unwrap();
209 /// let value = intvec.get(3);
210 /// assert_eq!(value, 12);
211 /// ```
212 ///
213 #[inline(always)]
214 pub fn get(&self, index: usize) -> u64 {
215 if index >= self.len {
216 panic!("Index {} is out of bounds", index);
217 }
218
219 let sample_index = index / self.k;
220 let start_bit = self.samples[sample_index];
221 let mut reader =
222 BufBitReader::<BE, MemWordReader<u64, &Vec<u64>>>::new(MemWordReader::new(&self.data));
223
224 reader.set_bit_pos(start_bit as u64).unwrap();
225
226 let mut value = 0;
227 let start_index = sample_index * self.k;
228 for _ in start_index..=index {
229 value = C::decode(&mut reader, self.codec_param).unwrap();
230 }
231 value
232 }
233
234 /// Returns the original vector of integers.
235 ///
236 /// This operation is expensive as it requires decoding the entire bitstream.
237 ///
238 /// # Examples
239 ///
240 /// ```
241 /// use compressed_intvec::intvec::BEIntVec;
242 /// use compressed_intvec::codecs::GammaCodec;
243 ///
244 /// let input = vec![43, 12, 5, 1991, 42];
245 /// let intvec = BEIntVec::<GammaCodec>::from(&input, 2).unwrap();
246 /// let values = intvec.into_vec();
247 /// assert_eq!(values, input);
248 /// ```
249 ///
250 pub fn into_vec(self) -> Vec<u64> {
251 let word_reader = MemWordReader::new(&self.data);
252 let mut reader = BufBitReader::<BE, MemWordReader<u64, &Vec<u64>>>::new(word_reader);
253 let mut values = Vec::with_capacity(self.len);
254
255 for _ in 0..self.len {
256 values.push(C::decode(&mut reader, self.codec_param).unwrap());
257 }
258
259 values
260 }
261
262 /// Returns an iterator over the decompressed integer values stored in this compressed vector.
263 ///
264 /// The iterator decodes values on the fly and does not modify the underlying data.
265 ///
266 /// # Example
267 ///
268 /// ```rust
269 /// use compressed_intvec::intvec::BEIntVec;
270 /// use compressed_intvec::codecs::GammaCodec;
271 ///
272 /// let input = vec![1, 5, 3, 12, 42];
273 /// let intvec = BEIntVec::<GammaCodec>::from(&input, 2).unwrap();
274 ///
275 /// // Iterate over the vector and print each value.
276 /// for (i, value) in intvec.iter().enumerate() {
277 /// assert_eq!(value, input[i]);
278 /// }
279 /// ```
280 pub fn iter(&self) -> BEIntVecIter<C> {
281 let word_reader = MemWordReader::new(&self.data);
282 let reader = BufBitReader::new(word_reader);
283 BEIntVecIter {
284 intvec: self,
285 reader,
286 current_index: 0,
287 }
288 }
289
290 /// Returns a clone of the internal bitstream data as a vector of 64-bit unsigned integers.
291 ///
292 /// This can be used for debugging or low-level operations where access to the raw
293 /// compressed limb data is required.
294 pub fn limbs(&self) -> Vec<u64> {
295 self.data.clone()
296 }
297
298 /// Returns the number of integers stored in the compressed vector.
299 ///
300 /// This value represents the total count of decompressed integers.
301 pub fn len(&self) -> usize {
302 self.len
303 }
304
305 /// Returns the sampling rate used by the compressed vector.
306 pub fn get_sampling_rate(&self) -> usize {
307 self.k
308 }
309
310 /// Returns the codec parameter used by the compressed vector.
311 pub fn get_samples(&self) -> Vec<usize> {
312 self.samples.clone()
313 }
314
315 /// Checks whether the compressed vector contains no elements.
316 ///
317 /// Returns `true` if the vector is empty, and `false` otherwise.
318 pub fn is_empty(&self) -> bool {
319 self.len == 0
320 }
321}
322
323/// Convenience constructor for codecs with no extra runtime parameter.
324impl<C: Codec<BE, BufBitWriter<BE, MemWordWriterVec<u64, Vec<u64>>>, Params = ()>> BEIntVec<C> {
325 pub fn from(input: &[u64], k: usize) -> Result<Self, Box<dyn std::error::Error>> {
326 Self::from_with_param(input, k, ())
327 }
328}
329
330/// Iterator over the values stored in a [`BEIntVec`].
331/// The iterator decodes values on the fly.
332///
333/// # Examples
334///
335/// ```rust
336/// use compressed_intvec::intvec::BEIntVec;
337/// use compressed_intvec::codecs::GammaCodec;
338///
339/// // Create a big-endian compressed vector using GammaCodec.
340/// let input = vec![1, 5, 3, 1991, 42];
341/// let intvec = BEIntVec::<GammaCodec>::from(&input, 2).unwrap();
342///
343/// // Iterate over the compressed vector.
344/// for (i, value) in intvec.iter().enumerate() {
345/// assert_eq!(value, input[i]);
346/// }
347/// ```
348pub struct BEIntVecIter<'a, C>
349where
350 C: Codec<BE, BufBitWriter<BE, MemWordWriterVec<u64, Vec<u64>>>>,
351{
352 intvec: &'a BEIntVec<C>,
353 reader: BufBitReader<BE, MemWordReader<u64, &'a Vec<u64>>>,
354 current_index: usize,
355}
356
357impl<C> Iterator for BEIntVecIter<'_, C>
358where
359 C: Codec<BE, BufBitWriter<BE, MemWordWriterVec<u64, Vec<u64>>>>,
360 C::Params: Copy,
361{
362 type Item = u64;
363
364 fn next(&mut self) -> Option<Self::Item> {
365 if self.current_index >= self.intvec.len {
366 return None;
367 }
368
369 match C::decode(&mut self.reader, self.intvec.codec_param) {
370 Ok(value) => {
371 self.current_index += 1;
372 Some(value)
373 }
374 Err(_) => None,
375 }
376 }
377}
378
379/// Little-endian variant of `IntVec`.
380pub type LEIntVec<C> = IntVec<LE, BufBitWriter<LE, MemWordWriterVec<u64, Vec<u64>>>, C>;
381
382impl<C: Codec<LE, BufBitWriter<LE, MemWordWriterVec<u64, Vec<u64>>>>> LEIntVec<C>
383where
384 C: Codec<LE, BufBitWriter<LE, MemWordWriterVec<u64, Vec<u64>>>>,
385 C::Params: Copy,
386{
387 /// Creates a new [`LEIntVec`] from a vector of unsigned 64-bit integers.
388 ///
389 /// Values are encoded with the specified codec parameter.
390 ///
391 /// # Arguments
392 ///
393 /// - `input`: The values to be compressed.
394 /// - `k`: The sampling rate (every k-th value is stored as a sample).
395 /// - `codec_param`: Parameters for the codec.
396 ///
397 /// # Examples
398 ///
399 /// ```
400 /// use compressed_intvec::intvec::LEIntVec;
401 /// use compressed_intvec::codecs::ExpGolombCodec;
402 ///
403 /// let input = vec![1, 5, 3, 1991, 42];
404 /// let intvec = LEIntVec::<ExpGolombCodec>::from_with_param(&input, 2, 3).unwrap();
405 ///
406 /// let value = intvec.get(3);
407 /// assert_eq!(value, 1991);
408 /// ```
409 pub fn from_with_param(
410 input: &[u64],
411 k: usize,
412 codec_param: C::Params,
413 ) -> Result<Self, Box<dyn std::error::Error>> {
414 let word_writer = MemWordWriterVec::new(Vec::new());
415 let mut writer = BufBitWriter::<LE, MemWordWriterVec<u64, Vec<u64>>>::new(word_writer);
416 let mut samples = Vec::new();
417 let mut total_bits = 0;
418
419 for (i, &x) in input.iter().enumerate() {
420 if i % k == 0 {
421 samples.push(total_bits);
422 }
423 total_bits += C::encode(&mut writer, x, codec_param).unwrap();
424 }
425
426 writer.flush().unwrap();
427 let data = writer.into_inner().unwrap().into_inner();
428
429 Ok(IntVec {
430 data,
431 samples,
432 codec: PhantomData,
433 k,
434 len: input.len(),
435 codec_param,
436 endian: PhantomData,
437 })
438 }
439 /// Retrieves the value at the given index.
440 ///
441 /// Returns `None` if the index is out of bounds.
442 ///
443 /// # Examples
444 ///
445 /// ```
446 /// use compressed_intvec::intvec::LEIntVec;
447 /// use compressed_intvec::codecs::GammaCodec;
448 ///
449 /// let input = vec![1, 5, 3, 1991, 42];
450 /// let intvec = LEIntVec::<GammaCodec>::from(&input, 2).unwrap();
451 /// let value = intvec.get(3);
452 /// assert_eq!(value, 1991);
453 /// ```
454 ///
455 #[inline(always)]
456 pub fn get(&self, index: usize) -> u64 {
457 if index >= self.len {
458 panic!("Index {} is out of bounds", index);
459 }
460
461 let sample_index = index / self.k; // this is an integer division
462 let start_bit = self.samples[sample_index];
463 let mut reader =
464 BufBitReader::<LE, MemWordReader<u64, &Vec<u64>>>::new(MemWordReader::new(&self.data));
465
466 reader.set_bit_pos(start_bit as u64).unwrap();
467
468 let mut value = 0;
469 let start_index = sample_index * self.k;
470 for _ in start_index..=index {
471 value = C::decode(&mut reader, self.codec_param).unwrap();
472 }
473 value
474 }
475
476 /// Returns the original vector of integers.
477 ///
478 /// This operation is expensive as it requires decoding the entire bitstream.
479 ///
480 /// # Examples
481 ///
482 /// ```
483 /// use compressed_intvec::intvec::LEIntVec;
484 /// use compressed_intvec::codecs::GammaCodec;
485 ///
486 /// let input = vec![43, 12, 5, 1991, 42];
487 /// let intvec = LEIntVec::<GammaCodec>::from(&input, 2).unwrap();
488 /// let values = intvec.into_vec();
489 /// assert_eq!(values, input);
490 /// ```
491 ///
492 pub fn into_vec(self) -> Vec<u64> {
493 let word_reader = MemWordReader::new(&self.data);
494 let mut reader = BufBitReader::<LE, MemWordReader<u64, &Vec<u64>>>::new(word_reader);
495 let mut values = Vec::with_capacity(self.len);
496
497 for _ in 0..self.len {
498 values.push(C::decode(&mut reader, self.codec_param).unwrap());
499 }
500
501 values
502 }
503
504 /// Returns an iterator over the values stored in the vector.
505 pub fn iter(&self) -> LEIntVecIter<C> {
506 let word_reader = MemWordReader::new(&self.data);
507 let reader = BufBitReader::new(word_reader);
508 LEIntVecIter {
509 intvec: self,
510 reader,
511 current_index: 0,
512 }
513 }
514
515 /// Returns a clone of the internal bitstream data as a vector of 64-bit unsigned integers.
516 ///
517 /// This can be used for debugging or low-level operations where access to the raw
518 /// compressed limb data is required.
519 pub fn limbs(&self) -> Vec<u64> {
520 self.data.clone()
521 }
522
523 /// Returns the number of integers stored in the compressed vector.
524 ///
525 /// This value represents the total count of decompressed integers.
526 pub fn len(&self) -> usize {
527 self.len
528 }
529
530 /// Returns the sampling rate used by the compressed vector.
531 pub fn get_sampling_rate(&self) -> usize {
532 self.k
533 }
534
535 /// Returns the codec parameter used by the compressed vector.
536 pub fn get_samples(&self) -> Vec<usize> {
537 self.samples.clone()
538 }
539
540 /// Checks whether the compressed vector contains no elements.
541 ///
542 /// Returns `true` if the vector is empty, and `false` otherwise.
543 pub fn is_empty(&self) -> bool {
544 self.len == 0
545 }
546}
547
548/// Convenience constructor for codecs with no extra runtime parameter.
549impl<C: Codec<LE, BufBitWriter<LE, MemWordWriterVec<u64, Vec<u64>>>, Params = ()>> LEIntVec<C> {
550 pub fn from(input: &[u64], k: usize) -> Result<Self, Box<dyn std::error::Error>> {
551 Self::from_with_param(input, k, ())
552 }
553}
554
555/// Iterator over the values stored in a [`LEIntVec`].
556///
557/// This iterator decodes values on the fly using a bitāstream reader.
558/// # Examples
559///
560/// ```rust
561/// use compressed_intvec::intvec::LEIntVec;
562/// use compressed_intvec::codecs::GammaCodec;
563///
564/// // Create a little-endian compressed vector using GammaCodec.
565/// // Note: Ensure your codec implements the required traits for iteration.
566/// let input = vec![10, 20, 30, 40, 50];
567/// let intvec = LEIntVec::<GammaCodec>::from(&input, 2).unwrap();
568///
569/// // Iterate over the compressed vector.
570/// for (i, value) in intvec.iter().enumerate() {
571/// assert_eq!(value, input[i]);
572/// }
573/// ```
574pub struct LEIntVecIter<'a, C>
575where
576 C: Codec<LE, BufBitWriter<LE, MemWordWriterVec<u64, Vec<u64>>>>,
577{
578 intvec: &'a LEIntVec<C>,
579 reader: BufBitReader<LE, MemWordReader<u64, &'a Vec<u64>>>,
580 current_index: usize,
581}
582
583impl<C> Iterator for LEIntVecIter<'_, C>
584where
585 C: Codec<LE, BufBitWriter<LE, MemWordWriterVec<u64, Vec<u64>>>>,
586 C::Params: Copy,
587{
588 type Item = u64;
589
590 fn next(&mut self) -> Option<Self::Item> {
591 if self.current_index >= self.intvec.len {
592 return None;
593 }
594
595 match C::decode(&mut self.reader, self.intvec.codec_param) {
596 Ok(value) => {
597 self.current_index += 1;
598 Some(value)
599 }
600 Err(_) => None,
601 }
602 }
603}