compressed-intvec 0.6.0

Space-efficient integer vectors with fixed-width, variable-length, and sequence-oriented encodings.
Documentation
use compressed_intvec::prelude::*;
use criterion::{Criterion, Throughput, black_box, criterion_group, criterion_main};
use dsi_bitstream::{
    codes::{len_rice, len_zeta},
    utils::sample_implied_distribution,
};
use rand::{RngExt, SeedableRng, rngs::SmallRng};
use std::time::Duration;

#[cfg(feature = "parallel")]
use rayon::prelude::*;

/// Defines the data distributions for testing.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Distribution {
    UniformLow,
    UniformHigh,
    RiceImplied,
    ZetaImplied,
}

impl Distribution {
    /// Generates a vector of data according to the distribution.
    fn generate(&self, size: usize) -> Vec<u64> {
        match self {
            Distribution::UniformLow => {
                let mut rng = SmallRng::seed_from_u64(42);
                (0..size).map(|_| rng.random_range(0..1_000)).collect()
            }
            Distribution::UniformHigh => {
                let mut rng = SmallRng::seed_from_u64(42);
                (0..size).map(|_| rng.random_range(0..1 << 32)).collect()
            }
            Distribution::RiceImplied => {
                let mut rng = SmallRng::seed_from_u64(42);
                sample_implied_distribution(|v| len_rice(v, 4), &mut rng)
                    .take(size)
                    .collect()
            }
            Distribution::ZetaImplied => {
                let mut rng = SmallRng::seed_from_u64(42);
                sample_implied_distribution(|v| len_zeta(v, 3), &mut rng)
                    .take(size)
                    .collect()
            }
        }
    }

    fn name(&self) -> &'static str {
        match self {
            Distribution::UniformLow => "UniformLow",
            Distribution::UniformHigh => "UniformHigh",
            Distribution::RiceImplied => "RiceImplied",
            Distribution::ZetaImplied => "ZetaImplied",
        }
    }
}

fn benchmark_sequential_ops(c: &mut Criterion) {
    const VECTOR_SIZE: usize = 1_000_000;
    const K_VALUE: usize = 32; // A typical k value

    let distributions = [
        Distribution::UniformLow,
        Distribution::UniformHigh,
        Distribution::RiceImplied,
        Distribution::ZetaImplied,
    ];

    let codecs_to_test = [
        ("Gamma", Codec::Gamma),
        ("Delta", Codec::Delta),
        ("Unary", Codec::Unary),
        ("Rice", Codec::Rice { log2_b: None }),
        ("Zeta", Codec::Zeta { k: None }),
        ("Omega", Codec::Omega),
        ("VByteLe", Codec::VByteLe),
        ("VByteBe", Codec::VByteBe),
        ("Pi", Codec::Pi { k: Some(3) }),
        ("Golomb", Codec::Golomb { b: Some(8) }),
        ("ExpGolomb", Codec::ExpGolomb { k: Some(2) }),
    ];

    for distribution in distributions {
        let mut group = c.benchmark_group(format!("SequentialOps/{}", distribution.name()));
        group.throughput(Throughput::Elements(VECTOR_SIZE as u64));
        let data = distribution.generate(VECTOR_SIZE);

        // --- Baseline benchmark on the original Vec<u64> ---
        group.bench_function("Baseline/iter_sum", |b| {
            b.iter(|| {
                // The sum operation ensures the compiler cannot optimize away the loop.
                black_box(black_box(&data).iter().sum::<u64>());
            })
        });

        #[cfg(feature = "parallel")]
        group.bench_function("Baseline/par_iter_sum", |b| {
            b.iter(|| {
                black_box(black_box(&data).par_iter().sum::<u64>());
            })
        });

        // --- Benchmarks for each VarVec codec ---
        for (spec_name, codec_spec) in codecs_to_test {
            // Skip combinations known to be extremely slow or impractical.
            if (matches!(
                distribution,
                Distribution::UniformHigh | Distribution::ZetaImplied
            ) && matches!(
                codec_spec,
                Codec::Unary | Codec::Rice { .. } | Codec::Golomb { .. }
            )) || (matches!(distribution, Distribution::RiceImplied)
                && matches!(codec_spec, Codec::Unary))
            {
                println!(
                    "Skipping codec {} for {} distribution (impractical).",
                    spec_name,
                    distribution.name()
                );
                continue;
            }

            let intvec = LEVarVec::builder()
                .k(K_VALUE)
                .codec(codec_spec)
                .build(&data)
                .expect("Failed to build VarVec");

            // 1. Sequential Iteration
            group.bench_function(format!("{}/iter_sum", spec_name), |b| {
                b.iter(|| {
                    black_box(black_box(&intvec).iter().sum::<u64>());
                })
            });

            // 2. Parallel Iteration
            #[cfg(feature = "parallel")]
            group.bench_function(format!("{}/par_iter_sum", spec_name), |b| {
                b.iter(|| {
                    black_box(black_box(&intvec).par_iter().sum::<u64>());
                })
            });
        }
        group.finish();
    }
}

criterion_group! {
    name = benches;
    config = Criterion::default()
        .sample_size(20)
        .warm_up_time(Duration::from_millis(100))
        .measurement_time(Duration::from_secs(2));
    targets = benchmark_sequential_ops
}
criterion_main!(benches);