vortex-btrblocks 0.67.0

BtrBlocks style compressor
Documentation
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! Compression scheme traits. This is the interface each encoding implements to participate in
//! compression.
//!
//! [`Scheme`] is the core trait. Each encoding (e.g. BitPacking, ALP, Dict) implements it with
//! two key methods: [`Scheme::expected_compression_ratio`] to estimate how well it compresses
//! the data, and [`Scheme::compress`] to apply the encoding. Type-specific sub-traits
//! ([`IntegerScheme`], [`FloatScheme`], [`StringScheme`]) bind schemes to the appropriate stats
//! and code types.
//!
//! [`SchemeExt`] provides the default ratio estimation strategy. It samples ~1% of the array
//! (minimum [`SAMPLE_SIZE`] values), compresses the sample, and returns the before/after byte
//! ratio. Schemes can override [`Scheme::expected_compression_ratio`] if they have a cheaper
//! heuristic.
//!
//! [`IntegerScheme`]: crate::compressor::integer::IntegerScheme
//! [`FloatScheme`]: crate::compressor::float::FloatScheme
//! [`StringScheme`]: crate::compressor::string::StringScheme
//! [`SAMPLE_SIZE`]: crate::stats::SAMPLE_SIZE

use std::fmt::Debug;
use std::hash::Hash;
use std::hash::Hasher;

use vortex_array::ArrayRef;
use vortex_error::VortexResult;

use crate::BtrBlocksCompressor;
use crate::CompressorContext;
use crate::CompressorStats;
use crate::sample::sample_count_approx_one_percent;
use crate::stats::SAMPLE_SIZE;

/// Top-level compression scheme trait.
///
/// Variants are specialized for each data type, e.g. see `IntegerScheme`, `FloatScheme`, etc.
pub trait Scheme: Debug {
    /// Type of the stats generated by the compression scheme.
    type StatsType: CompressorStats;
    /// Type of the code used to uniquely identify the compression scheme.
    type CodeType: Copy + Eq + Hash + Ord;

    /// Scheme unique identifier.
    fn code(&self) -> Self::CodeType;

    /// True if this is the singular Constant scheme for this data type.
    fn is_constant(&self) -> bool {
        false
    }

    /// Estimate the compression ratio for running this scheme (and its children)
    /// for the given input.
    ///
    /// Depth is the depth in the encoding tree we've already reached before considering this
    /// scheme.
    ///
    /// Returns the estimated compression ratio as well as the tree of compressors to use.
    fn expected_compression_ratio(
        &self,
        compressor: &BtrBlocksCompressor,
        stats: &Self::StatsType,
        ctx: CompressorContext,
        excludes: &[Self::CodeType],
    ) -> VortexResult<f64> {
        self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes)
    }

    /// Compress the input with this scheme, yielding a new array.
    fn compress(
        &self,
        compressor: &BtrBlocksCompressor,
        stats: &Self::StatsType,
        ctx: CompressorContext,
        excludes: &[Self::CodeType],
    ) -> VortexResult<ArrayRef>;
}

impl<C: Copy + Eq + Hash + Ord, V: CompressorStats> PartialEq for dyn Scheme<CodeType = C, StatsType = V> {
    fn eq(&self, other: &Self) -> bool {
        self.code() == other.code()
    }
}
impl<C: Copy + Eq + Hash + Ord, V: CompressorStats> Eq for dyn Scheme<CodeType = C, StatsType = V> {}
impl<C: Copy + Eq + Hash + Ord, V: CompressorStats> Hash for dyn Scheme<CodeType = C, StatsType = V> {
    fn hash<H: Hasher>(&self, state: &mut H) {
        self.code().hash(state)
    }
}

/// Extension trait providing sampling-based compression ratio estimation for schemes.
pub trait SchemeExt: Scheme {
    /// Estimates compression ratio by compressing a sample of the data.
    ///
    /// This method samples approximately 1% of the data (with a minimum of 1024 values)
    /// and compresses it to estimate the overall compression ratio.
    fn estimate_compression_ratio_with_sampling(
        &self,
        btr_blocks_compressor: &BtrBlocksCompressor,
        stats: &Self::StatsType,
        ctx: CompressorContext,
        excludes: &[Self::CodeType],
    ) -> VortexResult<f64> {
        let sample = if ctx.is_sample {
            stats.clone()
        } else {
            let source_len = stats.source().len();
            let sample_count = sample_count_approx_one_percent(source_len);

            tracing::trace!(
                "Sampling {} values out of {}",
                SAMPLE_SIZE as u64 * sample_count as u64,
                source_len
            );

            stats.sample(SAMPLE_SIZE, sample_count)
        };

        let after = self
            .compress(btr_blocks_compressor, &sample, ctx.as_sample(), excludes)?
            .nbytes();
        let before = sample.source().nbytes();

        tracing::debug!(
            "estimate_compression_ratio_with_sampling(compressor={self:#?} ctx={ctx:?}) = {}",
            before as f64 / after as f64
        );

        Ok(before as f64 / after as f64)
    }
}

// Blanket implementation for all Scheme types
impl<T: Scheme + ?Sized> SchemeExt for T {}