vortex_sampling_compressor/
lib.rs

1use std::sync::{Arc, LazyLock};
2
3use compressors::bitpacked::BITPACK_WITH_PATCHES;
4use compressors::chunked::DEFAULT_CHUNKED_COMPRESSOR;
5use compressors::constant::ConstantCompressor;
6use compressors::delta::DeltaCompressor;
7use compressors::fsst::FSSTCompressor;
8use compressors::struct_::StructCompressor;
9use compressors::varbin::VarBinCompressor;
10use compressors::{CompressedArray, CompressorRef};
11use vortex_alp::{ALPEncoding, ALPRDEncoding};
12use vortex_array::array::{
13    BoolEncoding, ChunkedEncoding, ConstantEncoding, ListEncoding, NullEncoding, PrimitiveEncoding,
14    StructEncoding, VarBinEncoding, VarBinViewEncoding,
15};
16use vortex_array::{Context, ContextRef};
17use vortex_bytebool::ByteBoolEncoding;
18use vortex_datetime_parts::DateTimePartsEncoding;
19use vortex_dict::DictEncoding;
20use vortex_fastlanes::{BitPackedEncoding, DeltaEncoding, FoREncoding};
21use vortex_fsst::FSSTEncoding;
22use vortex_runend::RunEndEncoding;
23use vortex_zigzag::ZigZagEncoding;
24
25use crate::compressors::alp::ALPCompressor;
26use crate::compressors::date_time_parts::DateTimePartsCompressor;
27use crate::compressors::dict::DictCompressor;
28use crate::compressors::list::ListCompressor;
29use crate::compressors::r#for::FoRCompressor;
30use crate::compressors::runend::DEFAULT_RUN_END_COMPRESSOR;
31use crate::compressors::sparse::SparseCompressor;
32use crate::compressors::zigzag::ZigZagCompressor;
33
34#[cfg(feature = "arbitrary")]
35pub mod arbitrary;
36pub mod compressors;
37mod constants;
38mod downscale;
39mod sampling;
40mod sampling_compressor;
41
42pub use sampling_compressor::*;
43use vortex_sparse::SparseEncoding;
44
45use crate::compressors::alp_rd::ALPRDCompressor;
46
47pub const DEFAULT_COMPRESSORS: [CompressorRef; 15] = [
48    &ALPCompressor as CompressorRef,
49    &ALPRDCompressor,
50    &BITPACK_WITH_PATCHES,
51    &DEFAULT_CHUNKED_COMPRESSOR,
52    &ConstantCompressor,
53    &DateTimePartsCompressor,
54    // &DeltaCompressor,
55    &DictCompressor,
56    &FoRCompressor,
57    &FSSTCompressor,
58    &DEFAULT_RUN_END_COMPRESSOR,
59    &SparseCompressor,
60    &StructCompressor,
61    &ListCompressor,
62    &VarBinCompressor,
63    &ZigZagCompressor,
64];
65
66pub const ALL_COMPRESSORS: [CompressorRef; 16] = [
67    &ALPCompressor as CompressorRef,
68    &ALPRDCompressor,
69    &BITPACK_WITH_PATCHES,
70    &DEFAULT_CHUNKED_COMPRESSOR,
71    &ConstantCompressor,
72    &DateTimePartsCompressor,
73    &DeltaCompressor,
74    &DictCompressor,
75    &FoRCompressor,
76    &FSSTCompressor,
77    &ListCompressor,
78    &DEFAULT_RUN_END_COMPRESSOR,
79    &SparseCompressor,
80    &StructCompressor,
81    &VarBinCompressor,
82    &ZigZagCompressor,
83];
84
85pub static ALL_ENCODINGS_CONTEXT: LazyLock<ContextRef> = LazyLock::new(|| {
86    Arc::new(Context::default().with_encodings([
87        ALPEncoding::vtable(),
88        ALPRDEncoding::vtable(),
89        BitPackedEncoding::vtable(),
90        BoolEncoding::vtable(),
91        ByteBoolEncoding::vtable(),
92        ChunkedEncoding::vtable(),
93        ConstantEncoding::vtable(),
94        DateTimePartsEncoding::vtable(),
95        DeltaEncoding::vtable(),
96        DictEncoding::vtable(),
97        FoREncoding::vtable(),
98        FSSTEncoding::vtable(),
99        ListEncoding::vtable(),
100        NullEncoding::vtable(),
101        PrimitiveEncoding::vtable(),
102        RunEndEncoding::vtable(),
103        SparseEncoding::vtable(),
104        StructEncoding::vtable(),
105        VarBinEncoding::vtable(),
106        VarBinViewEncoding::vtable(),
107        ZigZagEncoding::vtable(),
108    ]))
109});
110
111#[derive(Debug, Clone)]
112pub enum Objective {
113    MinSize,
114}
115
116impl Objective {
117    pub fn starting_value(&self) -> f64 {
118        1.0
119    }
120
121    pub fn evaluate(
122        array: &CompressedArray,
123        base_size_bytes: usize,
124        config: &CompressConfig,
125    ) -> f64 {
126        match &config.objective {
127            Objective::MinSize => (array.nbytes() as f64) / (base_size_bytes as f64),
128        }
129    }
130}
131
132#[derive(Debug, Clone)]
133pub struct CompressConfig {
134    /// Size of each sample slice
135    sample_size: u16,
136    /// Number of sample slices
137    sample_count: u16,
138    /// Random number generator seed
139    rng_seed: u64,
140
141    // Maximum depth of compression tree
142    max_cost: u8,
143    // Are we minimizing size or maximizing performance?
144    objective: Objective,
145
146    // Target chunk size in bytes
147    target_block_bytesize: usize,
148    // Target chunk size in row count
149    target_block_size: usize,
150}
151
152impl CompressConfig {
153    pub fn with_sample_size(mut self, sample_size: u16) -> Self {
154        self.sample_size = sample_size;
155        self
156    }
157
158    pub fn with_sample_count(mut self, sample_count: u16) -> Self {
159        self.sample_count = sample_count;
160        self
161    }
162}
163
164impl Default for CompressConfig {
165    fn default() -> Self {
166        let kib = 1 << 10;
167        let mib = 1 << 20;
168        Self {
169            // Sample length should always be multiple of 1024
170            sample_size: 64,
171            sample_count: 16,
172            max_cost: constants::DEFAULT_MAX_COST,
173            objective: Objective::MinSize,
174            target_block_bytesize: 16 * mib,
175            target_block_size: 64 * kib,
176            rng_seed: 0,
177        }
178    }
179}