Skip to main content

vortex_btrblocks/
builder.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Builder for configuring `BtrBlocksCompressor` instances.
5
6use vortex_utils::aliases::hash_set::HashSet;
7
8use crate::BtrBlocksCompressor;
9use crate::CascadingCompressor;
10use crate::Scheme;
11use crate::SchemeExt;
12use crate::SchemeId;
13use crate::schemes::binary;
14use crate::schemes::bool;
15use crate::schemes::decimal;
16use crate::schemes::float;
17use crate::schemes::integer;
18use crate::schemes::string;
19use crate::schemes::temporal;
20
21/// All available compression schemes.
22///
23/// This list is order-sensitive: the builder preserves this order when constructing
24/// the final scheme list, so that tie-breaking is deterministic.
25pub const ALL_SCHEMES: &[&dyn Scheme] = &[
26    ////////////////////////////////////////////////////////////////////////////////////////////////
27    // Bool schemes.
28    ////////////////////////////////////////////////////////////////////////////////////////////////
29    &bool::BoolConstantScheme,
30    ////////////////////////////////////////////////////////////////////////////////////////////////
31    // Integer schemes.
32    ////////////////////////////////////////////////////////////////////////////////////////////////
33    &integer::IntConstantScheme,
34    // NOTE: FoR must precede BitPacking to avoid unnecessary patches.
35    &integer::FoRScheme,
36    // NOTE: ZigZag should precede BitPacking because we don't want negative numbers.
37    &integer::ZigZagScheme,
38    &integer::BitPackingScheme,
39    &integer::SparseScheme,
40    &integer::IntDictScheme,
41    &integer::RunEndScheme,
42    &integer::SequenceScheme,
43    &integer::IntRLEScheme,
44    ////////////////////////////////////////////////////////////////////////////////////////////////
45    // Float schemes.
46    ////////////////////////////////////////////////////////////////////////////////////////////////
47    &float::FloatConstantScheme,
48    &float::ALPScheme,
49    &float::ALPRDScheme,
50    &float::FloatDictScheme,
51    &float::NullDominatedSparseScheme,
52    &float::FloatRLEScheme,
53    ////////////////////////////////////////////////////////////////////////////////////////////////
54    // String schemes.
55    ////////////////////////////////////////////////////////////////////////////////////////////////
56    &string::StringDictScheme,
57    // Both string-fragmentation schemes are registered; the sample-based
58    // selector keeps whichever is smaller per column.
59    &string::FSSTScheme,
60    #[cfg(feature = "unstable_encodings")]
61    &string::OnPairScheme,
62    &string::StringConstantScheme,
63    &string::NullDominatedSparseScheme,
64    ////////////////////////////////////////////////////////////////////////////////////////////////
65    // Binary schemes.
66    ////////////////////////////////////////////////////////////////////////////////////////////////
67    &binary::BinaryDictScheme,
68    &binary::BinaryConstantScheme,
69    // Decimal schemes.
70    &decimal::DecimalScheme,
71    // Temporal schemes.
72    &temporal::TemporalScheme,
73];
74
75/// Builder for creating configured [`BtrBlocksCompressor`] instances.
76///
77/// By default, all schemes in [`ALL_SCHEMES`] are enabled. Feature-gated schemes (Pco, Zstd)
78/// are not in `ALL_SCHEMES` and must be added explicitly via
79/// [`with_scheme`](BtrBlocksCompressorBuilder::with_new_scheme) or
80/// [`with_compact`](BtrBlocksCompressorBuilder::with_compact).
81///
82/// # Examples
83///
84/// ```rust
85/// use vortex_btrblocks::{BtrBlocksCompressorBuilder, Scheme, SchemeExt};
86/// use vortex_btrblocks::schemes::integer::IntDictScheme;
87///
88/// // Default compressor with all schemes in ALL_SCHEMES.
89/// let compressor = BtrBlocksCompressorBuilder::default().build();
90///
91/// // Remove specific schemes.
92/// let compressor = BtrBlocksCompressorBuilder::default()
93///     .exclude_schemes([IntDictScheme.id()])
94///     .build();
95/// ```
96#[derive(Debug, Clone)]
97pub struct BtrBlocksCompressorBuilder {
98    schemes: Vec<&'static dyn Scheme>,
99}
100
101impl Default for BtrBlocksCompressorBuilder {
102    fn default() -> Self {
103        Self {
104            schemes: ALL_SCHEMES.to_vec(),
105        }
106    }
107}
108
109impl BtrBlocksCompressorBuilder {
110    /// Creates a builder with no schemes registered.
111    ///
112    /// Useful when the caller wants explicit, scheme-by-scheme control over the compressor.
113    pub fn empty() -> Self {
114        Self {
115            schemes: Vec::new(),
116        }
117    }
118
119    /// Adds an external compression scheme not in [`ALL_SCHEMES`].
120    ///
121    /// This allows encoding crates outside of `vortex-btrblocks` to register their own schemes
122    /// with the compressor.
123    ///
124    /// # Panics
125    ///
126    /// Panics if a scheme with the same [`SchemeId`] is already present.
127    pub fn with_new_scheme(mut self, scheme: &'static dyn Scheme) -> Self {
128        assert!(
129            !self.schemes.iter().any(|s| s.id() == scheme.id()),
130            "scheme {:?} is already present in the builder",
131            scheme.id(),
132        );
133
134        self.schemes.push(scheme);
135        self
136    }
137
138    /// Adds compact encoding schemes (Zstd for strings, Pco for numerics).
139    ///
140    /// This provides better compression ratios than the default, especially for floating-point
141    /// heavy datasets. Requires the `zstd` feature. When the `pco` feature is also enabled,
142    /// Pco schemes for integers and floats are included.
143    ///
144    /// # Panics
145    ///
146    /// Panics if any of the compact schemes are already present.
147    #[cfg(feature = "zstd")]
148    pub fn with_compact(self) -> Self {
149        let builder = self.with_new_scheme(&string::ZstdScheme);
150
151        #[cfg(feature = "pco")]
152        let builder = builder
153            .with_new_scheme(&integer::PcoScheme)
154            .with_new_scheme(&float::PcoScheme);
155
156        builder
157    }
158
159    /// Adds the TurboQuant lossy vector quantization scheme.
160    ///
161    /// When enabled, [`Vector`] extension arrays are compressed using the TurboQuant algorithm
162    /// with MSE-optimal scalar quantization.
163    ///
164    /// # Panics
165    ///
166    /// Panics if the TurboQuant scheme is already present.
167    ///
168    /// [`Vector`]: vortex_tensor::vector::Vector
169    #[cfg(feature = "unstable_encodings")]
170    pub fn with_turboquant(self) -> Self {
171        use vortex_tensor::encodings::turboquant::TurboQuantScheme;
172        self.with_new_scheme(&TurboQuantScheme)
173    }
174
175    /// Excludes schemes without CUDA kernel support and adds Zstd for string compression.
176    ///
177    /// With the `unstable_encodings` feature, buffer-level Zstd compression is used which
178    /// preserves the array buffer layout for zero-conversion GPU decompression. Without it,
179    /// interleaved Zstd compression is used.
180    pub fn only_cuda_compatible(self) -> Self {
181        // String fragmentation schemes (OnPair, FSST) require host-side
182        // dictionary expansion at decode time, which is incompatible with
183        // pure-GPU decompression paths. Strip whichever string-fragment
184        // scheme is enabled by feature.
185        #[cfg_attr(not(feature = "unstable_encodings"), allow(unused_mut))]
186        let mut excluded: Vec<SchemeId> = vec![
187            integer::SparseScheme.id(),
188            integer::IntRLEScheme.id(),
189            float::FloatRLEScheme.id(),
190            float::NullDominatedSparseScheme.id(),
191            string::StringDictScheme.id(),
192            string::FSSTScheme.id(),
193            binary::BinaryDictScheme.id(),
194        ];
195        #[cfg(feature = "unstable_encodings")]
196        excluded.push(string::OnPairScheme.id());
197        let builder = self.exclude_schemes(excluded);
198
199        #[cfg(all(feature = "zstd", feature = "unstable_encodings"))]
200        let builder = builder.with_new_scheme(&string::ZstdBuffersScheme);
201        #[cfg(all(feature = "zstd", not(feature = "unstable_encodings")))]
202        let builder = builder.with_new_scheme(&string::ZstdScheme);
203
204        builder
205    }
206
207    /// Removes the specified compression schemes by their [`SchemeId`].
208    pub fn exclude_schemes(mut self, ids: impl IntoIterator<Item = SchemeId>) -> Self {
209        let ids: HashSet<_> = ids.into_iter().collect();
210        self.schemes.retain(|s| !ids.contains(&s.id()));
211        self
212    }
213
214    /// Builds the configured [`BtrBlocksCompressor`].
215    pub fn build(self) -> BtrBlocksCompressor {
216        BtrBlocksCompressor(CascadingCompressor::new(self.schemes))
217    }
218}
219
220#[cfg(test)]
221mod tests {
222    use super::*;
223
224    #[test]
225    fn empty_starts_with_no_schemes() {
226        let builder = BtrBlocksCompressorBuilder::empty();
227        assert!(builder.schemes.is_empty());
228    }
229
230    #[test]
231    fn default_includes_all_schemes() {
232        let builder = BtrBlocksCompressorBuilder::default();
233        assert_eq!(builder.schemes.len(), ALL_SCHEMES.len());
234    }
235}