vortex_btrblocks/builder.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Builder for configuring `BtrBlocksCompressor` instances.
5
6use vortex_utils::aliases::hash_set::HashSet;
7
8use crate::BtrBlocksCompressor;
9use crate::CascadingCompressor;
10use crate::Scheme;
11use crate::SchemeExt;
12use crate::SchemeId;
13use crate::schemes::binary;
14use crate::schemes::bool;
15use crate::schemes::decimal;
16use crate::schemes::float;
17use crate::schemes::integer;
18use crate::schemes::string;
19use crate::schemes::temporal;
20
21/// All available compression schemes.
22///
23/// This list is order-sensitive: the builder preserves this order when constructing
24/// the final scheme list, so that tie-breaking is deterministic.
25pub const ALL_SCHEMES: &[&dyn Scheme] = &[
26 ////////////////////////////////////////////////////////////////////////////////////////////////
27 // Bool schemes.
28 ////////////////////////////////////////////////////////////////////////////////////////////////
29 &bool::BoolConstantScheme,
30 ////////////////////////////////////////////////////////////////////////////////////////////////
31 // Integer schemes.
32 ////////////////////////////////////////////////////////////////////////////////////////////////
33 &integer::IntConstantScheme,
34 // NOTE: FoR must precede BitPacking to avoid unnecessary patches.
35 &integer::FoRScheme,
36 // NOTE: ZigZag should precede BitPacking because we don't want negative numbers.
37 &integer::ZigZagScheme,
38 &integer::BitPackingScheme,
39 &integer::SparseScheme,
40 &integer::IntDictScheme,
41 &integer::RunEndScheme,
42 &integer::SequenceScheme,
43 &integer::IntRLEScheme,
44 ////////////////////////////////////////////////////////////////////////////////////////////////
45 // Float schemes.
46 ////////////////////////////////////////////////////////////////////////////////////////////////
47 &float::FloatConstantScheme,
48 &float::ALPScheme,
49 &float::ALPRDScheme,
50 &float::FloatDictScheme,
51 &float::NullDominatedSparseScheme,
52 &float::FloatRLEScheme,
53 ////////////////////////////////////////////////////////////////////////////////////////////////
54 // String schemes.
55 ////////////////////////////////////////////////////////////////////////////////////////////////
56 &string::StringDictScheme,
57 // Both string-fragmentation schemes are registered; the sample-based
58 // selector keeps whichever is smaller per column.
59 &string::FSSTScheme,
60 #[cfg(feature = "unstable_encodings")]
61 &string::OnPairScheme,
62 &string::StringConstantScheme,
63 &string::NullDominatedSparseScheme,
64 ////////////////////////////////////////////////////////////////////////////////////////////////
65 // Binary schemes.
66 ////////////////////////////////////////////////////////////////////////////////////////////////
67 &binary::BinaryDictScheme,
68 &binary::BinaryConstantScheme,
69 // Decimal schemes.
70 &decimal::DecimalScheme,
71 // Temporal schemes.
72 &temporal::TemporalScheme,
73];
74
75/// Builder for creating configured [`BtrBlocksCompressor`] instances.
76///
77/// By default, all schemes in [`ALL_SCHEMES`] are enabled. Feature-gated schemes (Pco, Zstd)
78/// are not in `ALL_SCHEMES` and must be added explicitly via
79/// [`with_scheme`](BtrBlocksCompressorBuilder::with_new_scheme) or
80/// [`with_compact`](BtrBlocksCompressorBuilder::with_compact).
81///
82/// # Examples
83///
84/// ```rust
85/// use vortex_btrblocks::{BtrBlocksCompressorBuilder, Scheme, SchemeExt};
86/// use vortex_btrblocks::schemes::integer::IntDictScheme;
87///
88/// // Default compressor with all schemes in ALL_SCHEMES.
89/// let compressor = BtrBlocksCompressorBuilder::default().build();
90///
91/// // Remove specific schemes.
92/// let compressor = BtrBlocksCompressorBuilder::default()
93/// .exclude_schemes([IntDictScheme.id()])
94/// .build();
95/// ```
96#[derive(Debug, Clone)]
97pub struct BtrBlocksCompressorBuilder {
98 schemes: Vec<&'static dyn Scheme>,
99}
100
101impl Default for BtrBlocksCompressorBuilder {
102 fn default() -> Self {
103 Self {
104 schemes: ALL_SCHEMES.to_vec(),
105 }
106 }
107}
108
109impl BtrBlocksCompressorBuilder {
110 /// Creates a builder with no schemes registered.
111 ///
112 /// Useful when the caller wants explicit, scheme-by-scheme control over the compressor.
113 pub fn empty() -> Self {
114 Self {
115 schemes: Vec::new(),
116 }
117 }
118
119 /// Adds an external compression scheme not in [`ALL_SCHEMES`].
120 ///
121 /// This allows encoding crates outside of `vortex-btrblocks` to register their own schemes
122 /// with the compressor.
123 ///
124 /// # Panics
125 ///
126 /// Panics if a scheme with the same [`SchemeId`] is already present.
127 pub fn with_new_scheme(mut self, scheme: &'static dyn Scheme) -> Self {
128 assert!(
129 !self.schemes.iter().any(|s| s.id() == scheme.id()),
130 "scheme {:?} is already present in the builder",
131 scheme.id(),
132 );
133
134 self.schemes.push(scheme);
135 self
136 }
137
138 /// Adds compact encoding schemes (Zstd for strings and binary, Pco for numerics).
139 ///
140 /// This provides better compression ratios than the default, especially for floating-point
141 /// heavy datasets. Requires the `zstd` feature. When the `pco` feature is also enabled,
142 /// Pco schemes for integers and floats are included.
143 ///
144 /// # Panics
145 ///
146 /// Panics if any of the compact schemes are already present.
147 #[cfg(feature = "zstd")]
148 pub fn with_compact(self) -> Self {
149 let builder = self
150 .with_new_scheme(&string::ZstdScheme)
151 .with_new_scheme(&binary::ZstdScheme);
152
153 #[cfg(feature = "pco")]
154 let builder = builder
155 .with_new_scheme(&integer::PcoScheme)
156 .with_new_scheme(&float::PcoScheme);
157
158 builder
159 }
160
161 /// Excludes schemes without CUDA kernel support and adds Zstd for string and binary compression.
162 ///
163 /// With the `unstable_encodings` feature, buffer-level Zstd compression is used which
164 /// preserves the array buffer layout for zero-conversion GPU decompression. Without it,
165 /// interleaved Zstd compression is used.
166 pub fn only_cuda_compatible(self) -> Self {
167 // String fragmentation schemes (OnPair, FSST) require host-side
168 // dictionary expansion at decode time, which is incompatible with
169 // pure-GPU decompression paths. Strip whichever string-fragment
170 // scheme is enabled by feature.
171 #[cfg_attr(not(feature = "unstable_encodings"), allow(unused_mut))]
172 let mut excluded: Vec<SchemeId> = vec![
173 integer::SparseScheme.id(),
174 integer::IntRLEScheme.id(),
175 float::FloatRLEScheme.id(),
176 float::NullDominatedSparseScheme.id(),
177 string::StringDictScheme.id(),
178 string::FSSTScheme.id(),
179 binary::BinaryDictScheme.id(),
180 ];
181 #[cfg(feature = "unstable_encodings")]
182 excluded.push(string::OnPairScheme.id());
183 let builder = self.exclude_schemes(excluded);
184
185 #[cfg(all(feature = "zstd", feature = "unstable_encodings"))]
186 let builder = builder
187 .with_new_scheme(&string::ZstdBuffersScheme)
188 .with_new_scheme(&binary::ZstdBuffersScheme);
189 #[cfg(all(feature = "zstd", not(feature = "unstable_encodings")))]
190 let builder = builder
191 .with_new_scheme(&string::ZstdScheme)
192 .with_new_scheme(&binary::ZstdScheme);
193
194 builder
195 }
196
197 /// Removes the specified compression schemes by their [`SchemeId`].
198 pub fn exclude_schemes(mut self, ids: impl IntoIterator<Item = SchemeId>) -> Self {
199 let ids: HashSet<_> = ids.into_iter().collect();
200 self.schemes.retain(|s| !ids.contains(&s.id()));
201 self
202 }
203
204 /// Builds the configured [`BtrBlocksCompressor`].
205 pub fn build(self) -> BtrBlocksCompressor {
206 BtrBlocksCompressor(CascadingCompressor::new(self.schemes))
207 }
208}
209
210#[cfg(test)]
211mod tests {
212 use super::*;
213
214 #[test]
215 fn empty_starts_with_no_schemes() {
216 let builder = BtrBlocksCompressorBuilder::empty();
217 assert!(builder.schemes.is_empty());
218 }
219
220 #[test]
221 fn default_includes_all_schemes() {
222 let builder = BtrBlocksCompressorBuilder::default();
223 assert_eq!(builder.schemes.len(), ALL_SCHEMES.len());
224 }
225}