vortex_compressor/scheme.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Unified compression scheme trait and exclusion rules.
5
6use std::fmt;
7use std::fmt::Debug;
8use std::hash::Hash;
9use std::hash::Hasher;
10
11use vortex_array::ArrayRef;
12use vortex_array::Canonical;
13use vortex_error::VortexResult;
14
15use crate::CascadingCompressor;
16use crate::ctx::CompressorContext;
17use crate::estimate::CompressionEstimate;
18use crate::stats::ArrayAndStats;
19use crate::stats::GenerateStatsOptions;
20
21/// Unique identifier for a compression scheme.
22///
23/// The only way to obtain a [`SchemeId`] is through [`SchemeExt::id()`], which is
24/// auto-implemented for all [`Scheme`] types. There is no public constructor.
25///
26/// The only exception to this is for `ROOT_SCHEME_ID` in `compressor.rs`.
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
28pub struct SchemeId {
29 /// Only constructable within `vortex-compressor`.
30 ///
31 /// The only public way to obtain a [`SchemeId`] is through [`SchemeExt::id()`].
32 pub(super) name: &'static str,
33}
34
35impl fmt::Display for SchemeId {
36 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
37 f.write_str(self.name)
38 }
39}
40
41/// Selects which children of a cascading scheme a rule applies to.
42#[derive(Debug, Clone, Copy)]
43pub enum ChildSelection {
44 /// Rule applies to all children.
45 All,
46 /// Rule applies to a single child.
47 One(usize),
48 /// Rule applies to multiple specific children.
49 Many(&'static [usize]),
50}
51
52impl ChildSelection {
53 /// Returns `true` if this selection includes the given child index.
54 pub fn contains(&self, child_index: usize) -> bool {
55 match self {
56 ChildSelection::All => true,
57 ChildSelection::One(idx) => *idx == child_index,
58 ChildSelection::Many(indices) => indices.contains(&child_index),
59 }
60 }
61}
62
63/// Push rule: declared by a cascading scheme to exclude another scheme from the subtree
64/// rooted at the specified children.
65///
66/// Use this when the declaring scheme (the ancestor) knows about the excluded scheme. For example,
67/// `ZigZag` excludes `Dict` from all its children.
68#[derive(Debug, Clone, Copy)]
69pub struct DescendantExclusion {
70 /// The scheme to exclude from descendants.
71 pub excluded: SchemeId,
72 /// Which children of the declaring scheme this rule applies to.
73 pub children: ChildSelection,
74}
75
76/// Pull rule: declared by a scheme to exclude itself when the specified ancestor is in the
77/// cascade chain.
78///
79/// Use this when the excluded scheme (the descendant) knows about the ancestor. For example,
80/// `Sequence` excludes itself when `IntDict` is an ancestor on its codes child.
81#[derive(Debug, Clone, Copy)]
82pub struct AncestorExclusion {
83 /// The ancestor scheme that makes the declaring scheme ineligible.
84 pub ancestor: SchemeId,
85 /// Which children of the ancestor this rule applies to.
86 pub children: ChildSelection,
87}
88
89// TODO(connor): Remove all default implemented methods.
90/// A single compression encoding that the [`CascadingCompressor`] can select from.
91///
92/// The compressor evaluates every registered scheme whose [`matches`] returns `true` for a given
93/// array, picks the one with the highest [`expected_compression_ratio`], and calls [`compress`] on
94/// the winner.
95///
96/// One of the key features of the compressor in this crate is that schemes may "cascade". A
97/// scheme's [`compress`] can call back into the compressor via
98/// [`CascadingCompressor::compress_child`] to compress child or transformed arrays, building up
99/// multiple encoding layers (e.g. frame-of-reference and then bit-packing).
100///
101/// # Scheme IDs
102///
103/// Every scheme has a globally unique name returned by [`scheme_name`]. The [`SchemeExt::id`]
104/// method (auto-implemented, cannot be overridden) wraps that name in an opaque [`SchemeId`] used
105/// for equality, hashing, and exclusion rules (see below).
106///
107/// # Cascading and children
108///
109/// Schemes that produce child arrays for further compression must declare [`num_children`] > 0.
110/// Each child should be identified by a stable index. Cascading schemes should use
111/// [`CascadingCompressor::compress_child`] to compress each child array, which handles cascade
112/// level / budget tracking and context management automatically.
113///
114/// No scheme may appear twice in a cascade (descendant) chain (enforced by the compressor). This
115/// keeps the search space a tree.
116///
117/// # Exclusion rules
118///
119/// Schemes declare exclusion rules to prevent incompatible scheme combinations in the cascade
120/// chain:
121///
122/// - [`descendant_exclusions`] (push): "exclude scheme X from my child Y's subtree." Used when the
123/// declaring scheme knows about the excluded scheme.
124/// - [`ancestor_exclusions`] (pull): "exclude me if ancestor X's child Y is above me." Used when
125/// the declaring scheme knows about the ancestor.
126///
127/// We do this because different schemes will live in different crates, and we cannot know the
128/// dependency direction ahead of time.
129///
130/// # Implementing a scheme
131///
132/// [`expected_compression_ratio`] should return
133/// `CompressionEstimate::Deferred(DeferredEstimate::Sample)` when a cheap heuristic is not
134/// available, asking the compressor to estimate via sampling. Implementors should return an
135/// immediate [`CompressionEstimate::Verdict`] when possible.
136///
137/// Schemes that need statistics that may be expensive to compute should override [`stats_options`]
138/// to declare what they require. The compressor merges all eligible schemes' options before
139/// generating stats, so each stat is always computed at most once for a given array.
140///
141/// [`scheme_name`]: Scheme::scheme_name
142/// [`matches`]: Scheme::matches
143/// [`compress`]: Scheme::compress
144/// [`expected_compression_ratio`]: Scheme::expected_compression_ratio
145/// [`stats_options`]: Scheme::stats_options
146/// [`num_children`]: Scheme::num_children
147/// [`descendant_exclusions`]: Scheme::descendant_exclusions
148/// [`ancestor_exclusions`]: Scheme::ancestor_exclusions
149pub trait Scheme: Debug + Send + Sync {
150 /// The globally unique name for this scheme (e.g. `"vortex.int.bitpacking"`).
151 fn scheme_name(&self) -> &'static str;
152
153 /// Whether this scheme can compress the given canonical array.
154 fn matches(&self, canonical: &Canonical) -> bool;
155
156 /// Returns the stats generation options this scheme requires. The compressor merges all
157 /// eligible schemes' options before generating stats so that a single stats pass satisfies
158 /// every scheme.
159 fn stats_options(&self) -> GenerateStatsOptions {
160 GenerateStatsOptions::default()
161 }
162
163 /// The number of child arrays this scheme produces when cascading. Returns 0 for leaf
164 /// schemes that produce a final encoded array.
165 fn num_children(&self) -> usize {
166 0
167 }
168
169 /// Schemes to exclude from specific children's subtrees (push direction).
170 ///
171 /// Each rule says: "when I cascade through child Y, do not use scheme X anywhere in that
172 /// subtree." Only meaningful when [`num_children`](Scheme::num_children) > 0.
173 fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
174 Vec::new()
175 }
176
177 /// Ancestors that make this scheme ineligible (pull direction).
178 ///
179 /// Each rule says: "if ancestor X cascaded through child Y somewhere above me in the chain, do
180 /// not try me."
181 fn ancestor_exclusions(&self) -> Vec<AncestorExclusion> {
182 Vec::new()
183 }
184
185 /// Cheaply estimate the compression ratio for this scheme on the given array.
186 ///
187 /// This method should be fast and infallible. Any expensive or fallible work should be
188 /// deferred to the compressor by returning
189 /// `CompressionEstimate::Deferred(DeferredEstimate::Sample)` or
190 /// `CompressionEstimate::Deferred(DeferredEstimate::Callback(...))`.
191 ///
192 /// The compressor will ask all schemes what their expected compression ratio is given the array
193 /// and statistics. The scheme with the highest estimated ratio will then be applied to the
194 /// entire array.
195 ///
196 /// [`CompressionEstimate::Verdict`] means the scheme already knows the terminal
197 /// [`crate::estimate::EstimateVerdict`]. `CompressionEstimate::Deferred(DeferredEstimate::Sample)`
198 /// asks the compressor to sample. `CompressionEstimate::Deferred(DeferredEstimate::Callback(...))`
199 /// asks the compressor to run custom deferred work. Deferred callbacks must return a
200 /// [`crate::estimate::EstimateVerdict`] directly, never another deferred request.
201 ///
202 /// Note that the compressor will also use this method when compressing samples, so some
203 /// statistics that might hold for the samples may not hold for the entire array (e.g.,
204 /// `Constant`). Implementations should check `ctx.is_sample` to make sure that they are
205 /// returning the correct information.
206 ///
207 /// The compressor guarantees that empty and all-null arrays are handled before this method is
208 /// called. Implementations may assume the array has at least one valid element. However, a
209 /// constant scheme should still be registered with the compressor to detect single-value arrays
210 /// that are not all-null.
211 fn expected_compression_ratio(
212 &self,
213 _data: &mut ArrayAndStats,
214 _ctx: CompressorContext,
215 ) -> CompressionEstimate;
216
217 /// Compress the array using this scheme.
218 ///
219 /// # Errors
220 ///
221 /// Returns an error if compression fails.
222 fn compress(
223 &self,
224 compressor: &CascadingCompressor,
225 data: &mut ArrayAndStats,
226 ctx: CompressorContext,
227 ) -> VortexResult<ArrayRef>;
228}
229
230impl PartialEq for dyn Scheme {
231 fn eq(&self, other: &Self) -> bool {
232 self.id() == other.id()
233 }
234}
235
236impl Eq for dyn Scheme {}
237
238impl Hash for dyn Scheme {
239 fn hash<H: Hasher>(&self, state: &mut H) {
240 self.id().hash(state);
241 }
242}
243
244/// Extension trait providing [`id`](SchemeExt::id) for all [`Scheme`] implementors.
245///
246/// This trait is automatically implemented for every type that implements [`Scheme`]. Because the
247/// blanket implementation covers all types, external crates cannot override `id()`.
248pub trait SchemeExt: Scheme {
249 /// Unique identifier derived from [`scheme_name`](Scheme::scheme_name).
250 fn id(&self) -> SchemeId {
251 SchemeId {
252 name: self.scheme_name(),
253 }
254 }
255}
256
257impl<T: Scheme + ?Sized> SchemeExt for T {}