Skip to main content

vortex_compressor/
scheme.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Unified compression scheme trait and exclusion rules.
5
6use std::fmt;
7use std::fmt::Debug;
8use std::hash::Hash;
9use std::hash::Hasher;
10
11use vortex_array::ArrayRef;
12use vortex_array::Canonical;
13use vortex_error::VortexResult;
14
15use crate::CascadingCompressor;
16use crate::ctx::CompressorContext;
17use crate::estimate::CompressionEstimate;
18use crate::stats::ArrayAndStats;
19use crate::stats::GenerateStatsOptions;
20
21/// Unique identifier for a compression scheme.
22///
23/// The only way to obtain a [`SchemeId`] is through [`SchemeExt::id()`], which is
24/// auto-implemented for all [`Scheme`] types. There is no public constructor.
25///
26/// The only exception to this is for `ROOT_SCHEME_ID` in `compressor.rs`.
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
28pub struct SchemeId {
29    /// Only constructable within `vortex-compressor`.
30    ///
31    /// The only public way to obtain a [`SchemeId`] is through [`SchemeExt::id()`].
32    pub(super) name: &'static str,
33}
34
35impl fmt::Display for SchemeId {
36    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
37        f.write_str(self.name)
38    }
39}
40
41/// Selects which children of a cascading scheme a rule applies to.
42#[derive(Debug, Clone, Copy)]
43pub enum ChildSelection {
44    /// Rule applies to all children.
45    All,
46    /// Rule applies to a single child.
47    One(usize),
48    /// Rule applies to multiple specific children.
49    Many(&'static [usize]),
50}
51
52impl ChildSelection {
53    /// Returns `true` if this selection includes the given child index.
54    pub fn contains(&self, child_index: usize) -> bool {
55        match self {
56            ChildSelection::All => true,
57            ChildSelection::One(idx) => *idx == child_index,
58            ChildSelection::Many(indices) => indices.contains(&child_index),
59        }
60    }
61}
62
63/// Push rule: declared by a cascading scheme to exclude another scheme from the subtree
64/// rooted at the specified children.
65///
66/// Use this when the declaring scheme (the ancestor) knows about the excluded scheme. For example,
67/// `ZigZag` excludes `Dict` from all its children.
68#[derive(Debug, Clone, Copy)]
69pub struct DescendantExclusion {
70    /// The scheme to exclude from descendants.
71    pub excluded: SchemeId,
72    /// Which children of the declaring scheme this rule applies to.
73    pub children: ChildSelection,
74}
75
76/// Pull rule: declared by a scheme to exclude itself when the specified ancestor is in the
77/// cascade chain.
78///
79/// Use this when the excluded scheme (the descendant) knows about the ancestor. For example,
80/// `Sequence` excludes itself when `IntDict` is an ancestor on its codes child.
81#[derive(Debug, Clone, Copy)]
82pub struct AncestorExclusion {
83    /// The ancestor scheme that makes the declaring scheme ineligible.
84    pub ancestor: SchemeId,
85    /// Which children of the ancestor this rule applies to.
86    pub children: ChildSelection,
87}
88
89// TODO(connor): Remove all default implemented methods.
90/// A single compression encoding that the [`CascadingCompressor`] can select from.
91///
92/// The compressor evaluates every registered scheme whose [`matches`] returns `true` for a given
93/// array, picks the one with the highest [`expected_compression_ratio`], and calls [`compress`] on
94/// the winner.
95///
96/// One of the key features of the compressor in this crate is that schemes may "cascade". A
97/// scheme's [`compress`] can call back into the compressor via
98/// [`CascadingCompressor::compress_child`] to compress child or transformed arrays, building up
99/// multiple encoding layers (e.g. frame-of-reference and then bit-packing).
100///
101/// # Scheme IDs
102///
103/// Every scheme has a globally unique name returned by [`scheme_name`]. The [`SchemeExt::id`]
104/// method (auto-implemented, cannot be overridden) wraps that name in an opaque [`SchemeId`] used
105/// for equality, hashing, and exclusion rules (see below).
106///
107/// # Cascading and children
108///
109/// Schemes that produce child arrays for further compression must declare [`num_children`] > 0.
110/// Each child should be identified by a stable index. Cascading schemes should use
111/// [`CascadingCompressor::compress_child`] to compress each child array, which handles cascade
112/// level / budget tracking and context management automatically.
113///
114/// No scheme may appear twice in a cascade (descendant) chain (enforced by the compressor). This
115/// keeps the search space a tree.
116///
117/// # Exclusion rules
118///
119/// Schemes declare exclusion rules to prevent incompatible scheme combinations in the cascade
120/// chain:
121///
122/// - [`descendant_exclusions`] (push): "exclude scheme X from my child Y's subtree." Used when the
123///   declaring scheme knows about the excluded scheme.
124/// - [`ancestor_exclusions`] (pull): "exclude me if ancestor X's child Y is above me." Used when
125///   the declaring scheme knows about the ancestor.
126///
127/// We do this because different schemes will live in different crates, and we cannot know the
128/// dependency direction ahead of time.
129///
130/// # Implementing a scheme
131///
132/// [`expected_compression_ratio`] should return
133/// `CompressionEstimate::Deferred(DeferredEstimate::Sample)` when a cheap heuristic is not
134/// available, asking the compressor to estimate via sampling. Implementors should return an
135/// immediate [`CompressionEstimate::Verdict`] when possible.
136///
137/// Schemes that need statistics that may be expensive to compute should override [`stats_options`]
138/// to declare what they require. The compressor merges all eligible schemes' options before
139/// generating stats, so each stat is always computed at most once for a given array.
140///
141/// [`scheme_name`]: Scheme::scheme_name
142/// [`matches`]: Scheme::matches
143/// [`compress`]: Scheme::compress
144/// [`expected_compression_ratio`]: Scheme::expected_compression_ratio
145/// [`stats_options`]: Scheme::stats_options
146/// [`num_children`]: Scheme::num_children
147/// [`descendant_exclusions`]: Scheme::descendant_exclusions
148/// [`ancestor_exclusions`]: Scheme::ancestor_exclusions
149pub trait Scheme: Debug + Send + Sync {
150    /// The globally unique name for this scheme (e.g. `"vortex.int.bitpacking"`).
151    fn scheme_name(&self) -> &'static str;
152
153    /// Whether this scheme can compress the given canonical array.
154    fn matches(&self, canonical: &Canonical) -> bool;
155
156    /// Returns the stats generation options this scheme requires. The compressor merges all
157    /// eligible schemes' options before generating stats so that a single stats pass satisfies
158    /// every scheme.
159    fn stats_options(&self) -> GenerateStatsOptions {
160        GenerateStatsOptions::default()
161    }
162
163    /// The number of child arrays this scheme produces when cascading. Returns 0 for leaf
164    /// schemes that produce a final encoded array.
165    fn num_children(&self) -> usize {
166        0
167    }
168
169    /// Schemes to exclude from specific children's subtrees (push direction).
170    ///
171    /// Each rule says: "when I cascade through child Y, do not use scheme X anywhere in that
172    /// subtree." Only meaningful when [`num_children`](Scheme::num_children) > 0.
173    fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
174        Vec::new()
175    }
176
177    /// Ancestors that make this scheme ineligible (pull direction).
178    ///
179    /// Each rule says: "if ancestor X cascaded through child Y somewhere above me in the chain, do
180    /// not try me."
181    fn ancestor_exclusions(&self) -> Vec<AncestorExclusion> {
182        Vec::new()
183    }
184
185    /// Cheaply estimate the compression ratio for this scheme on the given array.
186    ///
187    /// This method should be fast and infallible. Any expensive or fallible work should be
188    /// deferred to the compressor by returning
189    /// `CompressionEstimate::Deferred(DeferredEstimate::Sample)` or
190    /// `CompressionEstimate::Deferred(DeferredEstimate::Callback(...))`.
191    ///
192    /// The compressor will ask all schemes what their expected compression ratio is given the array
193    /// and statistics. The scheme with the highest estimated ratio will then be applied to the
194    /// entire array.
195    ///
196    /// [`CompressionEstimate::Verdict`] means the scheme already knows the terminal
197    /// [`crate::estimate::EstimateVerdict`]. `CompressionEstimate::Deferred(DeferredEstimate::Sample)`
198    /// asks the compressor to sample. `CompressionEstimate::Deferred(DeferredEstimate::Callback(...))`
199    /// asks the compressor to run custom deferred work. Deferred callbacks must return a
200    /// [`crate::estimate::EstimateVerdict`] directly, never another deferred request.
201    ///
202    /// Note that the compressor will also use this method when compressing samples, so some
203    /// statistics that might hold for the samples may not hold for the entire array (e.g.,
204    /// `Constant`). Implementations should check `ctx.is_sample` to make sure that they are
205    /// returning the correct information.
206    ///
207    /// The compressor guarantees that empty and all-null arrays are handled before this method is
208    /// called. Implementations may assume the array has at least one valid element. However, a
209    /// constant scheme should still be registered with the compressor to detect single-value arrays
210    /// that are not all-null.
211    fn expected_compression_ratio(
212        &self,
213        _data: &mut ArrayAndStats,
214        _ctx: CompressorContext,
215    ) -> CompressionEstimate;
216
217    /// Compress the array using this scheme.
218    ///
219    /// # Errors
220    ///
221    /// Returns an error if compression fails.
222    fn compress(
223        &self,
224        compressor: &CascadingCompressor,
225        data: &mut ArrayAndStats,
226        ctx: CompressorContext,
227    ) -> VortexResult<ArrayRef>;
228}
229
230impl PartialEq for dyn Scheme {
231    fn eq(&self, other: &Self) -> bool {
232        self.id() == other.id()
233    }
234}
235
236impl Eq for dyn Scheme {}
237
238impl Hash for dyn Scheme {
239    fn hash<H: Hasher>(&self, state: &mut H) {
240        self.id().hash(state);
241    }
242}
243
244/// Extension trait providing [`id`](SchemeExt::id) for all [`Scheme`] implementors.
245///
246/// This trait is automatically implemented for every type that implements [`Scheme`]. Because the
247/// blanket implementation covers all types, external crates cannot override `id()`.
248pub trait SchemeExt: Scheme {
249    /// Unique identifier derived from [`scheme_name`](Scheme::scheme_name).
250    fn id(&self) -> SchemeId {
251        SchemeId {
252            name: self.scheme_name(),
253        }
254    }
255}
256
257impl<T: Scheme + ?Sized> SchemeExt for T {}