Skip to main content

vortex_compressor/
scheme.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Unified compression scheme trait and exclusion rules.
5
6use std::fmt;
7use std::fmt::Debug;
8use std::hash::Hash;
9use std::hash::Hasher;
10
11use vortex_array::ArrayRef;
12use vortex_array::Canonical;
13use vortex_array::ExecutionCtx;
14use vortex_error::VortexResult;
15
16use crate::CascadingCompressor;
17use crate::ctx::CompressorContext;
18use crate::estimate::CompressionEstimate;
19use crate::stats::ArrayAndStats;
20use crate::stats::GenerateStatsOptions;
21
22/// Unique identifier for a compression scheme.
23///
24/// The only way to obtain a [`SchemeId`] is through [`SchemeExt::id()`], which is auto-implemented
25/// for all [`Scheme`] types. There is no public constructor.
26///
27/// The only exception to this is for the compressor's synthetic `ROOT_SCHEME_ID`.
28#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
29pub struct SchemeId {
30    /// Only constructable within `vortex-compressor`.
31    ///
32    /// The only public way to obtain a [`SchemeId`] is through [`SchemeExt::id()`].
33    pub(super) name: &'static str,
34}
35
36impl fmt::Display for SchemeId {
37    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
38        f.write_str(self.name)
39    }
40}
41
42/// Selects which children of a cascading scheme a rule applies to.
43#[derive(Debug, Clone, Copy)]
44pub enum ChildSelection {
45    /// Rule applies to all children.
46    All,
47    /// Rule applies to a single child.
48    One(usize),
49    /// Rule applies to multiple specific children.
50    Many(&'static [usize]),
51}
52
53impl ChildSelection {
54    /// Returns `true` if this selection includes the given child index.
55    pub fn contains(&self, child_index: usize) -> bool {
56        match self {
57            ChildSelection::All => true,
58            ChildSelection::One(idx) => *idx == child_index,
59            ChildSelection::Many(indices) => indices.contains(&child_index),
60        }
61    }
62}
63
64/// Push rule: declared by a cascading scheme to exclude another scheme from the subtree
65/// rooted at the specified children.
66///
67/// Use this when the declaring scheme (the ancestor) knows about the excluded scheme. For example,
68/// `ZigZag` excludes `Dict` from all its children.
69#[derive(Debug, Clone, Copy)]
70pub struct DescendantExclusion {
71    /// The scheme to exclude from descendants.
72    pub excluded: SchemeId,
73    /// Which children of the declaring scheme this rule applies to.
74    pub children: ChildSelection,
75}
76
77/// Pull rule: declared by a scheme to exclude itself when the specified ancestor is in the
78/// cascade chain.
79///
80/// Use this when the excluded scheme (the descendant) knows about the ancestor. For example,
81/// `Sequence` excludes itself when `IntDict` is an ancestor on its codes child.
82#[derive(Debug, Clone, Copy)]
83pub struct AncestorExclusion {
84    /// The ancestor scheme that makes the declaring scheme ineligible.
85    pub ancestor: SchemeId,
86    /// Which children of the ancestor this rule applies to.
87    pub children: ChildSelection,
88}
89
90// TODO(connor): Remove all default implemented methods.
91/// A single compression encoding that the [`CascadingCompressor`] can select from.
92///
93/// The compressor evaluates every registered scheme whose [`matches`] returns `true` for a given
94/// array, picks the one with the highest [`expected_compression_ratio`], and calls [`compress`] on
95/// the winner.
96///
97/// One of the key features of the compressor in this crate is that schemes may "cascade". A
98/// scheme's [`compress`] can call back into the compressor via
99/// [`CascadingCompressor::compress_child`] to compress child or transformed arrays, building up
100/// multiple encoding layers (e.g. frame-of-reference and then bit-packing).
101///
102/// # Scheme IDs
103///
104/// Every scheme has a globally unique name returned by [`scheme_name`]. The [`SchemeExt::id`]
105/// method (auto-implemented, cannot be overridden) wraps that name in an opaque [`SchemeId`] used
106/// for equality, hashing, and exclusion rules (see below).
107///
108/// # Cascading and children
109///
110/// Schemes that produce child arrays for further compression must declare [`num_children`] > 0.
111/// Each child should be identified by a stable index. Cascading schemes should use
112/// [`CascadingCompressor::compress_child`] to compress each child array, which handles cascade
113/// level / budget tracking and context management automatically.
114///
115/// No scheme may appear twice in a cascade (descendant) chain (enforced by the compressor). This
116/// keeps the search space a tree.
117///
118/// # Exclusion rules
119///
120/// Schemes declare exclusion rules to prevent incompatible scheme combinations in the cascade
121/// chain:
122///
123/// - [`descendant_exclusions`] (push): "exclude scheme X from my child Y's subtree." Used when the
124///   declaring scheme knows about the excluded scheme.
125/// - [`ancestor_exclusions`] (pull): "exclude me if ancestor X's child Y is above me." Used when
126///   the declaring scheme knows about the ancestor.
127///
128/// We do this because different schemes will live in different crates, and we cannot know the
129/// dependency direction ahead of time.
130///
131/// # Implementing a scheme
132///
133/// [`expected_compression_ratio`] should return
134/// `CompressionEstimate::Deferred(DeferredEstimate::Sample)` when a cheap heuristic is not
135/// available, asking the compressor to estimate via sampling. Implementors should return an
136/// immediate [`CompressionEstimate::Verdict`] when possible.
137///
138/// Schemes that need statistics that may be expensive to compute should override [`stats_options`]
139/// to declare what they require. The compressor merges all eligible schemes' options before
140/// generating stats, so each stat is always computed at most once for a given array.
141///
142/// [`scheme_name`]: Scheme::scheme_name
143/// [`matches`]: Scheme::matches
144/// [`compress`]: Scheme::compress
145/// [`expected_compression_ratio`]: Scheme::expected_compression_ratio
146/// [`stats_options`]: Scheme::stats_options
147/// [`num_children`]: Scheme::num_children
148/// [`descendant_exclusions`]: Scheme::descendant_exclusions
149/// [`ancestor_exclusions`]: Scheme::ancestor_exclusions
150pub trait Scheme: Debug + Send + Sync {
151    /// The globally unique name for this scheme (e.g. `"vortex.int.bitpacking"`).
152    fn scheme_name(&self) -> &'static str;
153
154    /// Whether this scheme can compress the given canonical array.
155    fn matches(&self, canonical: &Canonical) -> bool;
156
157    /// Returns the stats generation options this scheme requires. The compressor merges all
158    /// eligible schemes' options before generating stats so that a single stats pass satisfies
159    /// every scheme.
160    fn stats_options(&self) -> GenerateStatsOptions {
161        GenerateStatsOptions::default()
162    }
163
164    /// The number of child arrays this scheme produces when cascading. Returns 0 for leaf
165    /// schemes that produce a final encoded array.
166    fn num_children(&self) -> usize {
167        0
168    }
169
170    /// Schemes to exclude from specific children's subtrees (push direction).
171    ///
172    /// Each rule says: "when I cascade through child Y, do not use scheme X anywhere in that
173    /// subtree." Only meaningful when [`num_children`](Scheme::num_children) > 0.
174    fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
175        Vec::new()
176    }
177
178    /// Ancestors that make this scheme ineligible (pull direction).
179    ///
180    /// Each rule says: "if ancestor X cascaded through child Y somewhere above me in the chain, do
181    /// not try me."
182    fn ancestor_exclusions(&self) -> Vec<AncestorExclusion> {
183        Vec::new()
184    }
185
186    /// Cheaply estimate the compression ratio for this scheme on the given array.
187    ///
188    /// This method should be fast and infallible. Any expensive or fallible work should be
189    /// deferred to the compressor by returning
190    /// `CompressionEstimate::Deferred(DeferredEstimate::Sample)` or
191    /// `CompressionEstimate::Deferred(DeferredEstimate::Callback(...))`.
192    ///
193    /// The compressor will ask all schemes what their expected compression ratio is given the array
194    /// and statistics. The scheme with the highest estimated ratio will then be applied to the
195    /// entire array.
196    ///
197    /// [`CompressionEstimate::Verdict`] means the scheme already knows the terminal
198    /// [`crate::estimate::EstimateVerdict`]. `CompressionEstimate::Deferred(DeferredEstimate::Sample)`
199    /// asks the compressor to sample. `CompressionEstimate::Deferred(DeferredEstimate::Callback(...))`
200    /// asks the compressor to run custom deferred work. Deferred callbacks must return a
201    /// [`crate::estimate::EstimateVerdict`] directly, never another deferred request.
202    ///
203    /// Note that the compressor will also use this method when compressing samples, so some
204    /// statistics that might hold for the samples may not hold for the entire array (e.g.,
205    /// `Constant`). Implementations should check `ctx.is_sample` to make sure that they are
206    /// returning the correct information.
207    ///
208    /// The compressor guarantees that empty and all-null arrays are handled before this method is
209    /// called. Implementations may assume the array has at least one valid element. However, a
210    /// constant scheme should still be registered with the compressor to detect single-value arrays
211    /// that are not all-null.
212    fn expected_compression_ratio(
213        &self,
214        _data: &ArrayAndStats,
215        _compress_ctx: CompressorContext,
216        _exec_ctx: &mut ExecutionCtx,
217    ) -> CompressionEstimate;
218
219    /// Compress the array using this scheme.
220    ///
221    /// # Errors
222    ///
223    /// Returns an error if compression fails.
224    fn compress(
225        &self,
226        compressor: &CascadingCompressor,
227        data: &ArrayAndStats,
228        compress_ctx: CompressorContext,
229        exec_ctx: &mut ExecutionCtx,
230    ) -> VortexResult<ArrayRef>;
231}
232
233impl PartialEq for dyn Scheme {
234    fn eq(&self, other: &Self) -> bool {
235        self.id() == other.id()
236    }
237}
238
239impl Eq for dyn Scheme {}
240
241impl Hash for dyn Scheme {
242    fn hash<H: Hasher>(&self, state: &mut H) {
243        self.id().hash(state);
244    }
245}
246
247/// Extension trait providing [`id`](SchemeExt::id) for all [`Scheme`] implementors.
248///
249/// This trait is automatically implemented for every type that implements [`Scheme`]. Because the
250/// blanket implementation covers all types, external crates cannot override `id()`.
251pub trait SchemeExt: Scheme {
252    /// Unique identifier derived from [`scheme_name`](Scheme::scheme_name).
253    fn id(&self) -> SchemeId {
254        SchemeId {
255            name: self.scheme_name(),
256        }
257    }
258}
259
260impl<T: Scheme + ?Sized> SchemeExt for T {}