Skip to main content

vortex_compressor/
scheme.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Unified compression scheme trait and exclusion rules.
5
6use std::fmt;
7use std::fmt::Debug;
8use std::hash::Hash;
9use std::hash::Hasher;
10
11use vortex_array::ArrayRef;
12use vortex_array::Canonical;
13use vortex_error::VortexResult;
14
15use crate::CascadingCompressor;
16use crate::ctx::CompressorContext;
17use crate::estimate::CompressionEstimate;
18use crate::stats::ArrayAndStats;
19use crate::stats::GenerateStatsOptions;
20
21/// Unique identifier for a compression scheme.
22///
23/// The only way to obtain a [`SchemeId`] is through [`SchemeExt::id()`], which is
24/// auto-implemented for all [`Scheme`] types. There is no public constructor.
25///
26/// The only exception to this is for `ROOT_SCHEME_ID` in `compressor.rs`.
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
28pub struct SchemeId {
29    /// Only constructable within `vortex-compressor`.
30    ///
31    /// The only public way to obtain a [`SchemeId`] is through [`SchemeExt::id()`].
32    pub(super) name: &'static str,
33}
34
35impl fmt::Display for SchemeId {
36    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
37        f.write_str(self.name)
38    }
39}
40
41/// Selects which children of a cascading scheme a rule applies to.
42#[derive(Debug, Clone, Copy)]
43pub enum ChildSelection {
44    /// Rule applies to all children.
45    All,
46    /// Rule applies to a single child.
47    One(usize),
48    /// Rule applies to multiple specific children.
49    Many(&'static [usize]),
50}
51
52impl ChildSelection {
53    /// Returns `true` if this selection includes the given child index.
54    pub fn contains(&self, child_index: usize) -> bool {
55        match self {
56            ChildSelection::All => true,
57            ChildSelection::One(idx) => *idx == child_index,
58            ChildSelection::Many(indices) => indices.contains(&child_index),
59        }
60    }
61}
62
63/// Push rule: declared by a cascading scheme to exclude another scheme from the subtree
64/// rooted at the specified children.
65///
66/// Use this when the declaring scheme (the ancestor) knows about the excluded scheme. For example,
67/// `ZigZag` excludes `Dict` from all its children.
68#[derive(Debug, Clone, Copy)]
69pub struct DescendantExclusion {
70    /// The scheme to exclude from descendants.
71    pub excluded: SchemeId,
72    /// Which children of the declaring scheme this rule applies to.
73    pub children: ChildSelection,
74}
75
76/// Pull rule: declared by a scheme to exclude itself when the specified ancestor is in the
77/// cascade chain.
78///
79/// Use this when the excluded scheme (the descendant) knows about the ancestor. For example,
80/// `Sequence` excludes itself when `IntDict` is an ancestor on its codes child.
81#[derive(Debug, Clone, Copy)]
82pub struct AncestorExclusion {
83    /// The ancestor scheme that makes the declaring scheme ineligible.
84    pub ancestor: SchemeId,
85    /// Which children of the ancestor this rule applies to.
86    pub children: ChildSelection,
87}
88
89// TODO(connor): Remove all default implemented methods.
90/// A single compression encoding that the [`CascadingCompressor`] can select from.
91///
92/// The compressor evaluates every registered scheme whose [`matches`] returns `true` for a given
93/// array, picks the one with the highest [`expected_compression_ratio`], and calls [`compress`] on
94/// the winner.
95///
96/// One of the key features of the compressor in this crate is that schemes may "cascade". A
97/// scheme's [`compress`] can call back into the compressor via
98/// [`CascadingCompressor::compress_child`] to compress child or transformed arrays, building up
99/// multiple encoding layers (e.g. frame-of-reference and then bit-packing).
100///
101/// # Scheme IDs
102///
103/// Every scheme has a globally unique name returned by [`scheme_name`]. The [`SchemeExt::id`]
104/// method (auto-implemented, cannot be overridden) wraps that name in an opaque [`SchemeId`] used
105/// for equality, hashing, and exclusion rules (see below).
106///
107/// # Cascading and children
108///
109/// Schemes that produce child arrays for further compression must declare [`num_children`] > 0.
110/// Each child should be identified by a stable index. Cascading schemes should use
111/// [`CascadingCompressor::compress_child`] to compress each child array, which handles cascade
112/// level / budget tracking and context management automatically.
113///
114/// No scheme may appear twice in a cascade (descendant) chain (enforced by the compressor). This
115/// keeps the search space a tree.
116///
117/// # Exclusion rules
118///
119/// Schemes declare exclusion rules to prevent incompatible scheme combinations in the cascade
120/// chain:
121///
122/// - [`descendant_exclusions`] (push): "exclude scheme X from my child Y's subtree." Used when the
123///   declaring scheme knows about the excluded scheme.
124/// - [`ancestor_exclusions`] (pull): "exclude me if ancestor X's child Y is above me." Used when
125///   the declaring scheme knows about the ancestor.
126///
127/// We do this because different schemes will live in different crates, and we cannot know the
128/// dependency direction ahead of time.
129///
130/// # Implementing a scheme
131///
132/// [`expected_compression_ratio`] should return [`CompressionEstimate::Sample`] when a cheap
133/// heuristic is not available, asking the compressor to estimate via sampling. Implementors should
134/// return a more specific variant when possible (e.g. [`CompressionEstimate::AlwaysUse`] for
135/// constant detection or [`CompressionEstimate::Skip`] for early rejection based on stats).
136///
137/// Schemes that need statistics that may be expensive to compute should override [`stats_options`]
138/// to declare what they require. The compressor merges all eligible schemes' options before
139/// generating stats, so each stat is always computed at most once for a given array.
140///
141/// [`scheme_name`]: Scheme::scheme_name
142/// [`matches`]: Scheme::matches
143/// [`compress`]: Scheme::compress
144/// [`expected_compression_ratio`]: Scheme::expected_compression_ratio
145/// [`stats_options`]: Scheme::stats_options
146/// [`num_children`]: Scheme::num_children
147/// [`descendant_exclusions`]: Scheme::descendant_exclusions
148/// [`ancestor_exclusions`]: Scheme::ancestor_exclusions
149pub trait Scheme: Debug + Send + Sync {
150    /// The globally unique name for this scheme (e.g. `"vortex.int.bitpacking"`).
151    fn scheme_name(&self) -> &'static str;
152
153    /// Whether this scheme can compress the given canonical array.
154    fn matches(&self, canonical: &Canonical) -> bool;
155
156    /// Returns the stats generation options this scheme requires. The compressor merges all
157    /// eligible schemes' options before generating stats so that a single stats pass satisfies
158    /// every scheme.
159    fn stats_options(&self) -> GenerateStatsOptions {
160        GenerateStatsOptions::default()
161    }
162
163    /// The number of child arrays this scheme produces when cascading. Returns 0 for leaf
164    /// schemes that produce a final encoded array.
165    fn num_children(&self) -> usize {
166        0
167    }
168
169    /// Schemes to exclude from specific children's subtrees (push direction).
170    ///
171    /// Each rule says: "when I cascade through child Y, do not use scheme X anywhere in that
172    /// subtree." Only meaningful when [`num_children`](Scheme::num_children) > 0.
173    fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
174        Vec::new()
175    }
176
177    /// Ancestors that make this scheme ineligible (pull direction).
178    ///
179    /// Each rule says: "if ancestor X cascaded through child Y somewhere above me in the chain, do
180    /// not try me."
181    fn ancestor_exclusions(&self) -> Vec<AncestorExclusion> {
182        Vec::new()
183    }
184
185    /// Cheaply estimate the compression ratio for this scheme on the given array.
186    ///
187    /// This method should be fast and infallible. Any expensive or fallible work should be deferred
188    /// to the compressor by returning [`CompressionEstimate::Sample`] or
189    /// [`CompressionEstimate::Estimate`].
190    ///
191    /// The compressor will ask all schemes what their expected compression ratio is given the array
192    /// and statistics. The scheme with the highest estimated ratio will then be applied to the
193    /// entire array.
194    ///
195    /// Note that the compressor will also use this method when compressing samples, so some
196    /// statistics that might hold for the samples may not hold for the entire array (e.g.,
197    /// `Constant`). Implementations should check `ctx.is_sample` to make sure that they are
198    /// returning the correct information.
199    ///
200    /// The compressor guarantees that empty and all-null arrays are handled before this method is
201    /// called. Implementations may assume the array has at least one valid element. However, a
202    /// constant scheme should still be registered with the compressor to detect single-value arrays
203    /// that are not all-null.
204    fn expected_compression_ratio(
205        &self,
206        _data: &mut ArrayAndStats,
207        _ctx: CompressorContext,
208    ) -> CompressionEstimate;
209
210    /// Compress the array using this scheme.
211    ///
212    /// # Errors
213    ///
214    /// Returns an error if compression fails.
215    fn compress(
216        &self,
217        compressor: &CascadingCompressor,
218        data: &mut ArrayAndStats,
219        ctx: CompressorContext,
220    ) -> VortexResult<ArrayRef>;
221}
222
223impl PartialEq for dyn Scheme {
224    fn eq(&self, other: &Self) -> bool {
225        self.id() == other.id()
226    }
227}
228
229impl Eq for dyn Scheme {}
230
231impl Hash for dyn Scheme {
232    fn hash<H: Hasher>(&self, state: &mut H) {
233        self.id().hash(state);
234    }
235}
236
237/// Extension trait providing [`id`](SchemeExt::id) for all [`Scheme`] implementors.
238///
239/// This trait is automatically implemented for every type that implements [`Scheme`]. Because the
240/// blanket implementation covers all types, external crates cannot override `id()`.
241pub trait SchemeExt: Scheme {
242    /// Unique identifier derived from [`scheme_name`](Scheme::scheme_name).
243    fn id(&self) -> SchemeId {
244        SchemeId {
245            name: self.scheme_name(),
246        }
247    }
248}
249
250impl<T: Scheme + ?Sized> SchemeExt for T {}