vortex_compressor/estimate.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Compression ratio estimation types and sampling-based estimation.
5
6use std::fmt;
7
8use vortex_array::ArrayRef;
9use vortex_array::Canonical;
10use vortex_array::IntoArray;
11use vortex_error::VortexResult;
12
13use crate::CascadingCompressor;
14use crate::ctx::CompressorContext;
15use crate::sample::SAMPLE_SIZE;
16use crate::sample::sample;
17use crate::sample::sample_count_approx_one_percent;
18use crate::scheme::Scheme;
19use crate::stats::ArrayAndStats;
20
21/// Closure type for [`CompressionEstimate::Estimate`]. The compressor calls this with the same
22/// arguments it would pass to sampling.
23#[rustfmt::skip]
24pub type EstimateFn = dyn FnOnce(
25 &CascadingCompressor,
26 &mut ArrayAndStats,
27 CompressorContext,
28 ) -> VortexResult<CompressionEstimate>
29 + Send
30 + Sync;
31
32// TODO(connor): We should make use of the fact that some checks are cheap and some checks are
33// expensive (sample or estimate variants).
34/// The result of a [`Scheme`]'s compression ratio estimation.
35///
36/// This type is returned by [`Scheme::expected_compression_ratio`] to tell the compressor how
37/// promising this scheme is for a given array without performing any expensive work.
38///
39/// All expensive or fallible operations (sampling, trial encoding) are deferred to the compressor
40/// via the [`Sample`](CompressionEstimate::Sample) and [`Estimate`](CompressionEstimate::Estimate)
41/// variants.
42///
43/// [`Sample`]: CompressionEstimate::Sample
44/// [`Estimate`]: CompressionEstimate::Estimate
45pub enum CompressionEstimate {
46 /// Do not use this scheme for this array.
47 Skip,
48
49 /// Always use this scheme, as we know it is definitively the best choice.
50 ///
51 /// Some examples include constant detection, decimal byte parts, and temporal decomposition.
52 ///
53 /// The compressor will select this scheme immediately without evaluating further candidates.
54 /// Schemes that return `AlwaysUse` must be mutually exclusive per canonical type (enforced by
55 /// [`Scheme::matches`]), otherwise the winner depends silently on registration order.
56 ///
57 /// [`Scheme::matches`]: crate::scheme::Scheme::matches
58 AlwaysUse,
59
60 /// The estimated compression ratio. This must be greater than `1.0` to be considered by the
61 /// compressor, otherwise it is worse than the canonical encoding.
62 Ratio(f64),
63
64 /// The scheme cannot cheaply estimate its ratio, so the compressor should compress a small
65 /// sample to determine effectiveness.
66 Sample,
67
68 /// A fallible estimation requiring a custom expensive computation. The compressor will call the
69 /// closure and handle the result.
70 ///
71 /// Use this only when the scheme needs to perform trial encoding or other costly checks to
72 /// determine its compression ratio.
73 ///
74 /// The estimation function must **not** return a [`Sample`](CompressionEstimate::Sample) or
75 /// [`Estimate`](CompressionEstimate::Estimate) variant to ensure the estimation process is
76 /// bounded.
77 Estimate(Box<EstimateFn>),
78}
79
80/// Returns `true` if `ratio` is a valid compression ratio (> 1.0, finite, not subnormal) that
81/// beats the current best.
82pub(super) fn is_better_ratio(ratio: f64, best: &Option<(&'static dyn Scheme, f64)>) -> bool {
83 ratio.is_finite() && !ratio.is_subnormal() && ratio > 1.0 && best.is_none_or(|(_, r)| ratio > r)
84}
85
86/// Estimates compression ratio by compressing a ~1% sample of the data.
87///
88/// Creates a new [`ArrayAndStats`] for the sample so that stats are generated from the sample, not
89/// the full array.
90///
91/// # Errors
92///
93/// Returns an error if sample compression fails.
94pub(super) fn estimate_compression_ratio_with_sampling<S: Scheme + ?Sized>(
95 scheme: &S,
96 compressor: &CascadingCompressor,
97 array: &ArrayRef,
98 ctx: CompressorContext,
99) -> VortexResult<f64> {
100 let sample_array = if ctx.is_sample() {
101 array.clone()
102 } else {
103 let source_len = array.len();
104 let sample_count = sample_count_approx_one_percent(source_len);
105
106 tracing::trace!(
107 "Sampling {} values out of {}",
108 SAMPLE_SIZE as u64 * sample_count as u64,
109 source_len
110 );
111
112 // `ArrayAndStats` expects a canonical array (so that it can easily compute lazy stats).
113 let canonical: Canonical =
114 sample(array, SAMPLE_SIZE, sample_count).execute(&mut compressor.execution_ctx())?;
115 canonical.into_array()
116 };
117
118 let mut sample_data = ArrayAndStats::new(sample_array, scheme.stats_options());
119 let sample_ctx = ctx.with_sampling();
120
121 let after = scheme
122 .compress(compressor, &mut sample_data, sample_ctx)?
123 .nbytes();
124 let before = sample_data.array().nbytes();
125
126 // TODO(connor): Issue https://github.com/vortex-data/vortex/issues/7268.
127 // if after == 0 {
128 // tracing::warn!(
129 // scheme = %scheme.id(),
130 // "sample compressed to 0 bytes, which should only happen for constant arrays",
131 // );
132 // }
133
134 let ratio = before as f64 / after as f64;
135
136 tracing::debug!("estimate_compression_ratio_with_sampling(compressor={scheme:#?}) = {ratio}",);
137
138 Ok(ratio)
139}
140
141impl fmt::Debug for CompressionEstimate {
142 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
143 match self {
144 CompressionEstimate::Skip => write!(f, "Skip"),
145 CompressionEstimate::AlwaysUse => write!(f, "AlwaysUse"),
146 CompressionEstimate::Ratio(r) => f.debug_tuple("Ratio").field(r).finish(),
147 CompressionEstimate::Sample => write!(f, "Sample"),
148 CompressionEstimate::Estimate(_) => write!(f, "Estimate(..)"),
149 }
150 }
151}