Skip to main content

vortex_btrblocks/schemes/integer/
sequence.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Sequence integer encoding for sequential patterns.
5
6use vortex_array::ArrayRef;
7use vortex_array::Canonical;
8use vortex_array::ExecutionCtx;
9use vortex_compressor::builtins::BinaryDictScheme;
10use vortex_compressor::builtins::FloatDictScheme;
11use vortex_compressor::builtins::IntDictScheme;
12use vortex_compressor::builtins::StringDictScheme;
13use vortex_compressor::estimate::CompressionEstimate;
14use vortex_compressor::estimate::DeferredEstimate;
15use vortex_compressor::estimate::EstimateScore;
16use vortex_compressor::estimate::EstimateVerdict;
17use vortex_compressor::scheme::AncestorExclusion;
18use vortex_compressor::scheme::ChildSelection;
19use vortex_error::VortexResult;
20use vortex_error::vortex_bail;
21use vortex_error::vortex_err;
22use vortex_sequence::sequence_encode;
23
24use crate::ArrayAndStats;
25use crate::CascadingCompressor;
26use crate::CompressorContext;
27use crate::Scheme;
28use crate::SchemeExt;
29
30/// Sequence encoding for sequential patterns.
31#[derive(Debug, Copy, Clone, PartialEq, Eq)]
32pub struct SequenceScheme;
33
34impl Scheme for SequenceScheme {
35    fn scheme_name(&self) -> &'static str {
36        "vortex.int.sequence"
37    }
38
39    fn matches(&self, canonical: &Canonical) -> bool {
40        canonical.dtype().is_int()
41    }
42
43    /// Sequence encoding on dictionary codes just adds a layer of indirection without compressing
44    /// the data. Dict codes are compact integers that benefit from BitPacking or FoR, not from
45    /// sequence detection.
46    fn ancestor_exclusions(&self) -> Vec<AncestorExclusion> {
47        vec![
48            AncestorExclusion {
49                ancestor: IntDictScheme.id(),
50                children: ChildSelection::One(1),
51            },
52            AncestorExclusion {
53                ancestor: FloatDictScheme.id(),
54                children: ChildSelection::One(1),
55            },
56            AncestorExclusion {
57                ancestor: StringDictScheme.id(),
58                children: ChildSelection::One(1),
59            },
60            AncestorExclusion {
61                ancestor: BinaryDictScheme.id(),
62                children: ChildSelection::One(1),
63            },
64        ]
65    }
66
67    fn expected_compression_ratio(
68        &self,
69        data: &ArrayAndStats,
70        compress_ctx: CompressorContext,
71        exec_ctx: &mut ExecutionCtx,
72    ) -> CompressionEstimate {
73        // It is pointless checking if a sample is a sequence since it will not correspond to the
74        // entire array.
75        if compress_ctx.is_sample() {
76            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
77        }
78        let stats = data.integer_stats(exec_ctx);
79
80        // `SequenceArray` does not support nulls.
81        if stats.null_count() > 0 {
82            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
83        }
84
85        // If the distinct_values_count was computed, and not all values are unique, then this
86        // cannot be encoded as a sequence array.
87        if stats
88            .distinct_count()
89            .is_some_and(|count| count as usize != data.array_len())
90        {
91            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
92        }
93
94        // TODO(connor): `sequence_encode` allocates the encoded array just to confirm feasibility.
95        // A cheaper `is_sequence` probe would let us skip the allocation entirely.
96        CompressionEstimate::Deferred(DeferredEstimate::Callback(Box::new(
97            |_compressor, data, best_so_far, _ctx, exec_ctx| {
98                // `SequenceArray` stores exactly two scalars (base and multiplier), so the best
99                // achievable compression ratio is `array_len / 2`.
100                let compressed_size = 2usize;
101                let max_ratio = data.array_len() as f64 / compressed_size as f64;
102
103                // If we cannot beat the best so far, then we do not want to even try sequence
104                // encoding the data.
105                let threshold = best_so_far.and_then(EstimateScore::finite_ratio);
106                if threshold.is_some_and(|t| max_ratio <= t) {
107                    return Ok(EstimateVerdict::Skip);
108                }
109
110                // TODO(connor): We should pass this array back to the compressor in the case that
111                // we do want to sequence encode this so that we do not need to recompress.
112                if sequence_encode(data.array_as_primitive(), exec_ctx)?.is_none() {
113                    return Ok(EstimateVerdict::Skip);
114                }
115                // TODO(connor): Should we get the actual ratio here?
116                Ok(EstimateVerdict::Ratio(max_ratio))
117            },
118        )))
119    }
120
121    fn compress(
122        &self,
123        _compressor: &CascadingCompressor,
124        data: &ArrayAndStats,
125        _compress_ctx: CompressorContext,
126        exec_ctx: &mut ExecutionCtx,
127    ) -> VortexResult<ArrayRef> {
128        let stats = data.integer_stats(exec_ctx);
129
130        if stats.null_count() > 0 {
131            vortex_bail!("sequence encoding does not support nulls");
132        }
133        sequence_encode(data.array_as_primitive(), exec_ctx)?
134            .ok_or_else(|| vortex_err!("cannot sequence encode array"))
135    }
136}