Skip to main content

vortex_compressor/builtins/constant/
binary.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Constant encoding for binary arrays.
5
6use vortex_array::ArrayRef;
7use vortex_array::Canonical;
8use vortex_array::ExecutionCtx;
9use vortex_array::aggregate_fn::fns::is_constant::is_constant;
10use vortex_error::VortexResult;
11
12use crate::CascadingCompressor;
13use crate::builtins::constant::compress_constant_array_with_validity;
14use crate::ctx::CompressorContext;
15use crate::estimate::CompressionEstimate;
16use crate::estimate::DeferredEstimate;
17use crate::estimate::EstimateVerdict;
18use crate::scheme::Scheme;
19use crate::stats::ArrayAndStats;
20
21/// Constant encoding for binary arrays with a single distinct value.
22#[derive(Debug, Copy, Clone, PartialEq, Eq)]
23pub struct BinaryConstantScheme;
24
25impl Scheme for BinaryConstantScheme {
26    fn scheme_name(&self) -> &'static str {
27        "vortex.binary.constant"
28    }
29
30    fn matches(&self, canonical: &Canonical) -> bool {
31        canonical.dtype().is_binary()
32    }
33
34    fn expected_compression_ratio(
35        &self,
36        data: &ArrayAndStats,
37        compress_ctx: CompressorContext,
38        exec_ctx: &mut ExecutionCtx,
39    ) -> CompressionEstimate {
40        // Constant detection on a sample is a false positive, since the sample being constant does
41        // not mean the full array is constant.
42        if compress_ctx.is_sample() {
43            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
44        }
45
46        let array_len = data.array().len();
47        let stats = data.varbinview_stats(exec_ctx);
48
49        // We want to use `Constant` if there are only nulls in the array.
50        if stats.value_count() == 0 {
51            debug_assert_eq!(stats.null_count() as usize, array_len);
52            return CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse);
53        }
54
55        // Since the estimated distinct count is always going to be less than or equal to the actual
56        // distinct count, if this is not equal to 1 the actual is definitely not equal to 1.
57        if stats.estimated_distinct_count().is_some_and(|c| c > 1) {
58            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
59        }
60
61        // Otherwise our best bet is to actually check if the array is constant.
62        // This is an expensive check, but the alternative of not compressing a constant array is
63        // far less preferable.
64        CompressionEstimate::Deferred(DeferredEstimate::Callback(Box::new(
65            |_compressor, data, _best_so_far, _ctx, exec_ctx| {
66                if is_constant(data.array(), exec_ctx)? {
67                    Ok(EstimateVerdict::AlwaysUse)
68                } else {
69                    Ok(EstimateVerdict::Skip)
70                }
71            },
72        )))
73    }
74
75    fn compress(
76        &self,
77        _compressor: &CascadingCompressor,
78        data: &ArrayAndStats,
79        _compress_ctx: CompressorContext,
80        exec_ctx: &mut ExecutionCtx,
81    ) -> VortexResult<ArrayRef> {
82        compress_constant_array_with_validity(data.array(), exec_ctx)
83    }
84}