Skip to main content

vortex_compressor/builtins/constant/
string.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Constant encoding for string arrays.
5
6use vortex_array::ArrayRef;
7use vortex_array::Canonical;
8use vortex_array::aggregate_fn::fns::is_constant::is_constant;
9use vortex_error::VortexResult;
10
11use super::is_utf8_string;
12use crate::CascadingCompressor;
13use crate::builtins::StringConstantScheme;
14use crate::builtins::constant::compress_constant_array_with_validity;
15use crate::ctx::CompressorContext;
16use crate::estimate::CompressionEstimate;
17use crate::estimate::DeferredEstimate;
18use crate::estimate::EstimateVerdict;
19use crate::scheme::Scheme;
20use crate::stats::ArrayAndStats;
21
22impl Scheme for StringConstantScheme {
23    fn scheme_name(&self) -> &'static str {
24        "vortex.string.constant"
25    }
26
27    fn matches(&self, canonical: &Canonical) -> bool {
28        is_utf8_string(canonical)
29    }
30
31    fn expected_compression_ratio(
32        &self,
33        data: &mut ArrayAndStats,
34        ctx: CompressorContext,
35    ) -> CompressionEstimate {
36        // Constant detection on a sample is a false positive, since the sample being constant does
37        // not mean the full array is constant.
38        if ctx.is_sample() {
39            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
40        }
41
42        let array_len = data.array().len();
43        let stats = data.string_stats();
44
45        // We want to use `Constant` if there are only nulls in the array.
46        if stats.value_count() == 0 {
47            debug_assert_eq!(stats.null_count() as usize, array_len);
48            return CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse);
49        }
50
51        // Since the estimated distinct count is always going to be less than or equal to the actual
52        // distinct count, if this is not equal to 1 the actual is definitely not equal to 1.
53        if stats.estimated_distinct_count().is_some_and(|c| c > 1) {
54            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
55        }
56
57        // Otherwise our best bet is to actually check if the array is constant.
58        // This is an expensive check, but the alternative of not compressing a constant array is
59        // far less preferable.
60        CompressionEstimate::Deferred(DeferredEstimate::Callback(Box::new(
61            |compressor, data, _ctx| {
62                if is_constant(data.array(), &mut compressor.execution_ctx())? {
63                    Ok(EstimateVerdict::AlwaysUse)
64                } else {
65                    Ok(EstimateVerdict::Skip)
66                }
67            },
68        )))
69    }
70
71    fn compress(
72        &self,
73        _compressor: &CascadingCompressor,
74        data: &mut ArrayAndStats,
75        _ctx: CompressorContext,
76    ) -> VortexResult<ArrayRef> {
77        compress_constant_array_with_validity(data.array())
78    }
79}