Skip to main content

vortex_compressor/builtins/constant/
string.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Constant encoding for string arrays.
5
6use vortex_array::ArrayRef;
7use vortex_array::Canonical;
8use vortex_array::ExecutionCtx;
9use vortex_array::aggregate_fn::fns::is_constant::is_constant;
10use vortex_error::VortexResult;
11
12use super::is_utf8_string;
13use crate::CascadingCompressor;
14use crate::builtins::StringConstantScheme;
15use crate::builtins::constant::compress_constant_array_with_validity;
16use crate::ctx::CompressorContext;
17use crate::estimate::CompressionEstimate;
18use crate::estimate::DeferredEstimate;
19use crate::estimate::EstimateVerdict;
20use crate::scheme::Scheme;
21use crate::stats::ArrayAndStats;
22
23impl Scheme for StringConstantScheme {
24    fn scheme_name(&self) -> &'static str {
25        "vortex.string.constant"
26    }
27
28    fn matches(&self, canonical: &Canonical) -> bool {
29        is_utf8_string(canonical)
30    }
31
32    fn expected_compression_ratio(
33        &self,
34        data: &ArrayAndStats,
35        compress_ctx: CompressorContext,
36        exec_ctx: &mut ExecutionCtx,
37    ) -> CompressionEstimate {
38        // Constant detection on a sample is a false positive, since the sample being constant does
39        // not mean the full array is constant.
40        if compress_ctx.is_sample() {
41            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
42        }
43
44        let array_len = data.array().len();
45        let stats = data.string_stats(exec_ctx);
46
47        // We want to use `Constant` if there are only nulls in the array.
48        if stats.value_count() == 0 {
49            debug_assert_eq!(stats.null_count() as usize, array_len);
50            return CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse);
51        }
52
53        // Since the estimated distinct count is always going to be less than or equal to the actual
54        // distinct count, if this is not equal to 1 the actual is definitely not equal to 1.
55        if stats.estimated_distinct_count().is_some_and(|c| c > 1) {
56            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
57        }
58
59        // Otherwise our best bet is to actually check if the array is constant.
60        // This is an expensive check, but the alternative of not compressing a constant array is
61        // far less preferable.
62        CompressionEstimate::Deferred(DeferredEstimate::Callback(Box::new(
63            |_compressor, data, _best_so_far, _ctx, exec_ctx| {
64                if is_constant(data.array(), exec_ctx)? {
65                    Ok(EstimateVerdict::AlwaysUse)
66                } else {
67                    Ok(EstimateVerdict::Skip)
68                }
69            },
70        )))
71    }
72
73    fn compress(
74        &self,
75        _compressor: &CascadingCompressor,
76        data: &ArrayAndStats,
77        _compress_ctx: CompressorContext,
78        exec_ctx: &mut ExecutionCtx,
79    ) -> VortexResult<ArrayRef> {
80        compress_constant_array_with_validity(data.array(), exec_ctx)
81    }
82}