Skip to main content

vortex_compressor/builtins/dict/
string.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! UTF8-specific dictionary encoding implementation.
5//!
6//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted
7//! for external compatibility.
8
9use vortex_array::ArrayRef;
10use vortex_array::Canonical;
11use vortex_array::IntoArray;
12use vortex_array::arrays::DictArray;
13use vortex_array::arrays::PrimitiveArray;
14use vortex_array::arrays::dict::DictArrayExt;
15use vortex_array::arrays::dict::DictArraySlotsExt;
16use vortex_array::arrays::primitive::PrimitiveArrayExt;
17use vortex_array::builders::dict::dict_encode;
18use vortex_error::VortexExpect;
19use vortex_error::VortexResult;
20
21use crate::CascadingCompressor;
22use crate::builtins::IntDictScheme;
23use crate::builtins::StringDictScheme;
24use crate::builtins::is_utf8_string;
25use crate::ctx::CompressorContext;
26use crate::estimate::CompressionEstimate;
27use crate::estimate::DeferredEstimate;
28use crate::estimate::EstimateVerdict;
29use crate::scheme::ChildSelection;
30use crate::scheme::DescendantExclusion;
31use crate::scheme::Scheme;
32use crate::scheme::SchemeExt;
33use crate::stats::ArrayAndStats;
34use crate::stats::GenerateStatsOptions;
35
36impl Scheme for StringDictScheme {
37    fn scheme_name(&self) -> &'static str {
38        "vortex.string.dict"
39    }
40
41    fn matches(&self, canonical: &Canonical) -> bool {
42        is_utf8_string(canonical)
43    }
44
45    fn stats_options(&self) -> GenerateStatsOptions {
46        GenerateStatsOptions {
47            count_distinct_values: true,
48        }
49    }
50
51    /// Children: values=0, codes=1.
52    fn num_children(&self) -> usize {
53        2
54    }
55
56    /// String dict codes (child 1) are compact unsigned integers that should not be dict-encoded
57    /// again.
58    ///
59    /// Additional exclusions for codes (IntSequenceScheme, FoRScheme, ZigZagScheme, SparseScheme,
60    /// RunEndScheme, RLE, etc.) are expressed as pull rules on those schemes in `vortex-btrblocks`.
61    fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
62        vec![DescendantExclusion {
63            excluded: IntDictScheme.id(),
64            children: ChildSelection::One(1),
65        }]
66    }
67
68    fn expected_compression_ratio(
69        &self,
70        data: &mut ArrayAndStats,
71        _ctx: CompressorContext,
72    ) -> CompressionEstimate {
73        let stats = data.string_stats();
74
75        if stats.value_count() == 0 {
76            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
77        }
78
79        let estimated_distinct_values_count = stats.estimated_distinct_count().vortex_expect(
80            "this must be present since `DictScheme` declared that we need distinct values",
81        );
82
83        // If > 50% of the values are distinct, skip dictionary scheme.
84        if estimated_distinct_values_count > stats.value_count() / 2 {
85            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
86        }
87
88        // Let sampling determine the expected ratio.
89        CompressionEstimate::Deferred(DeferredEstimate::Sample)
90    }
91
92    fn compress(
93        &self,
94        compressor: &CascadingCompressor,
95        data: &mut ArrayAndStats,
96        ctx: CompressorContext,
97    ) -> VortexResult<ArrayRef> {
98        let dict = dict_encode(data.array())?;
99
100        // Values = child 0.
101        let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?;
102
103        // Codes = child 1.
104        let narrowed_codes = dict
105            .codes()
106            .clone()
107            .execute::<PrimitiveArray>(&mut compressor.execution_ctx())?
108            .narrow()?
109            .into_array();
110        let compressed_codes = compressor.compress_child(&narrowed_codes, &ctx, self.id(), 1)?;
111
112        // SAFETY: compressing codes or values does not alter the invariants.
113        unsafe {
114            Ok(
115                DictArray::new_unchecked(compressed_codes, compressed_values)
116                    .set_all_values_referenced(dict.has_all_values_referenced())
117                    .into_array(),
118            )
119        }
120    }
121}