Skip to main content

vortex_compressor/builtins/dict/
string.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! UTF8-specific dictionary encoding implementation.
5//!
6//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted
7//! for external compatibility.
8
9use vortex_array::ArrayRef;
10use vortex_array::Canonical;
11use vortex_array::ExecutionCtx;
12use vortex_array::IntoArray;
13use vortex_array::arrays::DictArray;
14use vortex_array::arrays::PrimitiveArray;
15use vortex_array::arrays::dict::DictArrayExt;
16use vortex_array::arrays::dict::DictArraySlotsExt;
17use vortex_array::arrays::primitive::PrimitiveArrayExt;
18use vortex_array::builders::dict::dict_encode;
19use vortex_error::VortexExpect;
20use vortex_error::VortexResult;
21
22use crate::CascadingCompressor;
23use crate::builtins::IntDictScheme;
24use crate::ctx::CompressorContext;
25use crate::estimate::CompressionEstimate;
26use crate::estimate::DeferredEstimate;
27use crate::estimate::EstimateVerdict;
28use crate::scheme::ChildSelection;
29use crate::scheme::DescendantExclusion;
30use crate::scheme::Scheme;
31use crate::scheme::SchemeExt;
32use crate::stats::ArrayAndStats;
33use crate::stats::GenerateStatsOptions;
34
35/// Dictionary encoding for low-cardinality string values.
36#[derive(Debug, Copy, Clone, PartialEq, Eq)]
37pub struct StringDictScheme;
38
39impl Scheme for StringDictScheme {
40    fn scheme_name(&self) -> &'static str {
41        "vortex.string.dict"
42    }
43
44    fn matches(&self, canonical: &Canonical) -> bool {
45        canonical.dtype().is_utf8()
46    }
47
48    fn stats_options(&self) -> GenerateStatsOptions {
49        GenerateStatsOptions {
50            count_distinct_values: true,
51        }
52    }
53
54    /// Children: values=0, codes=1.
55    fn num_children(&self) -> usize {
56        2
57    }
58
59    /// String dict codes (child 1) are compact unsigned integers that should not be dict-encoded
60    /// again.
61    ///
62    /// Additional exclusions for codes (IntSequenceScheme, FoRScheme, ZigZagScheme, SparseScheme,
63    /// RunEndScheme, RLE, etc.) are expressed as pull rules on those schemes in `vortex-btrblocks`.
64    fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
65        vec![DescendantExclusion {
66            excluded: IntDictScheme.id(),
67            children: ChildSelection::One(1),
68        }]
69    }
70
71    fn expected_compression_ratio(
72        &self,
73        data: &ArrayAndStats,
74        _compress_ctx: CompressorContext,
75        exec_ctx: &mut ExecutionCtx,
76    ) -> CompressionEstimate {
77        let stats = data.varbinview_stats(exec_ctx);
78
79        if stats.value_count() == 0 {
80            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
81        }
82
83        let estimated_distinct_values_count = stats.estimated_distinct_count().vortex_expect(
84            "this must be present since `DictScheme` declared that we need distinct values",
85        );
86
87        // If > 50% of the values are distinct, skip dictionary scheme.
88        if estimated_distinct_values_count > stats.value_count() / 2 {
89            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
90        }
91
92        // Let sampling determine the expected ratio.
93        CompressionEstimate::Deferred(DeferredEstimate::Sample)
94    }
95
96    fn compress(
97        &self,
98        compressor: &CascadingCompressor,
99        data: &ArrayAndStats,
100        compress_ctx: CompressorContext,
101        exec_ctx: &mut ExecutionCtx,
102    ) -> VortexResult<ArrayRef> {
103        let dict = dict_encode(data.array(), exec_ctx)?;
104
105        // Values = child 0.
106        let compressed_values =
107            compressor.compress_child(dict.values(), &compress_ctx, self.id(), 0, exec_ctx)?;
108
109        // Codes = child 1.
110        let narrowed_codes = dict
111            .codes()
112            .clone()
113            .execute::<PrimitiveArray>(exec_ctx)?
114            .narrow(exec_ctx)?
115            .into_array();
116        let compressed_codes =
117            compressor.compress_child(&narrowed_codes, &compress_ctx, self.id(), 1, exec_ctx)?;
118
119        // SAFETY: compressing codes or values does not alter the invariants.
120        unsafe {
121            Ok(
122                DictArray::new_unchecked(compressed_codes, compressed_values)
123                    .set_all_values_referenced(dict.has_all_values_referenced())
124                    .into_array(),
125            )
126        }
127    }
128}