vortex_compressor/builtins/dict/
string.rs1use vortex_array::ArrayRef;
10use vortex_array::Canonical;
11use vortex_array::IntoArray;
12use vortex_array::arrays::DictArray;
13use vortex_array::arrays::PrimitiveArray;
14use vortex_array::arrays::dict::DictArrayExt;
15use vortex_array::arrays::dict::DictArraySlotsExt;
16use vortex_array::arrays::primitive::PrimitiveArrayExt;
17use vortex_array::builders::dict::dict_encode;
18use vortex_error::VortexExpect;
19use vortex_error::VortexResult;
20
21use crate::CascadingCompressor;
22use crate::builtins::IntDictScheme;
23use crate::builtins::StringDictScheme;
24use crate::builtins::is_utf8_string;
25use crate::ctx::CompressorContext;
26use crate::estimate::CompressionEstimate;
27use crate::estimate::DeferredEstimate;
28use crate::estimate::EstimateVerdict;
29use crate::scheme::ChildSelection;
30use crate::scheme::DescendantExclusion;
31use crate::scheme::Scheme;
32use crate::scheme::SchemeExt;
33use crate::stats::ArrayAndStats;
34use crate::stats::GenerateStatsOptions;
35
36impl Scheme for StringDictScheme {
37 fn scheme_name(&self) -> &'static str {
38 "vortex.string.dict"
39 }
40
41 fn matches(&self, canonical: &Canonical) -> bool {
42 is_utf8_string(canonical)
43 }
44
45 fn stats_options(&self) -> GenerateStatsOptions {
46 GenerateStatsOptions {
47 count_distinct_values: true,
48 }
49 }
50
51 fn num_children(&self) -> usize {
53 2
54 }
55
56 fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
62 vec![DescendantExclusion {
63 excluded: IntDictScheme.id(),
64 children: ChildSelection::One(1),
65 }]
66 }
67
68 fn expected_compression_ratio(
69 &self,
70 data: &mut ArrayAndStats,
71 _ctx: CompressorContext,
72 ) -> CompressionEstimate {
73 let stats = data.string_stats();
74
75 if stats.value_count() == 0 {
76 return CompressionEstimate::Verdict(EstimateVerdict::Skip);
77 }
78
79 let estimated_distinct_values_count = stats.estimated_distinct_count().vortex_expect(
80 "this must be present since `DictScheme` declared that we need distinct values",
81 );
82
83 if estimated_distinct_values_count > stats.value_count() / 2 {
85 return CompressionEstimate::Verdict(EstimateVerdict::Skip);
86 }
87
88 CompressionEstimate::Deferred(DeferredEstimate::Sample)
90 }
91
92 fn compress(
93 &self,
94 compressor: &CascadingCompressor,
95 data: &mut ArrayAndStats,
96 ctx: CompressorContext,
97 ) -> VortexResult<ArrayRef> {
98 let dict = dict_encode(data.array())?;
99
100 let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?;
102
103 let narrowed_codes = dict
105 .codes()
106 .clone()
107 .execute::<PrimitiveArray>(&mut compressor.execution_ctx())?
108 .narrow()?
109 .into_array();
110 let compressed_codes = compressor.compress_child(&narrowed_codes, &ctx, self.id(), 1)?;
111
112 unsafe {
114 Ok(
115 DictArray::new_unchecked(compressed_codes, compressed_values)
116 .set_all_values_referenced(dict.has_all_values_referenced())
117 .into_array(),
118 )
119 }
120 }
121}