vortex_compressor/builtins/dict/
string.rs1use vortex_array::ArrayRef;
10use vortex_array::Canonical;
11use vortex_array::ExecutionCtx;
12use vortex_array::IntoArray;
13use vortex_array::arrays::DictArray;
14use vortex_array::arrays::PrimitiveArray;
15use vortex_array::arrays::dict::DictArrayExt;
16use vortex_array::arrays::dict::DictArraySlotsExt;
17use vortex_array::arrays::primitive::PrimitiveArrayExt;
18use vortex_array::builders::dict::dict_encode;
19use vortex_error::VortexExpect;
20use vortex_error::VortexResult;
21
22use crate::CascadingCompressor;
23use crate::builtins::IntDictScheme;
24use crate::builtins::StringDictScheme;
25use crate::builtins::is_utf8_string;
26use crate::ctx::CompressorContext;
27use crate::estimate::CompressionEstimate;
28use crate::estimate::DeferredEstimate;
29use crate::estimate::EstimateVerdict;
30use crate::scheme::ChildSelection;
31use crate::scheme::DescendantExclusion;
32use crate::scheme::Scheme;
33use crate::scheme::SchemeExt;
34use crate::stats::ArrayAndStats;
35use crate::stats::GenerateStatsOptions;
36
37impl Scheme for StringDictScheme {
38 fn scheme_name(&self) -> &'static str {
39 "vortex.string.dict"
40 }
41
42 fn matches(&self, canonical: &Canonical) -> bool {
43 is_utf8_string(canonical)
44 }
45
46 fn stats_options(&self) -> GenerateStatsOptions {
47 GenerateStatsOptions {
48 count_distinct_values: true,
49 }
50 }
51
52 fn num_children(&self) -> usize {
54 2
55 }
56
57 fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
63 vec![DescendantExclusion {
64 excluded: IntDictScheme.id(),
65 children: ChildSelection::One(1),
66 }]
67 }
68
69 fn expected_compression_ratio(
70 &self,
71 data: &ArrayAndStats,
72 _compress_ctx: CompressorContext,
73 exec_ctx: &mut ExecutionCtx,
74 ) -> CompressionEstimate {
75 let stats = data.string_stats(exec_ctx);
76
77 if stats.value_count() == 0 {
78 return CompressionEstimate::Verdict(EstimateVerdict::Skip);
79 }
80
81 let estimated_distinct_values_count = stats.estimated_distinct_count().vortex_expect(
82 "this must be present since `DictScheme` declared that we need distinct values",
83 );
84
85 if estimated_distinct_values_count > stats.value_count() / 2 {
87 return CompressionEstimate::Verdict(EstimateVerdict::Skip);
88 }
89
90 CompressionEstimate::Deferred(DeferredEstimate::Sample)
92 }
93
94 fn compress(
95 &self,
96 compressor: &CascadingCompressor,
97 data: &ArrayAndStats,
98 compress_ctx: CompressorContext,
99 exec_ctx: &mut ExecutionCtx,
100 ) -> VortexResult<ArrayRef> {
101 let dict = dict_encode(data.array())?;
102
103 let compressed_values =
105 compressor.compress_child(dict.values(), &compress_ctx, self.id(), 0, exec_ctx)?;
106
107 let narrowed_codes = dict
109 .codes()
110 .clone()
111 .execute::<PrimitiveArray>(exec_ctx)?
112 .narrow()?
113 .into_array();
114 let compressed_codes =
115 compressor.compress_child(&narrowed_codes, &compress_ctx, self.id(), 1, exec_ctx)?;
116
117 unsafe {
119 Ok(
120 DictArray::new_unchecked(compressed_codes, compressed_values)
121 .set_all_values_referenced(dict.has_all_values_referenced())
122 .into_array(),
123 )
124 }
125 }
126}