vortex_compressor/builtins/dict/
string.rs1use vortex_array::ArrayRef;
10use vortex_array::Canonical;
11use vortex_array::IntoArray;
12use vortex_array::arrays::DictArray;
13use vortex_array::arrays::PrimitiveArray;
14use vortex_array::arrays::dict::DictArrayExt;
15use vortex_array::arrays::dict::DictArraySlotsExt;
16use vortex_array::arrays::primitive::PrimitiveArrayExt;
17use vortex_array::builders::dict::dict_encode;
18use vortex_error::VortexExpect;
19use vortex_error::VortexResult;
20
21use crate::CascadingCompressor;
22use crate::builtins::IntDictScheme;
23use crate::builtins::StringDictScheme;
24use crate::builtins::is_utf8_string;
25use crate::ctx::CompressorContext;
26use crate::estimate::CompressionEstimate;
27use crate::scheme::ChildSelection;
28use crate::scheme::DescendantExclusion;
29use crate::scheme::Scheme;
30use crate::scheme::SchemeExt;
31use crate::stats::ArrayAndStats;
32use crate::stats::GenerateStatsOptions;
33
34impl Scheme for StringDictScheme {
35 fn scheme_name(&self) -> &'static str {
36 "vortex.string.dict"
37 }
38
39 fn matches(&self, canonical: &Canonical) -> bool {
40 is_utf8_string(canonical)
41 }
42
43 fn stats_options(&self) -> GenerateStatsOptions {
44 GenerateStatsOptions {
45 count_distinct_values: true,
46 }
47 }
48
49 fn num_children(&self) -> usize {
51 2
52 }
53
54 fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
60 vec![DescendantExclusion {
61 excluded: IntDictScheme.id(),
62 children: ChildSelection::One(1),
63 }]
64 }
65
66 fn expected_compression_ratio(
67 &self,
68 data: &mut ArrayAndStats,
69 _ctx: CompressorContext,
70 ) -> CompressionEstimate {
71 let stats = data.string_stats();
72
73 if stats.value_count() == 0 {
74 return CompressionEstimate::Skip;
75 }
76
77 let estimated_distinct_values_count = stats.estimated_distinct_count().vortex_expect(
78 "this must be present since `DictScheme` declared that we need distinct values",
79 );
80
81 if estimated_distinct_values_count > stats.value_count() / 2 {
83 return CompressionEstimate::Skip;
84 }
85
86 CompressionEstimate::Sample
88 }
89
90 fn compress(
91 &self,
92 compressor: &CascadingCompressor,
93 data: &mut ArrayAndStats,
94 ctx: CompressorContext,
95 ) -> VortexResult<ArrayRef> {
96 let dict = dict_encode(data.array())?;
97
98 let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?;
100
101 let narrowed_codes = dict
103 .codes()
104 .clone()
105 .execute::<PrimitiveArray>(&mut compressor.execution_ctx())?
106 .narrow()?
107 .into_array();
108 let compressed_codes = compressor.compress_child(&narrowed_codes, &ctx, self.id(), 1)?;
109
110 unsafe {
112 Ok(
113 DictArray::new_unchecked(compressed_codes, compressed_values)
114 .set_all_values_referenced(dict.has_all_values_referenced())
115 .into_array(),
116 )
117 }
118 }
119}