vortex_compressor/builtins/dict/
binary.rs1use vortex_array::ArrayRef;
10use vortex_array::Canonical;
11use vortex_array::ExecutionCtx;
12use vortex_array::IntoArray;
13use vortex_array::arrays::DictArray;
14use vortex_array::arrays::PrimitiveArray;
15use vortex_array::arrays::dict::DictArrayExt;
16use vortex_array::arrays::dict::DictArraySlotsExt;
17use vortex_array::arrays::primitive::PrimitiveArrayExt;
18use vortex_array::builders::dict::dict_encode;
19use vortex_error::VortexExpect;
20use vortex_error::VortexResult;
21
22use crate::CascadingCompressor;
23use crate::builtins::IntDictScheme;
24use crate::ctx::CompressorContext;
25use crate::estimate::CompressionEstimate;
26use crate::estimate::DeferredEstimate;
27use crate::estimate::EstimateVerdict;
28use crate::scheme::ChildSelection;
29use crate::scheme::DescendantExclusion;
30use crate::scheme::Scheme;
31use crate::scheme::SchemeExt;
32use crate::stats::ArrayAndStats;
33use crate::stats::GenerateStatsOptions;
34
35#[derive(Debug, Copy, Clone, PartialEq, Eq)]
37pub struct BinaryDictScheme;
38
39impl Scheme for BinaryDictScheme {
40 fn scheme_name(&self) -> &'static str {
41 "vortex.binary.dict"
42 }
43
44 fn matches(&self, canonical: &Canonical) -> bool {
45 canonical.dtype().is_binary()
46 }
47
48 fn stats_options(&self) -> GenerateStatsOptions {
49 GenerateStatsOptions {
50 count_distinct_values: true,
51 }
52 }
53
54 fn num_children(&self) -> usize {
56 2
57 }
58
59 fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
65 vec![DescendantExclusion {
66 excluded: IntDictScheme.id(),
67 children: ChildSelection::One(1),
68 }]
69 }
70
71 fn expected_compression_ratio(
72 &self,
73 data: &ArrayAndStats,
74 _compress_ctx: CompressorContext,
75 exec_ctx: &mut ExecutionCtx,
76 ) -> CompressionEstimate {
77 let stats = data.varbinview_stats(exec_ctx);
78
79 if stats.value_count() == 0 {
80 return CompressionEstimate::Verdict(EstimateVerdict::Skip);
81 }
82
83 let estimated_distinct_values_count = stats.estimated_distinct_count().vortex_expect(
84 "this must be present since `DictScheme` declared that we need distinct values",
85 );
86
87 if estimated_distinct_values_count > stats.value_count() / 2 {
89 return CompressionEstimate::Verdict(EstimateVerdict::Skip);
90 }
91
92 CompressionEstimate::Deferred(DeferredEstimate::Sample)
94 }
95
96 fn compress(
97 &self,
98 compressor: &CascadingCompressor,
99 data: &ArrayAndStats,
100 compress_ctx: CompressorContext,
101 exec_ctx: &mut ExecutionCtx,
102 ) -> VortexResult<ArrayRef> {
103 let dict = dict_encode(data.array(), exec_ctx)?;
104
105 let compressed_values =
107 compressor.compress_child(dict.values(), &compress_ctx, self.id(), 0, exec_ctx)?;
108
109 let narrowed_codes = dict
111 .codes()
112 .clone()
113 .execute::<PrimitiveArray>(exec_ctx)?
114 .narrow(exec_ctx)?
115 .into_array();
116 let compressed_codes =
117 compressor.compress_child(&narrowed_codes, &compress_ctx, self.id(), 1, exec_ctx)?;
118
119 unsafe {
121 Ok(
122 DictArray::new_unchecked(compressed_codes, compressed_values)
123 .set_all_values_referenced(dict.has_all_values_referenced())
124 .into_array(),
125 )
126 }
127 }
128}