use vortex_array::ArrayRef;
use vortex_array::Canonical;
use vortex_array::IntoArray;
use vortex_array::arrays::DictArray;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::arrays::dict::DictArrayExt;
use vortex_array::arrays::dict::DictArraySlotsExt;
use vortex_array::arrays::primitive::PrimitiveArrayExt;
use vortex_array::builders::dict::dict_encode;
use vortex_error::VortexExpect;
use vortex_error::VortexResult;
use crate::CascadingCompressor;
use crate::builtins::IntDictScheme;
use crate::builtins::StringDictScheme;
use crate::builtins::is_utf8_string;
use crate::ctx::CompressorContext;
use crate::estimate::CompressionEstimate;
use crate::scheme::ChildSelection;
use crate::scheme::DescendantExclusion;
use crate::scheme::Scheme;
use crate::scheme::SchemeExt;
use crate::stats::ArrayAndStats;
use crate::stats::GenerateStatsOptions;
impl Scheme for StringDictScheme {
fn scheme_name(&self) -> &'static str {
"vortex.string.dict"
}
fn matches(&self, canonical: &Canonical) -> bool {
is_utf8_string(canonical)
}
fn stats_options(&self) -> GenerateStatsOptions {
GenerateStatsOptions {
count_distinct_values: true,
}
}
fn num_children(&self) -> usize {
2
}
fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
vec![DescendantExclusion {
excluded: IntDictScheme.id(),
children: ChildSelection::One(1),
}]
}
fn expected_compression_ratio(
&self,
data: &mut ArrayAndStats,
_ctx: CompressorContext,
) -> CompressionEstimate {
let stats = data.string_stats();
if stats.value_count() == 0 {
return CompressionEstimate::Skip;
}
let estimated_distinct_values_count = stats.estimated_distinct_count().vortex_expect(
"this must be present since `DictScheme` declared that we need distinct values",
);
if estimated_distinct_values_count > stats.value_count() / 2 {
return CompressionEstimate::Skip;
}
CompressionEstimate::Sample
}
fn compress(
&self,
compressor: &CascadingCompressor,
data: &mut ArrayAndStats,
ctx: CompressorContext,
) -> VortexResult<ArrayRef> {
let dict = dict_encode(data.array())?;
let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?;
let narrowed_codes = dict
.codes()
.clone()
.execute::<PrimitiveArray>(&mut compressor.execution_ctx())?
.narrow()?
.into_array();
let compressed_codes = compressor.compress_child(&narrowed_codes, &ctx, self.id(), 1)?;
unsafe {
Ok(
DictArray::new_unchecked(compressed_codes, compressed_values)
.set_all_values_referenced(dict.has_all_values_referenced())
.into_array(),
)
}
}
}