use vortex_array::ExecutionCtx;
use vortex_array::arrays::VarBinViewArray;
use vortex_error::VortexExpect;
use vortex_error::VortexResult;
use vortex_error::vortex_err;
use vortex_utils::aliases::hash_set::HashSet;
use super::GenerateStatsOptions;
#[derive(Clone, Debug)]
pub struct StringStats {
estimated_distinct_count: Option<u32>,
value_count: u32,
null_count: u32,
}
fn estimate_distinct_count(strings: &VarBinViewArray) -> VortexResult<u32> {
let views = strings.views();
let mut distinct = HashSet::with_capacity(views.len() / 2);
views.iter().for_each(|&view| {
#[expect(
clippy::cast_possible_truncation,
reason = "approximate uniqueness with view prefix"
)]
let len_and_prefix = view.as_u128() as u64;
distinct.insert(len_and_prefix);
});
Ok(u32::try_from(distinct.len())?)
}
impl StringStats {
fn generate_opts_fallible(
input: &VarBinViewArray,
opts: GenerateStatsOptions,
ctx: &mut ExecutionCtx,
) -> VortexResult<Self> {
let null_count = input
.statistics()
.compute_null_count(ctx)
.ok_or_else(|| vortex_err!("Failed to compute null_count"))?;
let value_count = input.len() - null_count;
let estimated_distinct_count = opts
.count_distinct_values
.then(|| estimate_distinct_count(input))
.transpose()?;
Ok(Self {
value_count: u32::try_from(value_count)?,
null_count: u32::try_from(null_count)?,
estimated_distinct_count,
})
}
}
impl StringStats {
pub fn generate(input: &VarBinViewArray, ctx: &mut ExecutionCtx) -> Self {
Self::generate_opts(input, GenerateStatsOptions::default(), ctx)
}
pub fn generate_opts(
input: &VarBinViewArray,
opts: GenerateStatsOptions,
ctx: &mut ExecutionCtx,
) -> Self {
Self::generate_opts_fallible(input, opts, ctx)
.vortex_expect("StringStats::generate_opts should not fail")
}
pub fn estimated_distinct_count(&self) -> Option<u32> {
self.estimated_distinct_count
}
pub fn value_count(&self) -> u32 {
self.value_count
}
pub fn null_count(&self) -> u32 {
self.null_count
}
}