vortex_compressor/stats/
string.rs1use vortex_array::ExecutionCtx;
7use vortex_array::arrays::VarBinViewArray;
8use vortex_error::VortexExpect;
9use vortex_error::VortexResult;
10use vortex_error::vortex_err;
11use vortex_utils::aliases::hash_set::HashSet;
12
13use super::GenerateStatsOptions;
14
15#[derive(Clone, Debug)]
17pub struct StringStats {
18 estimated_distinct_count: Option<u32>,
21 value_count: u32,
23 null_count: u32,
25}
26
27fn estimate_distinct_count(strings: &VarBinViewArray) -> VortexResult<u32> {
29 let views = strings.views();
30 let mut distinct = HashSet::with_capacity(views.len() / 2);
34 views.iter().for_each(|&view| {
35 #[expect(
36 clippy::cast_possible_truncation,
37 reason = "approximate uniqueness with view prefix"
38 )]
39 let len_and_prefix = view.as_u128() as u64;
40 distinct.insert(len_and_prefix);
41 });
42
43 Ok(u32::try_from(distinct.len())?)
44}
45
46impl StringStats {
47 fn generate_opts_fallible(
49 input: &VarBinViewArray,
50 opts: GenerateStatsOptions,
51 ctx: &mut ExecutionCtx,
52 ) -> VortexResult<Self> {
53 let null_count = input
54 .statistics()
55 .compute_null_count(ctx)
56 .ok_or_else(|| vortex_err!("Failed to compute null_count"))?;
57 let value_count = input.len() - null_count;
58 let estimated_distinct_count = opts
59 .count_distinct_values
60 .then(|| estimate_distinct_count(input))
61 .transpose()?;
62
63 Ok(Self {
64 value_count: u32::try_from(value_count)?,
65 null_count: u32::try_from(null_count)?,
66 estimated_distinct_count,
67 })
68 }
69}
70
71impl StringStats {
72 pub fn generate(input: &VarBinViewArray, ctx: &mut ExecutionCtx) -> Self {
74 Self::generate_opts(input, GenerateStatsOptions::default(), ctx)
75 }
76
77 pub fn generate_opts(
79 input: &VarBinViewArray,
80 opts: GenerateStatsOptions,
81 ctx: &mut ExecutionCtx,
82 ) -> Self {
83 Self::generate_opts_fallible(input, opts, ctx)
84 .vortex_expect("StringStats::generate_opts should not fail")
85 }
86
87 pub fn estimated_distinct_count(&self) -> Option<u32> {
91 self.estimated_distinct_count
92 }
93
94 pub fn value_count(&self) -> u32 {
96 self.value_count
97 }
98
99 pub fn null_count(&self) -> u32 {
101 self.null_count
102 }
103}