vortex_compressor/stats/
string.rs1use vortex_array::arrays::VarBinViewArray;
7use vortex_error::VortexExpect;
8use vortex_error::VortexResult;
9use vortex_error::vortex_err;
10use vortex_utils::aliases::hash_set::HashSet;
11
12use super::GenerateStatsOptions;
13
14#[derive(Clone, Debug)]
16pub struct StringStats {
17 estimated_distinct_count: Option<u32>,
20 value_count: u32,
22 null_count: u32,
24}
25
26fn estimate_distinct_count(strings: &VarBinViewArray) -> VortexResult<u32> {
28 let views = strings.views();
29 let mut distinct = HashSet::with_capacity(views.len() / 2);
33 views.iter().for_each(|&view| {
34 #[expect(
35 clippy::cast_possible_truncation,
36 reason = "approximate uniqueness with view prefix"
37 )]
38 let len_and_prefix = view.as_u128() as u64;
39 distinct.insert(len_and_prefix);
40 });
41
42 Ok(u32::try_from(distinct.len())?)
43}
44
45impl StringStats {
46 fn generate_opts_fallible(
48 input: &VarBinViewArray,
49 opts: GenerateStatsOptions,
50 ) -> VortexResult<Self> {
51 let null_count = input
52 .statistics()
53 .compute_null_count()
54 .ok_or_else(|| vortex_err!("Failed to compute null_count"))?;
55 let value_count = input.len() - null_count;
56 let estimated_distinct_count = opts
57 .count_distinct_values
58 .then(|| estimate_distinct_count(input))
59 .transpose()?;
60
61 Ok(Self {
62 value_count: u32::try_from(value_count)?,
63 null_count: u32::try_from(null_count)?,
64 estimated_distinct_count,
65 })
66 }
67}
68
69impl StringStats {
70 pub fn generate(input: &VarBinViewArray) -> Self {
72 Self::generate_opts(input, GenerateStatsOptions::default())
73 }
74
75 pub fn generate_opts(input: &VarBinViewArray, opts: GenerateStatsOptions) -> Self {
77 Self::generate_opts_fallible(input, opts)
78 .vortex_expect("StringStats::generate_opts should not fail")
79 }
80
81 pub fn estimated_distinct_count(&self) -> Option<u32> {
85 self.estimated_distinct_count
86 }
87
88 pub fn value_count(&self) -> u32 {
90 self.value_count
91 }
92
93 pub fn null_count(&self) -> u32 {
95 self.null_count
96 }
97}