vortex_compressor/stats/
string.rs1use vortex_array::LEGACY_SESSION;
7use vortex_array::VortexSessionExecute;
8use vortex_array::arrays::VarBinViewArray;
9use vortex_error::VortexExpect;
10use vortex_error::VortexResult;
11use vortex_error::vortex_err;
12use vortex_utils::aliases::hash_set::HashSet;
13
14use super::GenerateStatsOptions;
15
16#[derive(Clone, Debug)]
18pub struct StringStats {
19 estimated_distinct_count: Option<u32>,
22 value_count: u32,
24 null_count: u32,
26}
27
28fn estimate_distinct_count(strings: &VarBinViewArray) -> VortexResult<u32> {
30 let views = strings.views();
31 let mut distinct = HashSet::with_capacity(views.len() / 2);
35 views.iter().for_each(|&view| {
36 #[expect(
37 clippy::cast_possible_truncation,
38 reason = "approximate uniqueness with view prefix"
39 )]
40 let len_and_prefix = view.as_u128() as u64;
41 distinct.insert(len_and_prefix);
42 });
43
44 Ok(u32::try_from(distinct.len())?)
45}
46
47impl StringStats {
48 fn generate_opts_fallible(
50 input: &VarBinViewArray,
51 opts: GenerateStatsOptions,
52 ) -> VortexResult<Self> {
53 let null_count = input
54 .statistics()
55 .compute_null_count(&mut LEGACY_SESSION.create_execution_ctx())
56 .ok_or_else(|| vortex_err!("Failed to compute null_count"))?;
57 let value_count = input.len() - null_count;
58 let estimated_distinct_count = opts
59 .count_distinct_values
60 .then(|| estimate_distinct_count(input))
61 .transpose()?;
62
63 Ok(Self {
64 value_count: u32::try_from(value_count)?,
65 null_count: u32::try_from(null_count)?,
66 estimated_distinct_count,
67 })
68 }
69}
70
71impl StringStats {
72 pub fn generate(input: &VarBinViewArray) -> Self {
74 Self::generate_opts(input, GenerateStatsOptions::default())
75 }
76
77 pub fn generate_opts(input: &VarBinViewArray, opts: GenerateStatsOptions) -> Self {
79 Self::generate_opts_fallible(input, opts)
80 .vortex_expect("StringStats::generate_opts should not fail")
81 }
82
83 pub fn estimated_distinct_count(&self) -> Option<u32> {
87 self.estimated_distinct_count
88 }
89
90 pub fn value_count(&self) -> u32 {
92 self.value_count
93 }
94
95 pub fn null_count(&self) -> u32 {
97 self.null_count
98 }
99}