Skip to main content

vortex_compressor/stats/
string.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! String compression statistics.
5
6use vortex_array::ExecutionCtx;
7use vortex_array::arrays::VarBinViewArray;
8use vortex_error::VortexExpect;
9use vortex_error::VortexResult;
10use vortex_error::vortex_err;
11use vortex_utils::aliases::hash_set::HashSet;
12
13use super::GenerateStatsOptions;
14
15/// Array of variable-length byte arrays, and relevant stats for compression.
16#[derive(Clone, Debug)]
17pub struct StringStats {
18    /// The estimated number of distinct strings, or `None` if not computed.
19    /// This _must_ be non-zero.
20    estimated_distinct_count: Option<u32>,
21    /// The number of non-null values.
22    value_count: u32,
23    /// The number of null values.
24    null_count: u32,
25}
26
27/// Estimate the number of distinct strings in the var bin view array.
28fn estimate_distinct_count(strings: &VarBinViewArray) -> VortexResult<u32> {
29    let views = strings.views();
30    // Iterate the views. Two strings which are equal must have the same first 8-bytes.
31    // NOTE: there are cases where this performs pessimally, e.g. when we have strings that all
32    // share a 4-byte prefix and have the same length.
33    let mut distinct = HashSet::with_capacity(views.len() / 2);
34    views.iter().for_each(|&view| {
35        #[expect(
36            clippy::cast_possible_truncation,
37            reason = "approximate uniqueness with view prefix"
38        )]
39        let len_and_prefix = view.as_u128() as u64;
40        distinct.insert(len_and_prefix);
41    });
42
43    Ok(u32::try_from(distinct.len())?)
44}
45
46impl StringStats {
47    /// Generates stats, returning an error on failure.
48    fn generate_opts_fallible(
49        input: &VarBinViewArray,
50        opts: GenerateStatsOptions,
51        ctx: &mut ExecutionCtx,
52    ) -> VortexResult<Self> {
53        let null_count = input
54            .statistics()
55            .compute_null_count(ctx)
56            .ok_or_else(|| vortex_err!("Failed to compute null_count"))?;
57        let value_count = input.len() - null_count;
58        let estimated_distinct_count = opts
59            .count_distinct_values
60            .then(|| estimate_distinct_count(input))
61            .transpose()?;
62
63        Ok(Self {
64            value_count: u32::try_from(value_count)?,
65            null_count: u32::try_from(null_count)?,
66            estimated_distinct_count,
67        })
68    }
69}
70
71impl StringStats {
72    /// Generates stats with default options.
73    pub fn generate(input: &VarBinViewArray, ctx: &mut ExecutionCtx) -> Self {
74        Self::generate_opts(input, GenerateStatsOptions::default(), ctx)
75    }
76
77    /// Generates stats with provided options.
78    pub fn generate_opts(
79        input: &VarBinViewArray,
80        opts: GenerateStatsOptions,
81        ctx: &mut ExecutionCtx,
82    ) -> Self {
83        Self::generate_opts_fallible(input, opts, ctx)
84            .vortex_expect("StringStats::generate_opts should not fail")
85    }
86
87    /// Returns the estimated number of distinct strings, or `None` if not computed.
88    ///
89    /// This estimation is always going to be less than or equal to the actual distinct count.
90    pub fn estimated_distinct_count(&self) -> Option<u32> {
91        self.estimated_distinct_count
92    }
93
94    /// Returns the number of non-null values.
95    pub fn value_count(&self) -> u32 {
96        self.value_count
97    }
98
99    /// Returns the number of null values.
100    pub fn null_count(&self) -> u32 {
101        self.null_count
102    }
103}