Skip to main content

vortex_compressor/stats/
string.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! String compression statistics.
5
6use vortex_array::arrays::VarBinViewArray;
7use vortex_error::VortexExpect;
8use vortex_error::VortexResult;
9use vortex_error::vortex_err;
10use vortex_utils::aliases::hash_set::HashSet;
11
12use super::GenerateStatsOptions;
13
14/// Array of variable-length byte arrays, and relevant stats for compression.
15#[derive(Clone, Debug)]
16pub struct StringStats {
17    /// The estimated number of distinct strings, or `None` if not computed.
18    /// This _must_ be non-zero.
19    estimated_distinct_count: Option<u32>,
20    /// The number of non-null values.
21    value_count: u32,
22    /// The number of null values.
23    null_count: u32,
24}
25
26/// Estimate the number of distinct strings in the var bin view array.
27fn estimate_distinct_count(strings: &VarBinViewArray) -> VortexResult<u32> {
28    let views = strings.views();
29    // Iterate the views. Two strings which are equal must have the same first 8-bytes.
30    // NOTE: there are cases where this performs pessimally, e.g. when we have strings that all
31    // share a 4-byte prefix and have the same length.
32    let mut distinct = HashSet::with_capacity(views.len() / 2);
33    views.iter().for_each(|&view| {
34        #[expect(
35            clippy::cast_possible_truncation,
36            reason = "approximate uniqueness with view prefix"
37        )]
38        let len_and_prefix = view.as_u128() as u64;
39        distinct.insert(len_and_prefix);
40    });
41
42    Ok(u32::try_from(distinct.len())?)
43}
44
45impl StringStats {
46    /// Generates stats, returning an error on failure.
47    fn generate_opts_fallible(
48        input: &VarBinViewArray,
49        opts: GenerateStatsOptions,
50    ) -> VortexResult<Self> {
51        let null_count = input
52            .statistics()
53            .compute_null_count()
54            .ok_or_else(|| vortex_err!("Failed to compute null_count"))?;
55        let value_count = input.len() - null_count;
56        let estimated_distinct_count = opts
57            .count_distinct_values
58            .then(|| estimate_distinct_count(input))
59            .transpose()?;
60
61        Ok(Self {
62            value_count: u32::try_from(value_count)?,
63            null_count: u32::try_from(null_count)?,
64            estimated_distinct_count,
65        })
66    }
67}
68
69impl StringStats {
70    /// Generates stats with default options.
71    pub fn generate(input: &VarBinViewArray) -> Self {
72        Self::generate_opts(input, GenerateStatsOptions::default())
73    }
74
75    /// Generates stats with provided options.
76    pub fn generate_opts(input: &VarBinViewArray, opts: GenerateStatsOptions) -> Self {
77        Self::generate_opts_fallible(input, opts)
78            .vortex_expect("StringStats::generate_opts should not fail")
79    }
80
81    /// Returns the estimated number of distinct strings, or `None` if not computed.
82    ///
83    /// This estimation is always going to be less than or equal to the actual distinct count.
84    pub fn estimated_distinct_count(&self) -> Option<u32> {
85        self.estimated_distinct_count
86    }
87
88    /// Returns the number of non-null values.
89    pub fn value_count(&self) -> u32 {
90        self.value_count
91    }
92
93    /// Returns the number of null values.
94    pub fn null_count(&self) -> u32 {
95        self.null_count
96    }
97}