Skip to main content

vortex_compressor/stats/
string.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! String compression statistics.
5
6use vortex_array::LEGACY_SESSION;
7use vortex_array::VortexSessionExecute;
8use vortex_array::arrays::VarBinViewArray;
9use vortex_error::VortexExpect;
10use vortex_error::VortexResult;
11use vortex_error::vortex_err;
12use vortex_utils::aliases::hash_set::HashSet;
13
14use super::GenerateStatsOptions;
15
16/// Array of variable-length byte arrays, and relevant stats for compression.
17#[derive(Clone, Debug)]
18pub struct StringStats {
19    /// The estimated number of distinct strings, or `None` if not computed.
20    /// This _must_ be non-zero.
21    estimated_distinct_count: Option<u32>,
22    /// The number of non-null values.
23    value_count: u32,
24    /// The number of null values.
25    null_count: u32,
26}
27
28/// Estimate the number of distinct strings in the var bin view array.
29fn estimate_distinct_count(strings: &VarBinViewArray) -> VortexResult<u32> {
30    let views = strings.views();
31    // Iterate the views. Two strings which are equal must have the same first 8-bytes.
32    // NOTE: there are cases where this performs pessimally, e.g. when we have strings that all
33    // share a 4-byte prefix and have the same length.
34    let mut distinct = HashSet::with_capacity(views.len() / 2);
35    views.iter().for_each(|&view| {
36        #[expect(
37            clippy::cast_possible_truncation,
38            reason = "approximate uniqueness with view prefix"
39        )]
40        let len_and_prefix = view.as_u128() as u64;
41        distinct.insert(len_and_prefix);
42    });
43
44    Ok(u32::try_from(distinct.len())?)
45}
46
47impl StringStats {
48    /// Generates stats, returning an error on failure.
49    fn generate_opts_fallible(
50        input: &VarBinViewArray,
51        opts: GenerateStatsOptions,
52    ) -> VortexResult<Self> {
53        let null_count = input
54            .statistics()
55            .compute_null_count(&mut LEGACY_SESSION.create_execution_ctx())
56            .ok_or_else(|| vortex_err!("Failed to compute null_count"))?;
57        let value_count = input.len() - null_count;
58        let estimated_distinct_count = opts
59            .count_distinct_values
60            .then(|| estimate_distinct_count(input))
61            .transpose()?;
62
63        Ok(Self {
64            value_count: u32::try_from(value_count)?,
65            null_count: u32::try_from(null_count)?,
66            estimated_distinct_count,
67        })
68    }
69}
70
71impl StringStats {
72    /// Generates stats with default options.
73    pub fn generate(input: &VarBinViewArray) -> Self {
74        Self::generate_opts(input, GenerateStatsOptions::default())
75    }
76
77    /// Generates stats with provided options.
78    pub fn generate_opts(input: &VarBinViewArray, opts: GenerateStatsOptions) -> Self {
79        Self::generate_opts_fallible(input, opts)
80            .vortex_expect("StringStats::generate_opts should not fail")
81    }
82
83    /// Returns the estimated number of distinct strings, or `None` if not computed.
84    ///
85    /// This estimation is always going to be less than or equal to the actual distinct count.
86    pub fn estimated_distinct_count(&self) -> Option<u32> {
87        self.estimated_distinct_count
88    }
89
90    /// Returns the number of non-null values.
91    pub fn value_count(&self) -> u32 {
92        self.value_count
93    }
94
95    /// Returns the number of null values.
96    pub fn null_count(&self) -> u32 {
97        self.null_count
98    }
99}