Skip to main content

vortex_compressor/stats/
float.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Float compression statistics.
5
6use std::hash::Hash;
7
8use itertools::Itertools;
9use num_traits::Float;
10use rustc_hash::FxBuildHasher;
11use vortex_array::arrays::PrimitiveArray;
12use vortex_array::arrays::primitive::NativeValue;
13use vortex_array::dtype::NativePType;
14use vortex_array::dtype::PType;
15use vortex_array::dtype::half::f16;
16use vortex_error::VortexExpect;
17use vortex_error::VortexResult;
18use vortex_error::vortex_err;
19use vortex_error::vortex_panic;
20use vortex_mask::AllOr;
21use vortex_utils::aliases::hash_set::HashSet;
22
23use super::GenerateStatsOptions;
24
25/// Information about the distinct values in a float array.
26#[derive(Debug, Clone)]
27pub struct DistinctInfo<T> {
28    /// The set of distinct float values.
29    distinct_values: HashSet<NativeValue<T>, FxBuildHasher>,
30    /// The count of unique values. This _must_ be non-zero.
31    distinct_count: u32,
32}
33
34impl<T> DistinctInfo<T> {
35    /// Returns a reference to the distinct values set.
36    pub fn distinct_values(&self) -> &HashSet<NativeValue<T>, FxBuildHasher> {
37        &self.distinct_values
38    }
39}
40
41/// Typed statistics for a specific float type.
42#[derive(Debug, Clone)]
43pub struct TypedStats<T> {
44    /// Distinct value information, or `None` if not computed.
45    distinct: Option<DistinctInfo<T>>,
46}
47
48impl<T> TypedStats<T> {
49    /// Returns the distinct value information, if computed.
50    pub fn distinct(&self) -> Option<&DistinctInfo<T>> {
51        self.distinct.as_ref()
52    }
53}
54
55/// Type-erased container for one of the [`TypedStats`] variants.
56#[derive(Debug, Clone)]
57pub enum ErasedStats {
58    /// Stats for `f16` arrays.
59    F16(TypedStats<f16>),
60    /// Stats for `f32` arrays.
61    F32(TypedStats<f32>),
62    /// Stats for `f64` arrays.
63    F64(TypedStats<f64>),
64}
65
66impl ErasedStats {
67    /// Get the count of distinct values, if we have computed it already.
68    fn distinct_count(&self) -> Option<u32> {
69        match self {
70            ErasedStats::F16(x) => x.distinct.as_ref().map(|d| d.distinct_count),
71            ErasedStats::F32(x) => x.distinct.as_ref().map(|d| d.distinct_count),
72            ErasedStats::F64(x) => x.distinct.as_ref().map(|d| d.distinct_count),
73        }
74    }
75}
76
77/// Implements `From<TypedStats<$T>>` for [`ErasedStats`].
78macro_rules! impl_from_typed {
79    ($T:ty, $variant:path) => {
80        impl From<TypedStats<$T>> for ErasedStats {
81            fn from(typed: TypedStats<$T>) -> Self {
82                $variant(typed)
83            }
84        }
85    };
86}
87
88impl_from_typed!(f16, ErasedStats::F16);
89impl_from_typed!(f32, ErasedStats::F32);
90impl_from_typed!(f64, ErasedStats::F64);
91
92/// Array of floating-point numbers and relevant stats for compression.
93#[derive(Debug, Clone)]
94pub struct FloatStats {
95    /// Cache for `validity.false_count()`.
96    null_count: u32,
97    /// Cache for `validity.true_count()`.
98    value_count: u32,
99    /// The average run length.
100    average_run_length: u32,
101    /// Type-erased typed statistics.
102    erased: ErasedStats,
103}
104
105impl FloatStats {
106    /// Generates stats, returning an error on failure.
107    fn generate_opts_fallible(
108        input: &PrimitiveArray,
109        opts: GenerateStatsOptions,
110    ) -> VortexResult<Self> {
111        match input.ptype() {
112            PType::F16 => typed_float_stats::<f16>(input, opts.count_distinct_values),
113            PType::F32 => typed_float_stats::<f32>(input, opts.count_distinct_values),
114            PType::F64 => typed_float_stats::<f64>(input, opts.count_distinct_values),
115            _ => vortex_panic!("cannot generate FloatStats from ptype {}", input.ptype()),
116        }
117    }
118
119    /// Get the count of distinct values, if we have computed it already.
120    pub fn distinct_count(&self) -> Option<u32> {
121        self.erased.distinct_count()
122    }
123}
124
125impl FloatStats {
126    /// Generates stats with default options.
127    pub fn generate(input: &PrimitiveArray) -> Self {
128        Self::generate_opts(input, GenerateStatsOptions::default())
129    }
130
131    /// Generates stats with provided options.
132    pub fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
133        Self::generate_opts_fallible(input, opts)
134            .vortex_expect("FloatStats::generate_opts should not fail")
135    }
136
137    /// Returns the number of null values.
138    pub fn null_count(&self) -> u32 {
139        self.null_count
140    }
141
142    /// Returns the number of non-null values.
143    pub fn value_count(&self) -> u32 {
144        self.value_count
145    }
146
147    /// Returns the average run length.
148    pub fn average_run_length(&self) -> u32 {
149        self.average_run_length
150    }
151
152    /// Returns the type-erased typed statistics.
153    pub fn erased(&self) -> &ErasedStats {
154        &self.erased
155    }
156}
157
158/// Computes typed float statistics for a specific float type.
159fn typed_float_stats<T: NativePType + Float>(
160    array: &PrimitiveArray,
161    count_distinct_values: bool,
162) -> VortexResult<FloatStats>
163where
164    NativeValue<T>: Hash + Eq,
165    TypedStats<T>: Into<ErasedStats>,
166{
167    // Special case: empty array.
168    if array.is_empty() {
169        return Ok(FloatStats {
170            null_count: 0,
171            value_count: 0,
172            average_run_length: 0,
173            erased: TypedStats { distinct: None }.into(),
174        });
175    }
176
177    if array.all_invalid()? {
178        return Ok(FloatStats {
179            null_count: u32::try_from(array.len())?,
180            value_count: 0,
181            average_run_length: 0,
182            erased: TypedStats { distinct: None }.into(),
183        });
184    }
185
186    let null_count = array
187        .statistics()
188        .compute_null_count()
189        .ok_or_else(|| vortex_err!("Failed to compute null_count"))?;
190    let value_count = array.len() - null_count;
191
192    // Keep a HashMap of T, then convert the keys into PValue afterward since value is
193    // so much more efficient to hash and search for.
194    let mut distinct_values = if count_distinct_values {
195        HashSet::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
196    } else {
197        HashSet::with_hasher(FxBuildHasher)
198    };
199
200    let validity = array.validity_mask()?;
201
202    let mut runs = 1;
203    let head_idx = validity
204        .first()
205        .vortex_expect("All null masks have been handled before");
206    let buff = array.to_buffer::<T>();
207    let mut prev = buff[head_idx];
208
209    let first_valid_buff = buff.slice(head_idx..array.len());
210    match validity.bit_buffer() {
211        AllOr::All => {
212            for value in first_valid_buff {
213                if count_distinct_values {
214                    distinct_values.insert(NativeValue(value));
215                }
216
217                if value != prev {
218                    prev = value;
219                    runs += 1;
220                }
221            }
222        }
223        AllOr::None => unreachable!("All invalid arrays have been handled earlier"),
224        AllOr::Some(v) => {
225            for (&value, valid) in first_valid_buff
226                .iter()
227                .zip_eq(v.slice(head_idx..array.len()).iter())
228            {
229                if valid {
230                    if count_distinct_values {
231                        distinct_values.insert(NativeValue(value));
232                    }
233
234                    if value != prev {
235                        prev = value;
236                        runs += 1;
237                    }
238                }
239            }
240        }
241    }
242
243    let null_count = u32::try_from(null_count)?;
244    let value_count = u32::try_from(value_count)?;
245
246    let distinct = count_distinct_values.then(|| DistinctInfo {
247        distinct_count: u32::try_from(distinct_values.len())
248            .vortex_expect("more than u32::MAX distinct values"),
249        distinct_values,
250    });
251
252    Ok(FloatStats {
253        null_count,
254        value_count,
255        average_run_length: value_count / runs,
256        erased: TypedStats { distinct }.into(),
257    })
258}
259
260#[cfg(test)]
261mod tests {
262    use vortex_array::IntoArray;
263    use vortex_array::ToCanonical;
264    use vortex_array::arrays::PrimitiveArray;
265    use vortex_array::validity::Validity;
266    use vortex_buffer::buffer;
267
268    use super::FloatStats;
269
270    #[test]
271    fn test_float_stats() {
272        let floats = buffer![0.0f32, 1.0f32, 2.0f32].into_array();
273        let floats = floats.to_primitive();
274
275        let stats = FloatStats::generate_opts(
276            &floats,
277            crate::stats::GenerateStatsOptions {
278                count_distinct_values: true,
279            },
280        );
281
282        assert_eq!(stats.value_count, 3);
283        assert_eq!(stats.null_count, 0);
284        assert_eq!(stats.average_run_length, 1);
285        assert_eq!(stats.distinct_count().unwrap(), 3);
286    }
287
288    #[test]
289    fn test_float_stats_leading_nulls() {
290        let floats = PrimitiveArray::new(
291            buffer![0.0f32, 1.0f32, 2.0f32],
292            Validity::from_iter([false, true, true]),
293        );
294
295        let stats = FloatStats::generate_opts(
296            &floats,
297            crate::stats::GenerateStatsOptions {
298                count_distinct_values: true,
299            },
300        );
301
302        assert_eq!(stats.value_count, 2);
303        assert_eq!(stats.null_count, 1);
304        assert_eq!(stats.average_run_length, 1);
305        assert_eq!(stats.distinct_count().unwrap(), 2);
306    }
307}