Skip to main content

vortex_compressor/stats/
float.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Float compression statistics.
5
6use std::hash::Hash;
7
8use itertools::Itertools;
9use num_traits::Float;
10use rustc_hash::FxBuildHasher;
11use vortex_array::ExecutionCtx;
12use vortex_array::arrays::PrimitiveArray;
13use vortex_array::arrays::primitive::NativeValue;
14use vortex_array::dtype::NativePType;
15use vortex_array::dtype::PType;
16use vortex_array::dtype::half::f16;
17use vortex_error::VortexExpect;
18use vortex_error::VortexResult;
19use vortex_error::vortex_err;
20use vortex_error::vortex_panic;
21use vortex_mask::AllOr;
22use vortex_utils::aliases::hash_set::HashSet;
23
24use super::GenerateStatsOptions;
25
26/// Information about the distinct values in a float array.
27#[derive(Debug, Clone)]
28pub struct DistinctInfo<T> {
29    /// The set of distinct float values.
30    distinct_values: HashSet<NativeValue<T>, FxBuildHasher>,
31    /// The count of unique values. This _must_ be non-zero.
32    distinct_count: u32,
33}
34
35impl<T> DistinctInfo<T> {
36    /// Returns a reference to the distinct values set.
37    pub fn distinct_values(&self) -> &HashSet<NativeValue<T>, FxBuildHasher> {
38        &self.distinct_values
39    }
40}
41
42/// Typed statistics for a specific float type.
43#[derive(Debug, Clone)]
44pub struct TypedStats<T> {
45    /// Distinct value information, or `None` if not computed.
46    distinct: Option<DistinctInfo<T>>,
47}
48
49impl<T> TypedStats<T> {
50    /// Returns the distinct value information, if computed.
51    pub fn distinct(&self) -> Option<&DistinctInfo<T>> {
52        self.distinct.as_ref()
53    }
54}
55
56/// Type-erased container for one of the [`TypedStats`] variants.
57#[derive(Debug, Clone)]
58pub enum ErasedStats {
59    /// Stats for `f16` arrays.
60    F16(TypedStats<f16>),
61    /// Stats for `f32` arrays.
62    F32(TypedStats<f32>),
63    /// Stats for `f64` arrays.
64    F64(TypedStats<f64>),
65}
66
67impl ErasedStats {
68    /// Get the count of distinct values, if we have computed it already.
69    fn distinct_count(&self) -> Option<u32> {
70        match self {
71            ErasedStats::F16(x) => x.distinct.as_ref().map(|d| d.distinct_count),
72            ErasedStats::F32(x) => x.distinct.as_ref().map(|d| d.distinct_count),
73            ErasedStats::F64(x) => x.distinct.as_ref().map(|d| d.distinct_count),
74        }
75    }
76}
77
78/// Implements `From<TypedStats<$T>>` for [`ErasedStats`].
79macro_rules! impl_from_typed {
80    ($T:ty, $variant:path) => {
81        impl From<TypedStats<$T>> for ErasedStats {
82            fn from(typed: TypedStats<$T>) -> Self {
83                $variant(typed)
84            }
85        }
86    };
87}
88
89impl_from_typed!(f16, ErasedStats::F16);
90impl_from_typed!(f32, ErasedStats::F32);
91impl_from_typed!(f64, ErasedStats::F64);
92
93/// Array of floating-point numbers and relevant stats for compression.
94#[derive(Debug, Clone)]
95pub struct FloatStats {
96    /// Cache for `validity.false_count()`.
97    null_count: u32,
98    /// Cache for `validity.true_count()`.
99    value_count: u32,
100    /// The average run length.
101    average_run_length: u32,
102    /// Type-erased typed statistics.
103    erased: ErasedStats,
104}
105
106impl FloatStats {
107    /// Generates stats, returning an error on failure.
108    fn generate_opts_fallible(
109        input: &PrimitiveArray,
110        opts: GenerateStatsOptions,
111        ctx: &mut ExecutionCtx,
112    ) -> VortexResult<Self> {
113        match input.ptype() {
114            PType::F16 => typed_float_stats::<f16>(input, opts.count_distinct_values, ctx),
115            PType::F32 => typed_float_stats::<f32>(input, opts.count_distinct_values, ctx),
116            PType::F64 => typed_float_stats::<f64>(input, opts.count_distinct_values, ctx),
117            _ => vortex_panic!("cannot generate FloatStats from ptype {}", input.ptype()),
118        }
119    }
120
121    /// Get the count of distinct values, if we have computed it already.
122    pub fn distinct_count(&self) -> Option<u32> {
123        self.erased.distinct_count()
124    }
125}
126
127impl FloatStats {
128    /// Generates stats with default options.
129    pub fn generate(input: &PrimitiveArray, ctx: &mut ExecutionCtx) -> Self {
130        Self::generate_opts(input, GenerateStatsOptions::default(), ctx)
131    }
132
133    /// Generates stats with provided options.
134    pub fn generate_opts(
135        input: &PrimitiveArray,
136        opts: GenerateStatsOptions,
137        ctx: &mut ExecutionCtx,
138    ) -> Self {
139        Self::generate_opts_fallible(input, opts, ctx)
140            .vortex_expect("FloatStats::generate_opts should not fail")
141    }
142
143    /// Returns the number of null values.
144    pub fn null_count(&self) -> u32 {
145        self.null_count
146    }
147
148    /// Returns the number of non-null values.
149    pub fn value_count(&self) -> u32 {
150        self.value_count
151    }
152
153    /// Returns the average run length.
154    pub fn average_run_length(&self) -> u32 {
155        self.average_run_length
156    }
157
158    /// Returns the type-erased typed statistics.
159    pub fn erased(&self) -> &ErasedStats {
160        &self.erased
161    }
162}
163
164/// Computes typed float statistics for a specific float type.
165fn typed_float_stats<T: NativePType + Float>(
166    array: &PrimitiveArray,
167    count_distinct_values: bool,
168    ctx: &mut ExecutionCtx,
169) -> VortexResult<FloatStats>
170where
171    NativeValue<T>: Hash + Eq,
172    TypedStats<T>: Into<ErasedStats>,
173{
174    // Special case: empty array.
175    if array.is_empty() {
176        return Ok(FloatStats {
177            null_count: 0,
178            value_count: 0,
179            average_run_length: 0,
180            erased: TypedStats { distinct: None }.into(),
181        });
182    }
183
184    if array.all_invalid(ctx)? {
185        return Ok(FloatStats {
186            null_count: u32::try_from(array.len())?,
187            value_count: 0,
188            average_run_length: 0,
189            erased: TypedStats {
190                distinct: Some(DistinctInfo {
191                    distinct_values: HashSet::with_capacity_and_hasher(0, FxBuildHasher),
192                    distinct_count: 0,
193                }),
194            }
195            .into(),
196        });
197    }
198
199    let null_count = array
200        .statistics()
201        .compute_null_count(ctx)
202        .ok_or_else(|| vortex_err!("Failed to compute null_count"))?;
203    let value_count = array.len() - null_count;
204
205    // Keep a HashMap of T, then convert the keys into PValue afterward since value is
206    // so much more efficient to hash and search for.
207    let mut distinct_values = if count_distinct_values {
208        HashSet::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
209    } else {
210        HashSet::with_hasher(FxBuildHasher)
211    };
212
213    let validity = array
214        .as_ref()
215        .validity()?
216        .execute_mask(array.as_ref().len(), ctx)?;
217
218    let mut runs = 1;
219    let head_idx = validity
220        .first()
221        .vortex_expect("All null masks have been handled before");
222    let buff = array.to_buffer::<T>();
223    let mut prev = buff[head_idx];
224
225    let first_valid_buff = buff.slice(head_idx..array.len());
226    match validity.bit_buffer() {
227        AllOr::All => {
228            for value in first_valid_buff {
229                if count_distinct_values {
230                    distinct_values.insert(NativeValue(value));
231                }
232
233                if value != prev {
234                    prev = value;
235                    runs += 1;
236                }
237            }
238        }
239        AllOr::None => unreachable!("All invalid arrays have been handled earlier"),
240        AllOr::Some(v) => {
241            for (&value, valid) in first_valid_buff
242                .iter()
243                .zip_eq(v.slice(head_idx..array.len()).iter())
244            {
245                if valid {
246                    if count_distinct_values {
247                        distinct_values.insert(NativeValue(value));
248                    }
249
250                    if value != prev {
251                        prev = value;
252                        runs += 1;
253                    }
254                }
255            }
256        }
257    }
258
259    let null_count = u32::try_from(null_count)?;
260    let value_count = u32::try_from(value_count)?;
261
262    let distinct = count_distinct_values.then(|| DistinctInfo {
263        distinct_count: u32::try_from(distinct_values.len())
264            .vortex_expect("more than u32::MAX distinct values"),
265        distinct_values,
266    });
267
268    Ok(FloatStats {
269        null_count,
270        value_count,
271        average_run_length: value_count / runs,
272        erased: TypedStats { distinct }.into(),
273    })
274}
275
276#[cfg(test)]
277mod tests {
278    use vortex_array::IntoArray;
279    use vortex_array::LEGACY_SESSION;
280    use vortex_array::VortexSessionExecute;
281    use vortex_array::arrays::PrimitiveArray;
282    use vortex_array::validity::Validity;
283    use vortex_buffer::buffer;
284    use vortex_error::VortexResult;
285
286    use super::FloatStats;
287
288    #[test]
289    fn test_float_stats() -> VortexResult<()> {
290        let mut ctx = LEGACY_SESSION.create_execution_ctx();
291        let floats = buffer![0.0f32, 1.0f32, 2.0f32].into_array();
292        let floats = floats.execute::<PrimitiveArray>(&mut ctx)?;
293
294        let stats = FloatStats::generate_opts(
295            &floats,
296            crate::stats::GenerateStatsOptions {
297                count_distinct_values: true,
298            },
299            &mut ctx,
300        );
301
302        assert_eq!(stats.value_count, 3);
303        assert_eq!(stats.null_count, 0);
304        assert_eq!(stats.average_run_length, 1);
305        assert_eq!(stats.distinct_count().unwrap(), 3);
306        Ok(())
307    }
308
309    #[test]
310    fn test_float_stats_leading_nulls() {
311        let mut ctx = LEGACY_SESSION.create_execution_ctx();
312        let floats = PrimitiveArray::new(
313            buffer![0.0f32, 1.0f32, 2.0f32],
314            Validity::from_iter([false, true, true]),
315        );
316
317        let stats = FloatStats::generate_opts(
318            &floats,
319            crate::stats::GenerateStatsOptions {
320                count_distinct_values: true,
321            },
322            &mut ctx,
323        );
324
325        assert_eq!(stats.value_count, 2);
326        assert_eq!(stats.null_count, 1);
327        assert_eq!(stats.average_run_length, 1);
328        assert_eq!(stats.distinct_count().unwrap(), 2);
329    }
330}