use std::hash::Hash;
use itertools::Itertools;
use num_traits::Float;
use rustc_hash::FxBuildHasher;
use vortex_array::ExecutionCtx;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::arrays::primitive::NativeValue;
use vortex_array::dtype::NativePType;
use vortex_array::dtype::PType;
use vortex_array::dtype::half::f16;
use vortex_error::VortexExpect;
use vortex_error::VortexResult;
use vortex_error::vortex_err;
use vortex_error::vortex_panic;
use vortex_mask::AllOr;
use vortex_utils::aliases::hash_set::HashSet;
use super::GenerateStatsOptions;
#[derive(Debug, Clone)]
pub struct DistinctInfo<T> {
distinct_values: HashSet<NativeValue<T>, FxBuildHasher>,
distinct_count: u32,
}
impl<T> DistinctInfo<T> {
pub fn distinct_values(&self) -> &HashSet<NativeValue<T>, FxBuildHasher> {
&self.distinct_values
}
}
#[derive(Debug, Clone)]
pub struct TypedStats<T> {
distinct: Option<DistinctInfo<T>>,
}
impl<T> TypedStats<T> {
pub fn distinct(&self) -> Option<&DistinctInfo<T>> {
self.distinct.as_ref()
}
}
#[derive(Debug, Clone)]
pub enum ErasedStats {
F16(TypedStats<f16>),
F32(TypedStats<f32>),
F64(TypedStats<f64>),
}
impl ErasedStats {
fn distinct_count(&self) -> Option<u32> {
match self {
ErasedStats::F16(x) => x.distinct.as_ref().map(|d| d.distinct_count),
ErasedStats::F32(x) => x.distinct.as_ref().map(|d| d.distinct_count),
ErasedStats::F64(x) => x.distinct.as_ref().map(|d| d.distinct_count),
}
}
}
macro_rules! impl_from_typed {
($T:ty, $variant:path) => {
impl From<TypedStats<$T>> for ErasedStats {
fn from(typed: TypedStats<$T>) -> Self {
$variant(typed)
}
}
};
}
impl_from_typed!(f16, ErasedStats::F16);
impl_from_typed!(f32, ErasedStats::F32);
impl_from_typed!(f64, ErasedStats::F64);
#[derive(Debug, Clone)]
pub struct FloatStats {
null_count: u32,
value_count: u32,
average_run_length: u32,
erased: ErasedStats,
}
impl FloatStats {
fn generate_opts_fallible(
input: &PrimitiveArray,
opts: GenerateStatsOptions,
ctx: &mut ExecutionCtx,
) -> VortexResult<Self> {
match input.ptype() {
PType::F16 => typed_float_stats::<f16>(input, opts.count_distinct_values, ctx),
PType::F32 => typed_float_stats::<f32>(input, opts.count_distinct_values, ctx),
PType::F64 => typed_float_stats::<f64>(input, opts.count_distinct_values, ctx),
_ => vortex_panic!("cannot generate FloatStats from ptype {}", input.ptype()),
}
}
pub fn distinct_count(&self) -> Option<u32> {
self.erased.distinct_count()
}
}
impl FloatStats {
pub fn generate(input: &PrimitiveArray, ctx: &mut ExecutionCtx) -> Self {
Self::generate_opts(input, GenerateStatsOptions::default(), ctx)
}
pub fn generate_opts(
input: &PrimitiveArray,
opts: GenerateStatsOptions,
ctx: &mut ExecutionCtx,
) -> Self {
Self::generate_opts_fallible(input, opts, ctx)
.vortex_expect("FloatStats::generate_opts should not fail")
}
pub fn null_count(&self) -> u32 {
self.null_count
}
pub fn value_count(&self) -> u32 {
self.value_count
}
pub fn average_run_length(&self) -> u32 {
self.average_run_length
}
pub fn erased(&self) -> &ErasedStats {
&self.erased
}
}
fn typed_float_stats<T: NativePType + Float>(
array: &PrimitiveArray,
count_distinct_values: bool,
ctx: &mut ExecutionCtx,
) -> VortexResult<FloatStats>
where
NativeValue<T>: Hash + Eq,
TypedStats<T>: Into<ErasedStats>,
{
if array.is_empty() {
return Ok(FloatStats {
null_count: 0,
value_count: 0,
average_run_length: 0,
erased: TypedStats { distinct: None }.into(),
});
}
if array.all_invalid(ctx)? {
return Ok(FloatStats {
null_count: u32::try_from(array.len())?,
value_count: 0,
average_run_length: 0,
erased: TypedStats {
distinct: Some(DistinctInfo {
distinct_values: HashSet::with_capacity_and_hasher(0, FxBuildHasher),
distinct_count: 0,
}),
}
.into(),
});
}
let null_count = array
.statistics()
.compute_null_count(ctx)
.ok_or_else(|| vortex_err!("Failed to compute null_count"))?;
let value_count = array.len() - null_count;
let mut distinct_values = if count_distinct_values {
HashSet::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
} else {
HashSet::with_hasher(FxBuildHasher)
};
let validity = array
.as_ref()
.validity()?
.execute_mask(array.as_ref().len(), ctx)?;
let mut runs = 1;
let head_idx = validity
.first()
.vortex_expect("All null masks have been handled before");
let buff = array.to_buffer::<T>();
let mut prev = buff[head_idx];
let first_valid_buff = buff.slice(head_idx..array.len());
match validity.bit_buffer() {
AllOr::All => {
for value in first_valid_buff {
if count_distinct_values {
distinct_values.insert(NativeValue(value));
}
if value != prev {
prev = value;
runs += 1;
}
}
}
AllOr::None => unreachable!("All invalid arrays have been handled earlier"),
AllOr::Some(v) => {
for (&value, valid) in first_valid_buff
.iter()
.zip_eq(v.slice(head_idx..array.len()).iter())
{
if valid {
if count_distinct_values {
distinct_values.insert(NativeValue(value));
}
if value != prev {
prev = value;
runs += 1;
}
}
}
}
}
let null_count = u32::try_from(null_count)?;
let value_count = u32::try_from(value_count)?;
let distinct = count_distinct_values.then(|| DistinctInfo {
distinct_count: u32::try_from(distinct_values.len())
.vortex_expect("more than u32::MAX distinct values"),
distinct_values,
});
Ok(FloatStats {
null_count,
value_count,
average_run_length: value_count / runs,
erased: TypedStats { distinct }.into(),
})
}
#[cfg(test)]
mod tests {
use vortex_array::IntoArray;
use vortex_array::LEGACY_SESSION;
use vortex_array::VortexSessionExecute;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::validity::Validity;
use vortex_buffer::buffer;
use vortex_error::VortexResult;
use super::FloatStats;
#[test]
fn test_float_stats() -> VortexResult<()> {
let mut ctx = LEGACY_SESSION.create_execution_ctx();
let floats = buffer![0.0f32, 1.0f32, 2.0f32].into_array();
let floats = floats.execute::<PrimitiveArray>(&mut ctx)?;
let stats = FloatStats::generate_opts(
&floats,
crate::stats::GenerateStatsOptions {
count_distinct_values: true,
},
&mut ctx,
);
assert_eq!(stats.value_count, 3);
assert_eq!(stats.null_count, 0);
assert_eq!(stats.average_run_length, 1);
assert_eq!(stats.distinct_count().unwrap(), 3);
Ok(())
}
#[test]
fn test_float_stats_leading_nulls() {
let mut ctx = LEGACY_SESSION.create_execution_ctx();
let floats = PrimitiveArray::new(
buffer![0.0f32, 1.0f32, 2.0f32],
Validity::from_iter([false, true, true]),
);
let stats = FloatStats::generate_opts(
&floats,
crate::stats::GenerateStatsOptions {
count_distinct_values: true,
},
&mut ctx,
);
assert_eq!(stats.value_count, 2);
assert_eq!(stats.null_count, 1);
assert_eq!(stats.average_run_length, 1);
assert_eq!(stats.distinct_count().unwrap(), 2);
}
}