arrow2 0.10.1

Unofficial implementation of Apache Arrow spec in safe Rust
Documentation
use crate::datatypes::TimeUnit;
use crate::{datatypes::DataType, types::NativeType};
use parquet2::schema::types::{
    LogicalType, ParquetType, TimeUnit as ParquetTimeUnit, TimestampType,
};
use parquet2::statistics::PrimitiveStatistics as ParquetPrimitiveStatistics;
use parquet2::types::NativeType as ParquetNativeType;
use std::any::Any;

use super::Statistics;
use crate::error::Result;

/// Arrow-deserialized parquet Statistics of a primitive type
#[derive(Debug, Clone, PartialEq)]
pub struct PrimitiveStatistics<T: NativeType> {
    /// the data type
    pub data_type: DataType,
    /// number of nulls
    pub null_count: Option<i64>,
    /// number of dictinct values
    pub distinct_count: Option<i64>,
    /// Minimum
    pub min_value: Option<T>,
    /// Maximum
    pub max_value: Option<T>,
}

impl<T: NativeType> Statistics for PrimitiveStatistics<T> {
    fn data_type(&self) -> &DataType {
        &self.data_type
    }

    fn as_any(&self) -> &dyn Any {
        self
    }

    fn null_count(&self) -> Option<i64> {
        self.null_count
    }
}

impl<T, R> From<(&ParquetPrimitiveStatistics<R>, DataType)> for PrimitiveStatistics<T>
where
    T: NativeType,
    R: ParquetNativeType,
    R: num_traits::AsPrimitive<T>,
{
    fn from((stats, data_type): (&ParquetPrimitiveStatistics<R>, DataType)) -> Self {
        Self {
            data_type,
            null_count: stats.null_count,
            distinct_count: stats.distinct_count,
            min_value: stats.min_value.map(|x| x.as_()),
            max_value: stats.max_value.map(|x| x.as_()),
        }
    }
}

pub(super) fn statistics_from_i32(
    stats: &ParquetPrimitiveStatistics<i32>,
    data_type: DataType,
) -> Result<Box<dyn Statistics>> {
    use DataType::*;
    Ok(match data_type {
        UInt8 => {
            Box::new(PrimitiveStatistics::<u8>::from((stats, data_type))) as Box<dyn Statistics>
        }
        UInt16 => Box::new(PrimitiveStatistics::<u16>::from((stats, data_type))),
        UInt32 => Box::new(PrimitiveStatistics::<u32>::from((stats, data_type))),
        Int8 => Box::new(PrimitiveStatistics::<i8>::from((stats, data_type))),
        Int16 => Box::new(PrimitiveStatistics::<i16>::from((stats, data_type))),
        Decimal(_, _) => Box::new(PrimitiveStatistics::<i128>::from((stats, data_type))),
        _ => Box::new(PrimitiveStatistics::<i32>::from((stats, data_type))),
    })
}

fn timestamp(type_: &ParquetType, time_unit: TimeUnit, x: i64) -> i64 {
    let logical_type = if let ParquetType::PrimitiveType { logical_type, .. } = type_ {
        logical_type
    } else {
        unreachable!()
    };

    let unit = if let Some(LogicalType::TIMESTAMP(TimestampType { unit, .. })) = logical_type {
        unit
    } else {
        return x;
    };

    match (unit, time_unit) {
        (ParquetTimeUnit::MILLIS(_), TimeUnit::Second) => x / 1_000,
        (ParquetTimeUnit::MICROS(_), TimeUnit::Second) => x / 1_000_000,
        (ParquetTimeUnit::NANOS(_), TimeUnit::Second) => x * 1_000_000_000,

        (ParquetTimeUnit::MILLIS(_), TimeUnit::Millisecond) => x,
        (ParquetTimeUnit::MICROS(_), TimeUnit::Millisecond) => x / 1_000,
        (ParquetTimeUnit::NANOS(_), TimeUnit::Millisecond) => x / 1_000_000,

        (ParquetTimeUnit::MILLIS(_), TimeUnit::Microsecond) => x * 1_000,
        (ParquetTimeUnit::MICROS(_), TimeUnit::Microsecond) => x,
        (ParquetTimeUnit::NANOS(_), TimeUnit::Microsecond) => x / 1_000,

        (ParquetTimeUnit::MILLIS(_), TimeUnit::Nanosecond) => x * 1_000_000,
        (ParquetTimeUnit::MICROS(_), TimeUnit::Nanosecond) => x * 1_000,
        (ParquetTimeUnit::NANOS(_), TimeUnit::Nanosecond) => x,
    }
}

pub(super) fn statistics_from_i64(
    stats: &ParquetPrimitiveStatistics<i64>,
    data_type: DataType,
) -> Result<Box<dyn Statistics>> {
    use DataType::*;
    Ok(match data_type {
        UInt64 => {
            Box::new(PrimitiveStatistics::<u64>::from((stats, data_type))) as Box<dyn Statistics>
        }
        Timestamp(time_unit, None) => Box::new(PrimitiveStatistics::<i64> {
            data_type,
            null_count: stats.null_count,
            distinct_count: stats.distinct_count,
            min_value: stats
                .min_value
                .map(|x| timestamp(stats.descriptor.type_(), time_unit, x)),
            max_value: stats
                .max_value
                .map(|x| timestamp(stats.descriptor.type_(), time_unit, x)),
        }),
        Decimal(_, _) => Box::new(PrimitiveStatistics::<i128>::from((stats, data_type))),
        _ => Box::new(PrimitiveStatistics::<i64>::from((stats, data_type))),
    })
}