use crate::draw::RenderSettings;
use arrow::{
array::{Array, GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray},
datatypes::*,
};
#[derive(Debug, Clone)]
pub struct ColumnStats {
pub min_max: Option<MinMax>,
pub ideal_width: u16,
pub cardinality: Option<u8>,
}
#[derive(Debug, Copy, Clone)]
pub struct MinMax {
pub min: f64,
pub max: f64,
}
impl ColumnStats {
pub fn merge(&mut self, other: ColumnStats) {
self.min_max = self
.min_max
.zip(other.min_max)
.map(|(x, y)| MinMax {
min: x.min.min(y.min),
max: x.max.max(y.max),
})
.or(self.min_max)
.or(other.min_max);
self.ideal_width = self.ideal_width.max(other.ideal_width);
self.cardinality = self
.cardinality
.zip(other.cardinality)
.map(|(x, y)| x.max(y));
}
}
impl ColumnStats {
pub fn new(
name: &str,
col: &dyn Array,
settings: &RenderSettings,
) -> anyhow::Result<ColumnStats> {
macro_rules! col {
() => {
col.as_any().downcast_ref().unwrap()
};
}
let mut stats = match col.data_type() {
DataType::Null => ColumnStats::fixed_len(0),
DataType::Boolean => ColumnStats::fixed_len(5),
DataType::UInt8 => ColumnStats::new_integral::<UInt8Type>(col!())?,
DataType::UInt16 => ColumnStats::new_integral::<UInt16Type>(col!())?,
DataType::UInt32 => ColumnStats::new_integral::<UInt32Type>(col!())?,
DataType::UInt64 => ColumnStats::new_integral::<UInt64Type>(col!())?,
DataType::Int8 => ColumnStats::new_integral::<Int8Type>(col!())?,
DataType::Int16 => ColumnStats::new_integral::<Int16Type>(col!())?,
DataType::Int32 => ColumnStats::new_integral::<Int32Type>(col!())?,
DataType::Int64 => ColumnStats::new_integral::<Int64Type>(col!())?,
DataType::Float16 => ColumnStats::new_floating::<Float16Type>(col!(), settings)?,
DataType::Float32 => ColumnStats::new_floating::<Float32Type>(col!(), settings)?,
DataType::Float64 => ColumnStats::new_floating::<Float64Type>(col!(), settings)?,
DataType::Decimal128(_, _) => ColumnStats::fallback(col)?, DataType::Decimal256(_, _) => ColumnStats::fallback(col)?,
DataType::Utf8 => ColumnStats::new_string::<i32>(col!())?,
DataType::LargeUtf8 => ColumnStats::new_string::<i64>(col!())?,
DataType::Binary => ColumnStats::new_binary::<i32>(col!())?,
DataType::LargeBinary => ColumnStats::new_binary::<i64>(col!())?,
DataType::FixedSizeBinary(_) => ColumnStats::fallback(col)?, DataType::Dictionary(_, _) => ColumnStats::fallback(col)?,
DataType::Date32 | DataType::Date64 => ColumnStats::fixed_len(10), DataType::Time32(unit) | DataType::Time64(unit) => ColumnStats::fixed_len(match unit {
TimeUnit::Second => 8, TimeUnit::Millisecond => 8 + 1 + 3, TimeUnit::Microsecond => 8 + 1 + 6, TimeUnit::Nanosecond => 8 + 1 + 9, }),
DataType::Timestamp(unit, tz) => ColumnStats::fixed_len(
20 + match unit {
TimeUnit::Second => 0,
TimeUnit::Millisecond => 3 + 1,
TimeUnit::Microsecond => 6 + 1,
TimeUnit::Nanosecond => 9 + 1,
} + tz
.as_ref()
.map(|tz| tz.to_string().len() as u16)
.unwrap_or(0),
),
DataType::Duration(_) => ColumnStats::fallback(col)?, DataType::Interval(_) => ColumnStats::fallback(col)?,
DataType::Struct(_) => ColumnStats::fallback(col)?,
DataType::Map(_, _) => ColumnStats::fallback(col)?,
DataType::List(_) => ColumnStats::fallback(col)?,
DataType::LargeList(_) => ColumnStats::fallback(col)?,
DataType::FixedSizeList(_, _) => ColumnStats::fallback(col)?,
DataType::Union(_, _) => ColumnStats::fallback(col)?,
DataType::RunEndEncoded(_, _) => ColumnStats::fallback(col)?,
};
stats.ideal_width = stats.ideal_width.max(name.len() as u16).max(3);
Ok(stats)
}
fn new_integral<T: ArrowNumericType>(col: &PrimitiveArray<T>) -> anyhow::Result<ColumnStats>
where
T::Native: Into<i128>,
{
let min: Option<i128> = arrow::compute::min(col).map(|x| x.into());
let max: Option<i128> = arrow::compute::max(col).map(|x| x.into());
let len = |x: i128| -> u16 {
1 + if x == 0 {
0
} else {
x.abs().ilog10() as u16 + if x < 0 { 1 } else { 0 }
}
};
let max_len = min
.map(len)
.into_iter()
.chain(max.map(len))
.max()
.unwrap_or(0);
Ok(ColumnStats {
min_max: min.zip(max).map(|(min, max)| MinMax {
min: min as f64,
max: max as f64,
}),
ideal_width: max_len,
cardinality: None,
})
}
fn new_floating<T: ArrowNumericType>(
col: &PrimitiveArray<T>,
settings: &RenderSettings,
) -> anyhow::Result<ColumnStats>
where
T::Native: Into<f64>,
{
let min: Option<f64> = arrow::compute::min(col).map(|x| x.into());
let max: Option<f64> = arrow::compute::max(col).map(|x| x.into());
let len = |x: f64| -> u16 {
2 + settings.float_dps as u16
+ if x == 0.0 {
0
} else {
x.abs().log10() as u16 + if x < 0.0 { 1 } else { 0 }
}
};
let max_len = min
.map(len)
.into_iter()
.chain(max.map(len))
.max()
.unwrap_or(0);
Ok(ColumnStats {
min_max: min.zip(max).map(|(min, max)| MinMax { min, max }),
ideal_width: max_len,
cardinality: None,
})
}
fn new_string<T: OffsetSizeTrait>(col: &GenericStringArray<T>) -> anyhow::Result<ColumnStats> {
let lens = arrow::compute::kernels::length::length(col)?;
let max_len = match lens.data_type() {
DataType::Int32 => {
arrow::compute::max::<Int32Type>(lens.as_any().downcast_ref().unwrap()).unwrap_or(0)
as u16
}
DataType::Int64 => {
arrow::compute::max::<Int64Type>(lens.as_any().downcast_ref().unwrap()).unwrap_or(0)
as u16
}
_ => unreachable!(),
};
let unique_vals: std::collections::HashSet<&str> = col.iter().flatten().collect();
Ok(ColumnStats {
min_max: None,
ideal_width: max_len,
cardinality: u8::try_from(unique_vals.len()).ok(),
})
}
fn new_binary<T: OffsetSizeTrait>(col: &GenericBinaryArray<T>) -> anyhow::Result<ColumnStats> {
let lens = arrow::compute::kernels::length::length(col)?;
let max_len = match lens.data_type() {
DataType::Int32 => {
arrow::compute::max::<Int32Type>(lens.as_any().downcast_ref().unwrap()).unwrap_or(0)
as u16
}
DataType::Int64 => {
arrow::compute::max::<Int64Type>(lens.as_any().downcast_ref().unwrap()).unwrap_or(0)
as u16
}
_ => unreachable!(),
};
Ok(ColumnStats {
min_max: None,
ideal_width: max_len,
cardinality: None,
})
}
fn fixed_len(max_len: u16) -> ColumnStats {
ColumnStats {
ideal_width: max_len,
min_max: None,
cardinality: None,
}
}
fn fallback(col: &dyn Array) -> anyhow::Result<ColumnStats> {
Ok(ColumnStats {
ideal_width: column_width(col)? as u16,
min_max: None,
cardinality: None,
})
}
}
fn column_width(col: &dyn Array) -> anyhow::Result<usize> {
use arrow::util::display::*;
let options = FormatOptions::default();
let formatter = ArrayFormatter::try_new(col, &options)?;
let mut max_len = 0;
for row in 0..col.len() {
let len = fmt_len(formatter.value(row))?;
max_len = max_len.max(len);
}
Ok(max_len)
}
fn fmt_len(x: impl std::fmt::Display) -> anyhow::Result<usize> {
use std::fmt::Write;
struct CountChars(usize);
impl Write for CountChars {
#[inline]
fn write_str(&mut self, s: &str) -> std::fmt::Result {
self.0 += s.len();
Ok(())
}
}
let mut counter = CountChars(0);
write!(counter, "{}", x)?;
Ok(counter.0)
}