use std::fs::File;
use std::path::PathBuf;
use std::sync::Arc;
use anyhow::{Context, Result};
use arrow::temporal_conversions::{date32_to_datetime, timestamp_ms_to_datetime};
use clap::Parser;
use orc_rust::reader::metadata::read_metadata;
use orc_rust::statistics::ColumnStatistics;
#[derive(Debug, Parser)]
#[command(about = "Print column and stripe statistics from an ORC file")]
pub struct Args {
file: PathBuf,
}
fn print_column_stats(col_stats: &ColumnStatistics) {
if let Some(tstats) = col_stats.type_statistics() {
match tstats {
orc_rust::statistics::TypeStatistics::Integer { min, max, sum } => {
println!("* Data type Integer");
println!("* Minimum: {min}");
println!("* Maximum: {max}");
if let Some(sum) = sum {
println!("* Sum: {sum}");
}
}
orc_rust::statistics::TypeStatistics::Double { min, max, sum } => {
println!("* Data type Double");
println!("* Minimum: {min}");
println!("* Maximum: {max}");
if let Some(sum) = sum {
println!("* Sum: {sum}");
}
}
orc_rust::statistics::TypeStatistics::String {
lower_bound,
upper_bound,
sum,
is_exact_min,
is_exact_max,
} => {
println!("* Data type String");
println!("* Minimum: {lower_bound}");
println!("* Maximum: {upper_bound}");
println!("* Sum: {sum}");
println!("* IsExactMin: {is_exact_min}");
println!("* IsExactMax: {is_exact_max}");
}
orc_rust::statistics::TypeStatistics::Bucket { true_count } => {
println!("* Data type Bucket");
println!("* True count: {true_count}");
}
orc_rust::statistics::TypeStatistics::Decimal { min, max, sum } => {
println!("* Data type Decimal");
println!("* Minimum: {min}");
println!("* Maximum: {max}");
println!("* Sum: {sum}");
}
orc_rust::statistics::TypeStatistics::Date { min, max } => {
println!("* Data type Date");
if let Some(dt) = date32_to_datetime(*min) {
println!("* Minimum: {dt}");
}
if let Some(dt) = date32_to_datetime(*max) {
println!("* Maximum: {dt}");
}
}
orc_rust::statistics::TypeStatistics::Binary { sum } => {
println!("* Data type Binary");
println!("* Sum: {sum}");
}
orc_rust::statistics::TypeStatistics::Timestamp {
min,
max,
min_utc,
max_utc,
} => {
println!("* Data type Timestamp");
println!("* Minimum: {min}");
println!("* Maximum: {max}");
if let Some(ts) = timestamp_ms_to_datetime(*min_utc) {
println!("* Minimum UTC: {ts}");
}
if let Some(ts) = timestamp_ms_to_datetime(*max_utc) {
println!("* Maximum UTC: {ts}");
}
}
orc_rust::statistics::TypeStatistics::Collection {
min_children,
max_children,
total_children,
} => {
println!("* Data type Collection");
println!("* Minimum children: {min_children}");
println!("* Maximum children: {max_children}");
println!("* Total children: {total_children}");
}
}
}
println!("* Num values: {}", col_stats.number_of_values());
println!("* Has nulls: {}", col_stats.has_null());
println!();
}
pub fn run(args: Args) -> Result<()> {
let mut file = File::open(&args.file)
.with_context(|| format!("failed to open {:?}", args.file.display()))?;
let metadata = Arc::new(read_metadata(&mut file)?);
println!("# Column stats");
println!(
"File {:?} has {} columns",
args.file,
metadata.column_file_statistics().len()
);
println!();
for (idx, col_stats) in metadata.column_file_statistics().iter().enumerate() {
println!("## Column {idx}");
print_column_stats(col_stats);
}
println!("# Stripe stats");
println!(
"File {:?} has {} stripes",
args.file,
metadata.stripe_metadatas().len()
);
println!();
for (idm, sm) in metadata.stripe_metadatas().iter().enumerate() {
println!("----- Stripe {idm} -----\n");
for (idc, col_stats) in sm.column_statistics().iter().enumerate() {
println!("## Column {idc}");
print_column_stats(col_stats);
}
}
Ok(())
}