use std::io;
use arrow::{csv, error::ArrowError, json, record_batch::RecordBatch};
use orc_rust::statistics::{ColumnStatistics, TypeStatistics};
#[derive(Clone, Debug, PartialEq, clap::ValueEnum)]
pub enum OutputFormat {
Csv,
Json,
}
#[allow(clippy::large_enum_variant)]
pub enum OutputWriter<W: io::Write, F: json::writer::JsonFormat> {
Csv(csv::Writer<W>),
Json(json::Writer<W, F>),
}
impl<W, F> OutputWriter<W, F>
where
W: io::Write,
F: json::writer::JsonFormat,
{
pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> {
match self {
OutputWriter::Csv(w) => w.write(batch),
OutputWriter::Json(w) => w.write(batch),
}
}
pub fn finish(&mut self) -> Result<(), ArrowError> {
match self {
OutputWriter::Csv(_) => Ok(()),
OutputWriter::Json(w) => w.finish(),
}
}
}
pub fn create_csv_writer<W: io::Write>(writer: W) -> csv::Writer<W> {
csv::WriterBuilder::new().with_header(true).build(writer)
}
pub fn create_json_writer<W: io::Write>(writer: W) -> json::Writer<W, json::writer::LineDelimited> {
json::WriterBuilder::new().build::<_, json::writer::LineDelimited>(writer)
}
pub fn format_stats(stats: &ColumnStatistics) -> String {
let mut parts = vec![format!("values={}", stats.number_of_values())];
if stats.has_null() {
parts.push("has_nulls=true".to_string());
}
if let Some(ts) = stats.type_statistics() {
match ts {
TypeStatistics::Integer { min, max, .. } => {
parts.push(format!("min={min}"));
parts.push(format!("max={max}"));
}
TypeStatistics::Double { min, max, .. } => {
parts.push(format!("min={min}"));
parts.push(format!("max={max}"));
}
TypeStatistics::String {
lower_bound,
upper_bound,
sum: _,
is_exact_min,
is_exact_max,
} => {
parts.push(format!("min={lower_bound}"));
parts.push(format!("max={upper_bound}"));
parts.push(format!("is_exact_min={is_exact_min}"));
parts.push(format!("is_exact_max={is_exact_max}"));
}
TypeStatistics::Bucket { true_count } => {
parts.push(format!("true_count={true_count}"));
}
TypeStatistics::Decimal { min, max, .. } => {
parts.push(format!("min={min}"));
parts.push(format!("max={max}"));
}
TypeStatistics::Date { min, max } => {
parts.push(format!("min={min}"));
parts.push(format!("max={max}"));
}
TypeStatistics::Binary { sum } => {
parts.push(format!("total_bytes={sum}"));
}
TypeStatistics::Timestamp { min, max, .. } => {
parts.push(format!("min={min}"));
parts.push(format!("max={max}"));
}
TypeStatistics::Collection {
min_children,
max_children,
total_children,
} => {
parts.push(format!("min_children={min_children}"));
parts.push(format!("max_children={max_children}"));
parts.push(format!("total_children={total_children}"));
}
}
}
parts.join(", ")
}