use std::fs::File;
use std::path::PathBuf;
use anyhow::{anyhow, Context, Result};
use clap::Parser;
use orc_rust::reader::metadata::read_metadata;
use orc_rust::schema::{DataType, RootDataType};
use orc_rust::stripe::Stripe;
use crate::common::format_stats;
#[derive(Debug, Parser)]
#[command(about = "Print row group index information for an ORC column")]
pub struct Args {
file: PathBuf,
column: String,
}
fn find_column<'a>(root: &'a RootDataType, name: &str) -> Option<(usize, &'a DataType, &'a str)> {
root.children()
.iter()
.find(|c| c.name() == name)
.map(|col| (col.data_type().column_index(), col.data_type(), col.name()))
}
pub fn run(args: Args) -> Result<()> {
let mut file = File::open(&args.file)
.with_context(|| format!("failed to open {:?}", args.file.display()))?;
let metadata = read_metadata(&mut file)?;
let Some((column_index, data_type, name)) =
find_column(metadata.root_data_type(), &args.column)
else {
let available = metadata
.root_data_type()
.children()
.iter()
.map(|c| c.name().to_string())
.collect::<Vec<_>>()
.join(", ");
return Err(anyhow!(
"column '{}' not found. Available columns: {available}",
args.column
));
};
println!(
"File: {} | Column: {} (index {})",
args.file.display(),
name,
column_index
);
println!("Type: {data_type}");
println!("Stripes: {}", metadata.stripe_metadatas().len());
for (stripe_idx, stripe_meta) in metadata.stripe_metadatas().iter().enumerate() {
let stripe = Stripe::new(&mut file, &metadata, metadata.root_data_type(), stripe_meta)?;
let row_index = stripe.read_row_indexes(&metadata)?;
let Some(col_index) = row_index.column(column_index) else {
println!("Stripe {stripe_idx}: no row index for column");
continue;
};
if col_index.num_row_groups() == 0 {
println!("Stripe {stripe_idx}: no row groups recorded");
continue;
}
println!(
"Stripe {stripe_idx}: rows_per_group={} total_rows={}",
col_index.rows_per_group(),
row_index.total_rows()
);
for (row_group_idx, entry) in col_index.entries().enumerate() {
let start = row_group_idx * col_index.rows_per_group();
let end = (start + col_index.rows_per_group()).min(row_index.total_rows());
print!(" Row group {row_group_idx} rows [{start},{end})");
if let Some(stats) = &entry.statistics {
println!(" -> {}", format_stats(stats));
} else {
println!(" -> no statistics");
}
}
}
Ok(())
}