bdt 0.18.0

Utility for viewing, querying, converting, and comparing files in popular data formats (CSV, Parquet, JSON, Avro)
Documentation
use crate::Error;
use comfy_table::{Cell, Table};
use datafusion::parquet::basic::LogicalType;
use datafusion::parquet::file::reader::{FileReader, SerializedFileReader};
use datafusion::parquet::file::statistics::Statistics;
use std::fs::File;
use std::path::PathBuf;

pub fn view_parquet_meta(path: PathBuf) -> Result<(), Error> {
    let file = File::open(path).map_err(Error::from)?;
    let reader = SerializedFileReader::new(file).map_err(Error::from)?;

    let parquet_metadata = reader.metadata();

    let mut table = Table::new();
    table.load_preset("||--+-++|    ++++++");
    table.set_header(vec![Cell::new("Key"), Cell::new("Value")]);
    let file_meta = parquet_metadata.file_metadata();
    table.add_row(vec![
        Cell::new("Version"),
        Cell::new(format!("{}", file_meta.version())),
    ]);
    table.add_row(vec![
        Cell::new("Created By"),
        Cell::new(file_meta.created_by().unwrap_or("N/A")),
    ]);
    table.add_row(vec![
        Cell::new("Rows"),
        Cell::new(format!("{}", file_meta.num_rows())),
    ]);
    table.add_row(vec![
        Cell::new("Row Groups"),
        Cell::new(format!("{}", parquet_metadata.num_row_groups())),
    ]);
    println!("{}", table);

    for i in 0..parquet_metadata.num_row_groups() {
        let row_group_reader = reader.get_row_group(i)?;
        let md = row_group_reader.metadata();
        println!(
            "\nRow Group {} of {} contains {} rows and has {} bytes:\n",
            i,
            parquet_metadata.num_row_groups(),
            md.num_rows(),
            md.total_byte_size()
        );

        let mut table = Table::new();
        table.load_preset("||--+-++|    ++++++");
        let header: Vec<Cell> = [
            "Column Name",
            "Logical Type",
            "Physical Type",
            "Distinct Values",
            "Nulls",
            "Min",
            "Max",
        ]
        .iter()
        .map(Cell::new)
        .collect();
        table.set_header(header);

        let not_available = "N/A".to_string();
        for column in md.columns() {
            let mut row: Vec<String> = vec![];
            row.push(column.column_descr().name().to_owned());
            if let Some(t) = column.column_descr().logical_type() {
                row.push(format!("{:?}", t));
            } else {
                row.push(not_available.clone());
            }
            match column.statistics() {
                Some(stats) => {
                    row.push(format!("{}", stats.physical_type()));
                    if let Some(dc) = stats.distinct_count() {
                        row.push(format!("{}", dc));
                    } else {
                        row.push(not_available.clone());
                    }
                    row.push(format!("{}", stats.null_count()));

                    if stats.has_min_max_set() {
                        match &stats {
                            Statistics::Boolean(v) => {
                                row.push(format!("{}", v.min()));
                                row.push(format!("{}", v.max()));
                            }
                            Statistics::Int32(v) => {
                                row.push(format!("{}", v.min()));
                                row.push(format!("{}", v.max()));
                            }
                            Statistics::Int64(v) => {
                                row.push(format!("{}", v.min()));
                                row.push(format!("{}", v.max()));
                            }
                            Statistics::Float(v) => {
                                row.push(format!("{}", v.min()));
                                row.push(format!("{}", v.max()));
                            }
                            Statistics::Double(v) => {
                                row.push(format!("{}", v.min()));
                                row.push(format!("{}", v.max()));
                            }
                            Statistics::ByteArray(v) => {
                                match column.column_descr().logical_type() {
                                    Some(LogicalType::String) => {
                                        let min = v.min().as_utf8().unwrap();
                                        let max = v.max().as_utf8().unwrap();
                                        row.push(min.to_string());
                                        row.push(max.to_string());
                                    }
                                    _ => {
                                        row.push(format!("{}", v.min()));
                                        row.push(format!("{}", v.max()));
                                    }
                                }
                            }
                            _ => {
                                row.push("unsupported".to_owned());
                                row.push("unsupported".to_owned());
                            }
                        }
                    } else {
                        row.push(not_available.clone());
                        row.push(not_available.clone());
                    }
                }
                _ => {
                    for _ in 0..5 {
                        row.push(not_available.clone());
                    }
                }
            }
            table.add_row(row);
        }

        println!("{}", table);
    }
    Ok(())
}