Skip to main content

parquet_lite/
statistics.rs

1use crate::types::*;
2
3/// Per-column statistics summary
4#[derive(Debug, Clone)]
5pub struct ColumnStatistics {
6    pub column_name: String,
7    pub physical_type: ParquetType,
8    pub num_values: i64,
9    pub null_count: i64,
10    pub distinct_values: Option<i64>,
11    pub min_value: Option<String>,
12    pub max_value: Option<String>,
13    pub size_bytes: i64,
14    pub compression_ratio: f64,
15}
16
17/// Collects and formats statistics from Parquet metadata.
18pub struct StatisticsCollector;
19
20impl StatisticsCollector {
21    /// Build column statistics from parsed metadata.
22    pub fn from_metadata(metadata: &ParquetMetadata) -> Vec<ColumnStatistics> {
23        metadata
24            .columns
25            .iter()
26            .map(|col| ColumnStatistics {
27                column_name: col.name.clone(),
28                physical_type: col.physical_type,
29                num_values: col.num_values,
30                null_count: 0,
31                distinct_values: None,
32                min_value: None,
33                max_value: None,
34                size_bytes: col.total_uncompressed_size,
35                compression_ratio: if col.total_uncompressed_size > 0 {
36                    col.total_compressed_size as f64 / col.total_uncompressed_size as f64
37                } else {
38                    1.0
39                },
40            })
41            .collect()
42    }
43
44    /// Print a formatted summary table to stdout.
45    pub fn print_summary(metadata: &ParquetMetadata) {
46        let stats = Self::from_metadata(metadata);
47
48        println!("\n=== Parquet File Statistics ===");
49        println!("Version:       {}", metadata.version);
50        println!("Total Rows:    {}", metadata.num_rows);
51        println!("Total Columns: {}", metadata.num_columns);
52        println!("Row Groups:    {}", metadata.row_groups.len());
53
54        let total_size: i64 = stats.iter().map(|s| s.size_bytes).sum();
55        let total_compressed: i64 = metadata
56            .columns
57            .iter()
58            .map(|c| c.total_compressed_size)
59            .sum();
60
61        println!(
62            "Total Size:    {:.2} MB (compressed: {:.2} MB)\n",
63            total_size as f64 / 1024.0 / 1024.0,
64            total_compressed as f64 / 1024.0 / 1024.0,
65        );
66
67        println!(
68            "{:<25} {:<15} {:<12} {:<12} {:<10}",
69            "Column", "Type", "Values", "Size (KB)", "Ratio"
70        );
71        println!("{:-<74}", "");
72
73        for stat in &stats {
74            println!(
75                "{:<25} {:<15} {:<12} {:<12.2} {:<10.3}",
76                truncate(&stat.column_name, 24),
77                format!("{:?}", stat.physical_type),
78                stat.num_values,
79                stat.size_bytes as f64 / 1024.0,
80                stat.compression_ratio,
81            );
82        }
83
84        if let Some(created_by) = &metadata.created_by {
85            println!("\nCreated by: {created_by}");
86        }
87    }
88
89    /// Return statistics as a formatted string (for non-stdout contexts).
90    pub fn summary_string(metadata: &ParquetMetadata) -> String {
91        let stats = Self::from_metadata(metadata);
92        let mut out = String::new();
93
94        out.push_str(&format!("Rows: {}, Columns: {}\n", metadata.num_rows, metadata.num_columns));
95
96        for stat in &stats {
97            out.push_str(&format!(
98                "  {} ({:?}): {} values, {:.2} KB, ratio {:.3}\n",
99                stat.column_name,
100                stat.physical_type,
101                stat.num_values,
102                stat.size_bytes as f64 / 1024.0,
103                stat.compression_ratio,
104            ));
105        }
106
107        out
108    }
109}
110
111/// Truncate a string to max_len, appending "…" if truncated.
112fn truncate(s: &str, max_len: usize) -> String {
113    if s.len() <= max_len {
114        s.to_string()
115    } else {
116        format!("{}…", &s[..max_len - 1])
117    }
118}