parquet_lite/
statistics.rs1use crate::types::*;
2
3#[derive(Debug, Clone)]
5pub struct ColumnStatistics {
6 pub column_name: String,
7 pub physical_type: ParquetType,
8 pub num_values: i64,
9 pub null_count: i64,
10 pub distinct_values: Option<i64>,
11 pub min_value: Option<String>,
12 pub max_value: Option<String>,
13 pub size_bytes: i64,
14 pub compression_ratio: f64,
15}
16
17pub struct StatisticsCollector;
19
20impl StatisticsCollector {
21 pub fn from_metadata(metadata: &ParquetMetadata) -> Vec<ColumnStatistics> {
23 metadata
24 .columns
25 .iter()
26 .map(|col| ColumnStatistics {
27 column_name: col.name.clone(),
28 physical_type: col.physical_type,
29 num_values: col.num_values,
30 null_count: 0,
31 distinct_values: None,
32 min_value: None,
33 max_value: None,
34 size_bytes: col.total_uncompressed_size,
35 compression_ratio: if col.total_uncompressed_size > 0 {
36 col.total_compressed_size as f64 / col.total_uncompressed_size as f64
37 } else {
38 1.0
39 },
40 })
41 .collect()
42 }
43
44 pub fn print_summary(metadata: &ParquetMetadata) {
46 let stats = Self::from_metadata(metadata);
47
48 println!("\n=== Parquet File Statistics ===");
49 println!("Version: {}", metadata.version);
50 println!("Total Rows: {}", metadata.num_rows);
51 println!("Total Columns: {}", metadata.num_columns);
52 println!("Row Groups: {}", metadata.row_groups.len());
53
54 let total_size: i64 = stats.iter().map(|s| s.size_bytes).sum();
55 let total_compressed: i64 = metadata
56 .columns
57 .iter()
58 .map(|c| c.total_compressed_size)
59 .sum();
60
61 println!(
62 "Total Size: {:.2} MB (compressed: {:.2} MB)\n",
63 total_size as f64 / 1024.0 / 1024.0,
64 total_compressed as f64 / 1024.0 / 1024.0,
65 );
66
67 println!(
68 "{:<25} {:<15} {:<12} {:<12} {:<10}",
69 "Column", "Type", "Values", "Size (KB)", "Ratio"
70 );
71 println!("{:-<74}", "");
72
73 for stat in &stats {
74 println!(
75 "{:<25} {:<15} {:<12} {:<12.2} {:<10.3}",
76 truncate(&stat.column_name, 24),
77 format!("{:?}", stat.physical_type),
78 stat.num_values,
79 stat.size_bytes as f64 / 1024.0,
80 stat.compression_ratio,
81 );
82 }
83
84 if let Some(created_by) = &metadata.created_by {
85 println!("\nCreated by: {created_by}");
86 }
87 }
88
89 pub fn summary_string(metadata: &ParquetMetadata) -> String {
91 let stats = Self::from_metadata(metadata);
92 let mut out = String::new();
93
94 out.push_str(&format!("Rows: {}, Columns: {}\n", metadata.num_rows, metadata.num_columns));
95
96 for stat in &stats {
97 out.push_str(&format!(
98 " {} ({:?}): {} values, {:.2} KB, ratio {:.3}\n",
99 stat.column_name,
100 stat.physical_type,
101 stat.num_values,
102 stat.size_bytes as f64 / 1024.0,
103 stat.compression_ratio,
104 ));
105 }
106
107 out
108 }
109}
110
111fn truncate(s: &str, max_len: usize) -> String {
113 if s.len() <= max_len {
114 s.to_string()
115 } else {
116 format!("{}…", &s[..max_len - 1])
117 }
118}