parquet_flamegraph/
lib.rs

1pub mod args;
2
3use parquet::file::metadata::ParquetMetaData;
4
5// https://parquet.apache.org/docs/file-format/metadata/
6pub fn parquet_column_size_to_flamegraph_format(
7    parquet_metadata: &ParquetMetaData,
8    unit: &args::Unit,
9) -> Vec<String> {
10    let converter_value = match *unit {
11        args::Unit::Bytes => 1,
12        args::Unit::KiloBytes => 1024,
13        args::Unit::MegaBytes => 1024 * 1024,
14        args::Unit::GigaBytes => 1024 * 1024 * 1024,
15    };
16
17    parquet_metadata
18        .row_groups()
19        .iter()
20        .flat_map(|row_group_metadata| {
21            row_group_metadata.columns().iter().map(|column_metadata| {
22                format!(
23                    "{} {}",
24                    column_metadata.column_path().string().replace(".", ";"),
25                    column_metadata.compressed_size() / converter_value
26                )
27            })
28        })
29        .collect()
30}
31
32// Data comes from: https://github.com/apache/parquet-testing/tree/master/data
33// It's easier to test with files than creating dummy metadata, cf: https://github.com/apache/arrow-rs/blob/3ee5048c8ea3aa531d111afe33d0a3551eabcd84/parquet/src/file/metadata/reader.rs#L891
34#[cfg(test)]
35mod tests {
36    use std::{fs::File, path::Path};
37
38    use parquet::file::reader::{FileReader, SerializedFileReader};
39
40    use super::*;
41
42    #[test]
43    fn test_parquet_column_size_to_flamegraph_format_with_nested_paths() {
44        let result = parquet_column_size_to_flamegraph_format(
45            metadata_reader("nested_maps.snappy.parquet").metadata(),
46            &args::Unit::Bytes,
47        );
48        let expected: Vec<String> = vec![
49            "a;key_value;key 69".to_string(),
50            "a;key_value;value;key_value;key 95".to_string(),
51            "a;key_value;value;key_value;value 50".to_string(),
52            "b 56".to_string(),
53            "c 68".to_string(),
54        ];
55        assert_eq!(result, expected);
56    }
57
58    #[test]
59    fn test_parquet_column_size_to_flamegraph_format_with_multiple_row_groups() {
60        let result = parquet_column_size_to_flamegraph_format(
61            metadata_reader("sort_columns.parquet").metadata(),
62            &args::Unit::Bytes,
63        );
64        let expected: Vec<String> = vec![
65            "a 104".to_string(),
66            "b 70".to_string(),
67            "a 104".to_string(),
68            "b 70".to_string(),
69        ];
70        assert_eq!(result, expected);
71    }
72
73    #[test]
74    fn test_parquet_column_size_to_flamegraph_format_with_kilobytes() {
75        let result = parquet_column_size_to_flamegraph_format(
76            metadata_reader("delta_encoding_required_column.parquet").metadata(),
77            &args::Unit::KiloBytes,
78        );
79        let expected: Vec<String> = vec![
80            "c_customer_sk: 0".to_string(),
81            "c_current_cdemo_sk: 0".to_string(),
82            "c_current_hdemo_sk: 0".to_string(),
83            "c_current_addr_sk: 0".to_string(),
84            "c_first_shipto_date_sk: 0".to_string(),
85            "c_first_sales_date_sk: 0".to_string(),
86            "c_birth_day: 0".to_string(),
87            "c_birth_month: 0".to_string(),
88            "c_birth_year: 0".to_string(),
89            "c_customer_id: 0".to_string(),
90            "c_salutation: 0".to_string(),
91            "c_first_name: 0".to_string(),
92            "c_last_name: 0".to_string(),
93            "c_preferred_cust_flag: 0".to_string(),
94            "c_birth_country: 1".to_string(),
95            "c_email_address: 2".to_string(),
96            "c_last_review_date: 0".to_string(),
97        ];
98        assert_eq!(result, expected);
99    }
100
101    fn metadata_reader(file_name: &str) -> SerializedFileReader<File> {
102        let file = File::open(Path::new(&format!("./resources/{}", &file_name))).unwrap();
103        let reader = SerializedFileReader::new(file).unwrap();
104
105        reader
106    }
107}