struct_compression_analyzer/csv/
mod.rs

1use crate::results::analysis_results::AnalysisResults;
2use crate::results::merged_analysis_results::MergedAnalysisResults;
3use csv::Writer;
4use std::fs;
5use std::path::{Path, PathBuf};
6
7/// Writes all CSVs related to analysis results.
8///
9/// This function orchestrates the writing of multiple CSV files:
10/// - Per-field statistics.
11/// - Split comparison statistics.
12/// - Custom comparison statistics.
13/// - Per-field value statistics.
14/// - Per-field bit statistics.
15///
16/// # Arguments
17///
18/// * `results` - A slice of [`AnalysisResults`], one for each analyzed file.
19/// * `merged_results` -  An [`MergedAnalysisResults`] object representing the merged results of all files.
20/// * `output_dir` - The directory where the CSV files will be written.
21/// * `file_paths` - A slice of [`PathBuf`]s representing the original file paths for each result.
22///
23/// # Returns
24///
25/// * `std::io::Result<()>` -  Ok if successful, otherwise an error.
26pub fn write_all_csvs(
27    results: &[AnalysisResults],
28    merged_results: &MergedAnalysisResults,
29    output_dir: &Path,
30    file_paths: &[PathBuf],
31) -> std::io::Result<()> {
32    // Create subdirectories for each stat type
33    let field_stats_dir = output_dir.join("field_stats");
34    let split_comparison_dir = output_dir.join("split_comparison");
35    let custom_comparison_dir = output_dir.join("custom_comparison");
36    let value_stats_dir = output_dir.join("value_stats");
37    let bit_stats_dir = output_dir.join("bit_stats");
38
39    fs::create_dir_all(&field_stats_dir)?;
40    fs::create_dir_all(&split_comparison_dir)?;
41    fs::create_dir_all(&custom_comparison_dir)?;
42    fs::create_dir_all(&value_stats_dir)?;
43    fs::create_dir_all(&bit_stats_dir)?;
44
45    write_field_csvs(results, &field_stats_dir, file_paths)?;
46    write_split_comparison_csv(results, &split_comparison_dir, file_paths)?;
47    write_custom_comparison_csv(results, &custom_comparison_dir, file_paths)?;
48    write_field_value_stats_csv(merged_results, &value_stats_dir)?;
49    write_field_bit_stats_csv(merged_results, &bit_stats_dir)?;
50    Ok(())
51}
52
53/// Writes individual CSV files for each field, containing statistics across all input files.
54///
55/// Creates one CSV file per field. Each row in a field's CSV represents the
56/// field's metrics from one of the input files.
57///
58/// # Arguments
59///
60/// * `results` - A slice of [`AnalysisResults`], one for each analyzed file.
61/// * `output_dir` - The directory where the CSV files will be written.
62/// * `file_paths` - A slice of [`PathBuf`]s representing the original file paths for each result.
63///
64/// # Returns
65///
66/// * `std::io::Result<()>` - Ok if successful, otherwise an error.
67pub fn write_field_csvs(
68    results: &[AnalysisResults],
69    output_dir: &Path,
70    file_paths: &[PathBuf],
71) -> std::io::Result<()> {
72    const CSV_HEADERS: &[&str] = &[
73        "name",
74        "full_path",
75        "depth",
76        "entropy",
77        "lz_matches",
78        "lz_matches_pct",
79        "zstd_size",
80        "original_size",
81        "zstd_size_pct",
82        "original_size_pct",
83        "zstd_ratio",
84        "lenbits",
85        "unique_values",
86        "bit_order",
87        "file_name",
88    ];
89
90    // Get field paths from first result (all results have same fields)
91    let field_paths = results[0].per_field.keys();
92    for field_path in field_paths {
93        let mut wtr = Writer::from_path(output_dir.join(sanitize_filename(field_path) + ".csv"))?;
94        wtr.write_record(CSV_HEADERS)?;
95
96        // Write all individual field and group records
97        for x in 0..results.len() {
98            let result = &results[x];
99            let file_path = &file_paths[x];
100            let file_metrics = result.as_field_metrics();
101            if let Some(field) = result.per_field.get(field_path) {
102                let parent_stats = field.parent_metrics_or(result, &file_metrics);
103                wtr.write_record(vec![
104                    field.name.clone(),
105                    field.full_path.clone(),
106                    field.depth.to_string(),
107                    field.entropy.to_string(),
108                    field.lz_matches.to_string(),
109                    calc_ratio(field.lz_matches, parent_stats.lz_matches),
110                    field.zstd_size.to_string(),
111                    field.original_size.to_string(),
112                    calc_ratio(field.zstd_size, parent_stats.zstd_size),
113                    calc_ratio(field.original_size, parent_stats.original_size),
114                    calc_ratio(field.zstd_size, field.original_size),
115                    field.lenbits.to_string(),
116                    field.value_counts.len().to_string(),
117                    format!("{:?}", field.bit_order),
118                    file_path
119                        .file_name()
120                        .and_then(|os_str| os_str.to_str())
121                        .unwrap_or_default()
122                        .to_string(),
123                ])?;
124            }
125        }
126        wtr.flush()?;
127    }
128
129    Ok(())
130}
131
132/// Writes CSV files comparing groups of fields within each file, for split comparisons.
133///
134/// This function generates CSV files that compare two groups of fields
135/// (defined in the schema) within each analyzed file.  It reports on
136/// differences in size, LZ77 matches, estimated size, and Zstd compression.
137///
138/// # Arguments
139///
140/// * `results` - A slice of [`AnalysisResults`], one for each analyzed file.
141/// * `output_dir` - The directory where the CSV files will be written.
142/// * `file_paths` - A slice of `PathBuf`s representing the original file paths for each result.
143///
144/// # Returns
145///
146/// * `std::io::Result<()>` - Ok if successful, otherwise an error.
147pub fn write_split_comparison_csv(
148    results: &[AnalysisResults],
149    output_dir: &Path,
150    file_paths: &[PathBuf],
151) -> std::io::Result<()> {
152    // Add group comparison CSVs
153    const GROUP_HEADERS: &[&str] = &[
154        "name",
155        "file_name",
156        "size",
157        "base lz",
158        "comp lz",
159        "base est",
160        "base zstd",
161        "comp est",
162        "comp zstd",
163        "ratio est",
164        "ratio zstd",
165        "diff est",
166        "diff zstd",
167        "base group lz",
168        "comp group lz",
169        "base group entropy",
170        "comp group entropy",
171        "max comp lz diff",
172        "max comp entropy diff",
173    ];
174
175    for (comp_idx, comparison) in results[0].split_comparisons.iter().enumerate() {
176        let mut wtr = Writer::from_path(
177            output_dir.join(sanitize_filename(&comparison.name) + "_comparison.csv"),
178        )?;
179        wtr.write_record(GROUP_HEADERS)?;
180
181        for (file_idx, result) in results.iter().enumerate() {
182            // Get equivalent comparison for this result.
183            let comparison = &result.split_comparisons[comp_idx];
184            let base_group_lz: Vec<_> = comparison
185                .baseline_comparison_metrics
186                .iter()
187                .map(|m| m.lz_matches.to_string())
188                .collect();
189            let comp_group_lz: Vec<_> = comparison
190                .split_comparison_metrics
191                .iter()
192                .map(|m| m.lz_matches.to_string())
193                .collect();
194            let comp_group_entropy: Vec<_> = comparison
195                .split_comparison_metrics
196                .iter()
197                .map(|m| format!("{:.2}", m.entropy))
198                .collect();
199            let base_group_entropy: Vec<_> = comparison
200                .baseline_comparison_metrics
201                .iter()
202                .map(|m| format!("{:.2}", m.entropy))
203                .collect();
204
205            let group2_lz_values: Vec<u64> = comparison
206                .split_comparison_metrics
207                .iter()
208                .map(|m| m.lz_matches)
209                .collect();
210
211            let max_intra_comp_lz_diff_ratio = if group2_lz_values.len() < 2 {
212                0.0
213            } else {
214                let max = *group2_lz_values.iter().max().unwrap() as f64;
215                let min = *group2_lz_values.iter().min().unwrap() as f64;
216                max / min
217            };
218
219            wtr.write_record(vec![
220                comparison.name.clone(), // name
221                file_paths[file_idx]
222                    .file_name()
223                    .map(|s| s.to_string_lossy().into_owned())
224                    .unwrap(), // file name
225                comparison.group1_metrics.original_size.to_string(), // size
226                comparison.group1_metrics.lz_matches.to_string(), // base lz
227                comparison.group2_metrics.lz_matches.to_string(), // comp lz
228                comparison.group1_metrics.estimated_size.to_string(), // base est
229                comparison.group1_metrics.zstd_size.to_string(), // base zstd
230                comparison.group2_metrics.estimated_size.to_string(), // comp est
231                comparison.group2_metrics.zstd_size.to_string(), // comp zstd
232                calc_ratio(
233                    comparison.group2_metrics.estimated_size,
234                    comparison.group1_metrics.estimated_size,
235                ), // ratio est
236                calc_ratio(
237                    comparison.group2_metrics.zstd_size,
238                    comparison.group1_metrics.zstd_size,
239                ), // ratio zstd
240                comparison.difference.estimated_size.to_string(), // diff est
241                comparison.difference.zstd_size.to_string(), // diff zstd
242                base_group_lz.join("|"),
243                comp_group_lz.join("|"),
244                base_group_entropy.join("|"),
245                comp_group_entropy.join("|"),
246                format!("{:.2}", max_intra_comp_lz_diff_ratio),
247                format!("{:.2}", comparison.split_max_entropy_diff()),
248            ])?;
249
250            wtr.flush()?;
251        }
252    }
253
254    Ok(())
255}
256
257/// Writes CSV files comparing groups of fields within each file, for custom comparisons.
258///
259/// This function is analogous to `write_split_comparison_csv`, but handles
260/// `custom_comparisons` instead.  It includes multiple comparison groups.
261///
262/// # Arguments
263///
264/// * `results` - A slice of [`AnalysisResults`], one for each analyzed file.
265/// * `output_dir` - The directory where the CSV files will be written.
266/// * `file_paths` - A slice of `PathBuf`s representing the original file paths for each result.
267///
268/// # Returns
269///
270/// * `std::io::Result<()>` - Ok if successful, otherwise an error.
271pub fn write_custom_comparison_csv(
272    results: &[AnalysisResults],
273    output_dir: &Path,
274    file_paths: &[PathBuf],
275) -> std::io::Result<()> {
276    for (comp_idx, comparison) in results[0].custom_comparisons.iter().enumerate() {
277        let mut wtr = Writer::from_path(
278            output_dir.join(sanitize_filename(&comparison.name) + "_comparison.csv"),
279        )?;
280
281        // Dynamically build headers based on the number of comparison groups
282        let mut headers = vec![
283            "name".to_string(),
284            "file_name".to_string(),
285            "base_size".to_string(),
286        ];
287
288        // LZ stats
289        headers.push("base_lz".to_string());
290        for group_name in &comparison.group_names {
291            headers.push(format!("{}_lz", group_name));
292        }
293
294        // Estimated Size stats
295        headers.push("base_est".to_string());
296        for group_name in &comparison.group_names {
297            headers.push(format!("{}_est", group_name));
298        }
299
300        // Estimated Ratio stats
301        for group_name in &comparison.group_names {
302            headers.push(format!("{}_ratio_est", group_name));
303        }
304
305        // Estimated Diff stats
306        for group_name in &comparison.group_names {
307            headers.push(format!("{}_diff_est", group_name));
308        }
309
310        // Zstd Size stats
311        headers.push("base_zstd".to_string());
312        for group_name in &comparison.group_names {
313            headers.push(format!("{}_zstd", group_name));
314        }
315
316        // Zstd Ratio stats
317        for group_name in &comparison.group_names {
318            headers.push(format!("{}_ratio_zstd", group_name));
319        }
320
321        // Zstd Diff stats
322        for group_name in &comparison.group_names {
323            headers.push(format!("{}_diff_zstd", group_name));
324        }
325
326        wtr.write_record(&headers)?;
327
328        for (file_idx, result) in results.iter().enumerate() {
329            // Get equivalent comparison for this result.
330            let comparison = &result.custom_comparisons[comp_idx];
331
332            // Write reference, baseline metrics.
333            let mut record = vec![
334                comparison.name.clone(),
335                file_paths[file_idx]
336                    .file_name()
337                    .map(|s| s.to_string_lossy().into_owned())
338                    .unwrap(),
339                comparison.baseline_metrics.original_size.to_string(),
340            ];
341
342            // Write LZ values
343            record.push(comparison.baseline_metrics.lz_matches.to_string());
344            for group_metrics in comparison.group_metrics.iter() {
345                record.push(group_metrics.lz_matches.to_string());
346            }
347
348            // Write Estimated Size values
349            record.push(comparison.baseline_metrics.estimated_size.to_string());
350            for group_metrics in comparison.group_metrics.iter() {
351                record.push(group_metrics.estimated_size.to_string());
352            }
353
354            // Write Estimated Ratio values
355            for group_metrics in comparison.group_metrics.iter() {
356                record.push(calc_ratio(
357                    group_metrics.estimated_size,
358                    comparison.baseline_metrics.estimated_size,
359                ));
360            }
361
362            // Write Estimated Diff values
363            for difference in &comparison.differences {
364                record.push(difference.estimated_size.to_string());
365            }
366
367            // Write Zstd Size values
368            record.push(comparison.baseline_metrics.zstd_size.to_string());
369            for group_metrics in comparison.group_metrics.iter() {
370                record.push(group_metrics.zstd_size.to_string());
371            }
372
373            // Write Zstd Ratio values
374            for group_metrics in comparison.group_metrics.iter() {
375                record.push(calc_ratio(
376                    group_metrics.zstd_size,
377                    comparison.baseline_metrics.zstd_size,
378                ));
379            }
380
381            // Write Zstd Diff values
382            for difference in &comparison.differences {
383                record.extend([difference.zstd_size.to_string()]);
384            }
385
386            wtr.write_record(&record)?;
387        }
388        wtr.flush()?;
389    }
390
391    Ok(())
392}
393
394/// Writes CSV files containing value statistics for each field.
395///
396/// This function generates a CSV file for each field, listing the unique values
397/// encountered in the merged data, along with their counts and ratios.
398///
399/// # Arguments
400///
401/// * `results` - The merged `AnalysisResults` object.
402/// * `output_dir` - The directory where the CSV files will be written.
403///
404/// # Returns
405///
406/// * `std::io::Result<()>` - Ok if successful, otherwise an error.
407pub fn write_field_value_stats_csv(
408    results: &MergedAnalysisResults,
409    output_dir: &Path,
410) -> std::io::Result<()> {
411    // Get field paths from first result
412    let field_paths = results.per_field.keys();
413    for field_path in field_paths {
414        let mut wtr =
415            Writer::from_path(output_dir.join(sanitize_filename(field_path) + "_value_stats.csv"))?;
416        wtr.write_record(["value", "count", "ratio"])?;
417
418        // Write value counts for each result
419        if let Some(field) = results.per_field.get(field_path) {
420            // Get sorted value counts
421            let value_counts = field.sorted_value_counts();
422
423            // Calculate total count for ratio
424            let total_values: u64 = value_counts.iter().map(|(_, count)| **count).sum();
425
426            // Write sorted values with ratios
427            for (value, count) in value_counts {
428                wtr.write_record(&[
429                    value.to_string(),
430                    count.to_string(),
431                    calc_ratio(*count, total_values),
432                ])?;
433            }
434        }
435        wtr.flush()?;
436    }
437    Ok(())
438}
439
440/// Writes CSV files containing bit-level statistics for each field.
441///
442/// This function generates a CSV file for each field, showing the counts of 0s
443/// and 1s at each bit offset within the field, along with the ratio of 0s to
444/// the total number of bits at that offset.
445///
446/// # Arguments
447///
448/// * `results` - The merged `AnalysisResults` object.
449/// * `output_dir` - The directory where the CSV files will be written.
450///
451/// # Returns
452///
453/// * `std::io::Result<()>` - Ok if successful, otherwise an error.
454pub fn write_field_bit_stats_csv(
455    results: &MergedAnalysisResults,
456    output_dir: &Path,
457) -> std::io::Result<()> {
458    // Get field paths from first result
459    let field_paths = results.per_field.keys();
460    for field_path in field_paths {
461        let mut wtr =
462            Writer::from_path(output_dir.join(sanitize_filename(field_path) + "_bit_stats.csv"))?;
463        wtr.write_record(["bit_offset", "zero_count", "one_count", "ratio"])?;
464
465        // Write bit stats for each result
466        if let Some(field) = results.per_field.get(field_path) {
467            for (i, stats) in field.bit_counts.iter().enumerate() {
468                wtr.write_record(&[
469                    i.to_string(),
470                    stats.zeros.to_string(),
471                    stats.ones.to_string(),
472                    calc_ratio(stats.zeros, stats.zeros + stats.ones),
473                ])?;
474            }
475        }
476        wtr.flush()?;
477    }
478    Ok(())
479}
480
481/// Calculates a ratio between two numbers, handling division by zero.
482///
483/// # Arguments
484///
485/// * `child` - The numerator. (comparison)
486/// * `parent` - The denominator. (base)
487///
488/// # Returns
489///
490/// A string representing the ratio, or "0.0" if the denominator is zero.
491pub fn calc_ratio(child: u64, parent: u64) -> String {
492    if parent == 0 {
493        "0.0".into()
494    } else {
495        format!("{}", child as f64 / parent as f64)
496    }
497}
498
499/// Sanitizes a string to be used as a filename by replacing non-alphanumeric characters with underscores.
500/// # Arguments
501///
502/// * `name` - The input string.
503///
504/// # Returns
505/// A sanitized version of the string suitable for use as a filename.
506fn sanitize_filename(name: &str) -> String {
507    name.replace(|c: char| !c.is_alphanumeric(), "_")
508}