use crate::results::analysis_results::AnalysisResults;
use crate::results::merged_analysis_results::MergedAnalysisResults;
use csv::Writer;
use std::fs;
use std::path::{Path, PathBuf};
pub fn write_all_csvs(
results: &[AnalysisResults],
merged_results: &MergedAnalysisResults,
output_dir: &Path,
file_paths: &[PathBuf],
) -> std::io::Result<()> {
let field_stats_dir = output_dir.join("field_stats");
let split_comparison_dir = output_dir.join("split_comparison");
let custom_comparison_dir = output_dir.join("custom_comparison");
let value_stats_dir = output_dir.join("value_stats");
let bit_stats_dir = output_dir.join("bit_stats");
fs::create_dir_all(&field_stats_dir)?;
fs::create_dir_all(&split_comparison_dir)?;
fs::create_dir_all(&custom_comparison_dir)?;
fs::create_dir_all(&value_stats_dir)?;
fs::create_dir_all(&bit_stats_dir)?;
write_field_csvs(results, &field_stats_dir, file_paths)?;
write_split_comparison_csv(results, &split_comparison_dir, file_paths)?;
write_custom_comparison_csv(results, &custom_comparison_dir, file_paths)?;
write_field_value_stats_csv(merged_results, &value_stats_dir)?;
write_field_bit_stats_csv(merged_results, &bit_stats_dir)?;
Ok(())
}
pub fn write_field_csvs(
results: &[AnalysisResults],
output_dir: &Path,
file_paths: &[PathBuf],
) -> std::io::Result<()> {
const CSV_HEADERS: &[&str] = &[
"name",
"full_path",
"depth",
"entropy",
"lz_matches",
"lz_matches_pct",
"zstd_size",
"original_size",
"zstd_size_pct",
"original_size_pct",
"zstd_ratio",
"lenbits",
"unique_values",
"bit_order",
"file_name",
];
let field_paths = results[0].per_field.keys();
for field_path in field_paths {
let mut wtr = Writer::from_path(output_dir.join(sanitize_filename(field_path) + ".csv"))?;
wtr.write_record(CSV_HEADERS)?;
for x in 0..results.len() {
let result = &results[x];
let file_path = &file_paths[x];
let file_metrics = result.as_field_metrics();
if let Some(field) = result.per_field.get(field_path) {
let parent_stats = field.parent_metrics_or(result, &file_metrics);
wtr.write_record(vec![
field.name.clone(),
field.full_path.clone(),
field.depth.to_string(),
field.entropy.to_string(),
field.lz_matches.to_string(),
calc_ratio(field.lz_matches, parent_stats.lz_matches),
field.zstd_size.to_string(),
field.original_size.to_string(),
calc_ratio(field.zstd_size, parent_stats.zstd_size),
calc_ratio(field.original_size, parent_stats.original_size),
calc_ratio(field.zstd_size, field.original_size),
field.lenbits.to_string(),
field.value_counts.len().to_string(),
format!("{:?}", field.bit_order),
file_path
.file_name()
.and_then(|os_str| os_str.to_str())
.unwrap_or_default()
.to_string(),
])?;
}
}
wtr.flush()?;
}
Ok(())
}
pub fn write_split_comparison_csv(
results: &[AnalysisResults],
output_dir: &Path,
file_paths: &[PathBuf],
) -> std::io::Result<()> {
const GROUP_HEADERS: &[&str] = &[
"name",
"file_name",
"size",
"base lz",
"comp lz",
"base est",
"base zstd",
"comp est",
"comp zstd",
"ratio est",
"ratio zstd",
"diff est",
"diff zstd",
"base group lz",
"comp group lz",
"base group entropy",
"comp group entropy",
"max comp lz diff",
"max comp entropy diff",
];
for (comp_idx, comparison) in results[0].split_comparisons.iter().enumerate() {
let mut wtr = Writer::from_path(
output_dir.join(sanitize_filename(&comparison.name) + "_comparison.csv"),
)?;
wtr.write_record(GROUP_HEADERS)?;
for (file_idx, result) in results.iter().enumerate() {
let comparison = &result.split_comparisons[comp_idx];
let base_group_lz: Vec<_> = comparison
.baseline_comparison_metrics
.iter()
.map(|m| m.lz_matches.to_string())
.collect();
let comp_group_lz: Vec<_> = comparison
.split_comparison_metrics
.iter()
.map(|m| m.lz_matches.to_string())
.collect();
let comp_group_entropy: Vec<_> = comparison
.split_comparison_metrics
.iter()
.map(|m| format!("{:.2}", m.entropy))
.collect();
let base_group_entropy: Vec<_> = comparison
.baseline_comparison_metrics
.iter()
.map(|m| format!("{:.2}", m.entropy))
.collect();
let group2_lz_values: Vec<u64> = comparison
.split_comparison_metrics
.iter()
.map(|m| m.lz_matches)
.collect();
let max_intra_comp_lz_diff_ratio = if group2_lz_values.len() < 2 {
0.0
} else {
let max = *group2_lz_values.iter().max().unwrap() as f64;
let min = *group2_lz_values.iter().min().unwrap() as f64;
max / min
};
wtr.write_record(vec![
comparison.name.clone(), file_paths[file_idx]
.file_name()
.map(|s| s.to_string_lossy().into_owned())
.unwrap(), comparison.group1_metrics.original_size.to_string(), comparison.group1_metrics.lz_matches.to_string(), comparison.group2_metrics.lz_matches.to_string(), comparison.group1_metrics.estimated_size.to_string(), comparison.group1_metrics.zstd_size.to_string(), comparison.group2_metrics.estimated_size.to_string(), comparison.group2_metrics.zstd_size.to_string(), calc_ratio(
comparison.group2_metrics.estimated_size,
comparison.group1_metrics.estimated_size,
), calc_ratio(
comparison.group2_metrics.zstd_size,
comparison.group1_metrics.zstd_size,
), comparison.difference.estimated_size.to_string(), comparison.difference.zstd_size.to_string(), base_group_lz.join("|"),
comp_group_lz.join("|"),
base_group_entropy.join("|"),
comp_group_entropy.join("|"),
format!("{:.2}", max_intra_comp_lz_diff_ratio),
format!("{:.2}", comparison.split_max_entropy_diff()),
])?;
wtr.flush()?;
}
}
Ok(())
}
pub fn write_custom_comparison_csv(
results: &[AnalysisResults],
output_dir: &Path,
file_paths: &[PathBuf],
) -> std::io::Result<()> {
for (comp_idx, comparison) in results[0].custom_comparisons.iter().enumerate() {
let mut wtr = Writer::from_path(
output_dir.join(sanitize_filename(&comparison.name) + "_comparison.csv"),
)?;
let mut headers = vec![
"name".to_string(),
"file_name".to_string(),
"base_size".to_string(),
];
headers.push("base_lz".to_string());
for group_name in &comparison.group_names {
headers.push(format!("{}_lz", group_name));
}
headers.push("base_est".to_string());
for group_name in &comparison.group_names {
headers.push(format!("{}_est", group_name));
}
for group_name in &comparison.group_names {
headers.push(format!("{}_ratio_est", group_name));
}
for group_name in &comparison.group_names {
headers.push(format!("{}_diff_est", group_name));
}
headers.push("base_zstd".to_string());
for group_name in &comparison.group_names {
headers.push(format!("{}_zstd", group_name));
}
for group_name in &comparison.group_names {
headers.push(format!("{}_ratio_zstd", group_name));
}
for group_name in &comparison.group_names {
headers.push(format!("{}_diff_zstd", group_name));
}
wtr.write_record(&headers)?;
for (file_idx, result) in results.iter().enumerate() {
let comparison = &result.custom_comparisons[comp_idx];
let mut record = vec![
comparison.name.clone(),
file_paths[file_idx]
.file_name()
.map(|s| s.to_string_lossy().into_owned())
.unwrap(),
comparison.baseline_metrics.original_size.to_string(),
];
record.push(comparison.baseline_metrics.lz_matches.to_string());
for group_metrics in comparison.group_metrics.iter() {
record.push(group_metrics.lz_matches.to_string());
}
record.push(comparison.baseline_metrics.estimated_size.to_string());
for group_metrics in comparison.group_metrics.iter() {
record.push(group_metrics.estimated_size.to_string());
}
for group_metrics in comparison.group_metrics.iter() {
record.push(calc_ratio(
group_metrics.estimated_size,
comparison.baseline_metrics.estimated_size,
));
}
for difference in &comparison.differences {
record.push(difference.estimated_size.to_string());
}
record.push(comparison.baseline_metrics.zstd_size.to_string());
for group_metrics in comparison.group_metrics.iter() {
record.push(group_metrics.zstd_size.to_string());
}
for group_metrics in comparison.group_metrics.iter() {
record.push(calc_ratio(
group_metrics.zstd_size,
comparison.baseline_metrics.zstd_size,
));
}
for difference in &comparison.differences {
record.extend([difference.zstd_size.to_string()]);
}
wtr.write_record(&record)?;
}
wtr.flush()?;
}
Ok(())
}
pub fn write_field_value_stats_csv(
results: &MergedAnalysisResults,
output_dir: &Path,
) -> std::io::Result<()> {
let field_paths = results.per_field.keys();
for field_path in field_paths {
let mut wtr =
Writer::from_path(output_dir.join(sanitize_filename(field_path) + "_value_stats.csv"))?;
wtr.write_record(["value", "count", "ratio"])?;
if let Some(field) = results.per_field.get(field_path) {
let value_counts = field.sorted_value_counts();
let total_values: u64 = value_counts.iter().map(|(_, count)| **count).sum();
for (value, count) in value_counts {
wtr.write_record(&[
value.to_string(),
count.to_string(),
calc_ratio(*count, total_values),
])?;
}
}
wtr.flush()?;
}
Ok(())
}
pub fn write_field_bit_stats_csv(
results: &MergedAnalysisResults,
output_dir: &Path,
) -> std::io::Result<()> {
let field_paths = results.per_field.keys();
for field_path in field_paths {
let mut wtr =
Writer::from_path(output_dir.join(sanitize_filename(field_path) + "_bit_stats.csv"))?;
wtr.write_record(["bit_offset", "zero_count", "one_count", "ratio"])?;
if let Some(field) = results.per_field.get(field_path) {
for (i, stats) in field.bit_counts.iter().enumerate() {
wtr.write_record(&[
i.to_string(),
stats.zeros.to_string(),
stats.ones.to_string(),
calc_ratio(stats.zeros, stats.zeros + stats.ones),
])?;
}
}
wtr.flush()?;
}
Ok(())
}
pub fn calc_ratio(child: u64, parent: u64) -> String {
if parent == 0 {
"0.0".into()
} else {
format!("{}", child as f64 / parent as f64)
}
}
fn sanitize_filename(name: &str) -> String {
name.replace(|c: char| !c.is_alphanumeric(), "_")
}