struct_compression_analyzer/csv/
mod.rs1use crate::results::analysis_results::AnalysisResults;
2use crate::results::merged_analysis_results::MergedAnalysisResults;
3use csv::Writer;
4use std::fs;
5use std::path::{Path, PathBuf};
6
7pub fn write_all_csvs(
27 results: &[AnalysisResults],
28 merged_results: &MergedAnalysisResults,
29 output_dir: &Path,
30 file_paths: &[PathBuf],
31) -> std::io::Result<()> {
32 let field_stats_dir = output_dir.join("field_stats");
34 let split_comparison_dir = output_dir.join("split_comparison");
35 let custom_comparison_dir = output_dir.join("custom_comparison");
36 let value_stats_dir = output_dir.join("value_stats");
37 let bit_stats_dir = output_dir.join("bit_stats");
38
39 fs::create_dir_all(&field_stats_dir)?;
40 fs::create_dir_all(&split_comparison_dir)?;
41 fs::create_dir_all(&custom_comparison_dir)?;
42 fs::create_dir_all(&value_stats_dir)?;
43 fs::create_dir_all(&bit_stats_dir)?;
44
45 write_field_csvs(results, &field_stats_dir, file_paths)?;
46 write_split_comparison_csv(results, &split_comparison_dir, file_paths)?;
47 write_custom_comparison_csv(results, &custom_comparison_dir, file_paths)?;
48 write_field_value_stats_csv(merged_results, &value_stats_dir)?;
49 write_field_bit_stats_csv(merged_results, &bit_stats_dir)?;
50 Ok(())
51}
52
53pub fn write_field_csvs(
68 results: &[AnalysisResults],
69 output_dir: &Path,
70 file_paths: &[PathBuf],
71) -> std::io::Result<()> {
72 const CSV_HEADERS: &[&str] = &[
73 "name",
74 "full_path",
75 "depth",
76 "entropy",
77 "lz_matches",
78 "lz_matches_pct",
79 "zstd_size",
80 "original_size",
81 "zstd_size_pct",
82 "original_size_pct",
83 "zstd_ratio",
84 "lenbits",
85 "unique_values",
86 "bit_order",
87 "file_name",
88 ];
89
90 let field_paths = results[0].per_field.keys();
92 for field_path in field_paths {
93 let mut wtr = Writer::from_path(output_dir.join(sanitize_filename(field_path) + ".csv"))?;
94 wtr.write_record(CSV_HEADERS)?;
95
96 for x in 0..results.len() {
98 let result = &results[x];
99 let file_path = &file_paths[x];
100 let file_metrics = result.as_field_metrics();
101 if let Some(field) = result.per_field.get(field_path) {
102 let parent_stats = field.parent_metrics_or(result, &file_metrics);
103 wtr.write_record(vec![
104 field.name.clone(),
105 field.full_path.clone(),
106 field.depth.to_string(),
107 field.entropy.to_string(),
108 field.lz_matches.to_string(),
109 calc_ratio(field.lz_matches, parent_stats.lz_matches),
110 field.zstd_size.to_string(),
111 field.original_size.to_string(),
112 calc_ratio(field.zstd_size, parent_stats.zstd_size),
113 calc_ratio(field.original_size, parent_stats.original_size),
114 calc_ratio(field.zstd_size, field.original_size),
115 field.lenbits.to_string(),
116 field.value_counts.len().to_string(),
117 format!("{:?}", field.bit_order),
118 file_path
119 .file_name()
120 .and_then(|os_str| os_str.to_str())
121 .unwrap_or_default()
122 .to_string(),
123 ])?;
124 }
125 }
126 wtr.flush()?;
127 }
128
129 Ok(())
130}
131
132pub fn write_split_comparison_csv(
148 results: &[AnalysisResults],
149 output_dir: &Path,
150 file_paths: &[PathBuf],
151) -> std::io::Result<()> {
152 const GROUP_HEADERS: &[&str] = &[
154 "name",
155 "file_name",
156 "size",
157 "base lz",
158 "comp lz",
159 "base est",
160 "base zstd",
161 "comp est",
162 "comp zstd",
163 "ratio est",
164 "ratio zstd",
165 "diff est",
166 "diff zstd",
167 "base group lz",
168 "comp group lz",
169 "base group entropy",
170 "comp group entropy",
171 "max comp lz diff",
172 "max comp entropy diff",
173 ];
174
175 for (comp_idx, comparison) in results[0].split_comparisons.iter().enumerate() {
176 let mut wtr = Writer::from_path(
177 output_dir.join(sanitize_filename(&comparison.name) + "_comparison.csv"),
178 )?;
179 wtr.write_record(GROUP_HEADERS)?;
180
181 for (file_idx, result) in results.iter().enumerate() {
182 let comparison = &result.split_comparisons[comp_idx];
184 let base_group_lz: Vec<_> = comparison
185 .baseline_comparison_metrics
186 .iter()
187 .map(|m| m.lz_matches.to_string())
188 .collect();
189 let comp_group_lz: Vec<_> = comparison
190 .split_comparison_metrics
191 .iter()
192 .map(|m| m.lz_matches.to_string())
193 .collect();
194 let comp_group_entropy: Vec<_> = comparison
195 .split_comparison_metrics
196 .iter()
197 .map(|m| format!("{:.2}", m.entropy))
198 .collect();
199 let base_group_entropy: Vec<_> = comparison
200 .baseline_comparison_metrics
201 .iter()
202 .map(|m| format!("{:.2}", m.entropy))
203 .collect();
204
205 let group2_lz_values: Vec<u64> = comparison
206 .split_comparison_metrics
207 .iter()
208 .map(|m| m.lz_matches)
209 .collect();
210
211 let max_intra_comp_lz_diff_ratio = if group2_lz_values.len() < 2 {
212 0.0
213 } else {
214 let max = *group2_lz_values.iter().max().unwrap() as f64;
215 let min = *group2_lz_values.iter().min().unwrap() as f64;
216 max / min
217 };
218
219 wtr.write_record(vec![
220 comparison.name.clone(), file_paths[file_idx]
222 .file_name()
223 .map(|s| s.to_string_lossy().into_owned())
224 .unwrap(), comparison.group1_metrics.original_size.to_string(), comparison.group1_metrics.lz_matches.to_string(), comparison.group2_metrics.lz_matches.to_string(), comparison.group1_metrics.estimated_size.to_string(), comparison.group1_metrics.zstd_size.to_string(), comparison.group2_metrics.estimated_size.to_string(), comparison.group2_metrics.zstd_size.to_string(), calc_ratio(
233 comparison.group2_metrics.estimated_size,
234 comparison.group1_metrics.estimated_size,
235 ), calc_ratio(
237 comparison.group2_metrics.zstd_size,
238 comparison.group1_metrics.zstd_size,
239 ), comparison.difference.estimated_size.to_string(), comparison.difference.zstd_size.to_string(), base_group_lz.join("|"),
243 comp_group_lz.join("|"),
244 base_group_entropy.join("|"),
245 comp_group_entropy.join("|"),
246 format!("{:.2}", max_intra_comp_lz_diff_ratio),
247 format!("{:.2}", comparison.split_max_entropy_diff()),
248 ])?;
249
250 wtr.flush()?;
251 }
252 }
253
254 Ok(())
255}
256
257pub fn write_custom_comparison_csv(
272 results: &[AnalysisResults],
273 output_dir: &Path,
274 file_paths: &[PathBuf],
275) -> std::io::Result<()> {
276 for (comp_idx, comparison) in results[0].custom_comparisons.iter().enumerate() {
277 let mut wtr = Writer::from_path(
278 output_dir.join(sanitize_filename(&comparison.name) + "_comparison.csv"),
279 )?;
280
281 let mut headers = vec![
283 "name".to_string(),
284 "file_name".to_string(),
285 "base_size".to_string(),
286 ];
287
288 headers.push("base_lz".to_string());
290 for group_name in &comparison.group_names {
291 headers.push(format!("{}_lz", group_name));
292 }
293
294 headers.push("base_est".to_string());
296 for group_name in &comparison.group_names {
297 headers.push(format!("{}_est", group_name));
298 }
299
300 for group_name in &comparison.group_names {
302 headers.push(format!("{}_ratio_est", group_name));
303 }
304
305 for group_name in &comparison.group_names {
307 headers.push(format!("{}_diff_est", group_name));
308 }
309
310 headers.push("base_zstd".to_string());
312 for group_name in &comparison.group_names {
313 headers.push(format!("{}_zstd", group_name));
314 }
315
316 for group_name in &comparison.group_names {
318 headers.push(format!("{}_ratio_zstd", group_name));
319 }
320
321 for group_name in &comparison.group_names {
323 headers.push(format!("{}_diff_zstd", group_name));
324 }
325
326 wtr.write_record(&headers)?;
327
328 for (file_idx, result) in results.iter().enumerate() {
329 let comparison = &result.custom_comparisons[comp_idx];
331
332 let mut record = vec![
334 comparison.name.clone(),
335 file_paths[file_idx]
336 .file_name()
337 .map(|s| s.to_string_lossy().into_owned())
338 .unwrap(),
339 comparison.baseline_metrics.original_size.to_string(),
340 ];
341
342 record.push(comparison.baseline_metrics.lz_matches.to_string());
344 for group_metrics in comparison.group_metrics.iter() {
345 record.push(group_metrics.lz_matches.to_string());
346 }
347
348 record.push(comparison.baseline_metrics.estimated_size.to_string());
350 for group_metrics in comparison.group_metrics.iter() {
351 record.push(group_metrics.estimated_size.to_string());
352 }
353
354 for group_metrics in comparison.group_metrics.iter() {
356 record.push(calc_ratio(
357 group_metrics.estimated_size,
358 comparison.baseline_metrics.estimated_size,
359 ));
360 }
361
362 for difference in &comparison.differences {
364 record.push(difference.estimated_size.to_string());
365 }
366
367 record.push(comparison.baseline_metrics.zstd_size.to_string());
369 for group_metrics in comparison.group_metrics.iter() {
370 record.push(group_metrics.zstd_size.to_string());
371 }
372
373 for group_metrics in comparison.group_metrics.iter() {
375 record.push(calc_ratio(
376 group_metrics.zstd_size,
377 comparison.baseline_metrics.zstd_size,
378 ));
379 }
380
381 for difference in &comparison.differences {
383 record.extend([difference.zstd_size.to_string()]);
384 }
385
386 wtr.write_record(&record)?;
387 }
388 wtr.flush()?;
389 }
390
391 Ok(())
392}
393
394pub fn write_field_value_stats_csv(
408 results: &MergedAnalysisResults,
409 output_dir: &Path,
410) -> std::io::Result<()> {
411 let field_paths = results.per_field.keys();
413 for field_path in field_paths {
414 let mut wtr =
415 Writer::from_path(output_dir.join(sanitize_filename(field_path) + "_value_stats.csv"))?;
416 wtr.write_record(["value", "count", "ratio"])?;
417
418 if let Some(field) = results.per_field.get(field_path) {
420 let value_counts = field.sorted_value_counts();
422
423 let total_values: u64 = value_counts.iter().map(|(_, count)| **count).sum();
425
426 for (value, count) in value_counts {
428 wtr.write_record(&[
429 value.to_string(),
430 count.to_string(),
431 calc_ratio(*count, total_values),
432 ])?;
433 }
434 }
435 wtr.flush()?;
436 }
437 Ok(())
438}
439
440pub fn write_field_bit_stats_csv(
455 results: &MergedAnalysisResults,
456 output_dir: &Path,
457) -> std::io::Result<()> {
458 let field_paths = results.per_field.keys();
460 for field_path in field_paths {
461 let mut wtr =
462 Writer::from_path(output_dir.join(sanitize_filename(field_path) + "_bit_stats.csv"))?;
463 wtr.write_record(["bit_offset", "zero_count", "one_count", "ratio"])?;
464
465 if let Some(field) = results.per_field.get(field_path) {
467 for (i, stats) in field.bit_counts.iter().enumerate() {
468 wtr.write_record(&[
469 i.to_string(),
470 stats.zeros.to_string(),
471 stats.ones.to_string(),
472 calc_ratio(stats.zeros, stats.zeros + stats.ones),
473 ])?;
474 }
475 }
476 wtr.flush()?;
477 }
478 Ok(())
479}
480
481pub fn calc_ratio(child: u64, parent: u64) -> String {
492 if parent == 0 {
493 "0.0".into()
494 } else {
495 format!("{}", child as f64 / parent as f64)
496 }
497}
498
499fn sanitize_filename(name: &str) -> String {
507 name.replace(|c: char| !c.is_alphanumeric(), "_")
508}