1use super::{
2 print_field_metrics_bit_stats, print_field_metrics_value_stats, ComputeAnalysisResultsError,
3 FieldMetrics, PrintFormat,
4};
5use crate::{
6 analyzer::{AnalyzerFieldState, CompressionOptions, SchemaAnalyzer},
7 comparison::{
8 compare_groups::{analyze_custom_comparisons, GroupComparisonResult},
9 split_comparison::{
10 make_split_comparison_result, FieldComparisonMetrics, SplitComparisonResult,
11 },
12 },
13 results::calculate_percentage,
14 schema::{BitOrder, Metadata, Schema, SplitComparison},
15 utils::analyze_utils::{calculate_file_entropy, get_writer_buffer, get_zstd_compressed_size},
16};
17use ahash::{AHashMap, HashMapExt};
18use lossless_transform_utils::match_estimator::estimate_num_lz_matches_fast;
19use rustc_hash::FxHashMap;
20use std::io::{self, Write};
21
22#[derive(Clone, Default)]
24pub struct AnalysisResults {
25 pub schema_metadata: Metadata,
27
28 pub file_entropy: f64,
30
31 pub file_lz_matches: u64,
33
34 pub zstd_file_size: u64,
36
37 pub original_size: u64,
39
40 pub per_field: AHashMap<String, FieldMetrics>,
44
45 pub split_comparisons: Vec<SplitComparisonResult>,
47
48 pub custom_comparisons: Vec<GroupComparisonResult>,
50}
51
52pub fn compute_analysis_results(
58 analyzer: &mut SchemaAnalyzer,
59) -> Result<AnalysisResults, ComputeAnalysisResultsError> {
60 let file_entropy = calculate_file_entropy(&analyzer.entries);
62 let file_lz_matches = estimate_num_lz_matches_fast(&analyzer.entries);
63
64 let mut field_metrics: AHashMap<String, FieldMetrics> = AHashMap::new();
66
67 for stats in &mut analyzer.field_states.values_mut() {
68 let writer_buffer = get_writer_buffer(&mut stats.writer);
69 let entropy = calculate_file_entropy(writer_buffer);
70 let lz_matches = estimate_num_lz_matches_fast(writer_buffer);
71 let actual_size = get_zstd_compressed_size(
72 writer_buffer,
73 analyzer.compression_options.zstd_compression_level,
74 );
75
76 stats.value_counts.shrink_to_fit();
78 field_metrics.insert(
79 stats.full_path.clone(),
80 FieldMetrics {
81 name: stats.name.clone(),
82 full_path: stats.full_path.clone(),
83 entropy,
84 lz_matches: lz_matches as u64,
85 bit_counts: stats.bit_counts.clone(),
86 value_counts: stats.value_counts.clone(),
87 depth: stats.depth,
88 count: stats.count,
89 lenbits: stats.lenbits,
90 bit_order: stats.bit_order,
91 zstd_size: actual_size,
92 original_size: writer_buffer.len() as u64,
93 },
94 );
95 }
96
97 let split_comparisons = calc_split_comparisons(
99 &mut analyzer.field_states,
100 &analyzer.schema.analysis.split_groups,
101 &field_metrics,
102 analyzer.compression_options,
103 );
104
105 let custom_comparisons = analyze_custom_comparisons(
107 analyzer.schema,
108 &mut analyzer.field_states,
109 analyzer.compression_options,
110 )?;
111
112 Ok(AnalysisResults {
113 file_entropy,
114 file_lz_matches: file_lz_matches as u64,
115 per_field: field_metrics,
116 schema_metadata: analyzer.schema.metadata.clone(),
117 zstd_file_size: get_zstd_compressed_size(
118 &analyzer.entries,
119 analyzer.compression_options.zstd_compression_level,
120 ),
121 original_size: analyzer.entries.len() as u64,
122 split_comparisons,
123 custom_comparisons,
124 })
125}
126
127fn calc_split_comparisons(
149 field_stats: &mut AHashMap<String, AnalyzerFieldState>,
150 comparisons: &[SplitComparison],
151 field_metrics: &AHashMap<String, FieldMetrics>,
152 compression_options: CompressionOptions,
153) -> Vec<SplitComparisonResult> {
154 let mut split_comparisons = Vec::new();
155 for comparison in comparisons {
156 let mut group1_bytes: Vec<u8> = Vec::new();
157 let mut group2_bytes: Vec<u8> = Vec::new();
158
159 for name in &comparison.group_1 {
161 if let Some(stats) = field_stats.get_mut(name) {
162 group1_bytes.extend_from_slice(get_writer_buffer(&mut stats.writer));
163 }
164 }
165
166 for name in &comparison.group_2 {
168 if let Some(stats) = field_stats.get_mut(name) {
169 group2_bytes.extend_from_slice(get_writer_buffer(&mut stats.writer));
170 }
171 }
172
173 let mut group1_field_metrics: Vec<FieldComparisonMetrics> = Vec::new();
174 let mut group2_field_metrics: Vec<FieldComparisonMetrics> = Vec::new();
175 for path in &comparison.group_1 {
176 if let Some(metrics) = field_metrics.iter().find(|(_k, v)| v.name == *path) {
177 group1_field_metrics.push(metrics.1.clone().into());
178 }
179 }
180 for path in &comparison.group_2 {
181 if let Some(metrics) = field_metrics.iter().find(|(_k, v)| v.name == *path) {
182 group2_field_metrics.push(metrics.1.clone().into());
183 }
184 }
185
186 let custom_compression_options = CompressionOptions {
188 zstd_compression_level: compression_options.zstd_compression_level,
189 size_estimator_fn: compression_options.size_estimator_fn,
190 lz_match_multiplier: compression_options.lz_match_multiplier,
191 entropy_multiplier: compression_options.entropy_multiplier,
192 };
193
194 split_comparisons.push(make_split_comparison_result(
195 comparison.name.clone(),
196 comparison.description.clone(),
197 &group1_bytes,
198 &group2_bytes,
199 group1_field_metrics,
200 group2_field_metrics,
201 custom_compression_options,
202 comparison.compression_estimation_group_1.clone(),
203 comparison.compression_estimation_group_2.clone(),
204 ));
205 }
206 split_comparisons
207}
208
209impl AnalysisResults {
210 pub fn as_field_metrics(&self) -> FieldMetrics {
214 FieldMetrics {
215 name: String::new(),
216 full_path: String::new(),
217 depth: 0,
218 zstd_size: self.zstd_file_size,
219 original_size: self.original_size,
220 count: 0,
221 lenbits: 0,
222 entropy: self.file_entropy,
223 lz_matches: self.file_lz_matches,
224 bit_counts: Vec::new(),
225 bit_order: BitOrder::Default,
226 value_counts: FxHashMap::new(),
227 }
228 }
229
230 pub fn print<W: Write>(
231 &self,
232 writer: &mut W,
233 schema: &Schema,
234 format: PrintFormat,
235 skip_misc_stats: bool,
236 ) -> io::Result<()> {
237 match format {
238 PrintFormat::Detailed => {
239 self.print_detailed(writer, schema, &self.as_field_metrics(), skip_misc_stats)
240 }
241 PrintFormat::Concise => {
242 self.print_concise(writer, schema, &self.as_field_metrics(), skip_misc_stats)
243 }
244 }
245 }
246
247 fn print_detailed<W: Write>(
248 &self,
249 writer: &mut W,
250 schema: &Schema,
251 file_metrics: &FieldMetrics,
252 skip_misc_stats: bool,
253 ) -> io::Result<()> {
254 writeln!(writer, "Schema: {}", self.schema_metadata.name)?;
255 writeln!(writer, "Description: {}", self.schema_metadata.description)?;
256 writeln!(writer, "File Entropy: {:.2} bits", self.file_entropy)?;
257 writeln!(writer, "File LZ Matches: {}", self.file_lz_matches)?;
258 writeln!(writer, "File Original Size: {}", self.original_size)?;
259 writeln!(writer, "File Compressed Size: {}", self.zstd_file_size)?;
260 writeln!(writer, "\nPer-field Metrics (in schema order):")?;
261
262 for field_path in schema.ordered_field_and_group_paths() {
264 self.detailed_print_field(writer, file_metrics, &field_path)?;
265 }
266
267 writeln!(writer, "\nSplit Group Comparisons:")?;
268 for comparison in &self.split_comparisons {
269 detailed_print_comparison(writer, comparison)?;
270 }
271
272 writeln!(writer, "\nCustom Group Comparisons:")?;
273 for comparison in &self.custom_comparisons {
274 concise_print_custom_comparison(writer, comparison)?;
275 }
276
277 if !skip_misc_stats {
278 writeln!(writer, "\nField Value Stats: [as `value: probability %`]")?;
279 for field_path in schema.ordered_field_and_group_paths() {
280 self.concise_print_field_value_stats(writer, &field_path)?;
281 }
282
283 writeln!(writer, "\nField Bit Stats: [as `(zeros/ones) (percentage %)`]")?;
284 for field_path in schema.ordered_field_and_group_paths() {
285 self.concise_print_field_bit_stats(writer, &field_path)?;
286 }
287 }
288
289 Ok(())
290 }
291
292 fn detailed_print_field<W: Write>(
293 &self,
294 writer: &mut W,
295 file_metrics: &FieldMetrics,
296 field_path: &str,
297 ) -> io::Result<()> {
298 if let Some(field) = self.per_field.get(field_path) {
299 let indent = " ".repeat(field.depth);
301 let parent_stats = field.parent_metrics_or(self, file_metrics);
302
303 writeln!(
305 writer,
306 "{}{}: {:.2} bit entropy, {} LZ 3 Byte matches ({:.2}%)",
307 indent,
308 field.name,
309 field.entropy,
310 field.lz_matches,
311 calculate_percentage(field.lz_matches as f64, parent_stats.lz_matches as f64)
312 )?;
313 let padding = format!("{}{}", indent, field.name).len() + 2; writeln!(
315 writer,
316 "{:padding$}Sizes: ZStandard/Original: {}/{} ({:.2}%/{:.2}%)",
317 "",
318 field.zstd_size,
319 field.original_size,
320 calculate_percentage(field.zstd_size as f64, parent_stats.zstd_size as f64),
321 calculate_percentage(
322 field.original_size as f64,
323 parent_stats.original_size as f64
324 )
325 )?;
326 writeln!(
327 writer,
328 "{:padding$}{} bit, {} unique values, {:?}",
329 "",
330 field.lenbits,
331 field.value_counts.len(),
332 field.bit_order
333 )?;
334 }
335
336 Ok(())
337 }
338
339 fn print_concise<W: Write>(
340 &self,
341 writer: &mut W,
342 schema: &Schema,
343 file_metrics: &FieldMetrics,
344 skip_misc_stats: bool,
345 ) -> io::Result<()> {
346 writeln!(writer, "Schema: {}", self.schema_metadata.name)?;
347 writeln!(
348 writer,
349 "File: {:.2}bpb, {} LZ, {}/{} ({:.2}%/{:.2}%) (zstd/orig)",
350 self.file_entropy,
351 self.file_lz_matches,
352 self.zstd_file_size,
353 self.original_size,
354 calculate_percentage(self.zstd_file_size as f64, self.original_size as f64),
355 100.0
356 )?;
357
358 writeln!(writer, "\nField Metrics:")?;
359 for field_path in schema.ordered_field_and_group_paths() {
360 self.concise_print_field(writer, file_metrics, &field_path)?;
361 }
362
363 writeln!(writer, "\nSplit Group Comparisons:")?;
364 for comparison in &self.split_comparisons {
365 concise_print_split_comparison(writer, comparison)?;
366 }
367
368 writeln!(writer, "\nCustom Group Comparisons:")?;
369 for comparison in &self.custom_comparisons {
370 concise_print_custom_comparison(writer, comparison)?;
371 }
372
373 if !skip_misc_stats {
374 writeln!(writer, "\nField Value Stats: [as `value: probability %`]")?;
375 for field_path in schema.ordered_field_and_group_paths() {
376 self.concise_print_field_value_stats(writer, &field_path)?;
377 }
378
379 writeln!(writer, "\nField Bit Stats: [as `(zeros/ones) (percentage %)`]")?;
380 for field_path in schema.ordered_field_and_group_paths() {
381 self.concise_print_field_bit_stats(writer, &field_path)?;
382 }
383 }
384
385 Ok(())
386 }
387
388 fn concise_print_field<W: Write>(
389 &self,
390 writer: &mut W,
391 file_metrics: &FieldMetrics,
392 field_path: &str,
393 ) -> io::Result<()> {
394 if let Some(field) = self.per_field.get(field_path) {
395 let indent = " ".repeat(field.depth);
396 let parent_stats = field.parent_metrics_or(self, file_metrics);
397
398 writeln!(
399 writer,
400 "{}{}: {:.2}bpb, {} LZ ({:.2}%), {}/{} ({:.2}%/{:.2}%) (zstd/orig), {}bit",
401 indent,
402 field.name,
403 field.entropy,
404 field.lz_matches,
405 calculate_percentage(field.lz_matches as f64, parent_stats.lz_matches as f64),
406 field.zstd_size,
407 field.original_size,
408 calculate_percentage(field.zstd_size as f64, parent_stats.zstd_size as f64),
409 calculate_percentage(
410 field.original_size as f64,
411 parent_stats.original_size as f64
412 ),
413 field.lenbits
414 )?;
415 }
416
417 Ok(())
418 }
419
420 fn concise_print_field_value_stats<W: Write>(
421 &self,
422 writer: &mut W,
423 field_path: &str,
424 ) -> io::Result<()> {
425 if let Some(field) = self.per_field.get(field_path) {
426 print_field_metrics_value_stats(writer, field)?;
427 }
428
429 Ok(())
430 }
431
432 fn concise_print_field_bit_stats<W: Write>(
433 &self,
434 writer: &mut W,
435 field_path: &str,
436 ) -> io::Result<()> {
437 if let Some(field) = self.per_field.get(field_path) {
438 print_field_metrics_bit_stats(writer, field)?;
439 }
440
441 Ok(())
442 }
443}
444
445fn detailed_print_comparison<W: Write>(
446 writer: &mut W,
447 comparison: &SplitComparisonResult,
448) -> io::Result<()> {
449 concise_print_split_comparison(writer, comparison)
450}
451
452fn concise_print_custom_comparison<W: Write>(
453 writer: &mut W,
454 comparison: &GroupComparisonResult,
455) -> io::Result<()> {
456 let base_lz = comparison.baseline_metrics.lz_matches;
457 let base_entropy = comparison.baseline_metrics.entropy;
458 let base_zstd = comparison.baseline_metrics.zstd_size;
459 let base_estimated = comparison.baseline_metrics.estimated_size;
460 let base_size = comparison.baseline_metrics.original_size;
461
462 writeln!(writer, " {}: {}", comparison.name, comparison.description)?;
463 writeln!(writer, " Base Group:")?;
464 writeln!(writer, " Size: {}", base_size)?;
465 writeln!(writer, " LZ, Entropy: ({}, {:.2})", base_lz, base_entropy)?;
466 if base_estimated != 0 {
467 writeln!(writer, " Estimate/Zstd: {}/{}", base_estimated, base_zstd)?;
468 } else {
469 writeln!(writer, " Zstd: {}", base_zstd)?;
470 }
471
472 for (i, (group_name, metrics)) in comparison
473 .group_names
474 .iter()
475 .zip(&comparison.group_metrics)
476 .enumerate()
477 {
478 let comp_lz = metrics.lz_matches;
479 let comp_entropy = metrics.entropy;
480 let comp_zstd = metrics.zstd_size;
481 let comp_estimated = metrics.estimated_size;
482 let comp_size = metrics.original_size;
483
484 let ratio_zstd = calculate_percentage(comp_zstd as f64, base_zstd as f64);
485 let diff_zstd = comparison.differences[i].zstd_size;
486
487 writeln!(writer, "\n {} Group:", group_name)?;
488 writeln!(writer, " Size: {}", comp_size)?;
489 writeln!(writer, " LZ, Entropy: ({}, {:.2})", comp_lz, comp_entropy)?;
490 if comp_estimated != 0 {
491 writeln!(writer, " Estimate/Zstd: {}/{}", comp_zstd, comp_estimated)?;
492 } else {
493 writeln!(writer, " Zstd: {}", comp_zstd)?;
494 }
495 writeln!(writer, " Ratio zstd: {:.1}%", ratio_zstd)?;
496 writeln!(writer, " Diff zstd: {}", diff_zstd)?;
497
498 if base_size != comp_size {
499 writeln!(writer, " [WARNING!!] Sizes of base and comparison groups don't match!! They may vary by a few bytes due to padding.")?;
500 writeln!(writer, " [WARNING!!] However if they vary extremely, your groups may be incorrect. base: {}, {}: {}", base_size, group_name, comp_size)?;
501 }
502 }
503
504 Ok(())
505}
506
507fn concise_print_split_comparison<W: Write>(
508 writer: &mut W,
509 comparison: &SplitComparisonResult,
510) -> io::Result<()> {
511 let base_lz = comparison.group1_metrics.lz_matches;
512 let size_orig = comparison.group1_metrics.original_size;
513 let size_comp = comparison.group2_metrics.original_size;
514 let base_entropy = comparison.group1_metrics.entropy;
515
516 let base_zstd = comparison.group1_metrics.zstd_size;
517 let base_estimated = comparison.group1_metrics.estimated_size;
518
519 let comp_lz = comparison.group2_metrics.lz_matches;
520 let comp_entropy = comparison.group2_metrics.entropy;
521
522 let comp_zstd = comparison.group2_metrics.zstd_size;
523 let comp_estimated = comparison.group2_metrics.estimated_size;
524 let ratio_zstd = calculate_percentage(comp_zstd as f64, base_zstd as f64);
525 let diff_zstd = comparison.difference.zstd_size;
526
527 writeln!(writer, " {}: {}", comparison.name, comparison.description)?;
528 writeln!(writer, " Original Size: {}", size_orig)?;
529 writeln!(writer, " Base LZ, Entropy: ({}, {:.2}):", base_lz, base_entropy)?;
530 writeln!(writer, " Comp LZ, Entropy: ({}, {:.2}):", comp_lz, comp_entropy)?;
531 writeln!(
532 writer,
533 " Base Group LZ, Entropy: ({:?}, {:?})",
534 comparison
535 .baseline_comparison_metrics
536 .iter()
537 .map(|m| m.lz_matches)
538 .collect::<Vec<_>>(),
539 comparison
540 .baseline_comparison_metrics
541 .iter()
542 .map(|m| format!("{:.2}", m.entropy))
543 .collect::<Vec<_>>()
544 )?;
545 writeln!(
546 writer,
547 " Comp Group LZ, Entropy: ({:?}, {:?})",
548 comparison
549 .split_comparison_metrics
550 .iter()
551 .map(|m| m.lz_matches)
552 .collect::<Vec<_>>(),
553 comparison
554 .split_comparison_metrics
555 .iter()
556 .map(|m| format!("{:.2}", m.entropy))
557 .collect::<Vec<_>>()
558 )?;
559
560 if base_estimated != 0 {
561 writeln!(writer, " Base (est/zstd): {}/{}", base_estimated, base_zstd)?;
562 } else {
563 writeln!(writer, " Base (zstd): {}", base_zstd)?;
564 }
565
566 if comp_estimated != 0 {
567 writeln!(writer, " Comp (est/zstd): {}/{}", comp_estimated, comp_zstd)?;
568 } else {
569 writeln!(writer, " Comp (zstd): {}", comp_zstd)?;
570 }
571
572 writeln!(writer, " Ratio (zstd): {}", ratio_zstd)?;
573 writeln!(writer, " Diff (zstd): {}", diff_zstd)?;
574
575 if size_orig != size_comp {
576 writeln!(writer, " [WARNING!!] Sizes of both groups in bytes don't match!! They may vary by a few bytes due to padding.")?;
577 writeln!(writer, " [WARNING!!] However if they vary extremely, your groups may be incorrect. group1: {}, group2: {}", size_orig, size_comp)?;
578 }
579
580 Ok(())
581}