struct_compression_analyzer/results/
analysis_results.rs

1use super::{
2    print_field_metrics_bit_stats, print_field_metrics_value_stats, ComputeAnalysisResultsError,
3    FieldMetrics, PrintFormat,
4};
5use crate::{
6    analyzer::{AnalyzerFieldState, CompressionOptions, SchemaAnalyzer},
7    comparison::{
8        compare_groups::{analyze_custom_comparisons, GroupComparisonResult},
9        split_comparison::{
10            make_split_comparison_result, FieldComparisonMetrics, SplitComparisonResult,
11        },
12    },
13    results::calculate_percentage,
14    schema::{BitOrder, Metadata, Schema, SplitComparison},
15    utils::analyze_utils::{calculate_file_entropy, get_writer_buffer, get_zstd_compressed_size},
16};
17use ahash::{AHashMap, HashMapExt};
18use lossless_transform_utils::match_estimator::estimate_num_lz_matches_fast;
19use rustc_hash::FxHashMap;
20use std::io::{self, Write};
21
22/// Final computed metrics for output
23#[derive(Clone, Default)]
24pub struct AnalysisResults {
25    /// Schema name
26    pub schema_metadata: Metadata,
27
28    /// Entropy of the whole file
29    pub file_entropy: f64,
30
31    /// LZ compression matches in the file
32    pub file_lz_matches: u64,
33
34    /// Actual size of the compressed data when compressed with zstandard
35    pub zstd_file_size: u64,
36
37    /// Original size of the uncompressed data
38    pub original_size: u64,
39
40    /// Field path → computed metrics
41    /// This is a map of `full_path` to [`FieldMetrics`], such that we
42    /// can easily merge the results of different fields down the road.
43    pub per_field: AHashMap<String, FieldMetrics>,
44
45    /// Split comparison results
46    pub split_comparisons: Vec<SplitComparisonResult>,
47
48    /// Custom group comparison results from schema-defined comparisons
49    pub custom_comparisons: Vec<GroupComparisonResult>,
50}
51
52/// Given a [`SchemaAnalyzer`] which has ingested all of the data to be calculated, via
53/// the [`SchemaAnalyzer::add_entry`] function, compute the analysis results.
54///
55/// This returns the results for all of the per-field metrics, as well as computing the
56/// various schema defined groups, such as 'split' groups and 'compare' groups.
57pub fn compute_analysis_results(
58    analyzer: &mut SchemaAnalyzer,
59) -> Result<AnalysisResults, ComputeAnalysisResultsError> {
60    // First calculate file entropy
61    let file_entropy = calculate_file_entropy(&analyzer.entries);
62    let file_lz_matches = estimate_num_lz_matches_fast(&analyzer.entries);
63
64    // Then calculate per-field entropy and lz matches
65    let mut field_metrics: AHashMap<String, FieldMetrics> = AHashMap::new();
66
67    for stats in &mut analyzer.field_states.values_mut() {
68        let writer_buffer = get_writer_buffer(&mut stats.writer);
69        let entropy = calculate_file_entropy(writer_buffer);
70        let lz_matches = estimate_num_lz_matches_fast(writer_buffer);
71        let actual_size = get_zstd_compressed_size(
72            writer_buffer,
73            analyzer.compression_options.zstd_compression_level,
74        );
75
76        // reduce memory usage from leftover analyzer.
77        stats.value_counts.shrink_to_fit();
78        field_metrics.insert(
79            stats.full_path.clone(),
80            FieldMetrics {
81                name: stats.name.clone(),
82                full_path: stats.full_path.clone(),
83                entropy,
84                lz_matches: lz_matches as u64,
85                bit_counts: stats.bit_counts.clone(),
86                value_counts: stats.value_counts.clone(),
87                depth: stats.depth,
88                count: stats.count,
89                lenbits: stats.lenbits,
90                bit_order: stats.bit_order,
91                zstd_size: actual_size,
92                original_size: writer_buffer.len() as u64,
93            },
94        );
95    }
96
97    // Process split group comparisons
98    let split_comparisons = calc_split_comparisons(
99        &mut analyzer.field_states,
100        &analyzer.schema.analysis.split_groups,
101        &field_metrics,
102        analyzer.compression_options,
103    );
104
105    // Process custom group comparisons
106    let custom_comparisons = analyze_custom_comparisons(
107        analyzer.schema,
108        &mut analyzer.field_states,
109        analyzer.compression_options,
110    )?;
111
112    Ok(AnalysisResults {
113        file_entropy,
114        file_lz_matches: file_lz_matches as u64,
115        per_field: field_metrics,
116        schema_metadata: analyzer.schema.metadata.clone(),
117        zstd_file_size: get_zstd_compressed_size(
118            &analyzer.entries,
119            analyzer.compression_options.zstd_compression_level,
120        ),
121        original_size: analyzer.entries.len() as u64,
122        split_comparisons,
123        custom_comparisons,
124    })
125}
126
127/// Calculates the comparison results between a series of field splits.
128///
129/// This function takes the [`SchemaAnalyzer`]'s intermediate state, that is, the
130/// state of each field (containing the data for each field), a list of split comparisons
131/// to make, and the individual metrics (results) for each field.
132///
133/// This then computes the comparison results for each split.
134///
135/// # Remarks
136/// This API is for internal use. It may change without notice.
137///
138/// # Arguments
139/// * `field_stats` - The current field states (analyzer working state)
140/// * `comparisons` - A slice of [`SplitComparison`] objects defining the splits to compare.
141/// * `field_metrics` - A reference to a hash map of field metrics.
142/// * `compression_options` - The compression options (zstd compression level, etc).
143///
144/// # Returns
145/// A vector of [`SplitComparisonResult`] objects containing the comparison results.
146///
147/// [`SchemaAnalyzer`]: crate::analyzer::SchemaAnalyzer
148fn calc_split_comparisons(
149    field_stats: &mut AHashMap<String, AnalyzerFieldState>,
150    comparisons: &[SplitComparison],
151    field_metrics: &AHashMap<String, FieldMetrics>,
152    compression_options: CompressionOptions,
153) -> Vec<SplitComparisonResult> {
154    let mut split_comparisons = Vec::new();
155    for comparison in comparisons {
156        let mut group1_bytes: Vec<u8> = Vec::new();
157        let mut group2_bytes: Vec<u8> = Vec::new();
158
159        // Sum up bytes for group 1
160        for name in &comparison.group_1 {
161            if let Some(stats) = field_stats.get_mut(name) {
162                group1_bytes.extend_from_slice(get_writer_buffer(&mut stats.writer));
163            }
164        }
165
166        // Sum up bytes for group 2
167        for name in &comparison.group_2 {
168            if let Some(stats) = field_stats.get_mut(name) {
169                group2_bytes.extend_from_slice(get_writer_buffer(&mut stats.writer));
170            }
171        }
172
173        let mut group1_field_metrics: Vec<FieldComparisonMetrics> = Vec::new();
174        let mut group2_field_metrics: Vec<FieldComparisonMetrics> = Vec::new();
175        for path in &comparison.group_1 {
176            if let Some(metrics) = field_metrics.iter().find(|(_k, v)| v.name == *path) {
177                group1_field_metrics.push(metrics.1.clone().into());
178            }
179        }
180        for path in &comparison.group_2 {
181            if let Some(metrics) = field_metrics.iter().find(|(_k, v)| v.name == *path) {
182                group2_field_metrics.push(metrics.1.clone().into());
183            }
184        }
185
186        // Create custom compression options for this comparison using its multipliers
187        let custom_compression_options = CompressionOptions {
188            zstd_compression_level: compression_options.zstd_compression_level,
189            size_estimator_fn: compression_options.size_estimator_fn,
190            lz_match_multiplier: compression_options.lz_match_multiplier,
191            entropy_multiplier: compression_options.entropy_multiplier,
192        };
193
194        split_comparisons.push(make_split_comparison_result(
195            comparison.name.clone(),
196            comparison.description.clone(),
197            &group1_bytes,
198            &group2_bytes,
199            group1_field_metrics,
200            group2_field_metrics,
201            custom_compression_options,
202            comparison.compression_estimation_group_1.clone(),
203            comparison.compression_estimation_group_2.clone(),
204        ));
205    }
206    split_comparisons
207}
208
209impl AnalysisResults {
210    /// Converts the file level statistics into a [`FieldMetrics`] object
211    /// which can be used for comparison with parent in places such as the
212    /// print function.
213    pub fn as_field_metrics(&self) -> FieldMetrics {
214        FieldMetrics {
215            name: String::new(),
216            full_path: String::new(),
217            depth: 0,
218            zstd_size: self.zstd_file_size,
219            original_size: self.original_size,
220            count: 0,
221            lenbits: 0,
222            entropy: self.file_entropy,
223            lz_matches: self.file_lz_matches,
224            bit_counts: Vec::new(),
225            bit_order: BitOrder::Default,
226            value_counts: FxHashMap::new(),
227        }
228    }
229
230    pub fn print<W: Write>(
231        &self,
232        writer: &mut W,
233        schema: &Schema,
234        format: PrintFormat,
235        skip_misc_stats: bool,
236    ) -> io::Result<()> {
237        match format {
238            PrintFormat::Detailed => {
239                self.print_detailed(writer, schema, &self.as_field_metrics(), skip_misc_stats)
240            }
241            PrintFormat::Concise => {
242                self.print_concise(writer, schema, &self.as_field_metrics(), skip_misc_stats)
243            }
244        }
245    }
246
247    fn print_detailed<W: Write>(
248        &self,
249        writer: &mut W,
250        schema: &Schema,
251        file_metrics: &FieldMetrics,
252        skip_misc_stats: bool,
253    ) -> io::Result<()> {
254        writeln!(writer, "Schema: {}", self.schema_metadata.name)?;
255        writeln!(writer, "Description: {}", self.schema_metadata.description)?;
256        writeln!(writer, "File Entropy: {:.2} bits", self.file_entropy)?;
257        writeln!(writer, "File LZ Matches: {}", self.file_lz_matches)?;
258        writeln!(writer, "File Original Size: {}", self.original_size)?;
259        writeln!(writer, "File Compressed Size: {}", self.zstd_file_size)?;
260        writeln!(writer, "\nPer-field Metrics (in schema order):")?;
261
262        // Iterate through schema-defined fields in order
263        for field_path in schema.ordered_field_and_group_paths() {
264            self.detailed_print_field(writer, file_metrics, &field_path)?;
265        }
266
267        writeln!(writer, "\nSplit Group Comparisons:")?;
268        for comparison in &self.split_comparisons {
269            detailed_print_comparison(writer, comparison)?;
270        }
271
272        writeln!(writer, "\nCustom Group Comparisons:")?;
273        for comparison in &self.custom_comparisons {
274            concise_print_custom_comparison(writer, comparison)?;
275        }
276
277        if !skip_misc_stats {
278            writeln!(writer, "\nField Value Stats: [as `value: probability %`]")?;
279            for field_path in schema.ordered_field_and_group_paths() {
280                self.concise_print_field_value_stats(writer, &field_path)?;
281            }
282
283            writeln!(writer, "\nField Bit Stats: [as `(zeros/ones) (percentage %)`]")?;
284            for field_path in schema.ordered_field_and_group_paths() {
285                self.concise_print_field_bit_stats(writer, &field_path)?;
286            }
287        }
288        
289        Ok(())
290    }
291
292    fn detailed_print_field<W: Write>(
293        &self,
294        writer: &mut W,
295        file_metrics: &FieldMetrics,
296        field_path: &str,
297    ) -> io::Result<()> {
298        if let Some(field) = self.per_field.get(field_path) {
299            // Indent based on field depth to show hierarchy
300            let indent = "  ".repeat(field.depth);
301            let parent_stats = field.parent_metrics_or(self, file_metrics);
302
303            // Calculate percentages
304            writeln!(
305                writer,
306                "{}{}: {:.2} bit entropy, {} LZ 3 Byte matches ({:.2}%)",
307                indent,
308                field.name,
309                field.entropy,
310                field.lz_matches,
311                calculate_percentage(field.lz_matches as f64, parent_stats.lz_matches as f64)
312            )?;
313            let padding = format!("{}{}", indent, field.name).len() + 2; // +2 for ": "
314            writeln!(
315                writer,
316                "{:padding$}Sizes: ZStandard/Original: {}/{} ({:.2}%/{:.2}%)",
317                "",
318                field.zstd_size,
319                field.original_size,
320                calculate_percentage(field.zstd_size as f64, parent_stats.zstd_size as f64),
321                calculate_percentage(
322                    field.original_size as f64,
323                    parent_stats.original_size as f64
324                )
325            )?;
326            writeln!(
327                writer,
328                "{:padding$}{} bit, {} unique values, {:?}",
329                "",
330                field.lenbits,
331                field.value_counts.len(),
332                field.bit_order
333            )?;
334        }
335        
336        Ok(())
337    }
338
339    fn print_concise<W: Write>(
340        &self,
341        writer: &mut W,
342        schema: &Schema,
343        file_metrics: &FieldMetrics,
344        skip_misc_stats: bool,
345    ) -> io::Result<()> {
346        writeln!(writer, "Schema: {}", self.schema_metadata.name)?;
347        writeln!(
348            writer,
349            "File: {:.2}bpb, {} LZ, {}/{} ({:.2}%/{:.2}%) (zstd/orig)",
350            self.file_entropy,
351            self.file_lz_matches,
352            self.zstd_file_size,
353            self.original_size,
354            calculate_percentage(self.zstd_file_size as f64, self.original_size as f64),
355            100.0
356        )?;
357
358        writeln!(writer, "\nField Metrics:")?;
359        for field_path in schema.ordered_field_and_group_paths() {
360            self.concise_print_field(writer, file_metrics, &field_path)?;
361        }
362
363        writeln!(writer, "\nSplit Group Comparisons:")?;
364        for comparison in &self.split_comparisons {
365            concise_print_split_comparison(writer, comparison)?;
366        }
367
368        writeln!(writer, "\nCustom Group Comparisons:")?;
369        for comparison in &self.custom_comparisons {
370            concise_print_custom_comparison(writer, comparison)?;
371        }
372
373        if !skip_misc_stats {
374            writeln!(writer, "\nField Value Stats: [as `value: probability %`]")?;
375            for field_path in schema.ordered_field_and_group_paths() {
376                self.concise_print_field_value_stats(writer, &field_path)?;
377            }
378
379            writeln!(writer, "\nField Bit Stats: [as `(zeros/ones) (percentage %)`]")?;
380            for field_path in schema.ordered_field_and_group_paths() {
381                self.concise_print_field_bit_stats(writer, &field_path)?;
382            }
383        }
384        
385        Ok(())
386    }
387
388    fn concise_print_field<W: Write>(
389        &self,
390        writer: &mut W,
391        file_metrics: &FieldMetrics,
392        field_path: &str,
393    ) -> io::Result<()> {
394        if let Some(field) = self.per_field.get(field_path) {
395            let indent = "  ".repeat(field.depth);
396            let parent_stats = field.parent_metrics_or(self, file_metrics);
397
398            writeln!(
399                writer,
400                "{}{}: {:.2}bpb, {} LZ ({:.2}%), {}/{} ({:.2}%/{:.2}%) (zstd/orig), {}bit",
401                indent,
402                field.name,
403                field.entropy,
404                field.lz_matches,
405                calculate_percentage(field.lz_matches as f64, parent_stats.lz_matches as f64),
406                field.zstd_size,
407                field.original_size,
408                calculate_percentage(field.zstd_size as f64, parent_stats.zstd_size as f64),
409                calculate_percentage(
410                    field.original_size as f64,
411                    parent_stats.original_size as f64
412                ),
413                field.lenbits
414            )?;
415        }
416        
417        Ok(())
418    }
419
420    fn concise_print_field_value_stats<W: Write>(
421        &self,
422        writer: &mut W,
423        field_path: &str,
424    ) -> io::Result<()> {
425        if let Some(field) = self.per_field.get(field_path) {
426            print_field_metrics_value_stats(writer, field)?;
427        }
428        
429        Ok(())
430    }
431
432    fn concise_print_field_bit_stats<W: Write>(
433        &self,
434        writer: &mut W,
435        field_path: &str,
436    ) -> io::Result<()> {
437        if let Some(field) = self.per_field.get(field_path) {
438            print_field_metrics_bit_stats(writer, field)?;
439        }
440        
441        Ok(())
442    }
443}
444
445fn detailed_print_comparison<W: Write>(
446    writer: &mut W,
447    comparison: &SplitComparisonResult,
448) -> io::Result<()> {
449    concise_print_split_comparison(writer, comparison)
450}
451
452fn concise_print_custom_comparison<W: Write>(
453    writer: &mut W,
454    comparison: &GroupComparisonResult,
455) -> io::Result<()> {
456    let base_lz = comparison.baseline_metrics.lz_matches;
457    let base_entropy = comparison.baseline_metrics.entropy;
458    let base_zstd = comparison.baseline_metrics.zstd_size;
459    let base_estimated = comparison.baseline_metrics.estimated_size;
460    let base_size = comparison.baseline_metrics.original_size;
461
462    writeln!(writer, "  {}: {}", comparison.name, comparison.description)?;
463    writeln!(writer, "    Base Group:")?;
464    writeln!(writer, "      Size: {}", base_size)?;
465    writeln!(writer, "      LZ, Entropy: ({}, {:.2})", base_lz, base_entropy)?;
466    if base_estimated != 0 {
467        writeln!(writer, "      Estimate/Zstd: {}/{}", base_estimated, base_zstd)?;
468    } else {
469        writeln!(writer, "      Zstd: {}", base_zstd)?;
470    }
471
472    for (i, (group_name, metrics)) in comparison
473        .group_names
474        .iter()
475        .zip(&comparison.group_metrics)
476        .enumerate()
477    {
478        let comp_lz = metrics.lz_matches;
479        let comp_entropy = metrics.entropy;
480        let comp_zstd = metrics.zstd_size;
481        let comp_estimated = metrics.estimated_size;
482        let comp_size = metrics.original_size;
483
484        let ratio_zstd = calculate_percentage(comp_zstd as f64, base_zstd as f64);
485        let diff_zstd = comparison.differences[i].zstd_size;
486
487        writeln!(writer, "\n    {} Group:", group_name)?;
488        writeln!(writer, "      Size: {}", comp_size)?;
489        writeln!(writer, "      LZ, Entropy: ({}, {:.2})", comp_lz, comp_entropy)?;
490        if comp_estimated != 0 {
491            writeln!(writer, "      Estimate/Zstd: {}/{}", comp_zstd, comp_estimated)?;
492        } else {
493            writeln!(writer, "      Zstd: {}", comp_zstd)?;
494        }
495        writeln!(writer, "      Ratio zstd: {:.1}%", ratio_zstd)?;
496        writeln!(writer, "      Diff zstd: {}", diff_zstd)?;
497
498        if base_size != comp_size {
499            writeln!(writer, "      [WARNING!!] Sizes of base and comparison groups don't match!! They may vary by a few bytes due to padding.")?;
500            writeln!(writer, "      [WARNING!!] However if they vary extremely, your groups may be incorrect. base: {}, {}: {}", base_size, group_name, comp_size)?;
501        }
502    }
503    
504    Ok(())
505}
506
507fn concise_print_split_comparison<W: Write>(
508    writer: &mut W,
509    comparison: &SplitComparisonResult,
510) -> io::Result<()> {
511    let base_lz = comparison.group1_metrics.lz_matches;
512    let size_orig = comparison.group1_metrics.original_size;
513    let size_comp = comparison.group2_metrics.original_size;
514    let base_entropy = comparison.group1_metrics.entropy;
515
516    let base_zstd = comparison.group1_metrics.zstd_size;
517    let base_estimated = comparison.group1_metrics.estimated_size;
518
519    let comp_lz = comparison.group2_metrics.lz_matches;
520    let comp_entropy = comparison.group2_metrics.entropy;
521
522    let comp_zstd = comparison.group2_metrics.zstd_size;
523    let comp_estimated = comparison.group2_metrics.estimated_size;
524    let ratio_zstd = calculate_percentage(comp_zstd as f64, base_zstd as f64);
525    let diff_zstd = comparison.difference.zstd_size;
526
527    writeln!(writer, "  {}: {}", comparison.name, comparison.description)?;
528    writeln!(writer, "    Original Size: {}", size_orig)?;
529    writeln!(writer, "    Base LZ, Entropy: ({}, {:.2}):", base_lz, base_entropy)?;
530    writeln!(writer, "    Comp LZ, Entropy: ({}, {:.2}):", comp_lz, comp_entropy)?;
531    writeln!(
532        writer,
533        "    Base Group LZ, Entropy: ({:?}, {:?})",
534        comparison
535            .baseline_comparison_metrics
536            .iter()
537            .map(|m| m.lz_matches)
538            .collect::<Vec<_>>(),
539        comparison
540            .baseline_comparison_metrics
541            .iter()
542            .map(|m| format!("{:.2}", m.entropy))
543            .collect::<Vec<_>>()
544    )?;
545    writeln!(
546        writer,
547        "    Comp Group LZ, Entropy: ({:?}, {:?})",
548        comparison
549            .split_comparison_metrics
550            .iter()
551            .map(|m| m.lz_matches)
552            .collect::<Vec<_>>(),
553        comparison
554            .split_comparison_metrics
555            .iter()
556            .map(|m| format!("{:.2}", m.entropy))
557            .collect::<Vec<_>>()
558    )?;
559
560    if base_estimated != 0 {
561        writeln!(writer, "    Base (est/zstd): {}/{}", base_estimated, base_zstd)?;
562    } else {
563        writeln!(writer, "    Base (zstd): {}", base_zstd)?;
564    }
565
566    if comp_estimated != 0 {
567        writeln!(writer, "    Comp (est/zstd): {}/{}", comp_estimated, comp_zstd)?;
568    } else {
569        writeln!(writer, "    Comp (zstd): {}", comp_zstd)?;
570    }
571
572    writeln!(writer, "    Ratio (zstd): {}", ratio_zstd)?;
573    writeln!(writer, "    Diff (zstd): {}", diff_zstd)?;
574
575    if size_orig != size_comp {
576        writeln!(writer, "    [WARNING!!] Sizes of both groups in bytes don't match!! They may vary by a few bytes due to padding.")?;
577        writeln!(writer, "    [WARNING!!] However if they vary extremely, your groups may be incorrect. group1: {}, group2: {}", size_orig, size_comp)?;
578    }
579    
580    Ok(())
581}