struct_compression_analyzer/results/
mod.rs

1//! Analyzes and processes final analysis results for bit-packed data structures.
2//!
3//! This module handles the final stage of analysis, computing metrics and statistics
4//! from processed bit-packed data. It provides comprehensive analysis capabilities
5//! including entropy calculations, LZ compression analysis, and field-level statistics.
6//!
7//! # Core Types
8//!
9//! - [`AnalysisResults`]: Top-level container for all analysis results
10//! - [`FieldMetrics`]: Detailed metrics for individual fields
11//! - [`PrintFormat`]: Output formatting options for result presentation
12//!
13//! # Key Features
14//!
15//! - Field-level and file-level entropy analysis
16//! - LZ compression match detection
17//! - Size estimation and actual compression metrics
18//! - Bit distribution statistics
19//! - Value frequency analysis
20//! - Split comparison results
21//!
22//! # Public APIs
23//!
24//! Key types and functions for users of this module:
25//!
26//! ## Types
27//!
28//! - [`AnalysisResults`]: Primary container for analysis output
29//!   - [`AnalysisResults::print()`]: Display results in console
30//!   - [`AnalysisResults::as_field_metrics()`]: Convert file statistics to field metrics
31//!
32//! - [`MergedAnalysisResults`]: Specialization of analysis results for aggregating multiple files
33//!   - [`MergedAnalysisResults::from_results()`]: Create from multiple analysis results
34//!   - [`MergedAnalysisResults::print()`]: Display merged results
35//!   - [`MergedAnalysisResults::as_field_metrics()`]: Convert file statistics to field metrics
36//!
37//! - [`FieldMetrics`]: Per-field analysis data
38//!   - [`FieldMetrics::parent_path()`]: Get path of parent field
39//!   - [`FieldMetrics::parent_metrics_or()`]: Get metrics of parent field
40//!   - [`FieldMetrics::sorted_value_counts()`]: Get sorted value frequencies
41//!
42//! ## Functions
43//!
44//! - [`compute_analysis_results()`]: Generate analysis from [`SchemaAnalyzer`]
45//!
46//! # Example
47//!
48//! ```no_run
49//! use struct_compression_analyzer::{analyzer::SchemaAnalyzer, schema::Schema};
50//! use struct_compression_analyzer::results::analysis_results::AnalysisResults;
51//! use struct_compression_analyzer::analyzer::CompressionOptions;
52//!
53//! fn analyze_data(schema: &Schema, data: &[u8]) -> AnalysisResults {
54//!     let options = CompressionOptions::default();
55//!     let mut analyzer = SchemaAnalyzer::new(schema, options);
56//!     analyzer.add_entry(data);
57//!     analyzer.generate_results().unwrap()
58//! }
59//! ```
60//!
61//! # Output Formats
62//!
63//! Results can be displayed in two formats (console):
64//!
65//! - [`Detailed`]: Comprehensive analysis with full metrics
66//! - [`Concise`]: Condensed summary of key statistics
67//!
68//! Groups of results (multiple files) can also be displayed via one of the
69//! other modules.
70//!
71//! - [`CSV`]: CSV representation of results. Export to spreadsheets.
72//! - [`Plot`]: Generate plots of results.
73//!
74//! # Field Metrics
75//!
76//! For each field, the analysis computes:
77//!
78//! - Shannon entropy in bits
79//! - LZ compression matches
80//! - Bit-level distribution
81//! - Value frequency counts
82//! - Size estimates (original, compressed, estimated)
83//!
84//! Fields can be analyzed individually or merged for group analysis.
85//!
86//! # Implementation Notes
87//!
88//! - Handles both MSB and LSB bit ordering
89//! - Supports nested field hierarchies
90//! - Provides parent/child relationship tracking
91//! - Implements efficient metric merging for group analysis
92//!
93//! [`AnalysisResults`]: crate::results::analysis_results::AnalysisResults
94//! [`FieldMetrics`]: crate::results::FieldMetrics
95//! [`PrintFormat`]: crate::results::PrintFormat
96//! [`Detailed`]: crate::results::PrintFormat::Detailed
97//! [`Concise`]: crate::results::PrintFormat::Concise
98//! [`CSV`]: crate::csv
99//! [`Plot`]: crate::plot
100//! [`SchemaAnalyzer`]: crate::analyzer::SchemaAnalyzer
101//! [`compute_analysis_results()`]: crate::results::analysis_results::compute_analysis_results
102//! [`MergedAnalysisResults`]: crate::results::merged_analysis_results::MergedAnalysisResults
103//! [`MergedAnalysisResults::from_results()`]: crate::results::merged_analysis_results::MergedAnalysisResults::from_results
104//! [`MergedAnalysisResults::print()`]: crate::results::merged_analysis_results::MergedAnalysisResults::print
105//! [`MergedAnalysisResults::as_field_metrics()`]: crate::results::merged_analysis_results::MergedAnalysisResults::as_field_metrics
106
107pub mod analysis_results;
108pub mod merged_analysis_results;
109
110use crate::analyzer::BitStats;
111use crate::comparison::compare_groups::GroupComparisonError;
112use crate::results::analysis_results::AnalysisResults;
113use crate::schema::BitOrder;
114use crate::utils::constants::CHILD_MARKER;
115use derive_more::FromStr;
116use merged_analysis_results::MergedAnalysisResults;
117use rustc_hash::FxHashMap;
118use std::io::{self, Write};
119use thiserror::Error;
120
121/// Error type for when merging analysis results fails.
122#[derive(Debug, Error)]
123pub enum AnalysisMergeError {
124    #[error(
125        "Number of bit counts did not match while merging `bit_counts`.
126This indicates inconsistent input data, or merging of results that were computed differently."
127    )]
128    BitCountsDontMatch,
129
130    #[error("Field length mismatch: {0} != {1}. This indicates inconsistent, different or incorrect input data.")]
131    FieldLengthMismatch(u32, u32),
132}
133
134/// Error type for when something goes wrong when computing the final analysis results.
135#[derive(Debug, Error)]
136pub enum ComputeAnalysisResultsError {
137    #[error(transparent)]
138    GroupComparisonError(#[from] GroupComparisonError),
139}
140
141/// Complete analysis metrics for a single field
142#[derive(Clone, Default)]
143pub struct FieldMetrics {
144    /// Name of the field or group
145    pub name: String,
146    /// Name of the full path to the field or group
147    pub full_path: String,
148    /// The depth of the field in the group/field chain.
149    pub depth: usize,
150    /// Total number of observed values
151    pub count: u64,
152    /// Length of the field or group in bits.
153    pub lenbits: u32,
154    /// Shannon entropy in bits
155    pub entropy: f64,
156    /// LZ compression matches in the field
157    pub lz_matches: u64,
158    /// Bit-level statistics. Index of tuple is bit offset.
159    pub bit_counts: Vec<BitStats>,
160    /// The order of the bits within the field
161    pub bit_order: BitOrder,
162    /// Value → occurrence count
163    /// Count of occurrences for each observed value.
164    pub value_counts: FxHashMap<u64, u64>,
165    /// Actual size of the compressed data when compressed with zstandard
166    pub zstd_size: u64,
167    /// Original size of the data before compression
168    pub original_size: u64,
169}
170
171impl FieldMetrics {
172    /// Merge multiple [`FieldMetrics`] objects into one.
173    /// This gives you an 'aggregate' result over a large data set.
174    ///
175    /// # Arguments
176    ///
177    /// * `items` - The items to merge into a new instance.
178    pub fn try_merge_many(items: &[&Self]) -> Result<FieldMetrics, AnalysisMergeError> {
179        if items.is_empty() {
180            return Ok(FieldMetrics::default());
181        }
182
183        let first = items[0];
184
185        // Validate compatible field configurations
186        for other in items {
187            if first.lenbits != other.lenbits {
188                return Err(AnalysisMergeError::FieldLengthMismatch(
189                    first.lenbits,
190                    other.lenbits,
191                ));
192            }
193        }
194
195        // Average over all items
196        let total_items = items.len();
197        let mut total_count = 0;
198        let mut total_entropy = 0.0;
199        let mut total_lz_matches = 0;
200        let mut total_zstd_size = 0;
201        let mut total_original_size = 0;
202
203        for metrics in items {
204            total_count += metrics.count;
205            total_entropy += metrics.entropy;
206            total_lz_matches += metrics.lz_matches;
207            total_zstd_size += metrics.zstd_size;
208            total_original_size += metrics.original_size;
209        }
210
211        let mut this = FieldMetrics {
212            name: first.name.clone(),
213            full_path: first.full_path.clone(),
214            depth: first.depth,
215            lenbits: first.lenbits,
216            bit_order: first.bit_order,
217            ..Default::default()
218        };
219        this.count = total_count;
220        this.entropy = total_entropy / total_items as f64;
221        this.lz_matches = total_lz_matches / total_items as u64;
222        this.zstd_size = total_zstd_size / total_items as u64;
223        this.original_size = total_original_size / total_items as u64;
224        this.merge_bit_stats_and_value_counts(items)?;
225        Ok(this)
226    }
227
228    fn merge_bit_stats_and_value_counts(
229        &mut self,
230        items: &[&Self],
231    ) -> Result<(), AnalysisMergeError> {
232        let mut bit_counts = items[0].bit_counts.clone();
233        let mut value_counts = items[0].value_counts.clone();
234
235        for other in items {
236            // Validate bit counts length
237            if bit_counts.len() != other.bit_counts.len() {
238                return Err(AnalysisMergeError::BitCountsDontMatch);
239            }
240
241            for (bit_offset, bit_stats) in other.bit_counts.iter().enumerate() {
242                let current = bit_counts
243                    .get_mut(bit_offset)
244                    .ok_or(AnalysisMergeError::BitCountsDontMatch)?;
245                current.ones += bit_stats.ones;
246                current.zeros += bit_stats.zeros;
247            }
248
249            // Add value counts from others into self
250            for (value, count) in &other.value_counts {
251                *value_counts.entry(*value).or_insert(0) += count;
252            }
253        }
254
255        self.bit_counts = bit_counts;
256        self.value_counts = value_counts;
257        Ok(())
258    }
259
260    /// Returns the parent path of the current field.
261    /// The parent path is the part of the full path before the last dot.
262    pub fn parent_path(&self) -> Option<&str> {
263        self.full_path.rsplit_once(CHILD_MARKER).map(|(p, _)| p)
264    }
265
266    /// Returns the [`FieldMetrics`] object for the parent of the current field.
267    /// Returns `None` if there is no parent.
268    pub fn parent_metrics_or<'a>(
269        &self,
270        results: &'a AnalysisResults,
271        optb: &'a FieldMetrics,
272    ) -> &'a FieldMetrics {
273        let parent_path = self.parent_path();
274        let parent_stats = parent_path
275            .and_then(|p| results.per_field.get(p))
276            .unwrap_or(optb);
277        parent_stats
278    }
279
280    /// Returns the [`FieldMetrics`] object for the parent of the current field in a merged result.
281    pub fn parent_metrics_in_merged_or<'a>(
282        &self,
283        results: &'a MergedAnalysisResults,
284        optb: &'a FieldMetrics,
285    ) -> &'a FieldMetrics {
286        let parent_path = self.parent_path();
287        let parent_stats = parent_path
288            .and_then(|p| results.per_field.get(p))
289            .unwrap_or(optb);
290        parent_stats
291    }
292
293    /// Get sorted value counts descending (value, count)
294    pub fn sorted_value_counts(&self) -> Vec<(&u64, &u64)> {
295        let mut counts: Vec<_> = self.value_counts.iter().collect();
296        counts.sort_by(|a, b| b.1.cmp(a.1));
297        counts
298    }
299}
300
301#[derive(Debug, Clone, Copy, Default, FromStr)]
302pub enum PrintFormat {
303    #[default]
304    Detailed,
305    Concise,
306}
307
308// Helper function to calculate percentage
309pub(crate) fn calculate_percentage(child: f64, parent: f64) -> f64 {
310    if parent == 0.0 {
311        0.0
312    } else {
313        (child / parent) * 100.0
314    }
315}
316
317pub(crate) fn print_field_metrics_value_stats<W: Write>(
318    writer: &mut W,
319    field: &FieldMetrics,
320) -> io::Result<()> {
321    // Print field name with indent
322    let indent = "  ".repeat(field.depth);
323    writeln!(writer, "{}{} ({} bits)", indent, field.name, field.lenbits)?;
324
325    // Print value statistics
326    let counts = field.sorted_value_counts();
327    if !counts.is_empty() {
328        let total_values: u64 = counts.iter().map(|(_, &c)| c).sum();
329        for (val, &count) in counts.iter().take(5) {
330            let pct = (count as f32 / total_values as f32) * 100.0;
331            writeln!(writer, "{}    {}: {:.1}%", indent, val, pct)?;
332        }
333    }
334
335    Ok(())
336}
337
338pub(crate) fn print_field_metrics_bit_stats<W: Write>(
339    writer: &mut W,
340    field: &FieldMetrics,
341) -> io::Result<()> {
342    let indent = "  ".repeat(field.depth);
343    writeln!(writer, "{}{} ({} bits)", indent, field.name, field.lenbits)?;
344
345    // If we didn't collect the bits, skip printing.
346    if field.bit_counts.len() != field.lenbits as usize {
347        return Ok(());
348    }
349
350    for i in 0..field.lenbits {
351        let bit_stats = &field.bit_counts[i as usize];
352        let total = bit_stats.zeros + bit_stats.ones;
353        let percentage = if total > 0 {
354            (bit_stats.ones as f64 / total as f64) * 100.0
355        } else {
356            0.0
357        };
358        writeln!(
359            writer,
360            "{}  Bit {}: ({}/{}) ({:.1}%)",
361            indent, i, bit_stats.zeros, bit_stats.ones, percentage
362        )?;
363    }
364
365    Ok(())
366}