struct_compression_analyzer/results/mod.rs
1//! Analyzes and processes final analysis results for bit-packed data structures.
2//!
3//! This module handles the final stage of analysis, computing metrics and statistics
4//! from processed bit-packed data. It provides comprehensive analysis capabilities
5//! including entropy calculations, LZ compression analysis, and field-level statistics.
6//!
7//! # Core Types
8//!
9//! - [`AnalysisResults`]: Top-level container for all analysis results
10//! - [`FieldMetrics`]: Detailed metrics for individual fields
11//! - [`PrintFormat`]: Output formatting options for result presentation
12//!
13//! # Key Features
14//!
15//! - Field-level and file-level entropy analysis
16//! - LZ compression match detection
17//! - Size estimation and actual compression metrics
18//! - Bit distribution statistics
19//! - Value frequency analysis
20//! - Split comparison results
21//!
22//! # Public APIs
23//!
24//! Key types and functions for users of this module:
25//!
26//! ## Types
27//!
28//! - [`AnalysisResults`]: Primary container for analysis output
29//! - [`AnalysisResults::print()`]: Display results in console
30//! - [`AnalysisResults::as_field_metrics()`]: Convert file statistics to field metrics
31//!
32//! - [`MergedAnalysisResults`]: Specialization of analysis results for aggregating multiple files
33//! - [`MergedAnalysisResults::from_results()`]: Create from multiple analysis results
34//! - [`MergedAnalysisResults::print()`]: Display merged results
35//! - [`MergedAnalysisResults::as_field_metrics()`]: Convert file statistics to field metrics
36//!
37//! - [`FieldMetrics`]: Per-field analysis data
38//! - [`FieldMetrics::parent_path()`]: Get path of parent field
39//! - [`FieldMetrics::parent_metrics_or()`]: Get metrics of parent field
40//! - [`FieldMetrics::sorted_value_counts()`]: Get sorted value frequencies
41//!
42//! ## Functions
43//!
44//! - [`compute_analysis_results()`]: Generate analysis from [`SchemaAnalyzer`]
45//!
46//! # Example
47//!
48//! ```no_run
49//! use struct_compression_analyzer::{analyzer::SchemaAnalyzer, schema::Schema};
50//! use struct_compression_analyzer::results::analysis_results::AnalysisResults;
51//! use struct_compression_analyzer::analyzer::CompressionOptions;
52//!
53//! fn analyze_data(schema: &Schema, data: &[u8]) -> AnalysisResults {
54//! let options = CompressionOptions::default();
55//! let mut analyzer = SchemaAnalyzer::new(schema, options);
56//! analyzer.add_entry(data);
57//! analyzer.generate_results().unwrap()
58//! }
59//! ```
60//!
61//! # Output Formats
62//!
63//! Results can be displayed in two formats (console):
64//!
65//! - [`Detailed`]: Comprehensive analysis with full metrics
66//! - [`Concise`]: Condensed summary of key statistics
67//!
68//! Groups of results (multiple files) can also be displayed via one of the
69//! other modules.
70//!
71//! - [`CSV`]: CSV representation of results. Export to spreadsheets.
72//! - [`Plot`]: Generate plots of results.
73//!
74//! # Field Metrics
75//!
76//! For each field, the analysis computes:
77//!
78//! - Shannon entropy in bits
79//! - LZ compression matches
80//! - Bit-level distribution
81//! - Value frequency counts
82//! - Size estimates (original, compressed, estimated)
83//!
84//! Fields can be analyzed individually or merged for group analysis.
85//!
86//! # Implementation Notes
87//!
88//! - Handles both MSB and LSB bit ordering
89//! - Supports nested field hierarchies
90//! - Provides parent/child relationship tracking
91//! - Implements efficient metric merging for group analysis
92//!
93//! [`AnalysisResults`]: crate::results::analysis_results::AnalysisResults
94//! [`FieldMetrics`]: crate::results::FieldMetrics
95//! [`PrintFormat`]: crate::results::PrintFormat
96//! [`Detailed`]: crate::results::PrintFormat::Detailed
97//! [`Concise`]: crate::results::PrintFormat::Concise
98//! [`CSV`]: crate::csv
99//! [`Plot`]: crate::plot
100//! [`SchemaAnalyzer`]: crate::analyzer::SchemaAnalyzer
101//! [`compute_analysis_results()`]: crate::results::analysis_results::compute_analysis_results
102//! [`MergedAnalysisResults`]: crate::results::merged_analysis_results::MergedAnalysisResults
103//! [`MergedAnalysisResults::from_results()`]: crate::results::merged_analysis_results::MergedAnalysisResults::from_results
104//! [`MergedAnalysisResults::print()`]: crate::results::merged_analysis_results::MergedAnalysisResults::print
105//! [`MergedAnalysisResults::as_field_metrics()`]: crate::results::merged_analysis_results::MergedAnalysisResults::as_field_metrics
106
107pub mod analysis_results;
108pub mod merged_analysis_results;
109
110use crate::analyzer::BitStats;
111use crate::comparison::compare_groups::GroupComparisonError;
112use crate::results::analysis_results::AnalysisResults;
113use crate::schema::BitOrder;
114use crate::utils::constants::CHILD_MARKER;
115use derive_more::FromStr;
116use merged_analysis_results::MergedAnalysisResults;
117use rustc_hash::FxHashMap;
118use std::io::{self, Write};
119use thiserror::Error;
120
121/// Error type for when merging analysis results fails.
122#[derive(Debug, Error)]
123pub enum AnalysisMergeError {
124 #[error(
125 "Number of bit counts did not match while merging `bit_counts`.
126This indicates inconsistent input data, or merging of results that were computed differently."
127 )]
128 BitCountsDontMatch,
129
130 #[error("Field length mismatch: {0} != {1}. This indicates inconsistent, different or incorrect input data.")]
131 FieldLengthMismatch(u32, u32),
132}
133
134/// Error type for when something goes wrong when computing the final analysis results.
135#[derive(Debug, Error)]
136pub enum ComputeAnalysisResultsError {
137 #[error(transparent)]
138 GroupComparisonError(#[from] GroupComparisonError),
139}
140
141/// Complete analysis metrics for a single field
142#[derive(Clone, Default)]
143pub struct FieldMetrics {
144 /// Name of the field or group
145 pub name: String,
146 /// Name of the full path to the field or group
147 pub full_path: String,
148 /// The depth of the field in the group/field chain.
149 pub depth: usize,
150 /// Total number of observed values
151 pub count: u64,
152 /// Length of the field or group in bits.
153 pub lenbits: u32,
154 /// Shannon entropy in bits
155 pub entropy: f64,
156 /// LZ compression matches in the field
157 pub lz_matches: u64,
158 /// Bit-level statistics. Index of tuple is bit offset.
159 pub bit_counts: Vec<BitStats>,
160 /// The order of the bits within the field
161 pub bit_order: BitOrder,
162 /// Value → occurrence count
163 /// Count of occurrences for each observed value.
164 pub value_counts: FxHashMap<u64, u64>,
165 /// Actual size of the compressed data when compressed with zstandard
166 pub zstd_size: u64,
167 /// Original size of the data before compression
168 pub original_size: u64,
169}
170
171impl FieldMetrics {
172 /// Merge multiple [`FieldMetrics`] objects into one.
173 /// This gives you an 'aggregate' result over a large data set.
174 ///
175 /// # Arguments
176 ///
177 /// * `items` - The items to merge into a new instance.
178 pub fn try_merge_many(items: &[&Self]) -> Result<FieldMetrics, AnalysisMergeError> {
179 if items.is_empty() {
180 return Ok(FieldMetrics::default());
181 }
182
183 let first = items[0];
184
185 // Validate compatible field configurations
186 for other in items {
187 if first.lenbits != other.lenbits {
188 return Err(AnalysisMergeError::FieldLengthMismatch(
189 first.lenbits,
190 other.lenbits,
191 ));
192 }
193 }
194
195 // Average over all items
196 let total_items = items.len();
197 let mut total_count = 0;
198 let mut total_entropy = 0.0;
199 let mut total_lz_matches = 0;
200 let mut total_zstd_size = 0;
201 let mut total_original_size = 0;
202
203 for metrics in items {
204 total_count += metrics.count;
205 total_entropy += metrics.entropy;
206 total_lz_matches += metrics.lz_matches;
207 total_zstd_size += metrics.zstd_size;
208 total_original_size += metrics.original_size;
209 }
210
211 let mut this = FieldMetrics {
212 name: first.name.clone(),
213 full_path: first.full_path.clone(),
214 depth: first.depth,
215 lenbits: first.lenbits,
216 bit_order: first.bit_order,
217 ..Default::default()
218 };
219 this.count = total_count;
220 this.entropy = total_entropy / total_items as f64;
221 this.lz_matches = total_lz_matches / total_items as u64;
222 this.zstd_size = total_zstd_size / total_items as u64;
223 this.original_size = total_original_size / total_items as u64;
224 this.merge_bit_stats_and_value_counts(items)?;
225 Ok(this)
226 }
227
228 fn merge_bit_stats_and_value_counts(
229 &mut self,
230 items: &[&Self],
231 ) -> Result<(), AnalysisMergeError> {
232 let mut bit_counts = items[0].bit_counts.clone();
233 let mut value_counts = items[0].value_counts.clone();
234
235 for other in items {
236 // Validate bit counts length
237 if bit_counts.len() != other.bit_counts.len() {
238 return Err(AnalysisMergeError::BitCountsDontMatch);
239 }
240
241 for (bit_offset, bit_stats) in other.bit_counts.iter().enumerate() {
242 let current = bit_counts
243 .get_mut(bit_offset)
244 .ok_or(AnalysisMergeError::BitCountsDontMatch)?;
245 current.ones += bit_stats.ones;
246 current.zeros += bit_stats.zeros;
247 }
248
249 // Add value counts from others into self
250 for (value, count) in &other.value_counts {
251 *value_counts.entry(*value).or_insert(0) += count;
252 }
253 }
254
255 self.bit_counts = bit_counts;
256 self.value_counts = value_counts;
257 Ok(())
258 }
259
260 /// Returns the parent path of the current field.
261 /// The parent path is the part of the full path before the last dot.
262 pub fn parent_path(&self) -> Option<&str> {
263 self.full_path.rsplit_once(CHILD_MARKER).map(|(p, _)| p)
264 }
265
266 /// Returns the [`FieldMetrics`] object for the parent of the current field.
267 /// Returns `None` if there is no parent.
268 pub fn parent_metrics_or<'a>(
269 &self,
270 results: &'a AnalysisResults,
271 optb: &'a FieldMetrics,
272 ) -> &'a FieldMetrics {
273 let parent_path = self.parent_path();
274 let parent_stats = parent_path
275 .and_then(|p| results.per_field.get(p))
276 .unwrap_or(optb);
277 parent_stats
278 }
279
280 /// Returns the [`FieldMetrics`] object for the parent of the current field in a merged result.
281 pub fn parent_metrics_in_merged_or<'a>(
282 &self,
283 results: &'a MergedAnalysisResults,
284 optb: &'a FieldMetrics,
285 ) -> &'a FieldMetrics {
286 let parent_path = self.parent_path();
287 let parent_stats = parent_path
288 .and_then(|p| results.per_field.get(p))
289 .unwrap_or(optb);
290 parent_stats
291 }
292
293 /// Get sorted value counts descending (value, count)
294 pub fn sorted_value_counts(&self) -> Vec<(&u64, &u64)> {
295 let mut counts: Vec<_> = self.value_counts.iter().collect();
296 counts.sort_by(|a, b| b.1.cmp(a.1));
297 counts
298 }
299}
300
301#[derive(Debug, Clone, Copy, Default, FromStr)]
302pub enum PrintFormat {
303 #[default]
304 Detailed,
305 Concise,
306}
307
308// Helper function to calculate percentage
309pub(crate) fn calculate_percentage(child: f64, parent: f64) -> f64 {
310 if parent == 0.0 {
311 0.0
312 } else {
313 (child / parent) * 100.0
314 }
315}
316
317pub(crate) fn print_field_metrics_value_stats<W: Write>(
318 writer: &mut W,
319 field: &FieldMetrics,
320) -> io::Result<()> {
321 // Print field name with indent
322 let indent = " ".repeat(field.depth);
323 writeln!(writer, "{}{} ({} bits)", indent, field.name, field.lenbits)?;
324
325 // Print value statistics
326 let counts = field.sorted_value_counts();
327 if !counts.is_empty() {
328 let total_values: u64 = counts.iter().map(|(_, &c)| c).sum();
329 for (val, &count) in counts.iter().take(5) {
330 let pct = (count as f32 / total_values as f32) * 100.0;
331 writeln!(writer, "{} {}: {:.1}%", indent, val, pct)?;
332 }
333 }
334
335 Ok(())
336}
337
338pub(crate) fn print_field_metrics_bit_stats<W: Write>(
339 writer: &mut W,
340 field: &FieldMetrics,
341) -> io::Result<()> {
342 let indent = " ".repeat(field.depth);
343 writeln!(writer, "{}{} ({} bits)", indent, field.name, field.lenbits)?;
344
345 // If we didn't collect the bits, skip printing.
346 if field.bit_counts.len() != field.lenbits as usize {
347 return Ok(());
348 }
349
350 for i in 0..field.lenbits {
351 let bit_stats = &field.bit_counts[i as usize];
352 let total = bit_stats.zeros + bit_stats.ones;
353 let percentage = if total > 0 {
354 (bit_stats.ones as f64 / total as f64) * 100.0
355 } else {
356 0.0
357 };
358 writeln!(
359 writer,
360 "{} Bit {}: ({}/{}) ({:.1}%)",
361 indent, i, bit_stats.zeros, bit_stats.ones, percentage
362 )?;
363 }
364
365 Ok(())
366}