struct_compression_analyzer/comparison/
split_comparison.rs

1//! Analyzes compression efficiency of different field arrangements in bit-packed structures.
2//!
3//! Compares compression metrics between different field groupings, primarily focusing on
4//! interleaved vs. separated layouts (e.g., RGBRGBRGB vs. RRRGGGBBB).
5//!
6//! # Core Types
7//!
8//! - [`SplitComparisonResult`]: Results from comparing field arrangements
9//! - [`FieldComparisonMetrics`]: Field-level compression statistics
10//!
11//! # Example
12//!
13//! ```yaml
14//! split_groups:
15//!   - name: colors
16//!     group_1: [colors]                    # RGBRGBRGB
17//!     group_2: [color_r, color_g, color_b] # RRRGGGBBB
18//! ```
19//!
20//! Use [`make_split_comparison_result`] to generate comparison metrics for two field arrangements.
21//!
22//! Each comparison tracks:
23//! - Entropy and LZ matches (data redundancy measures)
24//! - Sizes (original, estimated compression, actual zstd compression)
25//!
26//! # Usage Notes
27//!
28//! - Ensure compared groups have equal total bits
29//! - Field ordering can significantly impact compression
30//! - zstd compression time dominates performance
31//!
32//! [`SplitComparisonResult`]: crate::comparison::split_comparison::SplitComparisonResult
33//! [`FieldComparisonMetrics`]: crate::comparison::split_comparison::FieldComparisonMetrics
34//! [`make_split_comparison_result`]: crate::comparison::split_comparison::make_split_comparison_result
35
36use super::{GroupComparisonMetrics, GroupDifference};
37use crate::{
38    analyzer::{CompressionOptions, SizeEstimationParameters},
39    results::FieldMetrics,
40    schema::CompressionEstimationParams,
41    utils::analyze_utils::{calculate_file_entropy, get_zstd_compressed_size},
42};
43use lossless_transform_utils::match_estimator::estimate_num_lz_matches_fast;
44
45/// Calculates the compression statistics of two splits (of the same data) and
46/// returns them as a [`SplitComparisonResult`] object. This can also be used for
47/// generic two-way compares.
48///
49/// This function aggregates the comparison results for individual fields and
50/// calculates overall statistics for the split comparison.
51///
52/// # Arguments
53///
54/// * `name` - The name of the group comparison.
55/// * `description` - A description of the group comparison.
56/// * `baseline_bytes` - The bytes of the baseline (original/reference) group.
57/// * `split_bytes` - The bytes of the second (comparison) group.
58/// * `baseline_comparison_metrics` - The metrics for the individual fields in the baseline (original/reference) group.
59/// * `split_comparison_metrics` - The metrics for the individual fields in the second (comparison) group.
60/// * `compression_options` - Compression options, zstd compression level, etc.
61///
62/// # Returns
63///
64/// A [`SplitComparisonResult`] struct containing the aggregated comparison results
65/// and overall statistics.
66#[allow(clippy::too_many_arguments)]
67pub fn make_split_comparison_result(
68    name: String,
69    description: String,
70    baseline_bytes: &[u8],
71    split_bytes: &[u8],
72    baseline_comparison_metrics: Vec<FieldComparisonMetrics>,
73    split_comparison_metrics: Vec<FieldComparisonMetrics>,
74    compression_options: CompressionOptions,
75    compression_estimation_group_1: Option<CompressionEstimationParams>,
76    compression_estimation_group_2: Option<CompressionEstimationParams>,
77) -> SplitComparisonResult {
78    let comp_est_1 = compression_estimation_group_1
79        .unwrap_or(CompressionEstimationParams::new(&compression_options));
80    let comp_est_2 = compression_estimation_group_2
81        .unwrap_or(CompressionEstimationParams::new(&compression_options));
82
83    // Calculate entropy and LZ matches for both group sets.
84    let entropy1 = calculate_file_entropy(baseline_bytes);
85    let entropy2 = calculate_file_entropy(split_bytes);
86    let lz_matches1 = estimate_num_lz_matches_fast(baseline_bytes);
87    let lz_matches2 = estimate_num_lz_matches_fast(split_bytes);
88    let name_1 = format!("{}-1", name);
89    let name_2 = format!("{}-2", name);
90    let estimated_size_1 = (compression_options.size_estimator_fn)(SizeEstimationParameters {
91        name: &name_1,
92        data_len: baseline_bytes.len(),
93        data: Some(baseline_bytes),
94        num_lz_matches: lz_matches1,
95        entropy: entropy1,
96        lz_match_multiplier: comp_est_1.lz_match_multiplier,
97        entropy_multiplier: comp_est_1.entropy_multiplier,
98    });
99    let estimated_size_2 = (compression_options.size_estimator_fn)(SizeEstimationParameters {
100        name: &name_2,
101        data_len: split_bytes.len(),
102        data: Some(split_bytes),
103        num_lz_matches: lz_matches2,
104        entropy: entropy2,
105        lz_match_multiplier: comp_est_2.lz_match_multiplier,
106        entropy_multiplier: comp_est_2.entropy_multiplier,
107    });
108    let actual_size_1 =
109        get_zstd_compressed_size(baseline_bytes, compression_options.zstd_compression_level);
110    let actual_size_2 =
111        get_zstd_compressed_size(split_bytes, compression_options.zstd_compression_level);
112
113    let group1_metrics = GroupComparisonMetrics {
114        lz_matches: lz_matches1 as u64,
115        entropy: entropy1,
116        estimated_size: estimated_size_1 as u64,
117        zstd_size: actual_size_1,
118        original_size: baseline_bytes.len() as u64,
119    };
120
121    let group2_metrics = GroupComparisonMetrics {
122        lz_matches: lz_matches2 as u64,
123        entropy: entropy2,
124        estimated_size: estimated_size_2 as u64,
125        zstd_size: actual_size_2,
126        original_size: split_bytes.len() as u64,
127    };
128
129    SplitComparisonResult {
130        name,
131        description,
132        difference: GroupDifference::from_metrics(&group1_metrics, &group2_metrics),
133        group1_metrics,
134        group2_metrics,
135        baseline_comparison_metrics,
136        split_comparison_metrics,
137    }
138}
139
140/// The result of comparing 2 arbitrary groups of fields based on the schema.
141#[derive(Clone, Default)]
142pub struct SplitComparisonResult {
143    /// The name of the group comparison. (Copied from schema)
144    pub name: String,
145    /// A description of the group comparison. (Copied from schema)
146    pub description: String,
147    /// The metrics for the first group.
148    pub group1_metrics: GroupComparisonMetrics,
149    /// The metrics for the second group.
150    pub group2_metrics: GroupComparisonMetrics,
151    /// Comparison between group 2 and group 1.
152    pub difference: GroupDifference,
153    /// The statistics for the individual fields of the baseline group.
154    pub baseline_comparison_metrics: Vec<FieldComparisonMetrics>,
155    /// The statistics for the individual fields of the split group.
156    pub split_comparison_metrics: Vec<FieldComparisonMetrics>,
157}
158
159/// Helper functions around [`SplitComparisonResult`]
160impl SplitComparisonResult {
161    /// Ratio between the max and min entropy of the baseline fields.
162    pub fn baseline_max_entropy_diff_ratio(&self) -> f64 {
163        calculate_max_entropy_diff_ratio(&self.baseline_comparison_metrics)
164    }
165
166    /// Maximum difference between the entropy of the baseline fields.
167    pub fn baseline_max_entropy_diff(&self) -> f64 {
168        calculate_max_entropy_diff(&self.baseline_comparison_metrics)
169    }
170
171    /// Maximum difference between the entropy of the split fields.
172    pub fn split_max_entropy_diff(&self) -> f64 {
173        calculate_max_entropy_diff(&self.split_comparison_metrics)
174    }
175
176    /// Ratio between the max and min entropy of the split fields.
177    pub fn split_max_entropy_diff_ratio(&self) -> f64 {
178        calculate_max_entropy_diff_ratio(&self.split_comparison_metrics)
179    }
180}
181
182/// Represents the statistics for the individual fields which were used
183/// to create the individual combined group or split.
184///
185/// i.e. This is the info for the fields that were used to create the final
186/// combined group or split.
187///
188/// This is useful when dumping
189/// extra info about the fields.
190#[derive(PartialEq, Debug, Clone, Copy, Default)]
191pub struct FieldComparisonMetrics {
192    /// LZ compression matches in the field
193    pub lz_matches: u64,
194    /// Shannon entropy in bits
195    pub entropy: f64,
196}
197
198/// Converts a [`FieldMetrics`] object into a [`FieldComparisonMetrics`] object.
199impl From<FieldMetrics> for FieldComparisonMetrics {
200    fn from(value: FieldMetrics) -> Self {
201        Self {
202            entropy: value.entropy,
203            lz_matches: value.lz_matches,
204        }
205    }
206}
207
208pub(crate) fn calculate_max_entropy_diff(results: &[FieldComparisonMetrics]) -> f64 {
209    let entropy_values: Vec<f64> = results.iter().map(|m| m.entropy).collect();
210    if entropy_values.len() < 2 {
211        0.0
212    } else {
213        let max = entropy_values
214            .iter()
215            .max_by(|a, b| a.partial_cmp(b).unwrap())
216            .unwrap();
217        let min = entropy_values
218            .iter()
219            .min_by(|a, b| a.partial_cmp(b).unwrap())
220            .unwrap();
221        max - min
222    }
223}
224
225pub(crate) fn calculate_max_entropy_diff_ratio(results: &[FieldComparisonMetrics]) -> f64 {
226    let entropy_values: Vec<f64> = results.iter().map(|m| m.entropy).collect();
227    if entropy_values.len() < 2 {
228        0.0
229    } else {
230        let max = entropy_values
231            .iter()
232            .max_by(|a, b| a.partial_cmp(b).unwrap())
233            .unwrap();
234        let min = entropy_values
235            .iter()
236            .min_by(|a, b| a.partial_cmp(b).unwrap())
237            .unwrap();
238        if *min == 0.0 {
239            return 0.0;
240        }
241        max / min
242    }
243}