struct_compression_analyzer/comparison/
split_comparison.rs1use super::{GroupComparisonMetrics, GroupDifference};
37use crate::{
38 analyzer::{CompressionOptions, SizeEstimationParameters},
39 results::FieldMetrics,
40 schema::CompressionEstimationParams,
41 utils::analyze_utils::{calculate_file_entropy, get_zstd_compressed_size},
42};
43use lossless_transform_utils::match_estimator::estimate_num_lz_matches_fast;
44
45#[allow(clippy::too_many_arguments)]
67pub fn make_split_comparison_result(
68 name: String,
69 description: String,
70 baseline_bytes: &[u8],
71 split_bytes: &[u8],
72 baseline_comparison_metrics: Vec<FieldComparisonMetrics>,
73 split_comparison_metrics: Vec<FieldComparisonMetrics>,
74 compression_options: CompressionOptions,
75 compression_estimation_group_1: Option<CompressionEstimationParams>,
76 compression_estimation_group_2: Option<CompressionEstimationParams>,
77) -> SplitComparisonResult {
78 let comp_est_1 = compression_estimation_group_1
79 .unwrap_or(CompressionEstimationParams::new(&compression_options));
80 let comp_est_2 = compression_estimation_group_2
81 .unwrap_or(CompressionEstimationParams::new(&compression_options));
82
83 let entropy1 = calculate_file_entropy(baseline_bytes);
85 let entropy2 = calculate_file_entropy(split_bytes);
86 let lz_matches1 = estimate_num_lz_matches_fast(baseline_bytes);
87 let lz_matches2 = estimate_num_lz_matches_fast(split_bytes);
88 let name_1 = format!("{}-1", name);
89 let name_2 = format!("{}-2", name);
90 let estimated_size_1 = (compression_options.size_estimator_fn)(SizeEstimationParameters {
91 name: &name_1,
92 data_len: baseline_bytes.len(),
93 data: Some(baseline_bytes),
94 num_lz_matches: lz_matches1,
95 entropy: entropy1,
96 lz_match_multiplier: comp_est_1.lz_match_multiplier,
97 entropy_multiplier: comp_est_1.entropy_multiplier,
98 });
99 let estimated_size_2 = (compression_options.size_estimator_fn)(SizeEstimationParameters {
100 name: &name_2,
101 data_len: split_bytes.len(),
102 data: Some(split_bytes),
103 num_lz_matches: lz_matches2,
104 entropy: entropy2,
105 lz_match_multiplier: comp_est_2.lz_match_multiplier,
106 entropy_multiplier: comp_est_2.entropy_multiplier,
107 });
108 let actual_size_1 =
109 get_zstd_compressed_size(baseline_bytes, compression_options.zstd_compression_level);
110 let actual_size_2 =
111 get_zstd_compressed_size(split_bytes, compression_options.zstd_compression_level);
112
113 let group1_metrics = GroupComparisonMetrics {
114 lz_matches: lz_matches1 as u64,
115 entropy: entropy1,
116 estimated_size: estimated_size_1 as u64,
117 zstd_size: actual_size_1,
118 original_size: baseline_bytes.len() as u64,
119 };
120
121 let group2_metrics = GroupComparisonMetrics {
122 lz_matches: lz_matches2 as u64,
123 entropy: entropy2,
124 estimated_size: estimated_size_2 as u64,
125 zstd_size: actual_size_2,
126 original_size: split_bytes.len() as u64,
127 };
128
129 SplitComparisonResult {
130 name,
131 description,
132 difference: GroupDifference::from_metrics(&group1_metrics, &group2_metrics),
133 group1_metrics,
134 group2_metrics,
135 baseline_comparison_metrics,
136 split_comparison_metrics,
137 }
138}
139
140#[derive(Clone, Default)]
142pub struct SplitComparisonResult {
143 pub name: String,
145 pub description: String,
147 pub group1_metrics: GroupComparisonMetrics,
149 pub group2_metrics: GroupComparisonMetrics,
151 pub difference: GroupDifference,
153 pub baseline_comparison_metrics: Vec<FieldComparisonMetrics>,
155 pub split_comparison_metrics: Vec<FieldComparisonMetrics>,
157}
158
159impl SplitComparisonResult {
161 pub fn baseline_max_entropy_diff_ratio(&self) -> f64 {
163 calculate_max_entropy_diff_ratio(&self.baseline_comparison_metrics)
164 }
165
166 pub fn baseline_max_entropy_diff(&self) -> f64 {
168 calculate_max_entropy_diff(&self.baseline_comparison_metrics)
169 }
170
171 pub fn split_max_entropy_diff(&self) -> f64 {
173 calculate_max_entropy_diff(&self.split_comparison_metrics)
174 }
175
176 pub fn split_max_entropy_diff_ratio(&self) -> f64 {
178 calculate_max_entropy_diff_ratio(&self.split_comparison_metrics)
179 }
180}
181
182#[derive(PartialEq, Debug, Clone, Copy, Default)]
191pub struct FieldComparisonMetrics {
192 pub lz_matches: u64,
194 pub entropy: f64,
196}
197
198impl From<FieldMetrics> for FieldComparisonMetrics {
200 fn from(value: FieldMetrics) -> Self {
201 Self {
202 entropy: value.entropy,
203 lz_matches: value.lz_matches,
204 }
205 }
206}
207
208pub(crate) fn calculate_max_entropy_diff(results: &[FieldComparisonMetrics]) -> f64 {
209 let entropy_values: Vec<f64> = results.iter().map(|m| m.entropy).collect();
210 if entropy_values.len() < 2 {
211 0.0
212 } else {
213 let max = entropy_values
214 .iter()
215 .max_by(|a, b| a.partial_cmp(b).unwrap())
216 .unwrap();
217 let min = entropy_values
218 .iter()
219 .min_by(|a, b| a.partial_cmp(b).unwrap())
220 .unwrap();
221 max - min
222 }
223}
224
225pub(crate) fn calculate_max_entropy_diff_ratio(results: &[FieldComparisonMetrics]) -> f64 {
226 let entropy_values: Vec<f64> = results.iter().map(|m| m.entropy).collect();
227 if entropy_values.len() < 2 {
228 0.0
229 } else {
230 let max = entropy_values
231 .iter()
232 .max_by(|a, b| a.partial_cmp(b).unwrap())
233 .unwrap();
234 let min = entropy_values
235 .iter()
236 .min_by(|a, b| a.partial_cmp(b).unwrap())
237 .unwrap();
238 if *min == 0.0 {
239 return 0.0;
240 }
241 max / min
242 }
243}