use super::{GroupComparisonMetrics, GroupDifference};
use crate::{
analyzer::{CompressionOptions, SizeEstimationParameters},
results::FieldMetrics,
schema::CompressionEstimationParams,
utils::analyze_utils::{calculate_file_entropy, get_zstd_compressed_size},
};
use lossless_transform_utils::match_estimator::estimate_num_lz_matches_fast;
#[allow(clippy::too_many_arguments)]
pub fn make_split_comparison_result(
name: String,
description: String,
baseline_bytes: &[u8],
split_bytes: &[u8],
baseline_comparison_metrics: Vec<FieldComparisonMetrics>,
split_comparison_metrics: Vec<FieldComparisonMetrics>,
compression_options: CompressionOptions,
compression_estimation_group_1: Option<CompressionEstimationParams>,
compression_estimation_group_2: Option<CompressionEstimationParams>,
) -> SplitComparisonResult {
let comp_est_1 = compression_estimation_group_1
.unwrap_or(CompressionEstimationParams::new(&compression_options));
let comp_est_2 = compression_estimation_group_2
.unwrap_or(CompressionEstimationParams::new(&compression_options));
let entropy1 = calculate_file_entropy(baseline_bytes);
let entropy2 = calculate_file_entropy(split_bytes);
let lz_matches1 = estimate_num_lz_matches_fast(baseline_bytes);
let lz_matches2 = estimate_num_lz_matches_fast(split_bytes);
let name_1 = format!("{}-1", name);
let name_2 = format!("{}-2", name);
let estimated_size_1 = (compression_options.size_estimator_fn)(SizeEstimationParameters {
name: &name_1,
data_len: baseline_bytes.len(),
data: Some(baseline_bytes),
num_lz_matches: lz_matches1,
entropy: entropy1,
lz_match_multiplier: comp_est_1.lz_match_multiplier,
entropy_multiplier: comp_est_1.entropy_multiplier,
});
let estimated_size_2 = (compression_options.size_estimator_fn)(SizeEstimationParameters {
name: &name_2,
data_len: split_bytes.len(),
data: Some(split_bytes),
num_lz_matches: lz_matches2,
entropy: entropy2,
lz_match_multiplier: comp_est_2.lz_match_multiplier,
entropy_multiplier: comp_est_2.entropy_multiplier,
});
let actual_size_1 =
get_zstd_compressed_size(baseline_bytes, compression_options.zstd_compression_level);
let actual_size_2 =
get_zstd_compressed_size(split_bytes, compression_options.zstd_compression_level);
let group1_metrics = GroupComparisonMetrics {
lz_matches: lz_matches1 as u64,
entropy: entropy1,
estimated_size: estimated_size_1 as u64,
zstd_size: actual_size_1,
original_size: baseline_bytes.len() as u64,
};
let group2_metrics = GroupComparisonMetrics {
lz_matches: lz_matches2 as u64,
entropy: entropy2,
estimated_size: estimated_size_2 as u64,
zstd_size: actual_size_2,
original_size: split_bytes.len() as u64,
};
SplitComparisonResult {
name,
description,
difference: GroupDifference::from_metrics(&group1_metrics, &group2_metrics),
group1_metrics,
group2_metrics,
baseline_comparison_metrics,
split_comparison_metrics,
}
}
#[derive(Clone, Default)]
pub struct SplitComparisonResult {
pub name: String,
pub description: String,
pub group1_metrics: GroupComparisonMetrics,
pub group2_metrics: GroupComparisonMetrics,
pub difference: GroupDifference,
pub baseline_comparison_metrics: Vec<FieldComparisonMetrics>,
pub split_comparison_metrics: Vec<FieldComparisonMetrics>,
}
impl SplitComparisonResult {
pub fn baseline_max_entropy_diff_ratio(&self) -> f64 {
calculate_max_entropy_diff_ratio(&self.baseline_comparison_metrics)
}
pub fn baseline_max_entropy_diff(&self) -> f64 {
calculate_max_entropy_diff(&self.baseline_comparison_metrics)
}
pub fn split_max_entropy_diff(&self) -> f64 {
calculate_max_entropy_diff(&self.split_comparison_metrics)
}
pub fn split_max_entropy_diff_ratio(&self) -> f64 {
calculate_max_entropy_diff_ratio(&self.split_comparison_metrics)
}
}
#[derive(PartialEq, Debug, Clone, Copy, Default)]
pub struct FieldComparisonMetrics {
pub lz_matches: u64,
pub entropy: f64,
}
impl From<FieldMetrics> for FieldComparisonMetrics {
fn from(value: FieldMetrics) -> Self {
Self {
entropy: value.entropy,
lz_matches: value.lz_matches,
}
}
}
pub(crate) fn calculate_max_entropy_diff(results: &[FieldComparisonMetrics]) -> f64 {
let entropy_values: Vec<f64> = results.iter().map(|m| m.entropy).collect();
if entropy_values.len() < 2 {
0.0
} else {
let max = entropy_values
.iter()
.max_by(|a, b| a.partial_cmp(b).unwrap())
.unwrap();
let min = entropy_values
.iter()
.min_by(|a, b| a.partial_cmp(b).unwrap())
.unwrap();
max - min
}
}
pub(crate) fn calculate_max_entropy_diff_ratio(results: &[FieldComparisonMetrics]) -> f64 {
let entropy_values: Vec<f64> = results.iter().map(|m| m.entropy).collect();
if entropy_values.len() < 2 {
0.0
} else {
let max = entropy_values
.iter()
.max_by(|a, b| a.partial_cmp(b).unwrap())
.unwrap();
let min = entropy_values
.iter()
.min_by(|a, b| a.partial_cmp(b).unwrap())
.unwrap();
if *min == 0.0 {
return 0.0;
}
max / min
}
}