pub mod brute_force_custom;
pub mod brute_force_split;
use crate::analyzer::SizeEstimationParameters;
use crate::comparison::{GroupComparisonMetrics, GroupDifference};
use crate::results::analysis_results::AnalysisResults;
use crate::utils::analyze_utils::size_estimate;
use brute_force_custom::{
find_optimal_custom_result_coefficients, CustomComparisonOptimizationResult,
};
use brute_force_split::{
find_optimal_split_result_coefficients, SplitComparisonOptimizationResult,
};
use rayon::prelude::*;
#[derive(Debug, Clone)]
pub struct BruteForceConfig {
pub min_lz_multiplier: f64,
pub max_lz_multiplier: f64,
pub lz_step_size: f64,
pub min_entropy_multiplier: f64,
pub max_entropy_multiplier: f64,
pub entropy_step_size: f64,
}
impl Default for BruteForceConfig {
fn default() -> Self {
Self {
min_lz_multiplier: 0.0001,
max_lz_multiplier: 1.0,
lz_step_size: 0.0001,
min_entropy_multiplier: 1.0,
max_entropy_multiplier: 1.75,
entropy_step_size: 0.001,
}
}
}
#[derive(Debug, Clone, Copy, Default)]
pub struct OptimizationResult {
pub lz_match_multiplier: f64,
pub entropy_multiplier: f64,
}
#[inline(always)]
pub(crate) fn calculate_error(
num_lz_matches: u64,
entropy: f64,
zstd_size: u64,
original_size: u64,
lz_match_multiplier: f64,
entropy_multiplier: f64,
) -> f64 {
let estimated_size = size_estimate(SizeEstimationParameters {
name: "",
data_len: original_size as usize,
data: None,
num_lz_matches: num_lz_matches as usize,
entropy,
lz_match_multiplier,
entropy_multiplier,
});
let error = ((estimated_size as f64) - (zstd_size as f64)).abs();
let zstd_is_bigger = zstd_size > original_size;
let estimate_is_bigger = estimated_size as u64 > original_size;
if zstd_is_bigger != estimate_is_bigger {
return f32::MAX as f64;
}
error
}
#[allow(clippy::type_complexity)]
pub fn optimize_and_apply_coefficients(
merged_results: &mut [AnalysisResults],
config: Option<&BruteForceConfig>,
) -> (
Vec<(String, SplitComparisonOptimizationResult)>,
Vec<(String, CustomComparisonOptimizationResult)>,
) {
let split_optimization_results = find_optimal_split_result_coefficients(merged_results, config);
let custom_optimization_results =
find_optimal_custom_result_coefficients(merged_results, config);
apply_optimized_coefficients(
merged_results,
&split_optimization_results,
&custom_optimization_results,
);
(split_optimization_results, custom_optimization_results)
}
pub fn apply_optimized_coefficients(
individual_results: &mut [AnalysisResults],
split_optimization_results: &[(String, SplitComparisonOptimizationResult)],
custom_optimization_results: &[(String, CustomComparisonOptimizationResult)],
) {
for (split_idx, comparison) in individual_results[0]
.split_comparisons
.iter_mut()
.enumerate()
{
let optimization_result = &split_optimization_results[split_idx].1;
update_group_metrics(
&mut comparison.group1_metrics,
optimization_result.group_1.lz_match_multiplier,
optimization_result.group_1.entropy_multiplier,
);
update_group_metrics(
&mut comparison.group2_metrics,
optimization_result.group_2.lz_match_multiplier,
optimization_result.group_2.entropy_multiplier,
);
update_group_difference(
&comparison.group1_metrics,
&comparison.group2_metrics,
&mut comparison.difference,
);
}
for (custom_idx, comparison) in individual_results[0]
.custom_comparisons
.iter_mut()
.enumerate()
{
let optimization_result = &custom_optimization_results[custom_idx].1;
update_group_metrics(
&mut comparison.baseline_metrics,
optimization_result.baseline.lz_match_multiplier,
optimization_result.baseline.entropy_multiplier,
);
for (group_idx, group_metrics) in comparison.group_metrics.iter_mut().enumerate() {
update_group_metrics(
group_metrics,
optimization_result.comparisons[group_idx].lz_match_multiplier,
optimization_result.comparisons[group_idx].entropy_multiplier,
);
}
for (group_idx, difference) in comparison.differences.iter_mut().enumerate() {
update_group_difference(
&comparison.baseline_metrics,
&comparison.group_metrics[group_idx],
difference,
);
}
}
for result in individual_results {
for (split_idx, comparison) in result.split_comparisons.iter_mut().enumerate() {
let optimization_result = &split_optimization_results[split_idx].1;
update_group_metrics(
&mut comparison.group1_metrics,
optimization_result.group_1.lz_match_multiplier,
optimization_result.group_1.entropy_multiplier,
);
update_group_metrics(
&mut comparison.group2_metrics,
optimization_result.group_2.lz_match_multiplier,
optimization_result.group_2.entropy_multiplier,
);
update_group_difference(
&comparison.group1_metrics,
&comparison.group2_metrics,
&mut comparison.difference,
);
}
for (custom_idx, comparison) in result.custom_comparisons.iter_mut().enumerate() {
let optimization_result = &custom_optimization_results[custom_idx].1;
update_group_metrics(
&mut comparison.baseline_metrics,
optimization_result.baseline.lz_match_multiplier,
optimization_result.baseline.entropy_multiplier,
);
for (group_idx, group_metrics) in comparison.group_metrics.iter_mut().enumerate() {
update_group_metrics(
group_metrics,
optimization_result.comparisons[group_idx].lz_match_multiplier,
optimization_result.comparisons[group_idx].entropy_multiplier,
);
}
for (group_idx, difference) in comparison.differences.iter_mut().enumerate() {
update_group_difference(
&comparison.baseline_metrics,
&comparison.group_metrics[group_idx],
difference,
);
}
}
}
}
fn update_group_metrics(
metrics: &mut GroupComparisonMetrics,
lz_match_multiplier: f64,
entropy_multiplier: f64,
) {
let estimated_size = size_estimate(SizeEstimationParameters {
name: "",
data_len: metrics.original_size as usize,
data: None,
num_lz_matches: metrics.lz_matches as usize,
entropy: metrics.entropy,
lz_match_multiplier,
entropy_multiplier,
});
metrics.estimated_size = estimated_size as u64;
}
fn update_group_difference(
group1_metrics: &GroupComparisonMetrics,
group2_metrics: &GroupComparisonMetrics,
difference: &mut GroupDifference,
) {
difference.estimated_size =
group2_metrics.estimated_size as i64 - group1_metrics.estimated_size as i64;
}
pub fn print_all_optimization_results<W: std::io::Write>(
writer: &mut W,
split_results: &[(String, SplitComparisonOptimizationResult)],
custom_results: &[(String, CustomComparisonOptimizationResult)],
) -> std::io::Result<()> {
brute_force_split::print_optimization_results(writer, split_results)?;
brute_force_custom::print_optimization_results(writer, custom_results)?;
Ok(())
}
#[derive(Clone, Default, Debug, PartialEq, Copy)]
pub(crate) struct BruteForceComparisonMetrics {
pub lz_matches: u64,
pub entropy: f64,
pub zstd_size: u64,
pub original_size: u64,
}
impl From<GroupComparisonMetrics> for BruteForceComparisonMetrics {
fn from(value: GroupComparisonMetrics) -> Self {
BruteForceComparisonMetrics {
lz_matches: value.lz_matches,
entropy: value.entropy,
zstd_size: value.zstd_size,
original_size: value.original_size,
}
}
}
pub(crate) fn find_optimal_coefficients_for_metrics_parallel(
metrics: &[BruteForceComparisonMetrics],
config: &BruteForceConfig,
) -> OptimizationResult {
let num_chunks = rayon::current_num_threads();
let lz_range = config.max_lz_multiplier - config.min_lz_multiplier;
let chunk_size = lz_range / num_chunks as f64;
let mut chunks = Vec::with_capacity(num_chunks);
for x in 0..num_chunks {
let start = config.min_lz_multiplier + (x as f64 * chunk_size);
let end = if x == num_chunks - 1 {
config.max_lz_multiplier
} else {
config.min_lz_multiplier + ((x + 1) as f64 * chunk_size)
};
chunks.push((start, end));
}
let results: Vec<_> = chunks
.par_iter()
.map(|(start, end)| {
find_optimal_coefficients_for_metrics(
metrics,
&BruteForceConfig {
min_lz_multiplier: *start,
max_lz_multiplier: *end,
min_entropy_multiplier: config.min_entropy_multiplier,
max_entropy_multiplier: config.max_entropy_multiplier,
entropy_step_size: config.entropy_step_size,
lz_step_size: config.lz_step_size,
},
)
})
.collect();
let mut best_result = OptimizationResult::default();
let mut min_error = f64::MAX;
for (result, error) in results {
if error < min_error {
min_error = error;
best_result = result;
}
}
best_result
}
pub(crate) fn find_optimal_coefficients_for_metrics(
metrics: &[BruteForceComparisonMetrics],
config: &BruteForceConfig,
) -> (OptimizationResult, f64) {
let mut best_result = OptimizationResult::default();
let mut min_error = f64::MAX;
let mut lz_multiplier = config.min_lz_multiplier;
while lz_multiplier <= config.max_lz_multiplier {
let mut entropy_multiplier = config.min_entropy_multiplier;
while entropy_multiplier <= config.max_entropy_multiplier {
let error =
calculate_error_for_bruteforce_metrics(metrics, lz_multiplier, entropy_multiplier);
if error < min_error {
best_result = OptimizationResult {
lz_match_multiplier: lz_multiplier,
entropy_multiplier,
};
min_error = error;
}
entropy_multiplier += config.entropy_step_size;
}
lz_multiplier += config.lz_step_size;
}
(best_result, min_error)
}
#[inline(always)]
pub(crate) fn calculate_error_for_bruteforce_metrics(
metrics: &[BruteForceComparisonMetrics],
lz_match_multiplier: f64,
entropy_multiplier: f64,
) -> f64 {
let mut total_error = 0.0f64;
for result in metrics {
total_error += calculate_error(
result.lz_matches,
result.entropy,
result.zstd_size,
result.original_size,
lz_match_multiplier,
entropy_multiplier,
);
}
total_error
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{
comparison::{
compare_groups::GroupComparisonResult, split_comparison::SplitComparisonResult,
},
results::analysis_results::AnalysisResults,
schema::Metadata,
};
use ahash::AHashMap;
const TEST_NAME_SPLIT: &str = "Test Split";
const TEST_DESC_SPLIT: &str = "Test Split Description";
const TEST_NAME_CUSTOM: &str = "Test Custom";
const TEST_DESC_CUSTOM: &str = "Test Custom Description";
const TEST_GROUP_NAME: &str = "Test Group";
const TEST_SCHEMA_NAME: &str = "Test Schema";
const TEST_SCHEMA_DESC: &str = "Test Schema Description";
const GROUP1_LZ_MATCHES: u64 = 100;
const GROUP1_ENTROPY: f64 = 5.0;
const GROUP1_ESTIMATED_SIZE: u64 = 1000;
const GROUP1_ZSTD_SIZE: u64 = 800;
const GROUP1_ORIGINAL_SIZE: u64 = 2000;
const GROUP2_LZ_MATCHES: u64 = 150;
const GROUP2_ENTROPY: f64 = 4.0;
const GROUP2_ESTIMATED_SIZE: u64 = 900;
const GROUP2_ZSTD_SIZE: u64 = 700;
const GROUP2_ORIGINAL_SIZE: u64 = 1800;
const DIFF_LZ_MATCHES: i64 = 50;
const DIFF_ENTROPY: f64 = -1.0;
const DIFF_ESTIMATED_SIZE: i64 = -100;
const DIFF_ZSTD_SIZE: i64 = -100;
const DIFF_ORIGINAL_SIZE: i64 = -200;
const TEST_MIN_LZ: f64 = 0.01;
const TEST_MAX_LZ: f64 = 0.05;
const TEST_LZ_STEP: f64 = 0.02;
const TEST_MIN_ENTROPY: f64 = 1.0;
const TEST_MAX_ENTROPY: f64 = 1.1;
const TEST_ENTROPY_STEP: f64 = 0.05;
fn create_mock_results() -> AnalysisResults {
let group1_metrics = GroupComparisonMetrics {
lz_matches: GROUP1_LZ_MATCHES,
entropy: GROUP1_ENTROPY,
estimated_size: GROUP1_ESTIMATED_SIZE,
zstd_size: GROUP1_ZSTD_SIZE,
original_size: GROUP1_ORIGINAL_SIZE,
};
let group2_metrics = GroupComparisonMetrics {
lz_matches: GROUP2_LZ_MATCHES,
entropy: GROUP2_ENTROPY,
estimated_size: GROUP2_ESTIMATED_SIZE,
zstd_size: GROUP2_ZSTD_SIZE,
original_size: GROUP2_ORIGINAL_SIZE,
};
let difference = GroupDifference {
lz_matches: DIFF_LZ_MATCHES,
entropy: DIFF_ENTROPY,
estimated_size: DIFF_ESTIMATED_SIZE,
zstd_size: DIFF_ZSTD_SIZE,
original_size: DIFF_ORIGINAL_SIZE,
};
let baseline_metrics = GroupComparisonMetrics {
lz_matches: GROUP1_LZ_MATCHES,
entropy: GROUP1_ENTROPY,
estimated_size: GROUP1_ESTIMATED_SIZE,
zstd_size: GROUP1_ZSTD_SIZE,
original_size: GROUP1_ORIGINAL_SIZE,
};
let group_metrics = vec![GroupComparisonMetrics {
lz_matches: GROUP2_LZ_MATCHES,
entropy: GROUP2_ENTROPY,
estimated_size: GROUP2_ESTIMATED_SIZE,
zstd_size: GROUP2_ZSTD_SIZE,
original_size: GROUP2_ORIGINAL_SIZE,
}];
let group_difference = GroupDifference {
lz_matches: DIFF_LZ_MATCHES,
entropy: DIFF_ENTROPY,
estimated_size: DIFF_ESTIMATED_SIZE,
zstd_size: DIFF_ZSTD_SIZE,
original_size: DIFF_ORIGINAL_SIZE,
};
let schema_metadata = Metadata {
name: TEST_SCHEMA_NAME.to_string(),
description: TEST_SCHEMA_DESC.to_string(),
};
AnalysisResults {
schema_metadata: schema_metadata.clone(),
file_entropy: GROUP1_ENTROPY,
file_lz_matches: GROUP1_LZ_MATCHES,
zstd_file_size: GROUP1_ZSTD_SIZE,
original_size: GROUP1_ORIGINAL_SIZE,
per_field: AHashMap::new(),
split_comparisons: vec![SplitComparisonResult {
name: TEST_NAME_SPLIT.to_string(),
description: TEST_DESC_SPLIT.to_string(),
group1_metrics,
group2_metrics,
difference,
baseline_comparison_metrics: Vec::new(),
split_comparison_metrics: Vec::new(),
}],
custom_comparisons: vec![GroupComparisonResult {
name: TEST_NAME_CUSTOM.to_string(),
description: TEST_DESC_CUSTOM.to_string(),
baseline_metrics,
group_metrics: group_metrics.clone(),
group_names: vec![TEST_GROUP_NAME.to_string()],
differences: vec![group_difference],
}],
}
}
#[test]
fn can_optimize_and_apply_coefficients() {
let config = BruteForceConfig {
min_lz_multiplier: TEST_MIN_LZ,
max_lz_multiplier: TEST_MAX_LZ,
lz_step_size: TEST_LZ_STEP,
min_entropy_multiplier: TEST_MIN_ENTROPY,
max_entropy_multiplier: TEST_MAX_ENTROPY,
entropy_step_size: TEST_ENTROPY_STEP,
};
let mut results = vec![create_mock_results()];
let split_comparison = &results[0].split_comparisons[0];
let custom_comparison = &results[0].custom_comparisons[0];
let original_split_estimated_size_g1 = split_comparison.group1_metrics.estimated_size;
let original_split_estimated_size_g2 = split_comparison.group2_metrics.estimated_size;
let original_custom_estimated_size_baseline =
custom_comparison.baseline_metrics.estimated_size;
let original_custom_estimated_size_group =
custom_comparison.group_metrics[0].estimated_size;
let (split_results, custom_results) =
optimize_and_apply_coefficients(&mut results, Some(&config));
let split_comparison = &results[0].split_comparisons[0];
let custom_comparison = &results[0].custom_comparisons[0];
assert!(!split_results.is_empty());
assert_eq!(split_results[0].0, TEST_NAME_SPLIT);
assert_ne!(
split_comparison.group1_metrics.estimated_size,
original_split_estimated_size_g1
);
assert_ne!(
split_comparison.group2_metrics.estimated_size,
original_split_estimated_size_g2
);
assert!(!custom_results.is_empty());
assert_eq!(custom_results[0].0, TEST_NAME_CUSTOM);
assert_ne!(
custom_comparison.baseline_metrics.estimated_size,
original_custom_estimated_size_baseline
);
assert_ne!(
custom_comparison.group_metrics[0].estimated_size,
original_custom_estimated_size_group
);
assert_ne!(
results[0].split_comparisons[0]
.group1_metrics
.estimated_size,
original_split_estimated_size_g1
);
assert_ne!(
results[0].custom_comparisons[0]
.baseline_metrics
.estimated_size,
original_custom_estimated_size_baseline
);
}
#[test]
fn can_update_group_metrics() {
let mut metrics = GroupComparisonMetrics {
lz_matches: GROUP1_LZ_MATCHES,
entropy: GROUP1_ENTROPY,
estimated_size: GROUP1_ESTIMATED_SIZE,
zstd_size: GROUP1_ZSTD_SIZE,
original_size: GROUP1_ORIGINAL_SIZE,
};
let original_estimated_size = metrics.estimated_size;
update_group_metrics(&mut metrics, TEST_MIN_LZ * 2.0, TEST_MIN_ENTROPY + 0.05);
assert_ne!(metrics.estimated_size, original_estimated_size);
assert_eq!(metrics.lz_matches, GROUP1_LZ_MATCHES);
assert_eq!(metrics.entropy, GROUP1_ENTROPY);
assert_eq!(metrics.zstd_size, GROUP1_ZSTD_SIZE);
assert_eq!(metrics.original_size, GROUP1_ORIGINAL_SIZE);
}
#[test]
fn can_calculate_group_difference() {
let group1_metrics = GroupComparisonMetrics {
lz_matches: GROUP1_LZ_MATCHES,
entropy: GROUP1_ENTROPY,
estimated_size: GROUP1_ESTIMATED_SIZE,
zstd_size: GROUP1_ZSTD_SIZE,
original_size: GROUP1_ORIGINAL_SIZE,
};
let group2_metrics = GroupComparisonMetrics {
lz_matches: GROUP2_LZ_MATCHES,
entropy: GROUP2_ENTROPY,
estimated_size: GROUP2_ESTIMATED_SIZE,
zstd_size: GROUP2_ZSTD_SIZE,
original_size: GROUP2_ORIGINAL_SIZE,
};
let mut difference = GroupDifference {
lz_matches: 0, entropy: 0.0, estimated_size: 0, zstd_size: 0, original_size: 0, };
update_group_difference(&group1_metrics, &group2_metrics, &mut difference);
assert_eq!(difference.estimated_size, DIFF_ESTIMATED_SIZE);
}
}