1pub mod brute_force_custom;
32pub mod brute_force_split;
33use crate::analyzer::SizeEstimationParameters;
34use crate::comparison::{GroupComparisonMetrics, GroupDifference};
35use crate::results::analysis_results::AnalysisResults;
36use crate::utils::analyze_utils::size_estimate;
37use brute_force_custom::{
38 find_optimal_custom_result_coefficients, CustomComparisonOptimizationResult,
39};
40use brute_force_split::{
41 find_optimal_split_result_coefficients, SplitComparisonOptimizationResult,
42};
43use rayon::prelude::*;
44
45#[derive(Debug, Clone)]
47pub struct BruteForceConfig {
48 pub min_lz_multiplier: f64,
50 pub max_lz_multiplier: f64,
52 pub lz_step_size: f64,
54 pub min_entropy_multiplier: f64,
56 pub max_entropy_multiplier: f64,
58 pub entropy_step_size: f64,
60}
61
62impl Default for BruteForceConfig {
63 fn default() -> Self {
64 Self {
65 min_lz_multiplier: 0.0001,
66 max_lz_multiplier: 1.0,
67 lz_step_size: 0.0001,
68 min_entropy_multiplier: 1.0,
69 max_entropy_multiplier: 1.75,
70 entropy_step_size: 0.001,
71 }
72 }
73}
74
75#[derive(Debug, Clone, Copy, Default)]
77pub struct OptimizationResult {
78 pub lz_match_multiplier: f64,
80 pub entropy_multiplier: f64,
82}
83
84#[inline(always)]
99pub(crate) fn calculate_error(
100 num_lz_matches: u64,
102 entropy: f64,
103 zstd_size: u64,
105 original_size: u64,
106 lz_match_multiplier: f64,
108 entropy_multiplier: f64,
109) -> f64 {
110 let estimated_size = size_estimate(SizeEstimationParameters {
112 name: "",
113 data_len: original_size as usize,
114 data: None,
115 num_lz_matches: num_lz_matches as usize,
116 entropy,
117 lz_match_multiplier,
118 entropy_multiplier,
119 });
120
121 let error = ((estimated_size as f64) - (zstd_size as f64)).abs();
123
124 let zstd_is_bigger = zstd_size > original_size;
128 let estimate_is_bigger = estimated_size as u64 > original_size;
129 if zstd_is_bigger != estimate_is_bigger {
130 return f32::MAX as f64;
131 }
132
133 error
134}
135
136#[allow(clippy::type_complexity)]
152pub fn optimize_and_apply_coefficients(
153 merged_results: &mut [AnalysisResults],
154 config: Option<&BruteForceConfig>,
155) -> (
156 Vec<(String, SplitComparisonOptimizationResult)>,
157 Vec<(String, CustomComparisonOptimizationResult)>,
158) {
159 let split_optimization_results = find_optimal_split_result_coefficients(merged_results, config);
161
162 let custom_optimization_results =
164 find_optimal_custom_result_coefficients(merged_results, config);
165
166 apply_optimized_coefficients(
168 merged_results,
169 &split_optimization_results,
170 &custom_optimization_results,
171 );
172
173 (split_optimization_results, custom_optimization_results)
174}
175
176pub fn apply_optimized_coefficients(
184 individual_results: &mut [AnalysisResults],
185 split_optimization_results: &[(String, SplitComparisonOptimizationResult)],
186 custom_optimization_results: &[(String, CustomComparisonOptimizationResult)],
187) {
188 for (split_idx, comparison) in individual_results[0]
190 .split_comparisons
191 .iter_mut()
192 .enumerate()
193 {
194 let optimization_result = &split_optimization_results[split_idx].1;
195
196 update_group_metrics(
198 &mut comparison.group1_metrics,
199 optimization_result.group_1.lz_match_multiplier,
200 optimization_result.group_1.entropy_multiplier,
201 );
202
203 update_group_metrics(
205 &mut comparison.group2_metrics,
206 optimization_result.group_2.lz_match_multiplier,
207 optimization_result.group_2.entropy_multiplier,
208 );
209
210 update_group_difference(
212 &comparison.group1_metrics,
213 &comparison.group2_metrics,
214 &mut comparison.difference,
215 );
216 }
217
218 for (custom_idx, comparison) in individual_results[0]
220 .custom_comparisons
221 .iter_mut()
222 .enumerate()
223 {
224 let optimization_result = &custom_optimization_results[custom_idx].1;
225
226 update_group_metrics(
228 &mut comparison.baseline_metrics,
229 optimization_result.baseline.lz_match_multiplier,
230 optimization_result.baseline.entropy_multiplier,
231 );
232
233 for (group_idx, group_metrics) in comparison.group_metrics.iter_mut().enumerate() {
235 update_group_metrics(
236 group_metrics,
237 optimization_result.comparisons[group_idx].lz_match_multiplier,
238 optimization_result.comparisons[group_idx].entropy_multiplier,
239 );
240 }
241
242 for (group_idx, difference) in comparison.differences.iter_mut().enumerate() {
244 update_group_difference(
245 &comparison.baseline_metrics,
246 &comparison.group_metrics[group_idx],
247 difference,
248 );
249 }
250 }
251
252 for result in individual_results {
254 for (split_idx, comparison) in result.split_comparisons.iter_mut().enumerate() {
256 let optimization_result = &split_optimization_results[split_idx].1;
257
258 update_group_metrics(
260 &mut comparison.group1_metrics,
261 optimization_result.group_1.lz_match_multiplier,
262 optimization_result.group_1.entropy_multiplier,
263 );
264
265 update_group_metrics(
267 &mut comparison.group2_metrics,
268 optimization_result.group_2.lz_match_multiplier,
269 optimization_result.group_2.entropy_multiplier,
270 );
271
272 update_group_difference(
274 &comparison.group1_metrics,
275 &comparison.group2_metrics,
276 &mut comparison.difference,
277 );
278 }
279
280 for (custom_idx, comparison) in result.custom_comparisons.iter_mut().enumerate() {
282 let optimization_result = &custom_optimization_results[custom_idx].1;
283
284 update_group_metrics(
286 &mut comparison.baseline_metrics,
287 optimization_result.baseline.lz_match_multiplier,
288 optimization_result.baseline.entropy_multiplier,
289 );
290
291 for (group_idx, group_metrics) in comparison.group_metrics.iter_mut().enumerate() {
293 update_group_metrics(
294 group_metrics,
295 optimization_result.comparisons[group_idx].lz_match_multiplier,
296 optimization_result.comparisons[group_idx].entropy_multiplier,
297 );
298 }
299
300 for (group_idx, difference) in comparison.differences.iter_mut().enumerate() {
302 update_group_difference(
303 &comparison.baseline_metrics,
304 &comparison.group_metrics[group_idx],
305 difference,
306 );
307 }
308 }
309 }
310}
311
312fn update_group_metrics(
320 metrics: &mut GroupComparisonMetrics,
321 lz_match_multiplier: f64,
322 entropy_multiplier: f64,
323) {
324 let estimated_size = size_estimate(SizeEstimationParameters {
326 name: "",
327 data_len: metrics.original_size as usize,
328 data: None,
329 num_lz_matches: metrics.lz_matches as usize,
330 entropy: metrics.entropy,
331 lz_match_multiplier,
332 entropy_multiplier,
333 });
334
335 metrics.estimated_size = estimated_size as u64;
337}
338
339fn update_group_difference(
341 group1_metrics: &GroupComparisonMetrics,
342 group2_metrics: &GroupComparisonMetrics,
343 difference: &mut GroupDifference,
344) {
345 difference.estimated_size =
346 group2_metrics.estimated_size as i64 - group1_metrics.estimated_size as i64;
347}
348
349pub fn print_all_optimization_results<W: std::io::Write>(
357 writer: &mut W,
358 split_results: &[(String, SplitComparisonOptimizationResult)],
359 custom_results: &[(String, CustomComparisonOptimizationResult)],
360) -> std::io::Result<()> {
361 brute_force_split::print_optimization_results(writer, split_results)?;
362 brute_force_custom::print_optimization_results(writer, custom_results)?;
363 Ok(())
364}
365
366#[derive(Clone, Default, Debug, PartialEq, Copy)]
369pub(crate) struct BruteForceComparisonMetrics {
370 pub lz_matches: u64,
372 pub entropy: f64,
374 pub zstd_size: u64,
376 pub original_size: u64,
378}
379
380impl From<GroupComparisonMetrics> for BruteForceComparisonMetrics {
381 fn from(value: GroupComparisonMetrics) -> Self {
382 BruteForceComparisonMetrics {
383 lz_matches: value.lz_matches,
384 entropy: value.entropy,
385 zstd_size: value.zstd_size,
386 original_size: value.original_size,
387 }
388 }
389}
390
391pub(crate) fn find_optimal_coefficients_for_metrics_parallel(
403 metrics: &[BruteForceComparisonMetrics],
404 config: &BruteForceConfig,
405) -> OptimizationResult {
406 let num_chunks = rayon::current_num_threads();
408 let lz_range = config.max_lz_multiplier - config.min_lz_multiplier;
409 let chunk_size = lz_range / num_chunks as f64;
410
411 let mut chunks = Vec::with_capacity(num_chunks);
413 for x in 0..num_chunks {
414 let start = config.min_lz_multiplier + (x as f64 * chunk_size);
415 let end = if x == num_chunks - 1 {
416 config.max_lz_multiplier
417 } else {
418 config.min_lz_multiplier + ((x + 1) as f64 * chunk_size)
419 };
420
421 chunks.push((start, end));
422 }
423
424 let results: Vec<_> = chunks
426 .par_iter()
427 .map(|(start, end)| {
428 find_optimal_coefficients_for_metrics(
429 metrics,
430 &BruteForceConfig {
431 min_lz_multiplier: *start,
432 max_lz_multiplier: *end,
433 min_entropy_multiplier: config.min_entropy_multiplier,
434 max_entropy_multiplier: config.max_entropy_multiplier,
435 entropy_step_size: config.entropy_step_size,
436 lz_step_size: config.lz_step_size,
437 },
438 )
439 })
440 .collect();
441
442 let mut best_result = OptimizationResult::default();
444 let mut min_error = f64::MAX;
445 for (result, error) in results {
446 if error < min_error {
447 min_error = error;
448 best_result = result;
449 }
450 }
451
452 best_result
453}
454
455pub(crate) fn find_optimal_coefficients_for_metrics(
468 metrics: &[BruteForceComparisonMetrics],
469 config: &BruteForceConfig,
470) -> (OptimizationResult, f64) {
471 let mut best_result = OptimizationResult::default();
472 let mut min_error = f64::MAX;
473
474 let mut lz_multiplier = config.min_lz_multiplier;
475 while lz_multiplier <= config.max_lz_multiplier {
476 let mut entropy_multiplier = config.min_entropy_multiplier;
477 while entropy_multiplier <= config.max_entropy_multiplier {
478 let error =
480 calculate_error_for_bruteforce_metrics(metrics, lz_multiplier, entropy_multiplier);
481
482 if error < min_error {
484 best_result = OptimizationResult {
485 lz_match_multiplier: lz_multiplier,
486 entropy_multiplier,
487 };
488
489 min_error = error;
490 }
491
492 entropy_multiplier += config.entropy_step_size;
493 }
494
495 lz_multiplier += config.lz_step_size;
496 }
497
498 (best_result, min_error)
499}
500
501#[inline(always)]
514pub(crate) fn calculate_error_for_bruteforce_metrics(
515 metrics: &[BruteForceComparisonMetrics],
516 lz_match_multiplier: f64,
517 entropy_multiplier: f64,
518) -> f64 {
519 let mut total_error = 0.0f64;
520
521 for result in metrics {
522 total_error += calculate_error(
523 result.lz_matches,
524 result.entropy,
525 result.zstd_size,
526 result.original_size,
527 lz_match_multiplier,
528 entropy_multiplier,
529 );
530 }
531
532 total_error
533}
534
535#[cfg(test)]
537mod tests {
538 use super::*;
539 use crate::{
540 comparison::{
541 compare_groups::GroupComparisonResult, split_comparison::SplitComparisonResult,
542 },
543 results::analysis_results::AnalysisResults,
544 schema::Metadata,
545 };
546 use ahash::AHashMap;
547
548 const TEST_NAME_SPLIT: &str = "Test Split";
550 const TEST_DESC_SPLIT: &str = "Test Split Description";
551 const TEST_NAME_CUSTOM: &str = "Test Custom";
552 const TEST_DESC_CUSTOM: &str = "Test Custom Description";
553 const TEST_GROUP_NAME: &str = "Test Group";
554 const TEST_SCHEMA_NAME: &str = "Test Schema";
555 const TEST_SCHEMA_DESC: &str = "Test Schema Description";
556
557 const GROUP1_LZ_MATCHES: u64 = 100;
559 const GROUP1_ENTROPY: f64 = 5.0;
560 const GROUP1_ESTIMATED_SIZE: u64 = 1000;
561 const GROUP1_ZSTD_SIZE: u64 = 800;
562 const GROUP1_ORIGINAL_SIZE: u64 = 2000;
563
564 const GROUP2_LZ_MATCHES: u64 = 150;
565 const GROUP2_ENTROPY: f64 = 4.0;
566 const GROUP2_ESTIMATED_SIZE: u64 = 900;
567 const GROUP2_ZSTD_SIZE: u64 = 700;
568 const GROUP2_ORIGINAL_SIZE: u64 = 1800;
569
570 const DIFF_LZ_MATCHES: i64 = 50;
571 const DIFF_ENTROPY: f64 = -1.0;
572 const DIFF_ESTIMATED_SIZE: i64 = -100;
573 const DIFF_ZSTD_SIZE: i64 = -100;
574 const DIFF_ORIGINAL_SIZE: i64 = -200;
575
576 const TEST_MIN_LZ: f64 = 0.01;
578 const TEST_MAX_LZ: f64 = 0.05;
579 const TEST_LZ_STEP: f64 = 0.02;
580 const TEST_MIN_ENTROPY: f64 = 1.0;
581 const TEST_MAX_ENTROPY: f64 = 1.1;
582 const TEST_ENTROPY_STEP: f64 = 0.05;
583
584 fn create_mock_results() -> AnalysisResults {
586 let group1_metrics = GroupComparisonMetrics {
588 lz_matches: GROUP1_LZ_MATCHES,
589 entropy: GROUP1_ENTROPY,
590 estimated_size: GROUP1_ESTIMATED_SIZE,
591 zstd_size: GROUP1_ZSTD_SIZE,
592 original_size: GROUP1_ORIGINAL_SIZE,
593 };
594
595 let group2_metrics = GroupComparisonMetrics {
596 lz_matches: GROUP2_LZ_MATCHES,
597 entropy: GROUP2_ENTROPY,
598 estimated_size: GROUP2_ESTIMATED_SIZE,
599 zstd_size: GROUP2_ZSTD_SIZE,
600 original_size: GROUP2_ORIGINAL_SIZE,
601 };
602
603 let difference = GroupDifference {
604 lz_matches: DIFF_LZ_MATCHES,
605 entropy: DIFF_ENTROPY,
606 estimated_size: DIFF_ESTIMATED_SIZE,
607 zstd_size: DIFF_ZSTD_SIZE,
608 original_size: DIFF_ORIGINAL_SIZE,
609 };
610
611 let baseline_metrics = GroupComparisonMetrics {
613 lz_matches: GROUP1_LZ_MATCHES,
614 entropy: GROUP1_ENTROPY,
615 estimated_size: GROUP1_ESTIMATED_SIZE,
616 zstd_size: GROUP1_ZSTD_SIZE,
617 original_size: GROUP1_ORIGINAL_SIZE,
618 };
619
620 let group_metrics = vec![GroupComparisonMetrics {
621 lz_matches: GROUP2_LZ_MATCHES,
622 entropy: GROUP2_ENTROPY,
623 estimated_size: GROUP2_ESTIMATED_SIZE,
624 zstd_size: GROUP2_ZSTD_SIZE,
625 original_size: GROUP2_ORIGINAL_SIZE,
626 }];
627
628 let group_difference = GroupDifference {
629 lz_matches: DIFF_LZ_MATCHES,
630 entropy: DIFF_ENTROPY,
631 estimated_size: DIFF_ESTIMATED_SIZE,
632 zstd_size: DIFF_ZSTD_SIZE,
633 original_size: DIFF_ORIGINAL_SIZE,
634 };
635
636 let schema_metadata = Metadata {
638 name: TEST_SCHEMA_NAME.to_string(),
639 description: TEST_SCHEMA_DESC.to_string(),
640 };
641 AnalysisResults {
642 schema_metadata: schema_metadata.clone(),
643 file_entropy: GROUP1_ENTROPY,
644 file_lz_matches: GROUP1_LZ_MATCHES,
645 zstd_file_size: GROUP1_ZSTD_SIZE,
646 original_size: GROUP1_ORIGINAL_SIZE,
647 per_field: AHashMap::new(),
648 split_comparisons: vec![SplitComparisonResult {
649 name: TEST_NAME_SPLIT.to_string(),
650 description: TEST_DESC_SPLIT.to_string(),
651 group1_metrics,
652 group2_metrics,
653 difference,
654 baseline_comparison_metrics: Vec::new(),
655 split_comparison_metrics: Vec::new(),
656 }],
657 custom_comparisons: vec![GroupComparisonResult {
658 name: TEST_NAME_CUSTOM.to_string(),
659 description: TEST_DESC_CUSTOM.to_string(),
660 baseline_metrics,
661 group_metrics: group_metrics.clone(),
662 group_names: vec![TEST_GROUP_NAME.to_string()],
663 differences: vec![group_difference],
664 }],
665 }
666 }
667
668 #[test]
669 fn can_optimize_and_apply_coefficients() {
670 let config = BruteForceConfig {
672 min_lz_multiplier: TEST_MIN_LZ,
673 max_lz_multiplier: TEST_MAX_LZ,
674 lz_step_size: TEST_LZ_STEP,
675 min_entropy_multiplier: TEST_MIN_ENTROPY,
676 max_entropy_multiplier: TEST_MAX_ENTROPY,
677 entropy_step_size: TEST_ENTROPY_STEP,
678 };
679
680 let mut results = vec![create_mock_results()];
682
683 let split_comparison = &results[0].split_comparisons[0];
685 let custom_comparison = &results[0].custom_comparisons[0];
686
687 let original_split_estimated_size_g1 = split_comparison.group1_metrics.estimated_size;
689 let original_split_estimated_size_g2 = split_comparison.group2_metrics.estimated_size;
690 let original_custom_estimated_size_baseline =
691 custom_comparison.baseline_metrics.estimated_size;
692 let original_custom_estimated_size_group =
693 custom_comparison.group_metrics[0].estimated_size;
694
695 let (split_results, custom_results) =
697 optimize_and_apply_coefficients(&mut results, Some(&config));
698
699 let split_comparison = &results[0].split_comparisons[0];
701 let custom_comparison = &results[0].custom_comparisons[0];
702
703 assert!(!split_results.is_empty());
705 assert_eq!(split_results[0].0, TEST_NAME_SPLIT);
706
707 assert_ne!(
709 split_comparison.group1_metrics.estimated_size,
710 original_split_estimated_size_g1
711 );
712 assert_ne!(
713 split_comparison.group2_metrics.estimated_size,
714 original_split_estimated_size_g2
715 );
716
717 assert!(!custom_results.is_empty());
719 assert_eq!(custom_results[0].0, TEST_NAME_CUSTOM);
720
721 assert_ne!(
723 custom_comparison.baseline_metrics.estimated_size,
724 original_custom_estimated_size_baseline
725 );
726 assert_ne!(
727 custom_comparison.group_metrics[0].estimated_size,
728 original_custom_estimated_size_group
729 );
730
731 assert_ne!(
733 results[0].split_comparisons[0]
734 .group1_metrics
735 .estimated_size,
736 original_split_estimated_size_g1
737 );
738 assert_ne!(
739 results[0].custom_comparisons[0]
740 .baseline_metrics
741 .estimated_size,
742 original_custom_estimated_size_baseline
743 );
744 }
745
746 #[test]
747 fn can_update_group_metrics() {
748 let mut metrics = GroupComparisonMetrics {
750 lz_matches: GROUP1_LZ_MATCHES,
751 entropy: GROUP1_ENTROPY,
752 estimated_size: GROUP1_ESTIMATED_SIZE,
753 zstd_size: GROUP1_ZSTD_SIZE,
754 original_size: GROUP1_ORIGINAL_SIZE,
755 };
756
757 let original_estimated_size = metrics.estimated_size;
758
759 update_group_metrics(&mut metrics, TEST_MIN_LZ * 2.0, TEST_MIN_ENTROPY + 0.05);
761
762 assert_ne!(metrics.estimated_size, original_estimated_size);
764
765 assert_eq!(metrics.lz_matches, GROUP1_LZ_MATCHES);
767 assert_eq!(metrics.entropy, GROUP1_ENTROPY);
768 assert_eq!(metrics.zstd_size, GROUP1_ZSTD_SIZE);
769 assert_eq!(metrics.original_size, GROUP1_ORIGINAL_SIZE);
770 }
771
772 #[test]
773 fn can_calculate_group_difference() {
774 let group1_metrics = GroupComparisonMetrics {
776 lz_matches: GROUP1_LZ_MATCHES,
777 entropy: GROUP1_ENTROPY,
778 estimated_size: GROUP1_ESTIMATED_SIZE,
779 zstd_size: GROUP1_ZSTD_SIZE,
780 original_size: GROUP1_ORIGINAL_SIZE,
781 };
782
783 let group2_metrics = GroupComparisonMetrics {
784 lz_matches: GROUP2_LZ_MATCHES,
785 entropy: GROUP2_ENTROPY,
786 estimated_size: GROUP2_ESTIMATED_SIZE,
787 zstd_size: GROUP2_ZSTD_SIZE,
788 original_size: GROUP2_ORIGINAL_SIZE,
789 };
790
791 let mut difference = GroupDifference {
792 lz_matches: 0, entropy: 0.0, estimated_size: 0, zstd_size: 0, original_size: 0, };
798
799 update_group_difference(&group1_metrics, &group2_metrics, &mut difference);
801
802 assert_eq!(difference.estimated_size, DIFF_ESTIMATED_SIZE);
804
805 }
808}