struct_compression_analyzer/comparison/stats.rs
1//! Statistical functions for analyzing compression metrics.
2//!
3//! This module provides functionality for calculating and analyzing statistical
4//! measures related to compression ratios and other metrics.
5//!
6//! # Types
7//!
8//! - [`Stats`]: Container for a complete set of statistical measures including
9//! quartiles, mean, median, IQR, min/max, and sample count.
10//!
11//! # Functions
12//!
13//! ## Core Statistics
14//!
15//! - [`calculate_stats`]: Calculate comprehensive statistics for an array of values
16//! - [`calculate_percentile`]: Helper function to calculate a specific percentile
17//! - [`format_stats`]: Format statistics as a human-readable string
18//!
19//! ## ZSTD Compression Ratio Statistics
20//!
21//! - [`calculate_zstd_ratio_stats`]: Statistics for ZSTD ratios in split comparisons
22//! - [`calculate_custom_zstd_ratio_stats`]: Statistics for ZSTD ratios in custom comparisons
23//!
24//! # Statistical Measures
25//!
26//! The module provides calculation of:
27//! - Interquartile Range (IQR)
28//! - Percentile ranges (Q1, median, Q3)
29//! - Minimum and maximum values
30//! - Mean (average)
31//! - Sample count
32
33use crate::{plot::calc_ratio_f64, results::analysis_results::AnalysisResults};
34use core::cmp::Ordering;
35
36/// Statistics for a set of numeric values.
37#[derive(Debug, Clone, Copy)]
38pub struct Stats {
39 /// Minimum value
40 pub min: f64,
41 /// First quartile (25th percentile)
42 pub q1: f64,
43 /// Median (50th percentile)
44 pub median: f64,
45 /// Third quartile (75th percentile)
46 pub q3: f64,
47 /// Maximum value
48 pub max: f64,
49 /// Interquartile range (IQR = Q3 - Q1)
50 pub iqr: f64,
51 /// Mean (average) value
52 pub mean: f64,
53 /// Sample size
54 pub count: usize,
55}
56
57/// Calculate statistics for an array of values.
58///
59/// This function calculates various statistics including min, max, quartiles,
60/// interquartile range (IQR), and mean.
61///
62/// # Arguments
63///
64/// * `values` - Slice of values to analyze
65///
66/// # Returns
67///
68/// A [`Stats`] struct containing the calculated statistics
69pub fn calculate_stats(values: &[f64]) -> Option<Stats> {
70 let count = values.len();
71 if count == 0 {
72 return None;
73 }
74
75 let mut sorted_values = values.to_vec();
76 sorted_values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
77
78 let min = sorted_values[0];
79 let max = sorted_values[count - 1];
80
81 // Calculate mean
82 let sum: f64 = sorted_values.iter().sum();
83 let mean = sum / count as f64;
84
85 // Calculate median and quartiles
86 let median = calculate_percentile(&sorted_values, 0.5);
87 let q1 = calculate_percentile(&sorted_values, 0.25);
88 let q3 = calculate_percentile(&sorted_values, 0.75);
89 let iqr = q3 - q1;
90
91 Some(Stats {
92 min,
93 q1,
94 median,
95 q3,
96 max,
97 iqr,
98 mean,
99 count,
100 })
101}
102
103/// Calculate a specific percentile of values.
104///
105/// # Arguments
106///
107/// * `sorted_values` - Sorted slice of values
108/// * `percentile` - Percentile to calculate (0.0 to 1.0)
109///
110/// # Returns
111///
112/// The value at the specified percentile
113pub fn calculate_percentile(sorted_values: &[f64], percentile: f64) -> f64 {
114 let count = sorted_values.len();
115 if count == 0 {
116 return 0.0;
117 }
118
119 let index = percentile * (count - 1) as f64;
120 let lower_idx = index.floor() as usize;
121 let upper_idx = index.ceil() as usize;
122
123 if lower_idx == upper_idx {
124 sorted_values[lower_idx]
125 } else {
126 let weight = index - lower_idx as f64;
127 sorted_values[lower_idx] * (1.0 - weight) + sorted_values[upper_idx] * weight
128 }
129}
130
131/// Calculate ZSTD ratio statistics between two groups in split comparison.
132///
133/// This function calculates the ZSTD compression ratio statistics between
134/// group1_metrics and group2_metrics using the results array.
135///
136/// # Arguments
137///
138/// * `results` - Slice of analysis results
139/// * `comparison_index` - Index of the comparison to analyze
140///
141/// # Returns
142///
143/// Optional [`Stats`] struct containing the ratio statistics, or [`None`] if there are no results
144pub fn calculate_zstd_ratio_stats(
145 results: &[AnalysisResults],
146 comparison_index: usize,
147) -> Option<Stats> {
148 let ratios: Vec<f64> = results
149 .iter()
150 .filter_map(|result| {
151 result
152 .split_comparisons
153 .get(comparison_index)
154 .map(|comparison| {
155 calc_ratio_f64(
156 comparison.group2_metrics.zstd_size,
157 comparison.group1_metrics.zstd_size,
158 )
159 })
160 })
161 .collect();
162
163 calculate_stats(&ratios)
164}
165
166/// Calculate ZSTD ratio statistics between two groups in custom comparison.
167///
168/// This function calculates the ZSTD compression ratio statistics between
169/// a specific group in group_metrics and the baseline metrics.
170///
171/// # Arguments
172///
173/// * `results` - Slice of analysis results
174/// * `comparison_index` - Index of the custom comparison to analyze
175/// * `group_index` - Index of the group within group_metrics to compare with baseline
176///
177/// # Returns
178///
179/// Optional [`Stats`] struct containing the ratio statistics, or [`None`] if there are no results
180pub fn calculate_custom_zstd_ratio_stats(
181 results: &[AnalysisResults],
182 comparison_index: usize,
183 group_index: usize,
184) -> Option<Stats> {
185 let ratios: Vec<f64> = results
186 .iter()
187 .filter_map(|result| {
188 if let Some(comparison) = result.custom_comparisons.get(comparison_index) {
189 // Only include results where the group_index is valid
190 comparison
191 .group_metrics
192 .get(group_index)
193 .map(|group_metrics| {
194 calc_ratio_f64(
195 group_metrics.zstd_size,
196 comparison.baseline_metrics.zstd_size,
197 )
198 })
199 } else {
200 None
201 }
202 })
203 .collect();
204
205 calculate_stats(&ratios)
206}
207
208/// Format statistics as a string.
209///
210/// # Arguments
211///
212/// * `stats` - The statistics to format
213///
214/// # Returns
215///
216/// A formatted string representation of the statistics
217pub fn format_stats(stats: &Stats) -> String {
218 format!(
219 "min: {:.3}, Q1: {:.3}, median: {:.3}, Q3: {:.3}, max: {:.3}, IQR: {:.3}, mean: {:.3} (n={})",
220 stats.min, stats.q1, stats.median, stats.q3, stats.max, stats.iqr, stats.mean, stats.count
221 )
222}