struct_compression_analyzer/comparison/
stats.rs

1//! Statistical functions for analyzing compression metrics.
2//!
3//! This module provides functionality for calculating and analyzing statistical
4//! measures related to compression ratios and other metrics.
5//!
6//! # Types
7//!
8//! - [`Stats`]: Container for a complete set of statistical measures including
9//!   quartiles, mean, median, IQR, min/max, and sample count.
10//!
11//! # Functions
12//!
13//! ## Core Statistics
14//!
15//! - [`calculate_stats`]: Calculate comprehensive statistics for an array of values
16//! - [`calculate_percentile`]: Helper function to calculate a specific percentile
17//! - [`format_stats`]: Format statistics as a human-readable string
18//!
19//! ## ZSTD Compression Ratio Statistics
20//!
21//! - [`calculate_zstd_ratio_stats`]: Statistics for ZSTD ratios in split comparisons
22//! - [`calculate_custom_zstd_ratio_stats`]: Statistics for ZSTD ratios in custom comparisons
23//!
24//! # Statistical Measures
25//!
26//! The module provides calculation of:
27//! - Interquartile Range (IQR)
28//! - Percentile ranges (Q1, median, Q3)
29//! - Minimum and maximum values
30//! - Mean (average)
31//! - Sample count
32
33use crate::{plot::calc_ratio_f64, results::analysis_results::AnalysisResults};
34use core::cmp::Ordering;
35
36/// Statistics for a set of numeric values.
37#[derive(Debug, Clone, Copy)]
38pub struct Stats {
39    /// Minimum value
40    pub min: f64,
41    /// First quartile (25th percentile)
42    pub q1: f64,
43    /// Median (50th percentile)
44    pub median: f64,
45    /// Third quartile (75th percentile)
46    pub q3: f64,
47    /// Maximum value
48    pub max: f64,
49    /// Interquartile range (IQR = Q3 - Q1)
50    pub iqr: f64,
51    /// Mean (average) value
52    pub mean: f64,
53    /// Sample size
54    pub count: usize,
55}
56
57/// Calculate statistics for an array of values.
58///
59/// This function calculates various statistics including min, max, quartiles,
60/// interquartile range (IQR), and mean.
61///
62/// # Arguments
63///
64/// * `values` - Slice of values to analyze
65///
66/// # Returns
67///
68/// A [`Stats`] struct containing the calculated statistics
69pub fn calculate_stats(values: &[f64]) -> Option<Stats> {
70    let count = values.len();
71    if count == 0 {
72        return None;
73    }
74
75    let mut sorted_values = values.to_vec();
76    sorted_values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
77
78    let min = sorted_values[0];
79    let max = sorted_values[count - 1];
80
81    // Calculate mean
82    let sum: f64 = sorted_values.iter().sum();
83    let mean = sum / count as f64;
84
85    // Calculate median and quartiles
86    let median = calculate_percentile(&sorted_values, 0.5);
87    let q1 = calculate_percentile(&sorted_values, 0.25);
88    let q3 = calculate_percentile(&sorted_values, 0.75);
89    let iqr = q3 - q1;
90
91    Some(Stats {
92        min,
93        q1,
94        median,
95        q3,
96        max,
97        iqr,
98        mean,
99        count,
100    })
101}
102
103/// Calculate a specific percentile of values.
104///
105/// # Arguments
106///
107/// * `sorted_values` - Sorted slice of values
108/// * `percentile` - Percentile to calculate (0.0 to 1.0)
109///
110/// # Returns
111///
112/// The value at the specified percentile
113pub fn calculate_percentile(sorted_values: &[f64], percentile: f64) -> f64 {
114    let count = sorted_values.len();
115    if count == 0 {
116        return 0.0;
117    }
118
119    let index = percentile * (count - 1) as f64;
120    let lower_idx = index.floor() as usize;
121    let upper_idx = index.ceil() as usize;
122
123    if lower_idx == upper_idx {
124        sorted_values[lower_idx]
125    } else {
126        let weight = index - lower_idx as f64;
127        sorted_values[lower_idx] * (1.0 - weight) + sorted_values[upper_idx] * weight
128    }
129}
130
131/// Calculate ZSTD ratio statistics between two groups in split comparison.
132///
133/// This function calculates the ZSTD compression ratio statistics between
134/// group1_metrics and group2_metrics using the results array.
135///
136/// # Arguments
137///
138/// * `results` - Slice of analysis results
139/// * `comparison_index` - Index of the comparison to analyze
140///
141/// # Returns
142///
143/// Optional [`Stats`] struct containing the ratio statistics, or [`None`] if there are no results
144pub fn calculate_zstd_ratio_stats(
145    results: &[AnalysisResults],
146    comparison_index: usize,
147) -> Option<Stats> {
148    let ratios: Vec<f64> = results
149        .iter()
150        .filter_map(|result| {
151            result
152                .split_comparisons
153                .get(comparison_index)
154                .map(|comparison| {
155                    calc_ratio_f64(
156                        comparison.group2_metrics.zstd_size,
157                        comparison.group1_metrics.zstd_size,
158                    )
159                })
160        })
161        .collect();
162
163    calculate_stats(&ratios)
164}
165
166/// Calculate ZSTD ratio statistics between two groups in custom comparison.
167///
168/// This function calculates the ZSTD compression ratio statistics between
169/// a specific group in group_metrics and the baseline metrics.
170///
171/// # Arguments
172///
173/// * `results` - Slice of analysis results
174/// * `comparison_index` - Index of the custom comparison to analyze
175/// * `group_index` - Index of the group within group_metrics to compare with baseline
176///
177/// # Returns
178///
179/// Optional [`Stats`] struct containing the ratio statistics, or [`None`] if there are no results
180pub fn calculate_custom_zstd_ratio_stats(
181    results: &[AnalysisResults],
182    comparison_index: usize,
183    group_index: usize,
184) -> Option<Stats> {
185    let ratios: Vec<f64> = results
186        .iter()
187        .filter_map(|result| {
188            if let Some(comparison) = result.custom_comparisons.get(comparison_index) {
189                // Only include results where the group_index is valid
190                comparison
191                    .group_metrics
192                    .get(group_index)
193                    .map(|group_metrics| {
194                        calc_ratio_f64(
195                            group_metrics.zstd_size,
196                            comparison.baseline_metrics.zstd_size,
197                        )
198                    })
199            } else {
200                None
201            }
202        })
203        .collect();
204
205    calculate_stats(&ratios)
206}
207
208/// Format statistics as a string.
209///
210/// # Arguments
211///
212/// * `stats` - The statistics to format
213///
214/// # Returns
215///
216/// A formatted string representation of the statistics
217pub fn format_stats(stats: &Stats) -> String {
218    format!(
219        "min: {:.3}, Q1: {:.3}, median: {:.3}, Q3: {:.3}, max: {:.3}, IQR: {:.3}, mean: {:.3} (n={})",
220        stats.min, stats.q1, stats.median, stats.q3, stats.max, stats.iqr, stats.mean, stats.count
221    )
222}