Skip to main content

fluxbench_stats/
outliers.rs

1//! Outlier Detection
2//!
3//! Uses IQR (Interquartile Range) method to identify outliers.
4//!
5//! **Critical Design Decision**: Outliers are detected but NOT removed from
6//! percentile/min/max calculations. For tail latency metrics, outliers ARE the signal.
7//! Only mean/median/stddev use cleaned data.
8
9use crate::percentiles::compute_percentile;
10
11/// Method for outlier detection
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub enum OutlierMethod {
14    /// IQR method: outliers are outside [Q1 - k*IQR, Q3 + k*IQR]
15    Iqr {
16        /// Multiplier for IQR (multiplied by 0.5, so k=3 means 1.5*IQR)
17        k: u32,
18    },
19    /// Z-score method: outliers are beyond z standard deviations
20    ZScore {
21        /// Number of standard deviations (multiplied by 0.5)
22        threshold: u32,
23    },
24    /// No outlier detection
25    None,
26}
27
28impl Default for OutlierMethod {
29    fn default() -> Self {
30        // Standard IQR with k=1.5
31        OutlierMethod::Iqr { k: 3 }
32    }
33}
34
35/// Result of outlier analysis
36#[derive(Debug, Clone)]
37pub struct OutlierAnalysis {
38    /// Original samples (ALL data preserved)
39    pub all_samples: Vec<f64>,
40    /// Samples with outliers removed (for mean/stddev computation)
41    pub cleaned_samples: Vec<f64>,
42    /// Indices of outlier samples
43    pub outlier_indices: Vec<usize>,
44    /// Number of low outliers (below lower bound)
45    pub low_outlier_count: usize,
46    /// Number of high outliers (above upper bound)
47    pub high_outlier_count: usize,
48    /// Lower bound used for detection
49    pub lower_bound: f64,
50    /// Upper bound used for detection
51    pub upper_bound: f64,
52    /// Detection method used
53    pub method: OutlierMethod,
54}
55
56impl OutlierAnalysis {
57    /// Percentage of samples that are outliers
58    pub fn outlier_percentage(&self) -> f64 {
59        if self.all_samples.is_empty() {
60            return 0.0;
61        }
62        (self.outlier_indices.len() as f64 / self.all_samples.len() as f64) * 100.0
63    }
64
65    /// Check if outlier percentage exceeds threshold (indicates noisy environment)
66    pub fn is_noisy(&self, threshold_pct: f64) -> bool {
67        self.outlier_percentage() > threshold_pct
68    }
69}
70
71/// Detect outliers in samples using specified method
72///
73/// # Examples
74///
75/// ```ignore
76/// # use fluxbench_stats::{detect_outliers, OutlierMethod};
77/// let samples = vec![1.0, 2.0, 3.0, 4.0, 5.0, 100.0];
78/// let analysis = detect_outliers(&samples, OutlierMethod::default());
79/// println!("Outliers found: {}", analysis.outlier_count);
80/// println!("Outlier percentage: {:.1}%", analysis.outlier_percentage());
81/// ```
82pub fn detect_outliers(samples: &[f64], method: OutlierMethod) -> OutlierAnalysis {
83    if samples.is_empty() {
84        return OutlierAnalysis {
85            all_samples: Vec::new(),
86            cleaned_samples: Vec::new(),
87            outlier_indices: Vec::new(),
88            low_outlier_count: 0,
89            high_outlier_count: 0,
90            lower_bound: 0.0,
91            upper_bound: 0.0,
92            method,
93        };
94    }
95
96    match method {
97        OutlierMethod::None => OutlierAnalysis {
98            all_samples: samples.to_vec(),
99            cleaned_samples: samples.to_vec(),
100            outlier_indices: Vec::new(),
101            low_outlier_count: 0,
102            high_outlier_count: 0,
103            lower_bound: f64::NEG_INFINITY,
104            upper_bound: f64::INFINITY,
105            method,
106        },
107        OutlierMethod::Iqr { k } => detect_iqr_outliers(samples, k as f64 * 0.5),
108        OutlierMethod::ZScore { threshold } => {
109            detect_zscore_outliers(samples, threshold as f64 * 0.5)
110        }
111    }
112}
113
114/// IQR-based outlier detection
115fn detect_iqr_outliers(samples: &[f64], k: f64) -> OutlierAnalysis {
116    let q1 = compute_percentile(samples, 25.0);
117    let q3 = compute_percentile(samples, 75.0);
118    let iqr = q3 - q1;
119
120    let lower_bound = q1 - k * iqr;
121    let upper_bound = q3 + k * iqr;
122
123    let mut outlier_indices = Vec::new();
124    let mut low_count = 0;
125    let mut high_count = 0;
126    let mut cleaned = Vec::with_capacity(samples.len());
127
128    for (i, &sample) in samples.iter().enumerate() {
129        if sample < lower_bound {
130            outlier_indices.push(i);
131            low_count += 1;
132        } else if sample > upper_bound {
133            outlier_indices.push(i);
134            high_count += 1;
135        } else {
136            cleaned.push(sample);
137        }
138    }
139
140    OutlierAnalysis {
141        all_samples: samples.to_vec(),
142        cleaned_samples: cleaned,
143        outlier_indices,
144        low_outlier_count: low_count,
145        high_outlier_count: high_count,
146        lower_bound,
147        upper_bound,
148        method: OutlierMethod::Iqr {
149            k: (k * 2.0) as u32,
150        },
151    }
152}
153
154/// Z-score based outlier detection
155fn detect_zscore_outliers(samples: &[f64], threshold: f64) -> OutlierAnalysis {
156    let n = samples.len() as f64;
157    let mean: f64 = samples.iter().sum::<f64>() / n;
158    let variance: f64 = samples.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n;
159    let std_dev = variance.sqrt();
160
161    if std_dev == 0.0 {
162        // No variance, no outliers
163        return OutlierAnalysis {
164            all_samples: samples.to_vec(),
165            cleaned_samples: samples.to_vec(),
166            outlier_indices: Vec::new(),
167            low_outlier_count: 0,
168            high_outlier_count: 0,
169            lower_bound: mean,
170            upper_bound: mean,
171            method: OutlierMethod::ZScore {
172                threshold: (threshold * 2.0) as u32,
173            },
174        };
175    }
176
177    let lower_bound = mean - threshold * std_dev;
178    let upper_bound = mean + threshold * std_dev;
179
180    let mut outlier_indices = Vec::new();
181    let mut low_count = 0;
182    let mut high_count = 0;
183    let mut cleaned = Vec::with_capacity(samples.len());
184
185    for (i, &sample) in samples.iter().enumerate() {
186        let z_score = (sample - mean) / std_dev;
187        if z_score < -threshold {
188            outlier_indices.push(i);
189            low_count += 1;
190        } else if z_score > threshold {
191            outlier_indices.push(i);
192            high_count += 1;
193        } else {
194            cleaned.push(sample);
195        }
196    }
197
198    OutlierAnalysis {
199        all_samples: samples.to_vec(),
200        cleaned_samples: cleaned,
201        outlier_indices,
202        low_outlier_count: low_count,
203        high_outlier_count: high_count,
204        lower_bound,
205        upper_bound,
206        method: OutlierMethod::ZScore {
207            threshold: (threshold * 2.0) as u32,
208        },
209    }
210}
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215
216    #[test]
217    fn test_no_outliers() {
218        let samples = vec![1.0, 2.0, 3.0, 4.0, 5.0];
219        let result = detect_outliers(&samples, OutlierMethod::default());
220
221        assert!(result.outlier_indices.is_empty());
222        assert_eq!(result.cleaned_samples.len(), 5);
223    }
224
225    #[test]
226    fn test_with_outliers() {
227        let samples = vec![1.0, 2.0, 3.0, 4.0, 5.0, 100.0]; // 100 is outlier
228        let result = detect_outliers(&samples, OutlierMethod::default());
229
230        assert!(!result.outlier_indices.is_empty());
231        assert_eq!(result.high_outlier_count, 1);
232        assert_eq!(result.all_samples.len(), 6); // Original preserved
233        assert_eq!(result.cleaned_samples.len(), 5); // Outlier removed
234    }
235
236    #[test]
237    fn test_outlier_percentage() {
238        let samples = vec![1.0, 2.0, 3.0, 4.0, 5.0, 100.0];
239        let result = detect_outliers(&samples, OutlierMethod::default());
240
241        // 1 out of 6 = ~16.7%
242        assert!(result.outlier_percentage() > 15.0);
243        assert!(result.outlier_percentage() < 20.0);
244    }
245
246    #[test]
247    fn test_no_detection() {
248        let samples = vec![1.0, 2.0, 100.0];
249        let result = detect_outliers(&samples, OutlierMethod::None);
250
251        assert!(result.outlier_indices.is_empty());
252        assert_eq!(result.cleaned_samples.len(), 3);
253    }
254
255    #[test]
256    fn test_empty_samples() {
257        let samples: Vec<f64> = Vec::new();
258        let result = detect_outliers(&samples, OutlierMethod::default());
259
260        assert!(result.outlier_indices.is_empty());
261        assert!(result.all_samples.is_empty());
262    }
263}