#[derive(Debug, Clone, Copy)]
pub struct OutlierStats {
pub total_samples: usize,
pub retained_samples: usize,
pub outliers_removed: usize,
pub outlier_fraction: f64,
pub threshold: u64,
pub total_fixed: usize,
pub trimmed_fixed: usize,
pub total_random: usize,
pub trimmed_random: usize,
}
impl OutlierStats {
pub fn no_filtering(total_samples: usize) -> Self {
Self {
total_samples,
retained_samples: total_samples,
outliers_removed: 0,
outlier_fraction: 0.0,
threshold: u64::MAX,
total_fixed: total_samples / 2,
trimmed_fixed: 0,
total_random: total_samples / 2,
trimmed_random: 0,
}
}
pub fn rate_fixed(&self) -> f64 {
if self.total_fixed == 0 {
0.0
} else {
self.trimmed_fixed as f64 / self.total_fixed as f64
}
}
pub fn rate_random(&self) -> f64 {
if self.total_random == 0 {
0.0
} else {
self.trimmed_random as f64 / self.total_random as f64
}
}
}
pub fn filter_outliers(
fixed: &[u64],
random: &[u64],
percentile: f64,
) -> (Vec<u64>, Vec<u64>, OutlierStats) {
let total_samples = fixed.len() + random.len();
if percentile >= 1.0 {
return (
fixed.to_vec(),
random.to_vec(),
OutlierStats {
total_samples,
retained_samples: total_samples,
outliers_removed: 0,
outlier_fraction: 0.0,
threshold: u64::MAX,
total_fixed: fixed.len(),
trimmed_fixed: 0,
total_random: random.len(),
trimmed_random: 0,
},
);
}
let mut pooled: Vec<u64> = Vec::with_capacity(total_samples);
pooled.extend_from_slice(fixed);
pooled.extend_from_slice(random);
let threshold = compute_percentile(&mut pooled, percentile);
let mut filtered_fixed: Vec<u64> = Vec::with_capacity(fixed.len());
let mut filtered_random: Vec<u64> = Vec::with_capacity(random.len());
let mut trimmed_fixed = 0;
let mut trimmed_random = 0;
for &x in fixed {
if x > threshold {
trimmed_fixed += 1;
filtered_fixed.push(threshold);
} else {
filtered_fixed.push(x);
}
}
for &x in random {
if x > threshold {
trimmed_random += 1;
filtered_random.push(threshold);
} else {
filtered_random.push(x);
}
}
let retained = filtered_fixed.len() + filtered_random.len();
let removed = trimmed_fixed + trimmed_random;
let stats = OutlierStats {
total_samples,
retained_samples: retained,
outliers_removed: removed,
outlier_fraction: removed as f64 / total_samples as f64,
threshold,
total_fixed: fixed.len(),
trimmed_fixed,
total_random: random.len(),
trimmed_random,
};
(filtered_fixed, filtered_random, stats)
}
fn compute_percentile(data: &mut [u64], percentile: f64) -> u64 {
if data.is_empty() {
return 0;
}
if data.len() == 1 {
return data[0];
}
data.sort_unstable();
let p = percentile.clamp(0.0, 1.0);
let n = data.len();
let idx = p * (n - 1) as f64;
let lower = idx.floor() as usize;
let upper = idx.ceil() as usize;
if lower == upper {
data[lower]
} else {
let frac = idx - lower as f64;
let lower_val = data[lower] as f64;
let upper_val = data[upper] as f64;
(lower_val + frac * (upper_val - lower_val)).round() as u64
}
}
pub fn winsorize_f64(baseline: &mut [f64], sample: &mut [f64], percentile: f64) -> usize {
if percentile >= 1.0 {
return 0;
}
let total_len = baseline.len() + sample.len();
if total_len == 0 {
return 0;
}
let mut pooled: Vec<f64> = Vec::with_capacity(total_len);
pooled.extend_from_slice(baseline);
pooled.extend_from_slice(sample);
pooled.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let p = percentile.clamp(0.0, 1.0);
let idx = (p * (total_len - 1) as f64).round() as usize;
let threshold = pooled[idx.min(total_len - 1)];
let mut capped = 0;
for x in baseline.iter_mut() {
if *x > threshold {
*x = threshold;
capped += 1;
}
}
for x in sample.iter_mut() {
if *x > threshold {
*x = threshold;
capped += 1;
}
}
capped
}
#[cfg(test)]
pub fn compute_outlier_threshold(fixed: &[u64], random: &[u64], percentile: f64) -> u64 {
if percentile >= 1.0 {
return u64::MAX;
}
let mut pooled: Vec<u64> = Vec::with_capacity(fixed.len() + random.len());
pooled.extend_from_slice(fixed);
pooled.extend_from_slice(random);
compute_percentile(&mut pooled, percentile)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_compute_percentile() {
let data = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
assert_eq!(compute_percentile(&mut data.clone(), 0.0), 1);
let median = compute_percentile(&mut data.clone(), 0.5);
assert!(
median == 5 || median == 6,
"Median should be 5 or 6, got {}",
median
);
assert_eq!(compute_percentile(&mut data.clone(), 1.0), 10);
}
#[test]
fn test_filter_outliers_basic() {
let fixed = vec![100, 110, 105, 1000, 108]; let random = vec![102, 107, 103, 109, 2000];
let (f, r, stats) = filter_outliers(&fixed, &random, 0.8);
assert!(stats.outliers_removed > 0);
assert_eq!(f.len(), fixed.len());
assert_eq!(r.len(), random.len());
assert_eq!(f.len() + r.len(), stats.retained_samples);
}
#[test]
fn test_filter_outliers_no_filtering() {
let fixed = vec![100, 110, 105];
let random = vec![102, 107, 103];
let (f, r, stats) = filter_outliers(&fixed, &random, 1.0);
assert_eq!(f.len(), 3);
assert_eq!(r.len(), 3);
assert_eq!(stats.outliers_removed, 0);
assert_eq!(stats.outlier_fraction, 0.0);
}
#[test]
fn test_symmetric_filtering() {
let fixed = vec![100, 200, 300];
let random = vec![150, 250, 350];
let threshold = compute_outlier_threshold(&fixed, &random, 0.5);
assert!((200..=250).contains(&threshold));
}
}