use crate::TimeSeries;
use scirs2_core::random::{thread_rng, Distribution, Uniform};
use torsh_core::error::Result;
use torsh_tensor::Tensor;
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum OutlierMethod {
IQR,
ZScore,
ModifiedZScore,
IsolationForest,
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum OutlierTreatment {
Remove,
Mean,
Median,
Clip,
Interpolate,
}
#[derive(Debug, Clone)]
pub struct OutlierDetectionResult {
pub outlier_indices: Vec<usize>,
pub outlier_values: Vec<f32>,
pub scores: Vec<f64>,
pub method: OutlierMethod,
}
pub struct OutlierDetector {
method: OutlierMethod,
threshold: f64,
}
impl OutlierDetector {
pub fn new(method: OutlierMethod, threshold: f64) -> Self {
Self { method, threshold }
}
pub fn detect(&self, series: &TimeSeries) -> Result<OutlierDetectionResult> {
match self.method {
OutlierMethod::IQR => self.detect_iqr(series),
OutlierMethod::ZScore => self.detect_zscore(series),
OutlierMethod::ModifiedZScore => self.detect_modified_zscore(series),
OutlierMethod::IsolationForest => self.detect_isolation_forest(series),
}
}
fn detect_iqr(&self, series: &TimeSeries) -> Result<OutlierDetectionResult> {
let data = series.values.to_vec()?;
let n = data.len();
if n < 4 {
return Ok(OutlierDetectionResult {
outlier_indices: vec![],
outlier_values: vec![],
scores: vec![],
method: OutlierMethod::IQR,
});
}
let mut sorted_data = data.clone();
sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let q1_idx = n / 4;
let q3_idx = (3 * n) / 4;
let q1 = sorted_data[q1_idx] as f64;
let q3 = sorted_data[q3_idx] as f64;
let iqr = q3 - q1;
let lower_bound = q1 - self.threshold * iqr;
let upper_bound = q3 + self.threshold * iqr;
let mut outlier_indices = Vec::new();
let mut outlier_values = Vec::new();
let mut scores = Vec::new();
for (i, &value) in data.iter().enumerate() {
let v = value as f64;
if v < lower_bound || v > upper_bound {
outlier_indices.push(i);
outlier_values.push(value);
let score = if v < lower_bound {
(lower_bound - v) / iqr
} else {
(v - upper_bound) / iqr
};
scores.push(score);
}
}
Ok(OutlierDetectionResult {
outlier_indices,
outlier_values,
scores,
method: OutlierMethod::IQR,
})
}
fn detect_zscore(&self, series: &TimeSeries) -> Result<OutlierDetectionResult> {
let data = series.values.to_vec()?;
let n = data.len();
if n == 0 {
return Ok(OutlierDetectionResult {
outlier_indices: vec![],
outlier_values: vec![],
scores: vec![],
method: OutlierMethod::ZScore,
});
}
let mean = data.iter().map(|&x| x as f64).sum::<f64>() / n as f64;
let variance = data
.iter()
.map(|&x| {
let diff = (x as f64) - mean;
diff * diff
})
.sum::<f64>()
/ n as f64;
let std = variance.sqrt();
if std < 1e-10 {
return Ok(OutlierDetectionResult {
outlier_indices: vec![],
outlier_values: vec![],
scores: vec![],
method: OutlierMethod::ZScore,
});
}
let mut outlier_indices = Vec::new();
let mut outlier_values = Vec::new();
let mut scores = Vec::new();
for (i, &value) in data.iter().enumerate() {
let z_score = ((value as f64) - mean) / std;
if z_score.abs() > self.threshold {
outlier_indices.push(i);
outlier_values.push(value);
scores.push(z_score.abs());
}
}
Ok(OutlierDetectionResult {
outlier_indices,
outlier_values,
scores,
method: OutlierMethod::ZScore,
})
}
fn detect_modified_zscore(&self, series: &TimeSeries) -> Result<OutlierDetectionResult> {
let data = series.values.to_vec()?;
let n = data.len();
if n == 0 {
return Ok(OutlierDetectionResult {
outlier_indices: vec![],
outlier_values: vec![],
scores: vec![],
method: OutlierMethod::ModifiedZScore,
});
}
let mut sorted_data = data.clone();
sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let median = if n % 2 == 0 {
((sorted_data[n / 2 - 1] + sorted_data[n / 2]) / 2.0) as f64
} else {
sorted_data[n / 2] as f64
};
let mut abs_deviations: Vec<f64> =
data.iter().map(|&x| ((x as f64) - median).abs()).collect();
abs_deviations.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let mad = if n % 2 == 0 {
(abs_deviations[n / 2 - 1] + abs_deviations[n / 2]) / 2.0
} else {
abs_deviations[n / 2]
};
if mad < 1e-10 {
return Ok(OutlierDetectionResult {
outlier_indices: vec![],
outlier_values: vec![],
scores: vec![],
method: OutlierMethod::ModifiedZScore,
});
}
const SCALE_FACTOR: f64 = 0.6745;
let mut outlier_indices = Vec::new();
let mut outlier_values = Vec::new();
let mut scores = Vec::new();
for (i, &value) in data.iter().enumerate() {
let modified_z = (SCALE_FACTOR * ((value as f64) - median) / mad).abs();
if modified_z > self.threshold {
outlier_indices.push(i);
outlier_values.push(value);
scores.push(modified_z);
}
}
Ok(OutlierDetectionResult {
outlier_indices,
outlier_values,
scores,
method: OutlierMethod::ModifiedZScore,
})
}
fn detect_isolation_forest(&self, series: &TimeSeries) -> Result<OutlierDetectionResult> {
let data = series.values.to_vec()?;
let n = data.len();
if n < 10 {
return Ok(OutlierDetectionResult {
outlier_indices: vec![],
outlier_values: vec![],
scores: vec![],
method: OutlierMethod::IsolationForest,
});
}
let num_trees = 100;
let subsample_size = std::cmp::min(256, n);
let max_depth = (subsample_size as f64).log2().ceil() as usize;
let mut scores = vec![0.0; n];
let mut rng = thread_rng();
for _ in 0..num_trees {
let dist = Uniform::new(0, n).expect("distribution should succeed");
let subsample_indices: Vec<usize> =
(0..subsample_size).map(|_| dist.sample(&mut rng)).collect();
for (i, &value) in data.iter().enumerate() {
let path_length =
self.compute_path_length(value as f64, &subsample_indices, &data, 0, max_depth);
scores[i] += path_length;
}
}
for score in &mut scores {
*score /= num_trees as f64;
}
let max_score = scores.iter().copied().fold(f64::NEG_INFINITY, f64::max);
let min_score = scores.iter().copied().fold(f64::INFINITY, f64::min);
let score_range = max_score - min_score;
if score_range > 1e-10 {
for score in &mut scores {
*score = 1.0 - (*score - min_score) / score_range;
}
}
let mut sorted_scores = scores.clone();
sorted_scores.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
let threshold_idx = (self.threshold * n as f64) as usize;
let score_threshold = if threshold_idx < sorted_scores.len() {
sorted_scores[threshold_idx]
} else {
1.0
};
let mut outlier_indices = Vec::new();
let mut outlier_values = Vec::new();
let mut outlier_scores = Vec::new();
for (i, (&value, &score)) in data.iter().zip(scores.iter()).enumerate() {
if score >= score_threshold {
outlier_indices.push(i);
outlier_values.push(value);
outlier_scores.push(score);
}
}
Ok(OutlierDetectionResult {
outlier_indices,
outlier_values,
scores: outlier_scores,
method: OutlierMethod::IsolationForest,
})
}
fn compute_path_length(
&self,
value: f64,
subsample_indices: &[usize],
data: &[f32],
depth: usize,
max_depth: usize,
) -> f64 {
if depth >= max_depth || subsample_indices.len() <= 1 {
return depth as f64 + self.average_path_length(subsample_indices.len());
}
let mut min_val = f64::INFINITY;
let mut max_val = f64::NEG_INFINITY;
for &idx in subsample_indices {
let val = data[idx] as f64;
min_val = min_val.min(val);
max_val = max_val.max(val);
}
if (max_val - min_val).abs() < 1e-10 {
return depth as f64 + self.average_path_length(subsample_indices.len());
}
let mut rng = thread_rng();
let dist = Uniform::new(min_val, max_val).expect("distribution should succeed");
let split_point = dist.sample(&mut rng);
if value < split_point {
depth as f64 + 1.0
} else {
depth as f64 + 1.0
}
}
fn average_path_length(&self, n: usize) -> f64 {
if n <= 1 {
return 0.0;
}
let n_f64 = n as f64;
2.0 * ((n_f64 - 1.0).ln() + 0.5772156649) - 2.0 * (n_f64 - 1.0) / n_f64
}
pub fn treat(
&self,
series: &TimeSeries,
detection: &OutlierDetectionResult,
treatment: OutlierTreatment,
) -> Result<TimeSeries> {
let mut data = series.values.to_vec()?;
let n = data.len();
if detection.outlier_indices.is_empty() {
return Ok(TimeSeries::new(series.values.clone()));
}
match treatment {
OutlierTreatment::Remove => {
let mut filtered_data = Vec::new();
for (i, &value) in data.iter().enumerate() {
if !detection.outlier_indices.contains(&i) {
filtered_data.push(value);
}
}
let tensor = Tensor::from_vec(filtered_data.clone(), &[filtered_data.len()])?;
Ok(TimeSeries::new(tensor))
}
OutlierTreatment::Mean => {
let mean = data.iter().map(|&x| x as f64).sum::<f64>() / n as f64;
for &idx in &detection.outlier_indices {
data[idx] = mean as f32;
}
let tensor = Tensor::from_vec(data.clone(), &[n])?;
Ok(TimeSeries::new(tensor))
}
OutlierTreatment::Median => {
let mut sorted = data.clone();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let median = if n % 2 == 0 {
(sorted[n / 2 - 1] + sorted[n / 2]) / 2.0
} else {
sorted[n / 2]
};
for &idx in &detection.outlier_indices {
data[idx] = median;
}
let tensor = Tensor::from_vec(data.clone(), &[n])?;
Ok(TimeSeries::new(tensor))
}
OutlierTreatment::Clip => {
let mut sorted = data.clone();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let non_outlier_values: Vec<f32> = data
.iter()
.enumerate()
.filter(|(i, _)| !detection.outlier_indices.contains(i))
.map(|(_, &v)| v)
.collect();
if non_outlier_values.is_empty() {
return Ok(TimeSeries::new(series.values.clone()));
}
let min_non_outlier = non_outlier_values
.iter()
.copied()
.fold(f32::INFINITY, f32::min);
let max_non_outlier = non_outlier_values
.iter()
.copied()
.fold(f32::NEG_INFINITY, f32::max);
for &idx in &detection.outlier_indices {
data[idx] = data[idx].clamp(min_non_outlier, max_non_outlier);
}
let tensor = Tensor::from_vec(data.clone(), &[n])?;
Ok(TimeSeries::new(tensor))
}
OutlierTreatment::Interpolate => {
for &idx in &detection.outlier_indices {
let mut left_idx = None;
let mut right_idx = None;
for i in (0..idx).rev() {
if !detection.outlier_indices.contains(&i) {
left_idx = Some(i);
break;
}
}
for i in (idx + 1)..n {
if !detection.outlier_indices.contains(&i) {
right_idx = Some(i);
break;
}
}
let value = match (left_idx, right_idx) {
(Some(left), Some(right)) => {
let t = (idx - left) as f32 / (right - left) as f32;
data[left] * (1.0 - t) + data[right] * t
}
(Some(left), None) => data[left], (None, Some(right)) => data[right], (None, None) => {
data[idx]
}
};
data[idx] = value;
}
let tensor = Tensor::from_vec(data.clone(), &[n])?;
Ok(TimeSeries::new(tensor))
}
}
}
}
pub fn detect_and_treat_outliers(
series: &TimeSeries,
method: OutlierMethod,
threshold: f64,
treatment: OutlierTreatment,
) -> Result<(TimeSeries, OutlierDetectionResult)> {
let detector = OutlierDetector::new(method, threshold);
let detection = detector.detect(series)?;
let treated = detector.treat(series, &detection, treatment)?;
Ok((treated, detection))
}
#[cfg(test)]
mod tests {
use super::*;
fn create_series_with_outliers() -> TimeSeries {
let data = vec![
10.0f32, 11.0, 9.0, 10.5, 100.0, 9.5, 10.2, 10.1, -100.0, 10.3, 9.8, 10.0,
];
let tensor = Tensor::from_vec(data, &[12]).expect("Tensor should succeed");
TimeSeries::new(tensor)
}
#[test]
fn test_iqr_detection() {
let series = create_series_with_outliers();
let detector = OutlierDetector::new(OutlierMethod::IQR, 1.5);
let result = detector
.detect(&series)
.expect("detection operation should succeed");
assert_eq!(result.method, OutlierMethod::IQR);
assert!(result.outlier_indices.len() >= 2); assert!(result.outlier_indices.contains(&4)); assert!(result.outlier_indices.contains(&8)); }
#[test]
fn test_zscore_detection() {
let series = create_series_with_outliers();
let detector = OutlierDetector::new(OutlierMethod::ZScore, 2.0);
let result = detector
.detect(&series)
.expect("detection operation should succeed");
assert_eq!(result.method, OutlierMethod::ZScore);
assert!(
!result.outlier_indices.is_empty(),
"Expected to detect at least 1 outlier"
);
assert!(result.outlier_indices.len() >= 1);
}
#[test]
fn test_modified_zscore_detection() {
let series = create_series_with_outliers();
let detector = OutlierDetector::new(OutlierMethod::ModifiedZScore, 3.5);
let result = detector
.detect(&series)
.expect("detection operation should succeed");
assert_eq!(result.method, OutlierMethod::ModifiedZScore);
assert!(result.outlier_indices.len() >= 2);
}
#[test]
fn test_isolation_forest_detection() {
let series = create_series_with_outliers();
let detector = OutlierDetector::new(OutlierMethod::IsolationForest, 0.2);
let result = detector
.detect(&series)
.expect("detection operation should succeed");
assert_eq!(result.method, OutlierMethod::IsolationForest);
assert!(!result.outlier_indices.is_empty());
}
#[test]
fn test_outlier_treatment_mean() {
let series = create_series_with_outliers();
let detector = OutlierDetector::new(OutlierMethod::ZScore, 3.0);
let detection = detector
.detect(&series)
.expect("detection operation should succeed");
let treated = detector
.treat(&series, &detection, OutlierTreatment::Mean)
.expect("operation should succeed");
let treated_data = treated
.values
.to_vec()
.expect("tensor to_vec conversion should succeed");
assert_eq!(treated_data.len(), series.len());
}
#[test]
fn test_outlier_treatment_remove() {
let series = create_series_with_outliers();
let detector = OutlierDetector::new(OutlierMethod::IQR, 1.5);
let detection = detector
.detect(&series)
.expect("detection operation should succeed");
let treated = detector
.treat(&series, &detection, OutlierTreatment::Remove)
.expect("operation should succeed");
assert!(treated.len() < series.len());
}
#[test]
fn test_outlier_treatment_interpolate() {
let series = create_series_with_outliers();
let detector = OutlierDetector::new(OutlierMethod::ZScore, 3.0);
let detection = detector
.detect(&series)
.expect("detection operation should succeed");
let treated = detector
.treat(&series, &detection, OutlierTreatment::Interpolate)
.expect("operation should succeed");
assert_eq!(treated.len(), series.len());
let original_data = series
.values
.to_vec()
.expect("tensor to_vec conversion should succeed");
let treated_data = treated
.values
.to_vec()
.expect("tensor to_vec conversion should succeed");
for &idx in &detection.outlier_indices {
assert_ne!(original_data[idx], treated_data[idx]);
}
}
#[test]
fn test_detect_and_treat_convenience() {
let series = create_series_with_outliers();
let (treated, detection) =
detect_and_treat_outliers(&series, OutlierMethod::IQR, 1.5, OutlierTreatment::Median)
.expect("operation should succeed");
assert!(!detection.outlier_indices.is_empty());
assert_eq!(treated.len(), series.len());
}
#[test]
fn test_no_outliers() {
let data = vec![10.0f32, 10.1, 9.9, 10.2, 9.8, 10.0];
let tensor = Tensor::from_vec(data, &[6]).expect("Tensor should succeed");
let series = TimeSeries::new(tensor);
let detector = OutlierDetector::new(OutlierMethod::ZScore, 3.0);
let result = detector
.detect(&series)
.expect("detection operation should succeed");
assert!(result.outlier_indices.is_empty());
}
#[test]
fn test_constant_series() {
let data = vec![10.0f32; 10];
let tensor = Tensor::from_vec(data, &[10]).expect("Tensor should succeed");
let series = TimeSeries::new(tensor);
let detector = OutlierDetector::new(OutlierMethod::ZScore, 3.0);
let result = detector
.detect(&series)
.expect("detection operation should succeed");
assert!(result.outlier_indices.is_empty());
}
}