use crate::error::Result;
use std::collections::HashMap;
#[derive(Debug, Clone)]
pub enum AdvancedOutlierMethod {
LOF { k: usize },
IsolationScore { max_depth: usize },
DBSCANOutlier { eps: f64, min_pts: usize },
Mahalanobis,
Ensemble,
}
#[derive(Debug, Clone)]
pub struct AdvancedOutlierInfo {
pub index: usize,
pub value: f64,
pub outlier_score: f64,
pub confidence: f64,
pub method_scores: HashMap<String, f64>,
pub is_outlier: bool,
}
#[derive(Debug, Clone)]
pub struct AdvancedOutlierResult {
pub method_name: String,
pub outliers: Vec<AdvancedOutlierInfo>,
pub threshold: f64,
pub detection_rate: f64,
pub method_params: HashMap<String, f64>,
}
pub fn detect_outliers_lof(numbers: &[f64], k: usize) -> Result<AdvancedOutlierResult> {
if numbers.len() < k + 1 {
return Err(crate::error::BenfError::InsufficientData(numbers.len()));
}
let mut outliers = Vec::new();
let mut lof_scores = Vec::new();
for (i, &value) in numbers.iter().enumerate() {
let mut distances: Vec<f64> = numbers
.iter()
.enumerate()
.filter(|(j, _)| *j != i)
.map(|(_, &other)| (value - other).abs())
.collect();
distances.sort_by(|a, b| a.partial_cmp(b).unwrap());
if distances.len() >= k {
let k_distance = distances[k - 1];
let reachability_distances: Vec<f64> =
distances[..k].iter().map(|&d| d.max(k_distance)).collect();
let lrd = k as f64 / reachability_distances.iter().sum::<f64>();
let lof_score = if lrd > 0.0 {
let neighbor_lrds: f64 = distances[..k]
.iter()
.map(|_| lrd) .sum();
(neighbor_lrds / (k as f64)) / lrd
} else {
1.0
};
lof_scores.push(lof_score);
if lof_score > 1.5 {
outliers.push(AdvancedOutlierInfo {
index: i,
value,
outlier_score: lof_score,
confidence: ((lof_score - 1.0).min(2.0) / 2.0).clamp(0.0, 1.0),
method_scores: {
let mut scores = HashMap::new();
scores.insert("lof".to_string(), lof_score);
scores.insert("k_distance".to_string(), k_distance);
scores
},
is_outlier: true,
});
}
}
}
let detection_rate = outliers.len() as f64 / numbers.len() as f64;
Ok(AdvancedOutlierResult {
method_name: format!("LOF (k={k})"),
outliers,
threshold: 1.5,
detection_rate,
method_params: {
let mut params = HashMap::new();
params.insert("k".to_string(), k as f64);
params.insert("threshold".to_string(), 1.5);
params
},
})
}
pub fn detect_outliers_isolation(
numbers: &[f64],
max_depth: usize,
) -> Result<AdvancedOutlierResult> {
let mut outliers = Vec::new();
let avg_path_length = calculate_average_path_length(numbers.len());
for (i, &value) in numbers.iter().enumerate() {
let path_length = calculate_isolation_path_length(value, numbers, max_depth);
let anomaly_score = 2.0_f64.powf(-path_length / avg_path_length);
if anomaly_score > 0.6 {
outliers.push(AdvancedOutlierInfo {
index: i,
value,
outlier_score: anomaly_score,
confidence: ((anomaly_score - 0.5) * 2.0).clamp(0.0, 1.0),
method_scores: {
let mut scores = HashMap::new();
scores.insert("anomaly_score".to_string(), anomaly_score);
scores.insert("path_length".to_string(), path_length);
scores
},
is_outlier: true,
});
}
}
let detection_rate = outliers.len() as f64 / numbers.len() as f64;
Ok(AdvancedOutlierResult {
method_name: format!("Isolation Score (depth={max_depth})"),
outliers,
threshold: 0.6,
detection_rate,
method_params: {
let mut params = HashMap::new();
params.insert("max_depth".to_string(), max_depth as f64);
params.insert("threshold".to_string(), 0.6);
params
},
})
}
pub fn detect_outliers_dbscan(
numbers: &[f64],
eps: f64,
min_pts: usize,
) -> Result<AdvancedOutlierResult> {
let mut outliers = Vec::new();
let mut visited = vec![false; numbers.len()];
let mut clusters = Vec::new();
for (i, &value) in numbers.iter().enumerate() {
if visited[i] {
continue;
}
visited[i] = true;
let neighbors: Vec<usize> = numbers
.iter()
.enumerate()
.filter(|(j, &other)| *j != i && (value - other).abs() <= eps)
.map(|(j, _)| j)
.collect();
if neighbors.len() >= min_pts {
let mut cluster = vec![i];
let mut queue = neighbors;
while let Some(neighbor_idx) = queue.pop() {
if !visited[neighbor_idx] {
visited[neighbor_idx] = true;
cluster.push(neighbor_idx);
let neighbor_neighbors: Vec<usize> = numbers
.iter()
.enumerate()
.filter(|(j, &other)| {
*j != neighbor_idx && (numbers[neighbor_idx] - other).abs() <= eps
})
.map(|(j, _)| j)
.collect();
if neighbor_neighbors.len() >= min_pts {
queue.extend(neighbor_neighbors);
}
}
}
clusters.push(cluster);
} else {
let density_score = neighbors.len() as f64 / min_pts as f64;
outliers.push(AdvancedOutlierInfo {
index: i,
value,
outlier_score: 1.0 - density_score,
confidence: (1.0 - density_score).clamp(0.0, 1.0),
method_scores: {
let mut scores = HashMap::new();
scores.insert("density_score".to_string(), density_score);
scores.insert("neighbor_count".to_string(), neighbors.len() as f64);
scores
},
is_outlier: true,
});
}
}
let detection_rate = outliers.len() as f64 / numbers.len() as f64;
Ok(AdvancedOutlierResult {
method_name: format!("DBSCAN Outlier (eps={eps:.2}, min_pts={min_pts})"),
outliers,
threshold: 1.0 - (min_pts as f64 / 10.0),
detection_rate,
method_params: {
let mut params = HashMap::new();
params.insert("eps".to_string(), eps);
params.insert("min_pts".to_string(), min_pts as f64);
params
},
})
}
pub fn detect_outliers_ensemble(numbers: &[f64]) -> Result<AdvancedOutlierResult> {
let lof_result = detect_outliers_lof(numbers, 5)?;
let isolation_result = detect_outliers_isolation(numbers, 8)?;
let std_dev = {
let mean = numbers.iter().sum::<f64>() / numbers.len() as f64;
let variance =
numbers.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / numbers.len() as f64;
variance.sqrt()
};
let eps = std_dev * 0.5;
let min_pts = (numbers.len() as f64).sqrt() as usize;
let dbscan_result = detect_outliers_dbscan(numbers, eps, min_pts)?;
let mut ensemble_scores: HashMap<usize, (f64, f64, usize)> = HashMap::new();
for outlier in &lof_result.outliers {
let entry = ensemble_scores
.entry(outlier.index)
.or_insert((0.0, 0.0, 0));
entry.0 += outlier.outlier_score;
entry.1 += outlier.confidence;
entry.2 += 1;
}
for outlier in &isolation_result.outliers {
let entry = ensemble_scores
.entry(outlier.index)
.or_insert((0.0, 0.0, 0));
entry.0 += outlier.outlier_score;
entry.1 += outlier.confidence;
entry.2 += 1;
}
for outlier in &dbscan_result.outliers {
let entry = ensemble_scores
.entry(outlier.index)
.or_insert((0.0, 0.0, 0));
entry.0 += outlier.outlier_score;
entry.1 += outlier.confidence;
entry.2 += 1;
}
let mut outliers = Vec::new();
for (&index, &(total_score, total_confidence, method_count)) in &ensemble_scores {
let avg_score = total_score / method_count as f64;
let avg_confidence = total_confidence / method_count as f64;
let consensus_strength = method_count as f64 / 3.0;
if method_count >= 2 {
outliers.push(AdvancedOutlierInfo {
index,
value: numbers[index],
outlier_score: avg_score * consensus_strength,
confidence: avg_confidence * consensus_strength,
method_scores: {
let mut scores = HashMap::new();
scores.insert("ensemble_score".to_string(), avg_score);
scores.insert("consensus_strength".to_string(), consensus_strength);
scores.insert("method_count".to_string(), method_count as f64);
scores
},
is_outlier: true,
});
}
}
outliers.sort_by(|a, b| b.outlier_score.partial_cmp(&a.outlier_score).unwrap());
let detection_rate = outliers.len() as f64 / numbers.len() as f64;
Ok(AdvancedOutlierResult {
method_name: "Ensemble (LOF + Isolation + DBSCAN)".to_string(),
outliers,
threshold: 0.5,
detection_rate,
method_params: {
let mut params = HashMap::new();
params.insert("min_consensus".to_string(), 2.0);
params.insert("lof_k".to_string(), 5.0);
params.insert("isolation_depth".to_string(), 8.0);
params.insert("dbscan_eps".to_string(), eps);
params.insert("dbscan_min_pts".to_string(), min_pts as f64);
params
},
})
}
fn calculate_average_path_length(n: usize) -> f64 {
if n <= 1 {
return 0.0;
}
2.0 * ((n - 1) as f64).ln() - (2.0 * (n - 1) as f64 / n as f64)
}
fn calculate_isolation_path_length(value: f64, numbers: &[f64], max_depth: usize) -> f64 {
let mut depth = 0.0;
let mut data = numbers.to_vec();
for _ in 0..max_depth {
if data.len() <= 1 {
break;
}
let min_val = data.iter().copied().fold(f64::INFINITY, f64::min);
let max_val = data.iter().copied().fold(f64::NEG_INFINITY, f64::max);
if min_val == max_val {
break;
}
let split_point = (min_val + max_val) / 2.0;
if value < split_point {
data.retain(|&x| x < split_point);
} else {
data.retain(|&x| x >= split_point);
}
depth += 1.0;
if data.len() <= 1 {
break;
}
}
depth
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_lof_outlier_detection() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 100.0]; let result = detect_outliers_lof(&data, 3).unwrap();
assert_eq!(result.method_name, "LOF (k=3)");
assert!(result.detection_rate >= 0.0);
if !result.outliers.is_empty() {
assert!(result.detection_rate > 0.0);
}
}
#[test]
fn test_isolation_outlier_detection() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 100.0]; let result = detect_outliers_isolation(&data, 8).unwrap();
assert!(result.detection_rate >= 0.0);
assert_eq!(result.method_name, "Isolation Score (depth=8)");
}
#[test]
fn test_dbscan_outlier_detection() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 100.0]; let result = detect_outliers_dbscan(&data, 2.0, 2).unwrap();
assert!(result.detection_rate >= 0.0);
assert!(result.method_name.contains("DBSCAN"));
}
#[test]
fn test_ensemble_outlier_detection() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 100.0]; let result = detect_outliers_ensemble(&data).unwrap();
assert_eq!(result.method_name, "Ensemble (LOF + Isolation + DBSCAN)");
assert!(result.detection_rate >= 0.0);
}
#[test]
fn test_insufficient_data_error() {
let data = vec![1.0, 2.0]; let result = detect_outliers_lof(&data, 5);
assert!(result.is_err());
}
#[test]
fn test_normal_data_low_detection_rate() {
let data = vec![1.0, 1.1, 0.9, 1.05, 0.95, 1.02, 0.98]; let result = detect_outliers_ensemble(&data).unwrap();
assert!(result.detection_rate < 0.5);
}
}