use crate::error::InsightError;
#[derive(Debug, Clone)]
pub struct LofConfig {
pub k: usize,
pub threshold: f64,
}
impl Default for LofConfig {
fn default() -> Self {
Self {
k: 20,
threshold: 1.5,
}
}
}
impl LofConfig {
pub fn k(mut self, k: usize) -> Self {
self.k = k;
self
}
pub fn threshold(mut self, t: f64) -> Self {
self.threshold = t;
self
}
}
#[derive(Debug, Clone)]
pub struct LofResult {
pub scores: Vec<f64>,
pub anomalies: Vec<bool>,
pub threshold: f64,
pub anomaly_count: usize,
pub anomaly_fraction: f64,
}
pub fn lof(data: &[Vec<f64>], config: &LofConfig) -> Result<LofResult, InsightError> {
let n = data.len();
if n == 0 {
return Err(InsightError::InsufficientData {
min_required: 2,
actual: 0,
});
}
let dim = data[0].len();
if dim == 0 {
return Err(InsightError::DimensionMismatch {
expected: 1,
actual: 0,
});
}
for (row_idx, point) in data.iter().enumerate() {
if point.len() != dim {
return Err(InsightError::DimensionMismatch {
expected: dim,
actual: point.len(),
});
}
if let Some(col_idx) = point.iter().position(|v| !v.is_finite()) {
return Err(InsightError::DegenerateData {
reason: format!(
"non-finite value at row {row_idx}, column {col_idx}"
),
});
}
}
let k = config.k.min(n - 1).max(1);
let mut k_distances = vec![0.0_f64; n];
let mut k_neighbors: Vec<Vec<usize>> = vec![Vec::new(); n];
for i in 0..n {
let mut dists: Vec<(usize, f64)> = (0..n)
.filter(|&j| j != i)
.map(|j| (j, euclidean_distance(&data[i], &data[j])))
.collect();
dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
let k_dist = dists[k - 1].1;
k_distances[i] = k_dist;
k_neighbors[i] = dists
.iter()
.take_while(|(_, d)| *d <= k_dist + 1e-15)
.map(|(j, _)| *j)
.collect();
if k_neighbors[i].len() < k {
k_neighbors[i] = dists[..k].iter().map(|(j, _)| *j).collect();
}
}
let mut lrd = vec![0.0_f64; n];
for i in 0..n {
let neighbors = &k_neighbors[i];
let nk = neighbors.len() as f64;
let sum_reach_dist: f64 = neighbors
.iter()
.map(|&j| {
let d = euclidean_distance(&data[i], &data[j]);
k_distances[j].max(d) })
.sum();
lrd[i] = if sum_reach_dist > 0.0 {
nk / sum_reach_dist
} else {
f64::MAX };
}
let mut scores = vec![0.0_f64; n];
for i in 0..n {
let neighbors = &k_neighbors[i];
let nk = neighbors.len() as f64;
let avg_neighbor_lrd: f64 = neighbors.iter().map(|&j| lrd[j]).sum::<f64>() / nk;
scores[i] = if lrd[i] > 0.0 && lrd[i] < f64::MAX {
avg_neighbor_lrd / lrd[i]
} else {
1.0 };
}
let threshold = config.threshold;
let anomalies: Vec<bool> = scores.iter().map(|&s| s > threshold).collect();
let anomaly_count = anomalies.iter().filter(|&&a| a).count();
let anomaly_fraction = anomaly_count as f64 / n as f64;
Ok(LofResult {
scores,
anomalies,
threshold,
anomaly_count,
anomaly_fraction,
})
}
fn euclidean_distance(a: &[f64], b: &[f64]) -> f64 {
a.iter()
.zip(b.iter())
.map(|(x, y)| (x - y).powi(2))
.sum::<f64>()
.sqrt()
}
#[cfg(test)]
mod tests {
use super::*;
fn cluster_with_outlier() -> Vec<Vec<f64>> {
let mut data = Vec::new();
for i in 0..20 {
data.push(vec![i as f64 * 0.1, i as f64 * 0.05]);
}
data.push(vec![100.0, 100.0]);
data
}
#[test]
fn lof_detects_single_outlier() {
let data = cluster_with_outlier();
let config = LofConfig::default().k(5);
let result = lof(&data, &config).expect("should compute");
let max_idx = result
.scores
.iter()
.enumerate()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
.unwrap()
.0;
assert_eq!(max_idx, 20, "outlier at index 20 should have max LOF");
assert!(result.scores[20] > 2.0, "outlier LOF should be high");
}
#[test]
fn lof_inliers_near_one() {
let data = cluster_with_outlier();
let config = LofConfig::default().k(5);
let result = lof(&data, &config).expect("should compute");
for i in 3..17 {
assert!(
result.scores[i] < 1.5,
"inlier {} has LOF = {}, expected < 1.5",
i,
result.scores[i]
);
}
}
#[test]
fn lof_anomaly_classification() {
let data = cluster_with_outlier();
let config = LofConfig::default().k(5).threshold(2.0);
let result = lof(&data, &config).expect("should compute");
assert!(
result.anomalies[20],
"outlier should be classified as anomaly"
);
assert!(result.anomaly_count >= 1, "at least one anomaly");
assert!(result.anomaly_fraction > 0.0);
}
#[test]
fn lof_two_clusters() {
let mut data = Vec::new();
for i in 0..15 {
data.push(vec![i as f64 * 0.1, 0.0]);
}
for i in 0..15 {
data.push(vec![10.0 + i as f64 * 0.1, 10.0]);
}
data.push(vec![5.0, 5.0]);
let config = LofConfig::default().k(5);
let result = lof(&data, &config).expect("should compute");
let max_idx = result
.scores
.iter()
.enumerate()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
.unwrap()
.0;
assert_eq!(max_idx, 30, "between-cluster point should have max LOF");
}
#[test]
fn lof_uniform_density() {
let data: Vec<Vec<f64>> = (0..20).map(|i| vec![i as f64]).collect();
let config = LofConfig::default().k(3);
let result = lof(&data, &config).expect("should compute");
for i in 3..17 {
assert!(
(result.scores[i] - 1.0).abs() < 0.5,
"uniform point {} has LOF = {}, expected ~1.0",
i,
result.scores[i]
);
}
}
#[test]
fn lof_k_clamped() {
let data = vec![vec![0.0], vec![1.0], vec![2.0]];
let config = LofConfig::default().k(20);
let result = lof(&data, &config).expect("should compute");
assert_eq!(result.scores.len(), 3);
}
#[test]
fn lof_error_empty() {
let data: Vec<Vec<f64>> = vec![];
assert!(lof(&data, &LofConfig::default()).is_err());
}
#[test]
fn lof_error_dimension_mismatch() {
let data = vec![vec![1.0, 2.0], vec![3.0]]; assert!(lof(&data, &LofConfig::default()).is_err());
}
#[test]
fn lof_error_nan() {
let data = vec![vec![1.0, f64::NAN], vec![3.0, 4.0]];
let err = lof(&data, &LofConfig::default()).unwrap_err();
match err {
InsightError::DegenerateData { reason } => {
assert!(
reason.contains("row 0") && reason.contains("column 1"),
"reason should pinpoint the offending cell, got: {reason}"
);
}
other => panic!("expected DegenerateData, got {other:?}"),
}
}
#[test]
fn lof_multidimensional() {
let mut data = Vec::new();
for i in 0..15 {
data.push(vec![i as f64 * 0.1, i as f64 * 0.1, i as f64 * 0.1]);
}
data.push(vec![50.0, 50.0, 50.0]);
let config = LofConfig::default().k(5);
let result = lof(&data, &config).expect("should compute");
assert!(result.scores[15] > 1.5, "3D outlier should be detected");
}
}