use crate::error::Result;
use crate::primitives::Matrix;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LocalOutlierFactor {
n_neighbors: usize,
contamination: f32,
lof_scores: Option<Vec<f32>>,
negative_outlier_factor: Option<Vec<f32>>,
training_data: Option<Matrix<f32>>,
knn_distances: Option<Vec<Vec<f32>>>,
knn_indices: Option<Vec<Vec<usize>>>,
lrd: Option<Vec<f32>>,
threshold: Option<f32>,
}
impl LocalOutlierFactor {
#[must_use]
pub fn new() -> Self {
Self {
n_neighbors: 20,
contamination: 0.1,
lof_scores: None,
negative_outlier_factor: None,
training_data: None,
knn_distances: None,
knn_indices: None,
lrd: None,
threshold: None,
}
}
#[must_use]
pub fn with_n_neighbors(mut self, n_neighbors: usize) -> Self {
self.n_neighbors = n_neighbors;
self
}
#[must_use]
pub fn with_contamination(mut self, contamination: f32) -> Self {
self.contamination = contamination.clamp(0.0, 0.5);
self
}
#[must_use]
pub fn is_fitted(&self) -> bool {
self.lof_scores.is_some()
}
pub fn fit(&mut self, x: &Matrix<f32>) -> Result<()> {
let (n_samples, _) = x.shape();
if self.n_neighbors >= n_samples {
return Err("n_neighbors must be less than number of samples".into());
}
self.training_data = Some(x.clone());
let (knn_distances, knn_indices) = self.compute_knn(x, x);
self.knn_distances = Some(knn_distances.clone());
self.knn_indices = Some(knn_indices.clone());
let lrd = self.compute_lrd(&knn_distances, &knn_indices);
self.lrd = Some(lrd.clone());
let lof_scores = self.compute_lof_scores(&lrd, &knn_indices);
self.lof_scores = Some(lof_scores.clone());
let nof: Vec<f32> = lof_scores.iter().map(|&score| -score).collect();
self.negative_outlier_factor = Some(nof.clone());
let mut sorted_scores = lof_scores.clone();
sorted_scores.sort_by(|a, b| {
b.partial_cmp(a)
.expect("LOF scores must be valid floats for comparison")
});
let threshold_idx = (self.contamination * n_samples as f32) as usize;
self.threshold = Some(sorted_scores[threshold_idx.min(n_samples - 1)]);
Ok(())
}
fn compute_knn(
&self,
query: &Matrix<f32>,
data: &Matrix<f32>,
) -> (Vec<Vec<f32>>, Vec<Vec<usize>>) {
let (n_query, n_features) = query.shape();
let (n_data, _) = data.shape();
let mut knn_distances = Vec::with_capacity(n_query);
let mut knn_indices = Vec::with_capacity(n_query);
for i in 0..n_query {
let mut distances: Vec<(f32, usize)> = Vec::with_capacity(n_data);
for j in 0..n_data {
let mut dist_sq = 0.0;
for k in 0..n_features {
let diff = query.get(i, k) - data.get(j, k);
dist_sq += diff * diff;
}
let dist = dist_sq.sqrt();
distances.push((dist, j));
}
distances.sort_by(|a, b| {
a.0.partial_cmp(&b.0)
.expect("Distances must be valid floats for comparison")
});
let skip_self = i < n_data;
let k_start = usize::from(skip_self);
let k_end = k_start + self.n_neighbors;
let dists: Vec<f32> = distances[k_start..k_end.min(distances.len())]
.iter()
.map(|(d, _)| *d)
.collect();
let indices: Vec<usize> = distances[k_start..k_end.min(distances.len())]
.iter()
.map(|(_, idx)| *idx)
.collect();
knn_distances.push(dists);
knn_indices.push(indices);
}
(knn_distances, knn_indices)
}
#[allow(clippy::unused_self)]
fn reachability_distance(&self, dist: f32, k_distance: f32) -> f32 {
dist.max(k_distance)
}
fn compute_lrd(&self, knn_distances: &[Vec<f32>], knn_indices: &[Vec<usize>]) -> Vec<f32> {
let n_samples = knn_indices.len();
let mut lrd = Vec::with_capacity(n_samples);
for i in 0..n_samples {
let neighbors = &knn_indices[i];
let neighbor_dists = &knn_distances[i];
if neighbors.is_empty() {
lrd.push(1.0);
continue;
}
let mut sum_reach_dist = 0.0;
for (j, &neighbor_idx) in neighbors.iter().enumerate() {
let dist_to_neighbor = neighbor_dists[j];
let k_distance = if neighbor_idx < knn_distances.len() {
knn_distances[neighbor_idx].last().copied().unwrap_or(0.0)
} else {
0.0
};
let reach_dist = self.reachability_distance(dist_to_neighbor, k_distance);
sum_reach_dist += reach_dist;
}
let lrd_value = if sum_reach_dist > 0.0 {
neighbors.len() as f32 / sum_reach_dist
} else {
1.0 };
lrd.push(lrd_value);
}
lrd
}
#[allow(clippy::unused_self)]
fn compute_lof_scores(&self, lrd: &[f32], knn_indices: &[Vec<usize>]) -> Vec<f32> {
let n_samples = knn_indices.len();
let mut lof_scores = Vec::with_capacity(n_samples);
for i in 0..n_samples {
let neighbors = &knn_indices[i];
if neighbors.is_empty() || lrd[i] == 0.0 {
lof_scores.push(1.0);
continue;
}
let sum_neighbor_lrd: f32 = neighbors.iter().map(|&idx| lrd[idx]).sum();
let avg_neighbor_lrd = sum_neighbor_lrd / neighbors.len() as f32;
let lof = avg_neighbor_lrd / lrd[i];
lof_scores.push(lof);
}
lof_scores
}
#[must_use]
pub fn score_samples(&self, x: &Matrix<f32>) -> Vec<f32> {
assert!(self.is_fitted(), "Model not fitted. Call fit() first.");
let training_data = self
.training_data
.as_ref()
.expect("Training data must be stored during fit");
let training_lrd = self
.lrd
.as_ref()
.expect("LRD values must be computed during fit");
let (knn_distances, knn_indices) = self.compute_knn(x, training_data);
let query_lrd = self.compute_lrd(&knn_distances, &knn_indices);
let n_query = x.shape().0;
let mut lof_scores = Vec::with_capacity(n_query);
for i in 0..n_query {
let neighbors = &knn_indices[i];
if neighbors.is_empty() || query_lrd[i] == 0.0 {
lof_scores.push(1.0);
continue;
}
let sum_neighbor_lrd: f32 = neighbors
.iter()
.filter_map(|&idx| training_lrd.get(idx).copied())
.sum();
let avg_neighbor_lrd = sum_neighbor_lrd / neighbors.len() as f32;
let lof = avg_neighbor_lrd / query_lrd[i];
lof_scores.push(lof);
}
lof_scores
}
#[must_use]
pub fn predict(&self, x: &Matrix<f32>) -> Vec<i32> {
assert!(self.is_fitted(), "Model not fitted. Call fit() first.");
let threshold = self
.threshold
.expect("Threshold must be set during fit phase");
let scores = self.score_samples(x);
scores
.iter()
.map(|&score| if score > threshold { -1 } else { 1 })
.collect()
}
#[must_use]
pub fn negative_outlier_factor(&self) -> &[f32] {
assert!(self.is_fitted(), "Model not fitted. Call fit() first.");
self.negative_outlier_factor
.as_ref()
.expect("Negative outlier factor must be computed during fit")
}
}
impl Default for LocalOutlierFactor {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
#[path = "tests_lof_contract.rs"]
mod tests_lof_contract;