use crate::types::{AnomalyResult, DataMatrix, DistanceMetric};
use rand::prelude::*;
use rustkernel_core::{domain::Domain, kernel::KernelMetadata, traits::GpuKernel};
#[derive(Debug, Clone)]
pub struct IsolationForest {
metadata: KernelMetadata,
}
impl Default for IsolationForest {
fn default() -> Self {
Self::new()
}
}
impl IsolationForest {
#[must_use]
pub fn new() -> Self {
Self {
metadata: KernelMetadata::batch("ml/isolation-forest", Domain::StatisticalML)
.with_description("Isolation Forest ensemble anomaly detection")
.with_throughput(10_000)
.with_latency_us(100.0),
}
}
pub fn compute(
data: &DataMatrix,
n_trees: usize,
sample_size: usize,
contamination: f64,
) -> AnomalyResult {
let n = data.n_samples;
if n == 0 {
return AnomalyResult {
scores: Vec::new(),
labels: Vec::new(),
threshold: 0.5,
};
}
let actual_sample_size = sample_size.min(n);
let max_depth = (actual_sample_size as f64).log2().ceil() as usize;
let trees: Vec<IsolationTree> = (0..n_trees)
.map(|_| IsolationTree::build(data, actual_sample_size, max_depth))
.collect();
let scores: Vec<f64> = (0..n)
.map(|i| {
let point = data.row(i);
let avg_path_length: f64 = trees
.iter()
.map(|tree| tree.path_length(point) as f64)
.sum::<f64>()
/ n_trees as f64;
let c_n = Self::average_path_length(actual_sample_size);
(2.0_f64).powf(-avg_path_length / c_n)
})
.collect();
let mut sorted_scores = scores.clone();
sorted_scores.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
let threshold_idx = ((n as f64 * contamination) as usize).max(1).min(n - 1);
let threshold = sorted_scores[threshold_idx];
let labels: Vec<i32> = scores
.iter()
.map(|&s| if s >= threshold { -1 } else { 1 })
.collect();
AnomalyResult {
scores,
labels,
threshold,
}
}
fn average_path_length(n: usize) -> f64 {
if n <= 1 {
return 0.0;
}
if n == 2 {
return 1.0;
}
let euler = 0.5772156649;
2.0 * ((n as f64 - 1.0).ln() + euler) - 2.0 * (n as f64 - 1.0) / n as f64
}
}
impl GpuKernel for IsolationForest {
fn metadata(&self) -> &KernelMetadata {
&self.metadata
}
}
#[derive(Debug, Clone)]
enum IsolationTreeNode {
Internal {
feature: usize,
threshold: f64,
left: Box<IsolationTreeNode>,
right: Box<IsolationTreeNode>,
},
Leaf {
size: usize,
},
}
#[derive(Debug, Clone)]
#[allow(dead_code)]
struct IsolationTree {
root: IsolationTreeNode,
max_depth: usize,
}
impl IsolationTree {
fn build(data: &DataMatrix, sample_size: usize, max_depth: usize) -> Self {
let mut rng = rand::rng();
let n = data.n_samples;
let d = data.n_features;
let indices: Vec<usize> = (0..n).collect();
let sample_indices: Vec<usize> = indices
.choose_multiple(&mut rng, sample_size.min(n))
.copied()
.collect();
let sample_data: Vec<Vec<f64>> = sample_indices
.iter()
.map(|&i| data.row(i).to_vec())
.collect();
let root = Self::build_node(&sample_data, d, 0, max_depth, &mut rng);
IsolationTree { root, max_depth }
}
fn build_node(
data: &[Vec<f64>],
n_features: usize,
depth: usize,
max_depth: usize,
rng: &mut impl Rng,
) -> IsolationTreeNode {
let n = data.len();
if depth >= max_depth || n <= 1 {
return IsolationTreeNode::Leaf { size: n };
}
let feature = rng.random_range(0..n_features);
let values: Vec<f64> = data.iter().map(|row| row[feature]).collect();
let min_val = values.iter().copied().fold(f64::MAX, f64::min);
let max_val = values.iter().copied().fold(f64::MIN, f64::max);
if (max_val - min_val).abs() < 1e-10 {
return IsolationTreeNode::Leaf { size: n };
}
let threshold = min_val + rng.random::<f64>() * (max_val - min_val);
let (left_data, right_data): (Vec<Vec<f64>>, Vec<Vec<f64>>) = data
.iter()
.cloned()
.partition(|row| row[feature] < threshold);
if left_data.is_empty() || right_data.is_empty() {
return IsolationTreeNode::Leaf { size: n };
}
IsolationTreeNode::Internal {
feature,
threshold,
left: Box::new(Self::build_node(
&left_data,
n_features,
depth + 1,
max_depth,
rng,
)),
right: Box::new(Self::build_node(
&right_data,
n_features,
depth + 1,
max_depth,
rng,
)),
}
}
fn path_length(&self, point: &[f64]) -> usize {
Self::path_length_node(&self.root, point, 0)
}
fn path_length_node(node: &IsolationTreeNode, point: &[f64], depth: usize) -> usize {
match node {
IsolationTreeNode::Leaf { size } => {
depth + IsolationForest::average_path_length(*size) as usize
}
IsolationTreeNode::Internal {
feature,
threshold,
left,
right,
} => {
if point[*feature] < *threshold {
Self::path_length_node(left, point, depth + 1)
} else {
Self::path_length_node(right, point, depth + 1)
}
}
}
}
}
#[derive(Debug, Clone)]
pub struct LocalOutlierFactor {
metadata: KernelMetadata,
}
impl Default for LocalOutlierFactor {
fn default() -> Self {
Self::new()
}
}
impl LocalOutlierFactor {
#[must_use]
pub fn new() -> Self {
Self {
metadata: KernelMetadata::batch("ml/local-outlier-factor", Domain::StatisticalML)
.with_description("Local Outlier Factor (k-NN density estimation)")
.with_throughput(5_000)
.with_latency_us(200.0),
}
}
pub fn compute(
data: &DataMatrix,
k: usize,
contamination: f64,
metric: DistanceMetric,
) -> AnomalyResult {
let n = data.n_samples;
if n == 0 || k == 0 {
return AnomalyResult {
scores: Vec::new(),
labels: Vec::new(),
threshold: 1.0,
};
}
let k = k.min(n - 1);
let (k_distances, k_neighbors) = Self::compute_knn(data, k, metric);
let reach_dists: Vec<Vec<f64>> = (0..n)
.map(|i| {
k_neighbors[i]
.iter()
.map(|&j| {
let dist = metric.compute(data.row(i), data.row(j));
dist.max(k_distances[j])
})
.collect()
})
.collect();
let lrd: Vec<f64> = (0..n)
.map(|i| {
let sum_reach: f64 = reach_dists[i].iter().sum();
if sum_reach == 0.0 {
f64::MAX
} else {
k as f64 / sum_reach
}
})
.collect();
let scores: Vec<f64> = (0..n)
.map(|i| {
if lrd[i] == f64::MAX {
return 1.0;
}
let avg_neighbor_lrd: f64 =
k_neighbors[i].iter().map(|&j| lrd[j]).sum::<f64>() / k as f64;
avg_neighbor_lrd / lrd[i]
})
.collect();
let mut sorted_scores = scores.clone();
sorted_scores.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
let threshold_idx = ((n as f64 * contamination) as usize).max(1).min(n - 1);
let threshold = sorted_scores[threshold_idx];
let labels: Vec<i32> = scores
.iter()
.map(|&s| if s > threshold { -1 } else { 1 })
.collect();
AnomalyResult {
scores,
labels,
threshold,
}
}
fn compute_knn(
data: &DataMatrix,
k: usize,
metric: DistanceMetric,
) -> (Vec<f64>, Vec<Vec<usize>>) {
let n = data.n_samples;
let mut k_distances = vec![0.0f64; n];
let mut k_neighbors = vec![Vec::new(); n];
for i in 0..n {
let mut distances: Vec<(usize, f64)> = (0..n)
.filter(|&j| j != i)
.map(|j| (j, metric.compute(data.row(i), data.row(j))))
.collect();
distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
let k_nearest: Vec<(usize, f64)> = distances.into_iter().take(k).collect();
k_distances[i] = k_nearest.last().map(|(_, d)| *d).unwrap_or(0.0);
k_neighbors[i] = k_nearest.iter().map(|(j, _)| *j).collect();
}
(k_distances, k_neighbors)
}
}
impl GpuKernel for LocalOutlierFactor {
fn metadata(&self) -> &KernelMetadata {
&self.metadata
}
}
#[cfg(test)]
mod tests {
use super::*;
fn create_anomaly_data() -> DataMatrix {
DataMatrix::from_rows(&[
&[0.0, 0.0],
&[0.1, 0.1],
&[0.2, 0.0],
&[0.0, 0.2],
&[-0.1, 0.1],
&[0.1, -0.1],
&[100.0, 100.0], ])
}
#[test]
fn test_isolation_forest_metadata() {
let kernel = IsolationForest::new();
assert_eq!(kernel.metadata().id, "ml/isolation-forest");
assert_eq!(kernel.metadata().domain, Domain::StatisticalML);
}
#[test]
fn test_isolation_forest_detects_anomaly() {
let data = create_anomaly_data();
let result = IsolationForest::compute(&data, 100, 256, 0.15);
assert!(result.scores[6] > result.scores[0]);
assert!(result.scores[6] > result.scores[1]);
assert!(result.scores[6] > result.scores[2]);
assert_eq!(result.labels[6], -1);
}
#[test]
fn test_lof_metadata() {
let kernel = LocalOutlierFactor::new();
assert_eq!(kernel.metadata().id, "ml/local-outlier-factor");
assert_eq!(kernel.metadata().domain, Domain::StatisticalML);
}
#[test]
fn test_lof_detects_anomaly() {
let data = create_anomaly_data();
let result = LocalOutlierFactor::compute(&data, 3, 0.15, DistanceMetric::Euclidean);
assert!(result.scores[6] > 1.0);
for i in 0..6 {
assert!(result.scores[i] < result.scores[6]);
}
}
#[test]
fn test_isolation_forest_empty() {
let data = DataMatrix::zeros(0, 2);
let result = IsolationForest::compute(&data, 10, 256, 0.1);
assert!(result.scores.is_empty());
assert!(result.labels.is_empty());
}
#[test]
fn test_lof_empty() {
let data = DataMatrix::zeros(0, 2);
let result = LocalOutlierFactor::compute(&data, 3, 0.1, DistanceMetric::Euclidean);
assert!(result.scores.is_empty());
assert!(result.labels.is_empty());
}
}