rustkernel_ml/
anomaly.rs

1//! Anomaly detection kernels.
2//!
3//! This module provides anomaly detection algorithms:
4//! - Isolation Forest (ensemble of isolation trees)
5//! - Local Outlier Factor (k-NN density-based)
6
7use crate::types::{AnomalyResult, DataMatrix, DistanceMetric};
8use rand::prelude::*;
9use rustkernel_core::{domain::Domain, kernel::KernelMetadata, traits::GpuKernel};
10
11// ============================================================================
12// Isolation Forest Kernel
13// ============================================================================
14
15/// Isolation Forest kernel.
16///
17/// Anomaly detection using ensemble of isolation trees.
18/// Anomalies are isolated quickly (short path lengths) while normal
19/// points require more splits.
20#[derive(Debug, Clone)]
21pub struct IsolationForest {
22    metadata: KernelMetadata,
23}
24
25impl Default for IsolationForest {
26    fn default() -> Self {
27        Self::new()
28    }
29}
30
31impl IsolationForest {
32    /// Create a new Isolation Forest kernel.
33    #[must_use]
34    pub fn new() -> Self {
35        Self {
36            metadata: KernelMetadata::batch("ml/isolation-forest", Domain::StatisticalML)
37                .with_description("Isolation Forest ensemble anomaly detection")
38                .with_throughput(10_000)
39                .with_latency_us(100.0),
40        }
41    }
42
43    /// Train and score using Isolation Forest.
44    ///
45    /// # Arguments
46    /// * `data` - Input data matrix
47    /// * `n_trees` - Number of isolation trees
48    /// * `sample_size` - Subsample size for each tree
49    /// * `contamination` - Expected proportion of anomalies (for threshold)
50    pub fn compute(
51        data: &DataMatrix,
52        n_trees: usize,
53        sample_size: usize,
54        contamination: f64,
55    ) -> AnomalyResult {
56        let n = data.n_samples;
57
58        if n == 0 {
59            return AnomalyResult {
60                scores: Vec::new(),
61                labels: Vec::new(),
62                threshold: 0.5,
63            };
64        }
65
66        let actual_sample_size = sample_size.min(n);
67        let max_depth = (actual_sample_size as f64).log2().ceil() as usize;
68
69        // Build isolation trees
70        let trees: Vec<IsolationTree> = (0..n_trees)
71            .map(|_| IsolationTree::build(data, actual_sample_size, max_depth))
72            .collect();
73
74        // Compute anomaly scores for each point
75        let scores: Vec<f64> = (0..n)
76            .map(|i| {
77                let point = data.row(i);
78                let avg_path_length: f64 = trees
79                    .iter()
80                    .map(|tree| tree.path_length(point) as f64)
81                    .sum::<f64>()
82                    / n_trees as f64;
83
84                // Anomaly score: s(x, n) = 2^(-E[h(x)] / c(n))
85                // where c(n) is the average path length in a random tree
86                let c_n = Self::average_path_length(actual_sample_size);
87                (2.0_f64).powf(-avg_path_length / c_n)
88            })
89            .collect();
90
91        // Determine threshold based on contamination
92        let mut sorted_scores = scores.clone();
93        sorted_scores.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
94        let threshold_idx = ((n as f64 * contamination) as usize).max(1).min(n - 1);
95        let threshold = sorted_scores[threshold_idx];
96
97        // Label anomalies
98        let labels: Vec<i32> = scores
99            .iter()
100            .map(|&s| if s >= threshold { -1 } else { 1 })
101            .collect();
102
103        AnomalyResult {
104            scores,
105            labels,
106            threshold,
107        }
108    }
109
110    /// Average path length in a random binary search tree.
111    fn average_path_length(n: usize) -> f64 {
112        if n <= 1 {
113            return 0.0;
114        }
115        if n == 2 {
116            return 1.0;
117        }
118        // c(n) = 2 * H(n-1) - 2(n-1)/n
119        // H(i) ≈ ln(i) + 0.5772156649 (Euler's constant)
120        let euler = 0.5772156649;
121        2.0 * ((n as f64 - 1.0).ln() + euler) - 2.0 * (n as f64 - 1.0) / n as f64
122    }
123}
124
125impl GpuKernel for IsolationForest {
126    fn metadata(&self) -> &KernelMetadata {
127        &self.metadata
128    }
129}
130
131/// A single isolation tree node.
132#[derive(Debug, Clone)]
133enum IsolationTreeNode {
134    Internal {
135        feature: usize,
136        threshold: f64,
137        left: Box<IsolationTreeNode>,
138        right: Box<IsolationTreeNode>,
139    },
140    Leaf {
141        size: usize,
142    },
143}
144
145/// An isolation tree.
146#[derive(Debug, Clone)]
147#[allow(dead_code)]
148struct IsolationTree {
149    root: IsolationTreeNode,
150    max_depth: usize,
151}
152
153impl IsolationTree {
154    /// Build an isolation tree from a random subsample.
155    fn build(data: &DataMatrix, sample_size: usize, max_depth: usize) -> Self {
156        let mut rng = rand::rng();
157        let n = data.n_samples;
158        let d = data.n_features;
159
160        // Random subsample
161        let indices: Vec<usize> = (0..n).collect();
162        let sample_indices: Vec<usize> = indices
163            .choose_multiple(&mut rng, sample_size.min(n))
164            .copied()
165            .collect();
166
167        // Extract subsample data
168        let sample_data: Vec<Vec<f64>> = sample_indices
169            .iter()
170            .map(|&i| data.row(i).to_vec())
171            .collect();
172
173        let root = Self::build_node(&sample_data, d, 0, max_depth, &mut rng);
174
175        IsolationTree { root, max_depth }
176    }
177
178    fn build_node(
179        data: &[Vec<f64>],
180        n_features: usize,
181        depth: usize,
182        max_depth: usize,
183        rng: &mut impl Rng,
184    ) -> IsolationTreeNode {
185        let n = data.len();
186
187        // Base case: leaf node
188        if depth >= max_depth || n <= 1 {
189            return IsolationTreeNode::Leaf { size: n };
190        }
191
192        // Random feature
193        let feature = rng.random_range(0..n_features);
194
195        // Find min/max for this feature
196        let values: Vec<f64> = data.iter().map(|row| row[feature]).collect();
197        let min_val = values.iter().copied().fold(f64::MAX, f64::min);
198        let max_val = values.iter().copied().fold(f64::MIN, f64::max);
199
200        if (max_val - min_val).abs() < 1e-10 {
201            return IsolationTreeNode::Leaf { size: n };
202        }
203
204        // Random threshold
205        let threshold = min_val + rng.random::<f64>() * (max_val - min_val);
206
207        // Split data
208        let (left_data, right_data): (Vec<Vec<f64>>, Vec<Vec<f64>>) = data
209            .iter()
210            .cloned()
211            .partition(|row| row[feature] < threshold);
212
213        if left_data.is_empty() || right_data.is_empty() {
214            return IsolationTreeNode::Leaf { size: n };
215        }
216
217        IsolationTreeNode::Internal {
218            feature,
219            threshold,
220            left: Box::new(Self::build_node(
221                &left_data,
222                n_features,
223                depth + 1,
224                max_depth,
225                rng,
226            )),
227            right: Box::new(Self::build_node(
228                &right_data,
229                n_features,
230                depth + 1,
231                max_depth,
232                rng,
233            )),
234        }
235    }
236
237    /// Compute path length to isolate a point.
238    fn path_length(&self, point: &[f64]) -> usize {
239        Self::path_length_node(&self.root, point, 0)
240    }
241
242    fn path_length_node(node: &IsolationTreeNode, point: &[f64], depth: usize) -> usize {
243        match node {
244            IsolationTreeNode::Leaf { size } => {
245                // Add expected path length for remaining points
246                depth + IsolationForest::average_path_length(*size) as usize
247            }
248            IsolationTreeNode::Internal {
249                feature,
250                threshold,
251                left,
252                right,
253            } => {
254                if point[*feature] < *threshold {
255                    Self::path_length_node(left, point, depth + 1)
256                } else {
257                    Self::path_length_node(right, point, depth + 1)
258                }
259            }
260        }
261    }
262}
263
264// ============================================================================
265// Local Outlier Factor (LOF) Kernel
266// ============================================================================
267
268/// Local Outlier Factor (LOF) kernel.
269///
270/// Density-based anomaly detection using k-nearest neighbors.
271/// Points with substantially lower density than their neighbors are anomalies.
272#[derive(Debug, Clone)]
273pub struct LocalOutlierFactor {
274    metadata: KernelMetadata,
275}
276
277impl Default for LocalOutlierFactor {
278    fn default() -> Self {
279        Self::new()
280    }
281}
282
283impl LocalOutlierFactor {
284    /// Create a new LOF kernel.
285    #[must_use]
286    pub fn new() -> Self {
287        Self {
288            metadata: KernelMetadata::batch("ml/local-outlier-factor", Domain::StatisticalML)
289                .with_description("Local Outlier Factor (k-NN density estimation)")
290                .with_throughput(5_000)
291                .with_latency_us(200.0),
292        }
293    }
294
295    /// Compute LOF scores.
296    ///
297    /// # Arguments
298    /// * `data` - Input data matrix
299    /// * `k` - Number of neighbors
300    /// * `contamination` - Expected proportion of anomalies (for threshold)
301    /// * `metric` - Distance metric
302    pub fn compute(
303        data: &DataMatrix,
304        k: usize,
305        contamination: f64,
306        metric: DistanceMetric,
307    ) -> AnomalyResult {
308        let n = data.n_samples;
309
310        if n == 0 || k == 0 {
311            return AnomalyResult {
312                scores: Vec::new(),
313                labels: Vec::new(),
314                threshold: 1.0,
315            };
316        }
317
318        let k = k.min(n - 1);
319
320        // Compute k-distances and k-nearest neighbors for all points
321        let (k_distances, k_neighbors) = Self::compute_knn(data, k, metric);
322
323        // Compute reachability distances
324        let reach_dists: Vec<Vec<f64>> = (0..n)
325            .map(|i| {
326                k_neighbors[i]
327                    .iter()
328                    .map(|&j| {
329                        let dist = metric.compute(data.row(i), data.row(j));
330                        dist.max(k_distances[j])
331                    })
332                    .collect()
333            })
334            .collect();
335
336        // Compute local reachability density (LRD)
337        let lrd: Vec<f64> = (0..n)
338            .map(|i| {
339                let sum_reach: f64 = reach_dists[i].iter().sum();
340                if sum_reach == 0.0 {
341                    f64::MAX
342                } else {
343                    k as f64 / sum_reach
344                }
345            })
346            .collect();
347
348        // Compute LOF scores
349        let scores: Vec<f64> = (0..n)
350            .map(|i| {
351                if lrd[i] == f64::MAX {
352                    return 1.0;
353                }
354                let avg_neighbor_lrd: f64 =
355                    k_neighbors[i].iter().map(|&j| lrd[j]).sum::<f64>() / k as f64;
356                avg_neighbor_lrd / lrd[i]
357            })
358            .collect();
359
360        // Determine threshold
361        let mut sorted_scores = scores.clone();
362        sorted_scores.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
363        let threshold_idx = ((n as f64 * contamination) as usize).max(1).min(n - 1);
364        let threshold = sorted_scores[threshold_idx];
365
366        // Label anomalies (LOF > threshold means anomaly)
367        let labels: Vec<i32> = scores
368            .iter()
369            .map(|&s| if s > threshold { -1 } else { 1 })
370            .collect();
371
372        AnomalyResult {
373            scores,
374            labels,
375            threshold,
376        }
377    }
378
379    /// Compute k-nearest neighbors and k-distances for all points.
380    fn compute_knn(
381        data: &DataMatrix,
382        k: usize,
383        metric: DistanceMetric,
384    ) -> (Vec<f64>, Vec<Vec<usize>>) {
385        let n = data.n_samples;
386
387        let mut k_distances = vec![0.0f64; n];
388        let mut k_neighbors = vec![Vec::new(); n];
389
390        for i in 0..n {
391            // Compute distances to all other points
392            let mut distances: Vec<(usize, f64)> = (0..n)
393                .filter(|&j| j != i)
394                .map(|j| (j, metric.compute(data.row(i), data.row(j))))
395                .collect();
396
397            // Sort by distance
398            distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
399
400            // Take k nearest
401            let k_nearest: Vec<(usize, f64)> = distances.into_iter().take(k).collect();
402
403            k_distances[i] = k_nearest.last().map(|(_, d)| *d).unwrap_or(0.0);
404            k_neighbors[i] = k_nearest.iter().map(|(j, _)| *j).collect();
405        }
406
407        (k_distances, k_neighbors)
408    }
409}
410
411impl GpuKernel for LocalOutlierFactor {
412    fn metadata(&self) -> &KernelMetadata {
413        &self.metadata
414    }
415}
416
417#[cfg(test)]
418mod tests {
419    use super::*;
420
421    fn create_anomaly_data() -> DataMatrix {
422        // Normal cluster around origin
423        // One clear outlier at (100, 100)
424        DataMatrix::from_rows(&[
425            &[0.0, 0.0],
426            &[0.1, 0.1],
427            &[0.2, 0.0],
428            &[0.0, 0.2],
429            &[-0.1, 0.1],
430            &[0.1, -0.1],
431            &[100.0, 100.0], // Anomaly
432        ])
433    }
434
435    #[test]
436    fn test_isolation_forest_metadata() {
437        let kernel = IsolationForest::new();
438        assert_eq!(kernel.metadata().id, "ml/isolation-forest");
439        assert_eq!(kernel.metadata().domain, Domain::StatisticalML);
440    }
441
442    #[test]
443    fn test_isolation_forest_detects_anomaly() {
444        let data = create_anomaly_data();
445        let result = IsolationForest::compute(&data, 100, 256, 0.15);
446
447        // The outlier (index 6) should have high anomaly score
448        assert!(result.scores[6] > result.scores[0]);
449        assert!(result.scores[6] > result.scores[1]);
450        assert!(result.scores[6] > result.scores[2]);
451
452        // Outlier should be labeled as anomaly (-1)
453        assert_eq!(result.labels[6], -1);
454    }
455
456    #[test]
457    fn test_lof_metadata() {
458        let kernel = LocalOutlierFactor::new();
459        assert_eq!(kernel.metadata().id, "ml/local-outlier-factor");
460        assert_eq!(kernel.metadata().domain, Domain::StatisticalML);
461    }
462
463    #[test]
464    fn test_lof_detects_anomaly() {
465        let data = create_anomaly_data();
466        let result = LocalOutlierFactor::compute(&data, 3, 0.15, DistanceMetric::Euclidean);
467
468        // The outlier (index 6) should have high LOF score
469        // LOF > 1 indicates an outlier
470        assert!(result.scores[6] > 1.0);
471
472        // Normal points should have LOF close to 1
473        for i in 0..6 {
474            assert!(result.scores[i] < result.scores[6]);
475        }
476    }
477
478    #[test]
479    fn test_isolation_forest_empty() {
480        let data = DataMatrix::zeros(0, 2);
481        let result = IsolationForest::compute(&data, 10, 256, 0.1);
482        assert!(result.scores.is_empty());
483        assert!(result.labels.is_empty());
484    }
485
486    #[test]
487    fn test_lof_empty() {
488        let data = DataMatrix::zeros(0, 2);
489        let result = LocalOutlierFactor::compute(&data, 3, 0.1, DistanceMetric::Euclidean);
490        assert!(result.scores.is_empty());
491        assert!(result.labels.is_empty());
492    }
493}