1use crate::types::{AnomalyResult, DataMatrix, DistanceMetric};
8use rand::prelude::*;
9use rustkernel_core::{domain::Domain, kernel::KernelMetadata, traits::GpuKernel};
10
11#[derive(Debug, Clone)]
21pub struct IsolationForest {
22 metadata: KernelMetadata,
23}
24
25impl Default for IsolationForest {
26 fn default() -> Self {
27 Self::new()
28 }
29}
30
31impl IsolationForest {
32 #[must_use]
34 pub fn new() -> Self {
35 Self {
36 metadata: KernelMetadata::batch("ml/isolation-forest", Domain::StatisticalML)
37 .with_description("Isolation Forest ensemble anomaly detection")
38 .with_throughput(10_000)
39 .with_latency_us(100.0),
40 }
41 }
42
43 pub fn compute(
51 data: &DataMatrix,
52 n_trees: usize,
53 sample_size: usize,
54 contamination: f64,
55 ) -> AnomalyResult {
56 let n = data.n_samples;
57
58 if n == 0 {
59 return AnomalyResult {
60 scores: Vec::new(),
61 labels: Vec::new(),
62 threshold: 0.5,
63 };
64 }
65
66 let actual_sample_size = sample_size.min(n);
67 let max_depth = (actual_sample_size as f64).log2().ceil() as usize;
68
69 let trees: Vec<IsolationTree> = (0..n_trees)
71 .map(|_| IsolationTree::build(data, actual_sample_size, max_depth))
72 .collect();
73
74 let scores: Vec<f64> = (0..n)
76 .map(|i| {
77 let point = data.row(i);
78 let avg_path_length: f64 = trees
79 .iter()
80 .map(|tree| tree.path_length(point) as f64)
81 .sum::<f64>()
82 / n_trees as f64;
83
84 let c_n = Self::average_path_length(actual_sample_size);
87 (2.0_f64).powf(-avg_path_length / c_n)
88 })
89 .collect();
90
91 let mut sorted_scores = scores.clone();
93 sorted_scores.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
94 let threshold_idx = ((n as f64 * contamination) as usize).max(1).min(n - 1);
95 let threshold = sorted_scores[threshold_idx];
96
97 let labels: Vec<i32> = scores
99 .iter()
100 .map(|&s| if s >= threshold { -1 } else { 1 })
101 .collect();
102
103 AnomalyResult {
104 scores,
105 labels,
106 threshold,
107 }
108 }
109
110 fn average_path_length(n: usize) -> f64 {
112 if n <= 1 {
113 return 0.0;
114 }
115 if n == 2 {
116 return 1.0;
117 }
118 let euler = 0.5772156649;
121 2.0 * ((n as f64 - 1.0).ln() + euler) - 2.0 * (n as f64 - 1.0) / n as f64
122 }
123}
124
125impl GpuKernel for IsolationForest {
126 fn metadata(&self) -> &KernelMetadata {
127 &self.metadata
128 }
129}
130
131#[derive(Debug, Clone)]
133enum IsolationTreeNode {
134 Internal {
135 feature: usize,
136 threshold: f64,
137 left: Box<IsolationTreeNode>,
138 right: Box<IsolationTreeNode>,
139 },
140 Leaf {
141 size: usize,
142 },
143}
144
145#[derive(Debug, Clone)]
147#[allow(dead_code)]
148struct IsolationTree {
149 root: IsolationTreeNode,
150 max_depth: usize,
151}
152
153impl IsolationTree {
154 fn build(data: &DataMatrix, sample_size: usize, max_depth: usize) -> Self {
156 let mut rng = rand::rng();
157 let n = data.n_samples;
158 let d = data.n_features;
159
160 let indices: Vec<usize> = (0..n).collect();
162 let sample_indices: Vec<usize> = indices
163 .choose_multiple(&mut rng, sample_size.min(n))
164 .copied()
165 .collect();
166
167 let sample_data: Vec<Vec<f64>> = sample_indices
169 .iter()
170 .map(|&i| data.row(i).to_vec())
171 .collect();
172
173 let root = Self::build_node(&sample_data, d, 0, max_depth, &mut rng);
174
175 IsolationTree { root, max_depth }
176 }
177
178 fn build_node(
179 data: &[Vec<f64>],
180 n_features: usize,
181 depth: usize,
182 max_depth: usize,
183 rng: &mut impl Rng,
184 ) -> IsolationTreeNode {
185 let n = data.len();
186
187 if depth >= max_depth || n <= 1 {
189 return IsolationTreeNode::Leaf { size: n };
190 }
191
192 let feature = rng.random_range(0..n_features);
194
195 let values: Vec<f64> = data.iter().map(|row| row[feature]).collect();
197 let min_val = values.iter().copied().fold(f64::MAX, f64::min);
198 let max_val = values.iter().copied().fold(f64::MIN, f64::max);
199
200 if (max_val - min_val).abs() < 1e-10 {
201 return IsolationTreeNode::Leaf { size: n };
202 }
203
204 let threshold = min_val + rng.random::<f64>() * (max_val - min_val);
206
207 let (left_data, right_data): (Vec<Vec<f64>>, Vec<Vec<f64>>) = data
209 .iter()
210 .cloned()
211 .partition(|row| row[feature] < threshold);
212
213 if left_data.is_empty() || right_data.is_empty() {
214 return IsolationTreeNode::Leaf { size: n };
215 }
216
217 IsolationTreeNode::Internal {
218 feature,
219 threshold,
220 left: Box::new(Self::build_node(
221 &left_data,
222 n_features,
223 depth + 1,
224 max_depth,
225 rng,
226 )),
227 right: Box::new(Self::build_node(
228 &right_data,
229 n_features,
230 depth + 1,
231 max_depth,
232 rng,
233 )),
234 }
235 }
236
237 fn path_length(&self, point: &[f64]) -> usize {
239 Self::path_length_node(&self.root, point, 0)
240 }
241
242 fn path_length_node(node: &IsolationTreeNode, point: &[f64], depth: usize) -> usize {
243 match node {
244 IsolationTreeNode::Leaf { size } => {
245 depth + IsolationForest::average_path_length(*size) as usize
247 }
248 IsolationTreeNode::Internal {
249 feature,
250 threshold,
251 left,
252 right,
253 } => {
254 if point[*feature] < *threshold {
255 Self::path_length_node(left, point, depth + 1)
256 } else {
257 Self::path_length_node(right, point, depth + 1)
258 }
259 }
260 }
261 }
262}
263
264#[derive(Debug, Clone)]
273pub struct LocalOutlierFactor {
274 metadata: KernelMetadata,
275}
276
277impl Default for LocalOutlierFactor {
278 fn default() -> Self {
279 Self::new()
280 }
281}
282
283impl LocalOutlierFactor {
284 #[must_use]
286 pub fn new() -> Self {
287 Self {
288 metadata: KernelMetadata::batch("ml/local-outlier-factor", Domain::StatisticalML)
289 .with_description("Local Outlier Factor (k-NN density estimation)")
290 .with_throughput(5_000)
291 .with_latency_us(200.0),
292 }
293 }
294
295 pub fn compute(
303 data: &DataMatrix,
304 k: usize,
305 contamination: f64,
306 metric: DistanceMetric,
307 ) -> AnomalyResult {
308 let n = data.n_samples;
309
310 if n == 0 || k == 0 {
311 return AnomalyResult {
312 scores: Vec::new(),
313 labels: Vec::new(),
314 threshold: 1.0,
315 };
316 }
317
318 let k = k.min(n - 1);
319
320 let (k_distances, k_neighbors) = Self::compute_knn(data, k, metric);
322
323 let reach_dists: Vec<Vec<f64>> = (0..n)
325 .map(|i| {
326 k_neighbors[i]
327 .iter()
328 .map(|&j| {
329 let dist = metric.compute(data.row(i), data.row(j));
330 dist.max(k_distances[j])
331 })
332 .collect()
333 })
334 .collect();
335
336 let lrd: Vec<f64> = (0..n)
338 .map(|i| {
339 let sum_reach: f64 = reach_dists[i].iter().sum();
340 if sum_reach == 0.0 {
341 f64::MAX
342 } else {
343 k as f64 / sum_reach
344 }
345 })
346 .collect();
347
348 let scores: Vec<f64> = (0..n)
350 .map(|i| {
351 if lrd[i] == f64::MAX {
352 return 1.0;
353 }
354 let avg_neighbor_lrd: f64 =
355 k_neighbors[i].iter().map(|&j| lrd[j]).sum::<f64>() / k as f64;
356 avg_neighbor_lrd / lrd[i]
357 })
358 .collect();
359
360 let mut sorted_scores = scores.clone();
362 sorted_scores.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
363 let threshold_idx = ((n as f64 * contamination) as usize).max(1).min(n - 1);
364 let threshold = sorted_scores[threshold_idx];
365
366 let labels: Vec<i32> = scores
368 .iter()
369 .map(|&s| if s > threshold { -1 } else { 1 })
370 .collect();
371
372 AnomalyResult {
373 scores,
374 labels,
375 threshold,
376 }
377 }
378
379 fn compute_knn(
381 data: &DataMatrix,
382 k: usize,
383 metric: DistanceMetric,
384 ) -> (Vec<f64>, Vec<Vec<usize>>) {
385 let n = data.n_samples;
386
387 let mut k_distances = vec![0.0f64; n];
388 let mut k_neighbors = vec![Vec::new(); n];
389
390 for i in 0..n {
391 let mut distances: Vec<(usize, f64)> = (0..n)
393 .filter(|&j| j != i)
394 .map(|j| (j, metric.compute(data.row(i), data.row(j))))
395 .collect();
396
397 distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
399
400 let k_nearest: Vec<(usize, f64)> = distances.into_iter().take(k).collect();
402
403 k_distances[i] = k_nearest.last().map(|(_, d)| *d).unwrap_or(0.0);
404 k_neighbors[i] = k_nearest.iter().map(|(j, _)| *j).collect();
405 }
406
407 (k_distances, k_neighbors)
408 }
409}
410
411impl GpuKernel for LocalOutlierFactor {
412 fn metadata(&self) -> &KernelMetadata {
413 &self.metadata
414 }
415}
416
417#[cfg(test)]
418mod tests {
419 use super::*;
420
421 fn create_anomaly_data() -> DataMatrix {
422 DataMatrix::from_rows(&[
425 &[0.0, 0.0],
426 &[0.1, 0.1],
427 &[0.2, 0.0],
428 &[0.0, 0.2],
429 &[-0.1, 0.1],
430 &[0.1, -0.1],
431 &[100.0, 100.0], ])
433 }
434
435 #[test]
436 fn test_isolation_forest_metadata() {
437 let kernel = IsolationForest::new();
438 assert_eq!(kernel.metadata().id, "ml/isolation-forest");
439 assert_eq!(kernel.metadata().domain, Domain::StatisticalML);
440 }
441
442 #[test]
443 fn test_isolation_forest_detects_anomaly() {
444 let data = create_anomaly_data();
445 let result = IsolationForest::compute(&data, 100, 256, 0.15);
446
447 assert!(result.scores[6] > result.scores[0]);
449 assert!(result.scores[6] > result.scores[1]);
450 assert!(result.scores[6] > result.scores[2]);
451
452 assert_eq!(result.labels[6], -1);
454 }
455
456 #[test]
457 fn test_lof_metadata() {
458 let kernel = LocalOutlierFactor::new();
459 assert_eq!(kernel.metadata().id, "ml/local-outlier-factor");
460 assert_eq!(kernel.metadata().domain, Domain::StatisticalML);
461 }
462
463 #[test]
464 fn test_lof_detects_anomaly() {
465 let data = create_anomaly_data();
466 let result = LocalOutlierFactor::compute(&data, 3, 0.15, DistanceMetric::Euclidean);
467
468 assert!(result.scores[6] > 1.0);
471
472 for i in 0..6 {
474 assert!(result.scores[i] < result.scores[6]);
475 }
476 }
477
478 #[test]
479 fn test_isolation_forest_empty() {
480 let data = DataMatrix::zeros(0, 2);
481 let result = IsolationForest::compute(&data, 10, 256, 0.1);
482 assert!(result.scores.is_empty());
483 assert!(result.labels.is_empty());
484 }
485
486 #[test]
487 fn test_lof_empty() {
488 let data = DataMatrix::zeros(0, 2);
489 let result = LocalOutlierFactor::compute(&data, 3, 0.1, DistanceMetric::Euclidean);
490 assert!(result.scores.is_empty());
491 assert!(result.labels.is_empty());
492 }
493}