Skip to main content

scry_learn/anomaly/
iforest.rs

1// SPDX-License-Identifier: MIT OR Apache-2.0
2//! Isolation Forest for anomaly/outlier detection.
3//!
4//! Detects anomalies by building an ensemble of random isolation trees.
5//! Anomalies are isolated in fewer splits, yielding shorter average path
6//! lengths and higher anomaly scores.
7//!
8//! # Examples
9//!
10//! ```ignore
11//! use scry_learn::anomaly::IsolationForest;
12//!
13//! let mut ifo = IsolationForest::new()
14//!     .n_estimators(100)
15//!     .contamination(0.1);
16//!
17//! let data = vec![
18//!     vec![1.0, 2.0],
19//!     vec![1.1, 2.1],
20//!     vec![100.0, 200.0], // outlier
21//! ];
22//! ifo.fit(&data).unwrap();
23//!
24//! let scores = ifo.predict(&data);
25//! let labels = ifo.predict_labels(&data);
26//! assert_eq!(labels[2], -1); // outlier
27//! ```
28
29use crate::error::{Result, ScryLearnError};
30
31// ---------------------------------------------------------------------------
32// Isolation Tree internals
33// ---------------------------------------------------------------------------
34
35/// A single node in an isolation tree.
36#[derive(Clone, Debug)]
37#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
38enum ITreeNode {
39    /// Internal split node.
40    Split {
41        /// Feature index used for the split.
42        feature: usize,
43        /// Split threshold.
44        threshold: f64,
45        /// Left child (values < threshold).
46        left: Box<ITreeNode>,
47        /// Right child (values >= threshold).
48        right: Box<ITreeNode>,
49    },
50    /// Leaf node reached when max depth hit or only one sample remains.
51    Leaf {
52        /// Number of samples that reached this leaf during training.
53        size: usize,
54    },
55}
56
57/// A single isolation tree.
58#[derive(Clone, Debug)]
59#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
60struct IsolationTree {
61    root: ITreeNode,
62}
63
64impl IsolationTree {
65    /// Build an isolation tree on a subsample.
66    fn build(data: &[Vec<f64>], max_depth: usize, rng: &mut crate::rng::FastRng) -> Self {
67        let root = Self::build_node(data, 0, max_depth, rng);
68        Self { root }
69    }
70
71    fn build_node(
72        data: &[Vec<f64>],
73        depth: usize,
74        max_depth: usize,
75        rng: &mut crate::rng::FastRng,
76    ) -> ITreeNode {
77        let n = data.len();
78        if n <= 1 || depth >= max_depth {
79            return ITreeNode::Leaf { size: n };
80        }
81
82        let n_features = data[0].len();
83        if n_features == 0 {
84            return ITreeNode::Leaf { size: n };
85        }
86
87        // Pick a random feature.
88        let feature = rng.usize(0..n_features);
89
90        // Find min/max for this feature.
91        let mut min_val = f64::INFINITY;
92        let mut max_val = f64::NEG_INFINITY;
93        for sample in data {
94            let v = sample[feature];
95            if v < min_val {
96                min_val = v;
97            }
98            if v > max_val {
99                max_val = v;
100            }
101        }
102
103        // If all values are the same, can't split further.
104        if (max_val - min_val).abs() < f64::EPSILON {
105            return ITreeNode::Leaf { size: n };
106        }
107
108        // Random split value between min and max.
109        let threshold = min_val + rng.f64() * (max_val - min_val);
110
111        // Partition data.
112        let mut left_data = Vec::new();
113        let mut right_data = Vec::new();
114        for sample in data {
115            if sample[feature] < threshold {
116                left_data.push(sample.clone());
117            } else {
118                right_data.push(sample.clone());
119            }
120        }
121
122        // If one side is empty, treat as leaf (shouldn't happen with proper threshold).
123        if left_data.is_empty() || right_data.is_empty() {
124            return ITreeNode::Leaf { size: n };
125        }
126
127        let left = Self::build_node(&left_data, depth + 1, max_depth, rng);
128        let right = Self::build_node(&right_data, depth + 1, max_depth, rng);
129
130        ITreeNode::Split {
131            feature,
132            threshold,
133            left: Box::new(left),
134            right: Box::new(right),
135        }
136    }
137
138    /// Compute the path length for a single sample.
139    fn path_length(&self, sample: &[f64]) -> f64 {
140        Self::path_length_node(&self.root, sample, 0)
141    }
142
143    fn path_length_node(node: &ITreeNode, sample: &[f64], depth: usize) -> f64 {
144        match node {
145            ITreeNode::Leaf { size } => depth as f64 + average_path_length(*size),
146            ITreeNode::Split {
147                feature,
148                threshold,
149                left,
150                right,
151            } => {
152                if sample[*feature] < *threshold {
153                    Self::path_length_node(left, sample, depth + 1)
154                } else {
155                    Self::path_length_node(right, sample, depth + 1)
156                }
157            }
158        }
159    }
160}
161
162// ---------------------------------------------------------------------------
163// Average path length of unsuccessful search in BST
164// ---------------------------------------------------------------------------
165
166/// Average path length of an unsuccessful search in a Binary Search Tree
167/// with `n` elements: `c(n) = 2 * (H(n-1)) - 2*(n-1)/n`
168/// where `H(k) = ln(k) + 0.5772156649` (Euler-Mascheroni constant).
169fn average_path_length(n: usize) -> f64 {
170    if n <= 1 {
171        return 0.0;
172    }
173    let n_f = n as f64;
174    2.0 * (((n_f - 1.0).ln()) + 0.577_215_664_9) - 2.0 * (n_f - 1.0) / n_f
175}
176
177// ---------------------------------------------------------------------------
178// IsolationForest
179// ---------------------------------------------------------------------------
180
181/// Isolation Forest for anomaly detection.
182///
183/// Builds an ensemble of random isolation trees. Points that are isolated
184/// in fewer splits (shorter path lengths) receive higher anomaly scores.
185///
186/// # Algorithm
187///
188/// 1. **Fit**: Build `n_estimators` isolation trees, each trained on a random
189///    subsample of `max_samples` points.
190/// 2. **Score**: For each point, compute the average path length across all
191///    trees. Normalize to `[0, 1]` using `score = 2^(-E[h(x)] / c(max_samples))`.
192/// 3. **Label**: Scores above the threshold (determined by `contamination`)
193///    are labelled as anomalies (`-1`), others as normal (`1`).
194///
195/// # Examples
196///
197/// ```ignore
198/// use scry_learn::anomaly::IsolationForest;
199///
200/// let mut ifo = IsolationForest::new()
201///     .n_estimators(100)
202///     .contamination(0.05);
203/// ifo.fit(&data).unwrap();
204/// let labels = ifo.predict_labels(&data);
205/// ```
206#[derive(Clone, Debug)]
207#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
208#[non_exhaustive]
209pub struct IsolationForest {
210    /// Number of isolation trees to build.
211    n_estimators: usize,
212    /// Number of samples to draw for each tree.
213    max_samples: usize,
214    /// Expected proportion of outliers in the data (used for threshold).
215    contamination: f64,
216    /// Random seed.
217    random_state: Option<u64>,
218    /// Trained trees (populated after fit).
219    trees: Vec<IsolationTree>,
220    /// The anomaly score threshold (set after fit).
221    threshold: f64,
222    /// The subsample size used during training (for normalization constant).
223    training_sub_size: usize,
224    #[cfg_attr(feature = "serde", serde(default))]
225    _schema_version: u32,
226}
227
228impl IsolationForest {
229    /// Create a new `IsolationForest` with default parameters.
230    ///
231    /// Defaults: 100 estimators, 256 max samples, 0.1 contamination.
232    pub fn new() -> Self {
233        Self {
234            n_estimators: 100,
235            max_samples: 256,
236            contamination: 0.1,
237            random_state: None,
238            trees: Vec::new(),
239            threshold: 0.5,
240            training_sub_size: 0,
241            _schema_version: crate::version::SCHEMA_VERSION,
242        }
243    }
244
245    /// Set the number of isolation trees (default: 100).
246    pub fn n_estimators(mut self, n: usize) -> Self {
247        self.n_estimators = n;
248        self
249    }
250
251    /// Set the subsample size per tree (default: 256).
252    pub fn max_samples(mut self, n: usize) -> Self {
253        self.max_samples = n;
254        self
255    }
256
257    /// Set the expected contamination ratio (default: 0.1).
258    ///
259    /// Must be in `(0, 0.5]`. Used to determine the anomaly score threshold.
260    pub fn contamination(mut self, c: f64) -> Self {
261        self.contamination = c;
262        self
263    }
264
265    /// Set the random seed for reproducibility.
266    pub fn random_state(mut self, seed: u64) -> Self {
267        self.random_state = Some(seed);
268        self
269    }
270
271    /// Alias for [`random_state`](Self::random_state) (consistent with other models).
272    pub fn seed(self, s: u64) -> Self {
273        self.random_state(s)
274    }
275
276    /// Build isolation trees on the provided feature matrix.
277    ///
278    /// `features` is row-major: `features[sample_idx][feature_idx]`.
279    ///
280    /// # Errors
281    ///
282    /// Returns [`ScryLearnError::EmptyDataset`] if `features` is empty.
283    /// Returns [`ScryLearnError::InvalidParameter`] if `contamination` is out of range.
284    pub fn fit(&mut self, features: &[Vec<f64>]) -> Result<()> {
285        for (i, row) in features.iter().enumerate() {
286            for (j, &v) in row.iter().enumerate() {
287                if !v.is_finite() {
288                    return Err(ScryLearnError::InvalidData(format!(
289                        "non-finite value ({v}) in feature[{j}] at sample {i}"
290                    )));
291                }
292            }
293        }
294        if features.is_empty() {
295            return Err(ScryLearnError::EmptyDataset);
296        }
297        if self.contamination <= 0.0 || self.contamination > 0.5 {
298            return Err(ScryLearnError::InvalidParameter(format!(
299                "contamination must be in (0, 0.5], got {}",
300                self.contamination
301            )));
302        }
303
304        let n = features.len();
305        let sub_size = self.max_samples.min(n);
306        let max_depth = (sub_size as f64).log2().ceil() as usize;
307        let seed = self.random_state.unwrap_or(42);
308
309        let mut trees = Vec::with_capacity(self.n_estimators);
310
311        for i in 0..self.n_estimators {
312            let mut rng = crate::rng::FastRng::new(seed.wrapping_add(i as u64));
313
314            // Sample `sub_size` points randomly (with replacement).
315            let subsample: Vec<Vec<f64>> = (0..sub_size)
316                .map(|_| features[rng.usize(0..n)].clone())
317                .collect();
318
319            let tree = IsolationTree::build(&subsample, max_depth, &mut rng);
320            trees.push(tree);
321        }
322
323        self.trees = trees;
324        self.training_sub_size = sub_size;
325
326        // Compute scores on training data to determine the threshold.
327        let mut scores = self.predict(features);
328        // Sort descending (highest score = most anomalous).
329        scores.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
330
331        // Threshold: the score at the `contamination` quantile.
332        let cutoff_idx = ((self.contamination * n as f64).ceil() as usize)
333            .min(n)
334            .max(1);
335        self.threshold = scores[cutoff_idx - 1];
336
337        Ok(())
338    }
339
340    /// Compute anomaly scores for each sample.
341    ///
342    /// Returns a `Vec<f64>` where values closer to 1.0 indicate anomalies
343    /// and values closer to 0.0 indicate normal points.
344    ///
345    /// Score formula: `score = 2^(-E[h(x)] / c(ψ))`
346    /// where `E[h(x)]` is the average path length and `c(ψ)` is the
347    /// average BST path length for the training subsample size `ψ`.
348    pub fn predict(&self, features: &[Vec<f64>]) -> Vec<f64> {
349        let n = features.len();
350        // Use the training subsample size for normalization (Liu et al.).
351        // During fit, training_sub_size is set; before fit, fall back to max_samples.
352        let sub_size = if self.training_sub_size > 0 {
353            self.training_sub_size
354        } else {
355            self.max_samples.min(n.max(1))
356        };
357        let c = average_path_length(sub_size);
358
359        if c.abs() < f64::EPSILON || self.trees.is_empty() {
360            return vec![0.5; n];
361        }
362
363        features
364            .iter()
365            .map(|sample| {
366                let avg_path: f64 = self
367                    .trees
368                    .iter()
369                    .map(|t| t.path_length(sample))
370                    .sum::<f64>()
371                    / self.trees.len() as f64;
372                2.0_f64.powf(-avg_path / c)
373            })
374            .collect()
375    }
376
377    /// Predict anomaly labels: `1` for normal, `-1` for anomaly.
378    ///
379    /// Uses the contamination-based threshold computed during `fit`.
380    /// Returns all `1` (normal) if the model has not been fitted.
381    pub fn predict_labels(&self, features: &[Vec<f64>]) -> Vec<i8> {
382        if self.trees.is_empty() {
383            return vec![1; features.len()];
384        }
385        let scores = self.predict(features);
386        scores
387            .into_iter()
388            .map(|s| if s >= self.threshold { -1 } else { 1 })
389            .collect()
390    }
391
392    /// Returns the anomaly score threshold determined during fit.
393    pub fn score_threshold(&self) -> f64 {
394        self.threshold
395    }
396}
397
398impl Default for IsolationForest {
399    fn default() -> Self {
400        Self::new()
401    }
402}
403
404#[cfg(test)]
405mod tests {
406    use super::*;
407
408    /// Generate `n_normal` normal points centered around origin and
409    /// `n_outliers` outliers far from origin.
410    fn make_test_data(n_normal: usize, n_outliers: usize, seed: u64) -> Vec<Vec<f64>> {
411        let mut rng = crate::rng::FastRng::new(seed);
412        let mut data = Vec::with_capacity(n_normal + n_outliers);
413
414        // Normal cluster around (0, 0).
415        for _ in 0..n_normal {
416            data.push(vec![rng.f64() * 2.0 - 1.0, rng.f64() * 2.0 - 1.0]);
417        }
418
419        // Outliers far away.
420        for _ in 0..n_outliers {
421            data.push(vec![10.0 + rng.f64() * 5.0, 10.0 + rng.f64() * 5.0]);
422        }
423
424        data
425    }
426
427    #[test]
428    fn test_iforest_detects_outliers() {
429        let data = make_test_data(90, 10, 42);
430        let mut ifo = IsolationForest::new()
431            .n_estimators(100)
432            .max_samples(64)
433            .contamination(0.1)
434            .random_state(42);
435
436        ifo.fit(&data).unwrap();
437        let scores = ifo.predict(&data);
438
439        // Outliers (last 10 points) should have higher scores than normal points.
440        let normal_mean: f64 = scores[..90].iter().sum::<f64>() / 90.0;
441        let outlier_mean: f64 = scores[90..].iter().sum::<f64>() / 10.0;
442
443        assert!(
444            outlier_mean > normal_mean,
445            "outlier mean score ({:.3}) should be higher than normal mean ({:.3})",
446            outlier_mean,
447            normal_mean,
448        );
449    }
450
451    #[test]
452    fn test_iforest_labels_recall() {
453        let data = make_test_data(90, 10, 123);
454        let mut ifo = IsolationForest::new()
455            .n_estimators(100)
456            .max_samples(64)
457            .contamination(0.1)
458            .random_state(123);
459
460        ifo.fit(&data).unwrap();
461        let labels = ifo.predict_labels(&data);
462
463        // Count how many of the last 10 (outliers) were labeled -1.
464        let outlier_detected = labels[90..].iter().filter(|&&l| l == -1).count();
465        let recall = outlier_detected as f64 / 10.0;
466
467        assert!(
468            recall >= 0.7,
469            "expected outlier recall ≥ 0.70, got {:.2}",
470            recall,
471        );
472    }
473
474    #[test]
475    fn test_iforest_single_feature() {
476        let mut data: Vec<Vec<f64>> = (0..100).map(|i| vec![i as f64 * 0.1]).collect();
477        // Add an outlier.
478        data.push(vec![1000.0]);
479
480        let mut ifo = IsolationForest::new()
481            .n_estimators(50)
482            .max_samples(64)
483            .contamination(0.05)
484            .random_state(7);
485
486        ifo.fit(&data).unwrap();
487        let scores = ifo.predict(&data);
488
489        // The outlier (last) should have the highest score.
490        let max_score_idx = scores
491            .iter()
492            .enumerate()
493            .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
494            .unwrap()
495            .0;
496
497        assert_eq!(
498            max_score_idx,
499            data.len() - 1,
500            "outlier should have highest anomaly score"
501        );
502    }
503
504    #[test]
505    fn test_iforest_multi_feature() {
506        let mut rng = crate::rng::FastRng::new(99);
507        let mut data: Vec<Vec<f64>> = (0..100)
508            .map(|_| {
509                vec![
510                    rng.f64() * 2.0,
511                    rng.f64() * 2.0,
512                    rng.f64() * 2.0,
513                    rng.f64() * 2.0,
514                ]
515            })
516            .collect();
517        // Add outliers in 4D.
518        for _ in 0..5 {
519            data.push(vec![50.0, 50.0, 50.0, 50.0]);
520        }
521
522        let mut ifo = IsolationForest::new()
523            .n_estimators(80)
524            .max_samples(64)
525            .contamination(0.05)
526            .random_state(99);
527
528        ifo.fit(&data).unwrap();
529        let labels = ifo.predict_labels(&data);
530
531        let outlier_detected = labels[100..].iter().filter(|&&l| l == -1).count();
532        assert!(
533            outlier_detected >= 3,
534            "expected ≥ 3 of 5 outliers detected, got {}",
535            outlier_detected,
536        );
537    }
538
539    #[test]
540    fn test_iforest_empty_input() {
541        let mut ifo = IsolationForest::new();
542        let result = ifo.fit(&[]);
543        assert!(result.is_err());
544    }
545
546    #[test]
547    fn test_iforest_invalid_contamination() {
548        let data = make_test_data(10, 0, 1);
549        let mut ifo = IsolationForest::new().contamination(0.0);
550        assert!(ifo.fit(&data).is_err());
551
552        let mut ifo2 = IsolationForest::new().contamination(0.6);
553        assert!(ifo2.fit(&data).is_err());
554    }
555
556    #[test]
557    fn test_iforest_default() {
558        let ifo = IsolationForest::default();
559        assert_eq!(ifo.n_estimators, 100);
560        assert_eq!(ifo.max_samples, 256);
561        assert!((ifo.contamination - 0.1).abs() < f64::EPSILON);
562    }
563
564    #[test]
565    fn test_average_path_length() {
566        assert!((average_path_length(0) - 0.0).abs() < f64::EPSILON);
567        assert!((average_path_length(1) - 0.0).abs() < f64::EPSILON);
568        // c(2) = 2*(ln(1) + 0.5772) - 2*1/2 = 1.1544 - 1 ≈ 0.1544
569        let c2 = average_path_length(2);
570        assert!((c2 - 0.1544).abs() < 0.01, "c(2) = {c2}");
571        // c(256) ≈ 9.21
572        let c256 = average_path_length(256);
573        assert!(c256 > 8.0 && c256 < 12.0, "c(256) = {c256}");
574    }
575}