use crate::error::{Result, ScryLearnError};
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
enum ITreeNode {
Split {
feature: usize,
threshold: f64,
left: Box<ITreeNode>,
right: Box<ITreeNode>,
},
Leaf {
size: usize,
},
}
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
struct IsolationTree {
root: ITreeNode,
}
impl IsolationTree {
fn build(data: &[Vec<f64>], max_depth: usize, rng: &mut crate::rng::FastRng) -> Self {
let root = Self::build_node(data, 0, max_depth, rng);
Self { root }
}
fn build_node(
data: &[Vec<f64>],
depth: usize,
max_depth: usize,
rng: &mut crate::rng::FastRng,
) -> ITreeNode {
let n = data.len();
if n <= 1 || depth >= max_depth {
return ITreeNode::Leaf { size: n };
}
let n_features = data[0].len();
if n_features == 0 {
return ITreeNode::Leaf { size: n };
}
let feature = rng.usize(0..n_features);
let mut min_val = f64::INFINITY;
let mut max_val = f64::NEG_INFINITY;
for sample in data {
let v = sample[feature];
if v < min_val {
min_val = v;
}
if v > max_val {
max_val = v;
}
}
if (max_val - min_val).abs() < f64::EPSILON {
return ITreeNode::Leaf { size: n };
}
let threshold = min_val + rng.f64() * (max_val - min_val);
let mut left_data = Vec::new();
let mut right_data = Vec::new();
for sample in data {
if sample[feature] < threshold {
left_data.push(sample.clone());
} else {
right_data.push(sample.clone());
}
}
if left_data.is_empty() || right_data.is_empty() {
return ITreeNode::Leaf { size: n };
}
let left = Self::build_node(&left_data, depth + 1, max_depth, rng);
let right = Self::build_node(&right_data, depth + 1, max_depth, rng);
ITreeNode::Split {
feature,
threshold,
left: Box::new(left),
right: Box::new(right),
}
}
fn path_length(&self, sample: &[f64]) -> f64 {
Self::path_length_node(&self.root, sample, 0)
}
fn path_length_node(node: &ITreeNode, sample: &[f64], depth: usize) -> f64 {
match node {
ITreeNode::Leaf { size } => depth as f64 + average_path_length(*size),
ITreeNode::Split {
feature,
threshold,
left,
right,
} => {
if sample[*feature] < *threshold {
Self::path_length_node(left, sample, depth + 1)
} else {
Self::path_length_node(right, sample, depth + 1)
}
}
}
}
}
fn average_path_length(n: usize) -> f64 {
if n <= 1 {
return 0.0;
}
let n_f = n as f64;
2.0 * (((n_f - 1.0).ln()) + 0.577_215_664_9) - 2.0 * (n_f - 1.0) / n_f
}
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[non_exhaustive]
pub struct IsolationForest {
n_estimators: usize,
max_samples: usize,
contamination: f64,
random_state: Option<u64>,
trees: Vec<IsolationTree>,
threshold: f64,
training_sub_size: usize,
#[cfg_attr(feature = "serde", serde(default))]
_schema_version: u32,
}
impl IsolationForest {
pub fn new() -> Self {
Self {
n_estimators: 100,
max_samples: 256,
contamination: 0.1,
random_state: None,
trees: Vec::new(),
threshold: 0.5,
training_sub_size: 0,
_schema_version: crate::version::SCHEMA_VERSION,
}
}
pub fn n_estimators(mut self, n: usize) -> Self {
self.n_estimators = n;
self
}
pub fn max_samples(mut self, n: usize) -> Self {
self.max_samples = n;
self
}
pub fn contamination(mut self, c: f64) -> Self {
self.contamination = c;
self
}
pub fn random_state(mut self, seed: u64) -> Self {
self.random_state = Some(seed);
self
}
pub fn seed(self, s: u64) -> Self {
self.random_state(s)
}
pub fn fit(&mut self, features: &[Vec<f64>]) -> Result<()> {
for (i, row) in features.iter().enumerate() {
for (j, &v) in row.iter().enumerate() {
if !v.is_finite() {
return Err(ScryLearnError::InvalidData(format!(
"non-finite value ({v}) in feature[{j}] at sample {i}"
)));
}
}
}
if features.is_empty() {
return Err(ScryLearnError::EmptyDataset);
}
if self.contamination <= 0.0 || self.contamination > 0.5 {
return Err(ScryLearnError::InvalidParameter(format!(
"contamination must be in (0, 0.5], got {}",
self.contamination
)));
}
let n = features.len();
let sub_size = self.max_samples.min(n);
let max_depth = (sub_size as f64).log2().ceil() as usize;
let seed = self.random_state.unwrap_or(42);
let mut trees = Vec::with_capacity(self.n_estimators);
for i in 0..self.n_estimators {
let mut rng = crate::rng::FastRng::new(seed.wrapping_add(i as u64));
let subsample: Vec<Vec<f64>> = (0..sub_size)
.map(|_| features[rng.usize(0..n)].clone())
.collect();
let tree = IsolationTree::build(&subsample, max_depth, &mut rng);
trees.push(tree);
}
self.trees = trees;
self.training_sub_size = sub_size;
let mut scores = self.predict(features);
scores.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
let cutoff_idx = ((self.contamination * n as f64).ceil() as usize)
.min(n)
.max(1);
self.threshold = scores[cutoff_idx - 1];
Ok(())
}
pub fn predict(&self, features: &[Vec<f64>]) -> Vec<f64> {
let n = features.len();
let sub_size = if self.training_sub_size > 0 {
self.training_sub_size
} else {
self.max_samples.min(n.max(1))
};
let c = average_path_length(sub_size);
if c.abs() < f64::EPSILON || self.trees.is_empty() {
return vec![0.5; n];
}
features
.iter()
.map(|sample| {
let avg_path: f64 = self
.trees
.iter()
.map(|t| t.path_length(sample))
.sum::<f64>()
/ self.trees.len() as f64;
2.0_f64.powf(-avg_path / c)
})
.collect()
}
pub fn predict_labels(&self, features: &[Vec<f64>]) -> Vec<i8> {
if self.trees.is_empty() {
return vec![1; features.len()];
}
let scores = self.predict(features);
scores
.into_iter()
.map(|s| if s >= self.threshold { -1 } else { 1 })
.collect()
}
pub fn score_threshold(&self) -> f64 {
self.threshold
}
}
impl Default for IsolationForest {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_test_data(n_normal: usize, n_outliers: usize, seed: u64) -> Vec<Vec<f64>> {
let mut rng = crate::rng::FastRng::new(seed);
let mut data = Vec::with_capacity(n_normal + n_outliers);
for _ in 0..n_normal {
data.push(vec![rng.f64() * 2.0 - 1.0, rng.f64() * 2.0 - 1.0]);
}
for _ in 0..n_outliers {
data.push(vec![10.0 + rng.f64() * 5.0, 10.0 + rng.f64() * 5.0]);
}
data
}
#[test]
fn test_iforest_detects_outliers() {
let data = make_test_data(90, 10, 42);
let mut ifo = IsolationForest::new()
.n_estimators(100)
.max_samples(64)
.contamination(0.1)
.random_state(42);
ifo.fit(&data).unwrap();
let scores = ifo.predict(&data);
let normal_mean: f64 = scores[..90].iter().sum::<f64>() / 90.0;
let outlier_mean: f64 = scores[90..].iter().sum::<f64>() / 10.0;
assert!(
outlier_mean > normal_mean,
"outlier mean score ({:.3}) should be higher than normal mean ({:.3})",
outlier_mean,
normal_mean,
);
}
#[test]
fn test_iforest_labels_recall() {
let data = make_test_data(90, 10, 123);
let mut ifo = IsolationForest::new()
.n_estimators(100)
.max_samples(64)
.contamination(0.1)
.random_state(123);
ifo.fit(&data).unwrap();
let labels = ifo.predict_labels(&data);
let outlier_detected = labels[90..].iter().filter(|&&l| l == -1).count();
let recall = outlier_detected as f64 / 10.0;
assert!(
recall >= 0.7,
"expected outlier recall ≥ 0.70, got {:.2}",
recall,
);
}
#[test]
fn test_iforest_single_feature() {
let mut data: Vec<Vec<f64>> = (0..100).map(|i| vec![i as f64 * 0.1]).collect();
data.push(vec![1000.0]);
let mut ifo = IsolationForest::new()
.n_estimators(50)
.max_samples(64)
.contamination(0.05)
.random_state(7);
ifo.fit(&data).unwrap();
let scores = ifo.predict(&data);
let max_score_idx = scores
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
.unwrap()
.0;
assert_eq!(
max_score_idx,
data.len() - 1,
"outlier should have highest anomaly score"
);
}
#[test]
fn test_iforest_multi_feature() {
let mut rng = crate::rng::FastRng::new(99);
let mut data: Vec<Vec<f64>> = (0..100)
.map(|_| {
vec![
rng.f64() * 2.0,
rng.f64() * 2.0,
rng.f64() * 2.0,
rng.f64() * 2.0,
]
})
.collect();
for _ in 0..5 {
data.push(vec![50.0, 50.0, 50.0, 50.0]);
}
let mut ifo = IsolationForest::new()
.n_estimators(80)
.max_samples(64)
.contamination(0.05)
.random_state(99);
ifo.fit(&data).unwrap();
let labels = ifo.predict_labels(&data);
let outlier_detected = labels[100..].iter().filter(|&&l| l == -1).count();
assert!(
outlier_detected >= 3,
"expected ≥ 3 of 5 outliers detected, got {}",
outlier_detected,
);
}
#[test]
fn test_iforest_empty_input() {
let mut ifo = IsolationForest::new();
let result = ifo.fit(&[]);
assert!(result.is_err());
}
#[test]
fn test_iforest_invalid_contamination() {
let data = make_test_data(10, 0, 1);
let mut ifo = IsolationForest::new().contamination(0.0);
assert!(ifo.fit(&data).is_err());
let mut ifo2 = IsolationForest::new().contamination(0.6);
assert!(ifo2.fit(&data).is_err());
}
#[test]
fn test_iforest_default() {
let ifo = IsolationForest::default();
assert_eq!(ifo.n_estimators, 100);
assert_eq!(ifo.max_samples, 256);
assert!((ifo.contamination - 0.1).abs() < f64::EPSILON);
}
#[test]
fn test_average_path_length() {
assert!((average_path_length(0) - 0.0).abs() < f64::EPSILON);
assert!((average_path_length(1) - 0.0).abs() < f64::EPSILON);
let c2 = average_path_length(2);
assert!((c2 - 0.1544).abs() < 0.01, "c(2) = {c2}");
let c256 = average_path_length(256);
assert!(c256 > 8.0 && c256 < 12.0, "c(256) = {c256}");
}
}