use crate::error::InsightError;
#[derive(Debug, Clone)]
pub struct IsolationForestConfig {
pub n_estimators: usize,
pub max_samples: usize,
pub contamination: f64,
pub seed: Option<u64>,
}
impl Default for IsolationForestConfig {
fn default() -> Self {
Self {
n_estimators: 100,
max_samples: 0, contamination: 0.1,
seed: Some(42),
}
}
}
impl IsolationForestConfig {
pub fn n_estimators(mut self, n: usize) -> Self {
self.n_estimators = n;
self
}
pub fn contamination(mut self, c: f64) -> Self {
self.contamination = c;
self
}
pub fn seed(mut self, seed: Option<u64>) -> Self {
self.seed = seed;
self
}
pub fn max_samples(mut self, m: usize) -> Self {
self.max_samples = m;
self
}
}
#[derive(Debug, Clone)]
pub struct IsolationForestResult {
pub scores: Vec<f64>,
pub anomalies: Vec<bool>,
pub threshold: f64,
pub anomaly_count: usize,
pub anomaly_fraction: f64,
}
pub fn isolation_forest(
data: &[Vec<f64>],
config: &IsolationForestConfig,
) -> Result<IsolationForestResult, InsightError> {
let n = data.len();
if n < 2 {
return Err(InsightError::InsufficientData {
min_required: 2,
actual: n,
});
}
let d = data[0].len();
if d == 0 {
return Err(InsightError::DegenerateData {
reason: "data has 0 features".into(),
});
}
for (i, point) in data.iter().enumerate() {
if point.len() != d {
return Err(InsightError::DimensionMismatch {
expected: d,
actual: point.len(),
});
}
for (j, &v) in point.iter().enumerate() {
if !v.is_finite() {
return Err(InsightError::DegenerateData {
reason: format!("non-finite value at point {i}, dimension {j}"),
});
}
}
}
if config.n_estimators == 0 {
return Err(InsightError::InvalidParameter {
name: "n_estimators".into(),
message: "must be at least 1".into(),
});
}
if !(0.0..=1.0).contains(&config.contamination) {
return Err(InsightError::InvalidParameter {
name: "contamination".into(),
message: format!("must be in [0.0, 1.0], got {}", config.contamination),
});
}
let max_samples = if config.max_samples == 0 {
n.min(256)
} else {
config.max_samples.min(n)
};
let max_depth = (max_samples as f64).log2().ceil() as usize;
let mut rng_state = config.seed.unwrap_or(12345);
let mut trees = Vec::with_capacity(config.n_estimators);
for _ in 0..config.n_estimators {
let indices = sample_indices(n, max_samples, &mut rng_state);
let subsample: Vec<&[f64]> = indices.iter().map(|&i| data[i].as_slice()).collect();
let tree = build_itree(&subsample, d, max_depth, &mut rng_state);
trees.push(tree);
}
let cn = c_factor(max_samples);
let mut scores = Vec::with_capacity(n);
for point in data {
let avg_path: f64 = trees
.iter()
.map(|tree| path_length(point, tree, 0))
.sum::<f64>()
/ config.n_estimators as f64;
let score = if cn > 0.0 {
2.0f64.powf(-avg_path / cn)
} else {
0.5 };
scores.push(score);
}
let mut sorted_scores: Vec<f64> = scores.clone();
sorted_scores.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
let threshold_idx = ((n as f64 * config.contamination).ceil() as usize)
.min(n)
.max(1)
- 1;
let threshold = sorted_scores[threshold_idx];
let anomalies: Vec<bool> = scores.iter().map(|&s| s >= threshold).collect();
let anomaly_count = anomalies.iter().filter(|&&a| a).count();
Ok(IsolationForestResult {
scores,
anomalies,
threshold,
anomaly_count,
anomaly_fraction: anomaly_count as f64 / n as f64,
})
}
enum ITreeNode {
Internal {
feature: usize,
split_value: f64,
left: Box<ITreeNode>,
right: Box<ITreeNode>,
},
External { size: usize },
}
fn build_itree(data: &[&[f64]], d: usize, max_depth: usize, rng: &mut u64) -> ITreeNode {
let n = data.len();
if n <= 1 || max_depth == 0 {
return ITreeNode::External { size: n };
}
let feature = lcg_next_usize(rng, d);
let mut min_val = f64::INFINITY;
let mut max_val = f64::NEG_INFINITY;
for point in data {
let v = point[feature];
if v < min_val {
min_val = v;
}
if v > max_val {
max_val = v;
}
}
if (max_val - min_val).abs() < 1e-15 {
return ITreeNode::External { size: n };
}
let split_value = min_val + lcg_next_f64(rng) * (max_val - min_val);
let mut left_data = Vec::new();
let mut right_data = Vec::new();
for &point in data {
if point[feature] < split_value {
left_data.push(point);
} else {
right_data.push(point);
}
}
if left_data.is_empty() || right_data.is_empty() {
return ITreeNode::External { size: n };
}
ITreeNode::Internal {
feature,
split_value,
left: Box::new(build_itree(&left_data, d, max_depth - 1, rng)),
right: Box::new(build_itree(&right_data, d, max_depth - 1, rng)),
}
}
fn path_length(point: &[f64], node: &ITreeNode, current_depth: usize) -> f64 {
match node {
ITreeNode::External { size } => current_depth as f64 + c_factor(*size),
ITreeNode::Internal {
feature,
split_value,
left,
right,
} => {
if point[*feature] < *split_value {
path_length(point, left, current_depth + 1)
} else {
path_length(point, right, current_depth + 1)
}
}
}
}
fn c_factor(n: usize) -> f64 {
if n <= 1 {
return 0.0;
}
if n == 2 {
return 1.0;
}
let n_f = n as f64;
let harmonic = (n_f - 1.0).ln() + 0.5772156649;
2.0 * harmonic - 2.0 * (n_f - 1.0) / n_f
}
fn lcg_next_f64(state: &mut u64) -> f64 {
*state = state
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
(*state >> 33) as f64 / (1u64 << 31) as f64
}
fn lcg_next_usize(state: &mut u64, max: usize) -> usize {
(lcg_next_f64(state) * max as f64) as usize % max
}
fn sample_indices(n: usize, k: usize, rng: &mut u64) -> Vec<usize> {
let k = k.min(n);
let mut indices: Vec<usize> = (0..n).collect();
for i in 0..k {
let j = i + lcg_next_usize(rng, n - i);
indices.swap(i, j);
}
indices.truncate(k);
indices
}
#[cfg(test)]
mod tests {
use super::*;
fn make_normal_with_outliers() -> Vec<Vec<f64>> {
let mut data = Vec::new();
for i in 0..40 {
let x = 5.0 + (i % 7) as f64 * 0.2 - 0.6;
let y = 5.0 + (i % 5) as f64 * 0.3 - 0.6;
data.push(vec![x, y]);
}
data.push(vec![50.0, 50.0]);
data.push(vec![-40.0, -40.0]);
data.push(vec![50.0, -40.0]);
data
}
#[test]
fn detects_obvious_outliers() {
let data = make_normal_with_outliers();
let result =
isolation_forest(&data, &IsolationForestConfig::default().contamination(0.1)).unwrap();
let n = data.len();
for i in (n - 3)..n {
assert!(
result.scores[i] > 0.5,
"outlier point {i} score = {} should be > 0.5",
result.scores[i]
);
}
}
#[test]
fn outlier_scores_higher_than_normal() {
let data = make_normal_with_outliers();
let result = isolation_forest(&data, &IsolationForestConfig::default()).unwrap();
let normal_max = data[..40]
.iter()
.enumerate()
.map(|(i, _)| result.scores[i])
.fold(0.0f64, f64::max);
let outlier_min = data[40..]
.iter()
.enumerate()
.map(|(i, _)| result.scores[40 + i])
.fold(1.0f64, f64::min);
assert!(
outlier_min > normal_max,
"outlier min score ({outlier_min}) should exceed normal max ({normal_max})"
);
}
#[test]
fn anomaly_count_matches_contamination() {
let data = make_normal_with_outliers();
let n = data.len();
let contamination = 0.1;
let result = isolation_forest(
&data,
&IsolationForestConfig::default().contamination(contamination),
)
.unwrap();
let expected_max = (n as f64 * contamination).ceil() as usize + 1;
assert!(
result.anomaly_count <= expected_max,
"anomaly_count {} should be <= {}",
result.anomaly_count,
expected_max
);
}
#[test]
fn scores_in_range() {
let data = make_normal_with_outliers();
let result = isolation_forest(&data, &IsolationForestConfig::default()).unwrap();
for (i, &score) in result.scores.iter().enumerate() {
assert!(
(0.0..=1.0).contains(&score),
"score[{i}] = {score} out of [0, 1] range"
);
}
}
#[test]
fn scores_reproducible_with_seed() {
let data = make_normal_with_outliers();
let config = IsolationForestConfig::default().seed(Some(123));
let r1 = isolation_forest(&data, &config).unwrap();
let r2 = isolation_forest(&data, &config).unwrap();
for (i, (&s1, &s2)) in r1.scores.iter().zip(r2.scores.iter()).enumerate() {
assert!((s1 - s2).abs() < 1e-10, "score[{i}] differs: {s1} vs {s2}");
}
}
#[test]
fn uniform_data_low_scores() {
let data: Vec<Vec<f64>> = (0..50)
.map(|i| vec![i as f64 * 0.1, i as f64 * 0.1])
.collect();
let result = isolation_forest(&data, &IsolationForestConfig::default()).unwrap();
let mean_score: f64 = result.scores.iter().sum::<f64>() / result.scores.len() as f64;
assert!(
mean_score < 0.7,
"mean score for uniform data should be moderate: {mean_score}"
);
}
#[test]
fn empty_data() {
let data: Vec<Vec<f64>> = vec![];
assert!(isolation_forest(&data, &IsolationForestConfig::default()).is_err());
}
#[test]
fn single_point() {
let data = vec![vec![1.0, 2.0]];
assert!(isolation_forest(&data, &IsolationForestConfig::default()).is_err());
}
#[test]
fn nan_rejected() {
let data = vec![vec![1.0, f64::NAN], vec![2.0, 3.0]];
assert!(isolation_forest(&data, &IsolationForestConfig::default()).is_err());
}
#[test]
fn dimension_mismatch() {
let data = vec![vec![1.0, 2.0], vec![3.0]];
assert!(isolation_forest(&data, &IsolationForestConfig::default()).is_err());
}
#[test]
fn one_dimensional_outlier() {
let mut data: Vec<Vec<f64>> = (0..30).map(|i| vec![i as f64 * 0.1]).collect();
data.push(vec![100.0]);
let result = isolation_forest(&data, &IsolationForestConfig::default()).unwrap();
let outlier_score = result.scores[30];
let normal_mean: f64 = result.scores[..30].iter().sum::<f64>() / 30.0;
assert!(
outlier_score > normal_mean,
"outlier score ({outlier_score}) should exceed normal mean ({normal_mean})"
);
}
#[test]
fn high_dim_detection() {
let mut data = Vec::new();
for i in 0..30 {
let mut point = vec![0.0; 10];
point[i % 10] = (i % 5) as f64 * 0.2;
data.push(point);
}
data.push(vec![100.0; 10]);
let result =
isolation_forest(&data, &IsolationForestConfig::default().n_estimators(200)).unwrap();
let max_idx = result
.scores
.iter()
.enumerate()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
.unwrap()
.0;
assert_eq!(max_idx, 30, "outlier should have highest anomaly score");
}
#[test]
fn c_factor_known_values() {
assert_eq!(c_factor(1), 0.0);
assert_eq!(c_factor(2), 1.0);
let c256 = c_factor(256);
assert!(
(c256 - 10.244).abs() < 0.1,
"c(256) = {c256}, expected ~10.244"
);
}
#[test]
fn different_n_estimators() {
let data = make_normal_with_outliers();
let result =
isolation_forest(&data, &IsolationForestConfig::default().n_estimators(10)).unwrap();
let outlier_score = result.scores[40];
assert!(
outlier_score > 0.5,
"even with 10 trees, outlier score = {outlier_score}"
);
}
}