use crate::error::{AnomalyError, AnomalyResult};
use crate::handle::LcgRng;
const EULER_MASCHERONI: f64 = 0.577_215_664_901_532;
#[derive(Debug, Clone)]
pub struct ExtIforestConfig {
pub n_estimators: usize,
pub max_samples: usize,
pub contamination: f64,
pub extension_level: usize,
pub random_seed: u64,
}
impl Default for ExtIforestConfig {
fn default() -> Self {
Self {
n_estimators: 100,
max_samples: 256,
contamination: 0.1,
extension_level: usize::MAX, random_seed: 42,
}
}
}
#[derive(Debug, Clone)]
pub struct ExtNode {
pub is_leaf: bool,
pub size: usize,
pub depth: usize,
pub normal: Vec<f64>,
pub intercept: f64,
pub left_child: usize,
pub right_child: usize,
}
pub struct ExtIforestModel {
pub trees: Vec<Vec<ExtNode>>,
pub n_features: usize,
pub config: ExtIforestConfig,
pub threshold: f64,
}
#[inline]
fn c_factor(n: usize) -> f64 {
match n {
0 | 1 => 0.0,
2 => 1.0,
_ => {
let nm1 = (n - 1) as f64;
2.0 * (nm1.ln() + EULER_MASCHERONI) - 2.0 * nm1 / n as f64
}
}
}
fn sample_without_replacement(n: usize, k: usize, rng: &mut LcgRng, scratch: &mut Vec<usize>) {
scratch.clear();
scratch.extend(0..n);
for i in 0..k.min(n) {
let j = i + rng.next_usize(n - i);
scratch.swap(i, j);
}
}
#[inline]
fn next_normal_f64(rng: &mut LcgRng) -> f64 {
rng.next_normal() as f64
}
#[inline]
fn uniform_f64(lo: f64, hi: f64, rng: &mut LcgRng) -> f64 {
let u = (rng.next_u32() as f64) / (u32::MAX as f64 + 1.0);
lo + u * (hi - lo)
}
#[inline]
fn dot(a: &[f64], b: &[f64]) -> f64 {
a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
}
fn build_tree(
data: &[f64],
n: usize,
d: usize,
max_depth: usize,
ext_level: usize,
rng: &mut LcgRng,
) -> Vec<ExtNode> {
let mut nodes: Vec<ExtNode> = Vec::with_capacity(2 * n + 1);
let mut stack: Vec<(usize, Vec<usize>, usize)> = Vec::with_capacity(2 * n);
let root_indices: Vec<usize> = (0..n).collect();
nodes.push(ExtNode {
is_leaf: true,
size: n,
depth: 0,
normal: vec![0.0; d],
intercept: 0.0,
left_child: usize::MAX,
right_child: usize::MAX,
});
stack.push((0, root_indices, 0));
let mut scratch: Vec<usize> = Vec::with_capacity(d);
while let Some((node_idx, indices, depth)) = stack.pop() {
let size = indices.len();
if size <= 1 || depth >= max_depth {
nodes[node_idx].is_leaf = true;
nodes[node_idx].size = size;
nodes[node_idx].depth = depth;
continue;
}
let actual_ext = ext_level.min(d);
sample_without_replacement(d, actual_ext, rng, &mut scratch);
let mut normal = vec![0.0_f64; d];
for &feat_idx in scratch.iter().take(actual_ext) {
normal[feat_idx] = next_normal_f64(rng);
}
let norm_sq: f64 = normal.iter().map(|v| v * v).sum();
if norm_sq < 1e-15 {
normal[0] = 1.0;
} else {
let inv_norm = norm_sq.sqrt().recip();
for v in &mut normal {
*v *= inv_norm;
}
}
let projections: Vec<f64> = indices
.iter()
.map(|&i| {
let row = &data[i * d..(i + 1) * d];
dot(row, &normal)
})
.collect();
let proj_min = projections.iter().cloned().fold(f64::INFINITY, f64::min);
let proj_max = projections
.iter()
.cloned()
.fold(f64::NEG_INFINITY, f64::max);
if (proj_max - proj_min).abs() < 1e-14 {
nodes[node_idx].is_leaf = true;
nodes[node_idx].size = size;
nodes[node_idx].depth = depth;
continue;
}
let intercept = uniform_f64(proj_min, proj_max, rng);
let mut left_indices: Vec<usize> = Vec::with_capacity(size / 2 + 1);
let mut right_indices: Vec<usize> = Vec::with_capacity(size / 2 + 1);
for (local_idx, &global_idx) in indices.iter().enumerate() {
if projections[local_idx] <= intercept {
left_indices.push(global_idx);
} else {
right_indices.push(global_idx);
}
}
if left_indices.is_empty() || right_indices.is_empty() {
nodes[node_idx].is_leaf = true;
nodes[node_idx].size = size;
nodes[node_idx].depth = depth;
continue;
}
let left_idx = nodes.len();
nodes.push(ExtNode {
is_leaf: true,
size: 0,
depth: depth + 1,
normal: vec![0.0; d],
intercept: 0.0,
left_child: usize::MAX,
right_child: usize::MAX,
});
let right_idx = nodes.len();
nodes.push(ExtNode {
is_leaf: true,
size: 0,
depth: depth + 1,
normal: vec![0.0; d],
intercept: 0.0,
left_child: usize::MAX,
right_child: usize::MAX,
});
nodes[node_idx].is_leaf = false;
nodes[node_idx].size = size;
nodes[node_idx].depth = depth;
nodes[node_idx].normal = normal;
nodes[node_idx].intercept = intercept;
nodes[node_idx].left_child = left_idx;
nodes[node_idx].right_child = right_idx;
stack.push((left_idx, left_indices, depth + 1));
stack.push((right_idx, right_indices, depth + 1));
}
nodes
}
fn path_length_in_tree(tree: &[ExtNode], x: &[f64], d: usize) -> f64 {
let mut node_idx = 0usize;
loop {
let node = &tree[node_idx];
if node.is_leaf {
return node.depth as f64 + c_factor(node.size);
}
let proj = dot(x, &node.normal[..d]);
if proj <= node.intercept {
node_idx = node.left_child;
} else {
node_idx = node.right_child;
}
}
}
pub fn ext_iforest_fit(
x: &[f64],
n_samples: usize,
n_features: usize,
cfg: &ExtIforestConfig,
) -> AnomalyResult<ExtIforestModel> {
if n_samples == 0 {
return Err(AnomalyError::EmptyInput);
}
if n_features == 0 {
return Err(AnomalyError::InvalidFeatureCount { n: 0 });
}
if x.len() != n_samples * n_features {
return Err(AnomalyError::DimensionMismatch {
expected: n_samples * n_features,
got: x.len(),
});
}
if cfg.n_estimators == 0 {
return Err(AnomalyError::Internal {
msg: "n_estimators must be > 0".into(),
});
}
if cfg.max_samples == 0 {
return Err(AnomalyError::Internal {
msg: "max_samples must be > 0".into(),
});
}
if !(cfg.contamination > 0.0 && cfg.contamination < 0.5) {
return Err(AnomalyError::Internal {
msg: format!(
"contamination must be in (0, 0.5), got {}",
cfg.contamination
),
});
}
let actual_max_samples = cfg.max_samples.min(n_samples);
let actual_ext = if cfg.extension_level == usize::MAX {
n_features
} else {
cfg.extension_level.clamp(1, n_features)
};
let max_depth = {
let ms = actual_max_samples;
if ms <= 1 {
1
} else {
(ms as f64).log2().ceil() as usize
}
};
let mut rng = LcgRng::new(cfg.random_seed);
let mut trees: Vec<Vec<ExtNode>> = Vec::with_capacity(cfg.n_estimators);
let mut idx_scratch: Vec<usize> = Vec::with_capacity(n_samples);
for _ in 0..cfg.n_estimators {
idx_scratch.clear();
idx_scratch.extend(0..n_samples);
for k in 0..actual_max_samples {
let j = k + rng.next_usize(n_samples - k);
idx_scratch.swap(k, j);
}
let sample_indices: Vec<usize> = idx_scratch[..actual_max_samples].to_vec();
let mut sub: Vec<f64> = Vec::with_capacity(actual_max_samples * n_features);
for &i in &sample_indices {
sub.extend_from_slice(&x[i * n_features..(i + 1) * n_features]);
}
let tree = build_tree(
&sub,
actual_max_samples,
n_features,
max_depth,
actual_ext,
&mut rng,
);
trees.push(tree);
}
let c = c_factor(actual_max_samples);
let scores = compute_scores_inner(&trees, x, n_samples, n_features, c);
let mut sorted_scores = scores.clone();
sorted_scores.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let threshold_rank = ((1.0 - cfg.contamination) * n_samples as f64).floor() as usize;
let threshold_rank = threshold_rank.min(n_samples - 1);
let threshold = sorted_scores[threshold_rank];
Ok(ExtIforestModel {
trees,
n_features,
config: ExtIforestConfig {
extension_level: actual_ext,
..cfg.clone()
},
threshold,
})
}
fn compute_scores_inner(
trees: &[Vec<ExtNode>],
x: &[f64],
n_samples: usize,
n_features: usize,
c: f64,
) -> Vec<f64> {
let n_trees = trees.len() as f64;
(0..n_samples)
.map(|i| {
let xi = &x[i * n_features..(i + 1) * n_features];
let avg_path: f64 = trees
.iter()
.map(|tree| path_length_in_tree(tree, xi, n_features))
.sum::<f64>()
/ n_trees;
if c < 1e-10 {
0.5
} else {
2.0_f64.powf(-avg_path / c)
}
})
.collect()
}
pub fn ext_iforest_score(
model: &ExtIforestModel,
x: &[f64],
n_samples: usize,
) -> AnomalyResult<Vec<f64>> {
if model.trees.is_empty() {
return Err(AnomalyError::NotFitted);
}
if n_samples == 0 {
return Err(AnomalyError::EmptyInput);
}
if x.len() != n_samples * model.n_features {
return Err(AnomalyError::DimensionMismatch {
expected: n_samples * model.n_features,
got: x.len(),
});
}
let actual_max = model.config.max_samples.min(
model.config.max_samples,
);
let c = c_factor(actual_max);
let scores = compute_scores_inner(&model.trees, x, n_samples, model.n_features, c);
Ok(scores)
}
pub fn ext_iforest_predict(
model: &ExtIforestModel,
x: &[f64],
n_samples: usize,
) -> AnomalyResult<Vec<i32>> {
let scores = ext_iforest_score(model, x, n_samples)?;
Ok(scores
.into_iter()
.map(|s| if s >= model.threshold { -1 } else { 1 })
.collect())
}
#[cfg(test)]
mod tests {
use super::*;
fn make_inliers_and_outlier(n_inliers: usize, d: usize, seed: u64) -> (Vec<f64>, usize, usize) {
let mut rng = LcgRng::new(seed);
let mut data: Vec<f64> = Vec::with_capacity((n_inliers + 1) * d);
for _ in 0..n_inliers {
for _ in 0..d {
data.push(rng.next_normal() as f64 * 0.1);
}
}
data.extend(std::iter::repeat_n(100.0, d));
let n = n_inliers + 1;
(data, n, d)
}
#[test]
fn ext_iforest_fit_runs() {
let (data, n, d) = make_inliers_and_outlier(100, 4, 1);
let cfg = ExtIforestConfig {
n_estimators: 50,
max_samples: 64,
contamination: 0.1,
extension_level: usize::MAX,
random_seed: 1,
};
let model = ext_iforest_fit(&data, n, d, &cfg);
assert!(model.is_ok(), "fit failed: {:?}", model.err());
}
#[test]
fn ext_iforest_score_shape() {
let (data, n, d) = make_inliers_and_outlier(99, 4, 2);
let cfg = ExtIforestConfig::default();
let model = ext_iforest_fit(&data, n, d, &cfg).expect("ext_iforest_fit should succeed");
let scores = ext_iforest_score(&model, &data, n).expect("ext_iforest_score should succeed");
assert_eq!(scores.len(), n);
}
#[test]
fn ext_iforest_scores_range() {
let (data, n, d) = make_inliers_and_outlier(99, 4, 3);
let cfg = ExtIforestConfig {
n_estimators: 100,
max_samples: 64,
contamination: 0.1,
extension_level: usize::MAX,
random_seed: 3,
};
let model = ext_iforest_fit(&data, n, d, &cfg).expect("ext_iforest_fit should succeed");
let scores = ext_iforest_score(&model, &data, n).expect("ext_iforest_score should succeed");
for (i, &s) in scores.iter().enumerate() {
assert!(s > 0.0 && s < 1.0, "score[{i}] = {s} not in (0, 1)");
}
}
#[test]
fn ext_iforest_outlier_higher_score() {
let n_inliers = 99usize;
let d = 4usize;
let (data, n, _d) = make_inliers_and_outlier(n_inliers, d, 4);
let cfg = ExtIforestConfig {
n_estimators: 200,
max_samples: 64,
contamination: 0.1,
extension_level: usize::MAX,
random_seed: 4,
};
let model = ext_iforest_fit(&data, n, d, &cfg).expect("ext_iforest_fit should succeed");
let scores = ext_iforest_score(&model, &data, n).expect("ext_iforest_score should succeed");
let outlier_score = scores[n_inliers]; let inlier_mean: f64 = scores[..n_inliers].iter().sum::<f64>() / n_inliers as f64;
assert!(
outlier_score > inlier_mean,
"outlier {outlier_score:.4} should > inlier mean {inlier_mean:.4}"
);
}
#[test]
fn ext_iforest_predict_labels() {
let (data, n, d) = make_inliers_and_outlier(99, 4, 5);
let cfg = ExtIforestConfig::default();
let model = ext_iforest_fit(&data, n, d, &cfg).expect("ext_iforest_fit should succeed");
let labels =
ext_iforest_predict(&model, &data, n).expect("ext_iforest_predict should succeed");
for &l in &labels {
assert!(l == 1 || l == -1, "label={l} not in {{-1, +1}}");
}
}
#[test]
fn ext_iforest_contamination_fraction() {
let n_inliers = 190usize;
let d = 4usize;
let (data, n, _d) = make_inliers_and_outlier(n_inliers, d, 6);
let contamination = 0.1;
let cfg = ExtIforestConfig {
n_estimators: 150,
max_samples: 128,
contamination,
extension_level: usize::MAX,
random_seed: 6,
};
let model = ext_iforest_fit(&data, n, d, &cfg).expect("ext_iforest_fit should succeed");
let labels =
ext_iforest_predict(&model, &data, n).expect("ext_iforest_predict should succeed");
let n_anomalous = labels.iter().filter(|&&l| l == -1).count();
let frac = n_anomalous as f64 / n as f64;
assert!(
(frac - contamination).abs() < 0.05,
"anomaly fraction {frac:.3} not within 0.05 of contamination {contamination}"
);
}
#[test]
fn ext_iforest_extension_level_1() {
let (data, n, d) = make_inliers_and_outlier(99, 4, 7);
let cfg = ExtIforestConfig {
n_estimators: 100,
max_samples: 64,
contamination: 0.1,
extension_level: 1,
random_seed: 7,
};
let model = ext_iforest_fit(&data, n, d, &cfg).expect("ext_iforest_fit should succeed");
let scores = ext_iforest_score(&model, &data, n).expect("ext_iforest_score should succeed");
assert_eq!(scores.len(), n);
assert!(scores.iter().all(|&s| s > 0.0 && s < 1.0));
let outlier_score = scores[n - 1];
let max_inlier = scores[..n - 1]
.iter()
.cloned()
.fold(f64::NEG_INFINITY, f64::max);
assert!(
outlier_score >= max_inlier * 0.8,
"axis-aligned: outlier {outlier_score:.4}, max inlier {max_inlier:.4}"
);
}
#[test]
fn ext_iforest_small_sample() {
let (data, n, d) = make_inliers_and_outlier(49, 4, 8);
let cfg = ExtIforestConfig {
n_estimators: 50,
max_samples: 32,
contamination: 0.1,
extension_level: usize::MAX,
random_seed: 8,
};
let model = ext_iforest_fit(&data, n, d, &cfg).expect("ext_iforest_fit should succeed");
let scores = ext_iforest_score(&model, &data, n).expect("ext_iforest_score should succeed");
assert_eq!(scores.len(), n);
assert!(
scores.iter().all(|s| s.is_finite()),
"all scores must be finite"
);
}
#[test]
fn ext_iforest_single_point_err() {
let data = vec![1.0_f64, 2.0, 3.0, 4.0];
let cfg = ExtIforestConfig {
n_estimators: 10,
max_samples: 256,
contamination: 0.1,
extension_level: usize::MAX,
random_seed: 9,
};
let model = ext_iforest_fit(&data, 1, 4, &cfg).expect("ext_iforest_fit should succeed");
let scores = ext_iforest_score(&model, &data, 1).expect("ext_iforest_score should succeed");
assert_eq!(scores.len(), 1);
assert!(scores[0].is_finite(), "score must be finite for n=1");
}
#[test]
fn ext_iforest_high_dimensional() {
let d = 20usize;
let n_inliers = 199usize;
let (data, n, _d) = make_inliers_and_outlier(n_inliers, d, 10);
let cfg = ExtIforestConfig {
n_estimators: 100,
max_samples: 128,
contamination: 0.1,
extension_level: usize::MAX,
random_seed: 10,
};
let model = ext_iforest_fit(&data, n, d, &cfg).expect("ext_iforest_fit should succeed");
let scores = ext_iforest_score(&model, &data, n).expect("ext_iforest_score should succeed");
assert_eq!(scores.len(), n);
assert!(
scores.iter().all(|s| s.is_finite()),
"all scores must be finite in 20-D"
);
}
#[test]
fn ext_iforest_deterministic() {
let (data, n, d) = make_inliers_and_outlier(99, 4, 11);
let cfg = ExtIforestConfig {
n_estimators: 50,
max_samples: 64,
contamination: 0.1,
extension_level: usize::MAX,
random_seed: 11,
};
let m1 = ext_iforest_fit(&data, n, d, &cfg).expect("ext_iforest_fit should succeed");
let m2 = ext_iforest_fit(&data, n, d, &cfg).expect("ext_iforest_fit should succeed");
let s1 = ext_iforest_score(&m1, &data, n).expect("ext_iforest_score should succeed");
let s2 = ext_iforest_score(&m2, &data, n).expect("ext_iforest_score should succeed");
for (i, (a, b)) in s1.iter().zip(s2.iter()).enumerate() {
assert!(
(a - b).abs() < 1e-12,
"score[{i}]: {a} vs {b} with same seed"
);
}
}
#[test]
fn ext_iforest_ext_level_clamped() {
let (data, n, d) = make_inliers_and_outlier(49, 3, 12);
let cfg = ExtIforestConfig {
n_estimators: 30,
max_samples: 32,
contamination: 0.1,
extension_level: 999, random_seed: 12,
};
let model = ext_iforest_fit(&data, n, d, &cfg).expect("ext_iforest_fit should succeed");
assert_eq!(model.config.extension_level, 3, "should be clamped to d=3");
}
}