use crate::error::{AnomalyError, AnomalyResult};
use crate::handle::LcgRng;
const EULER_MASCHERONI: f64 = 0.577_215_664_901_532_86_f64;
#[must_use]
pub fn ifor_c_factor(n: usize) -> f64 {
match n {
0 | 1 => 0.0,
2 => 1.0,
_ => {
let nm1 = (n - 1) as f64;
2.0 * (nm1.ln() + EULER_MASCHERONI) - 2.0 * nm1 / n as f64
}
}
}
#[derive(Debug, Clone)]
pub struct IforNode {
pub feature: usize,
pub threshold: f64,
pub left: Option<usize>,
pub right: Option<usize>,
pub depth: usize,
pub size: usize,
}
#[derive(Debug, Clone)]
pub struct IforTree {
pub nodes: Vec<IforNode>,
}
impl IforTree {
#[must_use]
pub fn path_length(&self, x: &[f64]) -> f64 {
if self.nodes.is_empty() {
return 0.0;
}
let mut node_idx = 0_usize;
loop {
let node = &self.nodes[node_idx];
match (node.left, node.right) {
(Some(l), Some(r)) => {
if x[node.feature] <= node.threshold {
node_idx = l;
} else {
node_idx = r;
}
}
_ => {
return node.depth as f64 + ifor_c_factor(node.size);
}
}
}
}
}
#[derive(Debug, Clone)]
pub struct IforConfig {
pub n_trees: usize,
pub subsample_size: usize,
pub max_depth: usize,
pub seed: u64,
}
impl IforConfig {
pub fn new(n_trees: usize, subsample_size: usize, seed: u64) -> Self {
let max_depth = if subsample_size <= 1 {
1
} else {
(subsample_size as f64).log2().ceil() as usize
};
Self {
n_trees,
subsample_size,
max_depth,
seed,
}
}
}
#[derive(Debug, Clone)]
pub struct IforFit {
pub trees: Vec<IforTree>,
pub n_train: usize,
pub d: usize,
pub cfg: IforConfig,
}
fn sample_without_replacement(n: usize, k: usize, rng: &mut LcgRng) -> Vec<usize> {
let k = k.min(n);
let mut indices: Vec<usize> = (0..n).collect();
for i in 0..k {
let j = i + rng.next_usize(n - i);
indices.swap(i, j);
}
indices.truncate(k);
indices
}
fn grow_tree(
data: &[f64],
d: usize,
indices: &[usize],
depth: usize,
max_depth: usize,
nodes: &mut Vec<IforNode>,
rng: &mut LcgRng,
) -> usize {
let size = indices.len();
if depth >= max_depth || size <= 1 {
let node_idx = nodes.len();
nodes.push(IforNode {
feature: 0,
threshold: 0.0,
left: None,
right: None,
depth,
size,
});
return node_idx;
}
let feat = rng.next_usize(d);
let mut feat_min = f64::INFINITY;
let mut feat_max = f64::NEG_INFINITY;
for &idx in indices {
let v = data[idx * d + feat];
if v < feat_min {
feat_min = v;
}
if v > feat_max {
feat_max = v;
}
}
if (feat_max - feat_min).abs() < 1e-15 {
let node_idx = nodes.len();
nodes.push(IforNode {
feature: feat,
threshold: feat_min,
left: None,
right: None,
depth,
size,
});
return node_idx;
}
let t = feat_min + rng.next_f32() as f64 * (feat_max - feat_min);
let left_indices: Vec<usize> = indices
.iter()
.copied()
.filter(|&i| data[i * d + feat] <= t)
.collect();
let right_indices: Vec<usize> = indices
.iter()
.copied()
.filter(|&i| data[i * d + feat] > t)
.collect();
if left_indices.is_empty() || right_indices.is_empty() {
let node_idx = nodes.len();
nodes.push(IforNode {
feature: feat,
threshold: t,
left: None,
right: None,
depth,
size,
});
return node_idx;
}
let this_idx = nodes.len();
nodes.push(IforNode {
feature: feat,
threshold: t,
left: None,
right: None,
depth,
size,
});
let left_child = grow_tree(data, d, &left_indices, depth + 1, max_depth, nodes, rng);
let right_child = grow_tree(data, d, &right_indices, depth + 1, max_depth, nodes, rng);
nodes[this_idx].left = Some(left_child);
nodes[this_idx].right = Some(right_child);
this_idx
}
pub fn ifor_fit(x: &[f64], n: usize, d: usize, cfg: &IforConfig) -> AnomalyResult<IforFit> {
if n == 0 {
return Err(AnomalyError::EmptyInput);
}
if d == 0 {
return Err(AnomalyError::InvalidFeatureCount { n: 0 });
}
if x.len() != n * d {
return Err(AnomalyError::DimensionMismatch {
expected: n * d,
got: x.len(),
});
}
if cfg.n_trees == 0 {
return Err(AnomalyError::Internal {
msg: "n_trees must be > 0".into(),
});
}
let mut rng = LcgRng::new(cfg.seed);
let ss = cfg.subsample_size.min(n);
let mut trees = Vec::with_capacity(cfg.n_trees);
for _ in 0..cfg.n_trees {
let sub_indices = sample_without_replacement(n, ss, &mut rng);
let mut nodes = Vec::new();
grow_tree(x, d, &sub_indices, 0, cfg.max_depth, &mut nodes, &mut rng);
trees.push(IforTree { nodes });
}
Ok(IforFit {
trees,
n_train: n,
d,
cfg: cfg.clone(),
})
}
pub fn ifor_path_length(fit: &IforFit, x: &[f64]) -> f64 {
if fit.trees.is_empty() {
return 0.0;
}
let total: f64 = fit.trees.iter().map(|t| t.path_length(x)).sum();
total / fit.trees.len() as f64
}
pub fn ifor_score(fit: &IforFit, x: &[f64], n: usize) -> AnomalyResult<Vec<f64>> {
if n == 0 {
return Err(AnomalyError::EmptyInput);
}
if x.len() != n * fit.d {
return Err(AnomalyError::DimensionMismatch {
expected: n * fit.d,
got: x.len(),
});
}
let ss = fit.cfg.subsample_size.min(fit.n_train);
let c_n = ifor_c_factor(ss);
let mut scores = Vec::with_capacity(n);
for i in 0..n {
let sample = &x[i * fit.d..(i + 1) * fit.d];
let avg_path = ifor_path_length(fit, sample);
let score = if c_n < 1e-12 {
0.5_f64
} else {
2.0_f64.powf(-avg_path / c_n)
};
scores.push(score.clamp(0.0, 1.0));
}
Ok(scores)
}
pub fn ifor_predict(
fit: &IforFit,
x: &[f64],
n: usize,
threshold: f64,
) -> AnomalyResult<Vec<bool>> {
let scores = ifor_score(fit, x, n)?;
Ok(scores.iter().map(|&s| s > threshold).collect())
}
#[cfg(test)]
mod tests {
use super::*;
fn make_cfg(n_trees: usize, subsample: usize, seed: u64) -> IforConfig {
IforConfig::new(n_trees, subsample, seed)
}
#[test]
fn c_factor_special_cases() {
assert!((ifor_c_factor(0)).abs() < 1e-12);
assert!((ifor_c_factor(1)).abs() < 1e-12);
assert!((ifor_c_factor(2) - 1.0).abs() < 1e-12);
assert!(ifor_c_factor(256) > 0.0);
}
#[test]
fn ifor_fit_basic() {
let data: Vec<f64> = (0..200).map(|i| i as f64 * 0.1).collect();
let cfg = make_cfg(10, 64, 42);
let fit = ifor_fit(&data, 100, 2, &cfg).expect("ifor_fit should succeed with valid input");
assert_eq!(fit.trees.len(), 10);
assert_eq!(fit.d, 2);
}
#[test]
fn ifor_score_in_range() {
let n = 100_usize;
let d = 2_usize;
let data: Vec<f64> = (0..n * d).map(|i| i as f64 * 0.01).collect();
let cfg = make_cfg(20, 64, 7);
let fit = ifor_fit(&data, n, d, &cfg).expect("ifor_fit should succeed with valid data");
let scores = ifor_score(&fit, &data, n).expect("ifor_score should succeed on valid fit");
assert_eq!(scores.len(), n);
for s in &scores {
assert!((0.0..=1.0).contains(s), "score={s}");
}
}
#[test]
fn ifor_outlier_has_higher_score() {
let n = 200_usize;
let d = 1_usize;
let mut data: Vec<f64> = (0..n).map(|i| (i as f64 - 100.0) * 0.01).collect();
let cfg = make_cfg(50, 128, 13);
let fit = ifor_fit(&data, n, d, &cfg).expect("ifor_fit should succeed with cluster data");
let s_inlier =
ifor_score(&fit, &[0.0_f64], 1).expect("ifor_score should succeed for inlier")[0];
let s_outlier =
ifor_score(&fit, &[1000.0_f64], 1).expect("ifor_score should succeed for outlier")[0];
let _ = data.pop();
assert!(
s_outlier > s_inlier,
"outlier={s_outlier} inlier={s_inlier}"
);
}
#[test]
fn ifor_predict_labels() {
let n = 100_usize;
let d = 2_usize;
let data: Vec<f64> = (0..n * d).map(|i| (i as f64).sin()).collect();
let cfg = make_cfg(30, 64, 99);
let fit =
ifor_fit(&data, n, d, &cfg).expect("ifor_fit should succeed with sinusoidal data");
let preds =
ifor_predict(&fit, &data, n, 0.5).expect("ifor_predict should succeed on valid fit");
assert_eq!(preds.len(), n);
}
#[test]
fn ifor_path_length_finite() {
let data: Vec<f64> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let cfg = make_cfg(10, 4, 55);
let fit = ifor_fit(&data, 4, 2, &cfg).expect("ifor_fit should succeed with small dataset");
let pl = ifor_path_length(&fit, &[2.5_f64, 3.5]);
assert!(pl.is_finite() && pl >= 0.0, "path_length={pl}");
}
#[test]
fn ifor_empty_input_error() {
let cfg = make_cfg(10, 8, 1);
assert!(ifor_fit(&[], 0, 2, &cfg).is_err());
}
#[test]
fn ifor_subsample_larger_than_n() {
let data: Vec<f64> = vec![1.0, 2.0, 3.0, 4.0];
let cfg = IforConfig {
n_trees: 5,
subsample_size: 1024,
max_depth: 4,
seed: 3,
};
let fit = ifor_fit(&data, 2, 2, &cfg)
.expect("ifor_fit should clamp oversized subsample gracefully");
let s = ifor_score(&fit, &[1.5_f64, 2.5], 1)
.expect("ifor_score should succeed after clamped subsample fit");
assert_eq!(s.len(), 1);
assert!((0.0..=1.0).contains(&s[0]));
}
}