use crate::error::{DatasetsError, Result};
use scirs2_core::ndarray::{Array1, Array2};
use scirs2_core::random::prelude::*;
use scirs2_core::random::rand_distributions::Distribution;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AnomalyPattern {
PointAnomaly,
ContextualAnomaly,
CollectiveAnomaly,
Outlier,
}
pub fn kdd99_like(
n_normal: usize,
n_anomaly: usize,
rng: &mut StdRng,
) -> Result<(Array2<f64>, Array1<usize>)> {
if n_normal == 0 {
return Err(DatasetsError::InvalidFormat(
"kdd99_like: n_normal must be >= 1".to_string(),
));
}
if n_anomaly == 0 {
return Err(DatasetsError::InvalidFormat(
"kdd99_like: n_anomaly must be >= 1".to_string(),
));
}
const N_FEATURES: usize = 41;
let n_total = n_normal + n_anomaly;
let mut x = Array2::zeros((n_total, N_FEATURES));
let mut y = Array1::zeros(n_total);
let normal01 = scirs2_core::random::Normal::new(0.0_f64, 1.0_f64).map_err(|e| {
DatasetsError::ComputationError(format!("Normal(0,1): {e}"))
})?;
let uniform01 = scirs2_core::random::Uniform::new(0.0_f64, 1.0_f64).map_err(|e| {
DatasetsError::ComputationError(format!("Uniform(0,1): {e}"))
})?;
for i in 0..n_normal {
x[[i, 0]] = (normal01.sample(rng) * 5.0 + 10.0).max(0.0);
x[[i, 1]] = (uniform01.sample(rng) * 3.0).floor(); x[[i, 2]] = (uniform01.sample(rng) * 66.0).floor(); x[[i, 3]] = (uniform01.sample(rng) * 11.0).floor(); x[[i, 4]] = (normal01.sample(rng) * 500.0 + 1000.0).max(0.0);
x[[i, 5]] = (normal01.sample(rng) * 300.0 + 600.0).max(0.0);
x[[i, 6]] = if uniform01.sample(rng) < 0.01 { 1.0 } else { 0.0 };
x[[i, 7]] = (normal01.sample(rng).abs() * 10.0).floor();
x[[i, 8]] = 0.0;
for j in 9..=21 {
x[[i, j]] = (normal01.sample(rng).abs() * 2.0).floor();
}
for j in 22..N_FEATURES {
x[[i, j]] = uniform01.sample(rng);
}
y[i] = 0;
}
for a in 0..n_anomaly {
let i = n_normal + a;
x[[i, 0]] = (normal01.sample(rng) * 2.0).abs();
x[[i, 1]] = (uniform01.sample(rng) * 3.0).floor();
x[[i, 2]] = (uniform01.sample(rng) * 66.0).floor();
x[[i, 3]] = (uniform01.sample(rng) * 11.0).floor();
let attack_type = uniform01.sample(rng);
if attack_type < 0.5 {
x[[i, 4]] = (normal01.sample(rng) * 2000.0 + 10000.0).max(0.0);
x[[i, 5]] = 0.0;
} else {
x[[i, 4]] = (normal01.sample(rng).abs() * 20.0).floor();
x[[i, 5]] = (normal01.sample(rng).abs() * 10.0).floor();
}
x[[i, 6]] = if uniform01.sample(rng) < 0.3 { 1.0 } else { 0.0 };
x[[i, 7]] = (normal01.sample(rng).abs() * 30.0).floor();
x[[i, 8]] = if uniform01.sample(rng) < 0.2 { 1.0 } else { 0.0 };
for j in 9..=21 {
x[[i, j]] = (normal01.sample(rng).abs() * 15.0 + 5.0).floor();
}
for j in 22..N_FEATURES {
let v = uniform01.sample(rng);
x[[i, j]] = if v < 0.5 { v * 0.2 } else { 0.8 + (v - 0.5) * 0.4 };
}
y[i] = 1;
}
Ok((x, y))
}
pub fn thyroid_like(
n: usize,
rng: &mut StdRng,
) -> Result<(Array2<f64>, Array1<usize>)> {
if n == 0 {
return Err(DatasetsError::InvalidFormat(
"thyroid_like: n must be >= 1".to_string(),
));
}
const N_FEATURES: usize = 6;
let n_anomaly = ((n as f64 * 0.025).round() as usize).max(1);
let n_normal = n - n_anomaly;
let normal01 = scirs2_core::random::Normal::new(0.0_f64, 1.0_f64).map_err(|e| {
DatasetsError::ComputationError(format!("Normal(0,1): {e}"))
})?;
let uniform01 = scirs2_core::random::Uniform::new(0.0_f64, 1.0_f64).map_err(|e| {
DatasetsError::ComputationError(format!("Uniform(0,1): {e}"))
})?;
let mut x = Array2::zeros((n, N_FEATURES));
let mut y = Array1::zeros(n);
for i in 0..n_normal {
x[[i, 0]] = (normal01.sample(rng) * 0.5 + 1.5).max(0.01); x[[i, 1]] = (normal01.sample(rng) * 0.3 + 1.8).max(0.1); x[[i, 2]] = (normal01.sample(rng) * 20.0 + 110.0).max(30.0); x[[i, 3]] = (normal01.sample(rng) * 0.1 + 1.0).clamp(0.5, 1.5); x[[i, 4]] = (normal01.sample(rng) * 20.0 + 110.0).max(30.0); x[[i, 5]] = uniform01.sample(rng) * 0.8 + 0.1; y[i] = 0;
}
for a in 0..n_anomaly {
let i = n_normal + a;
x[[i, 0]] = (normal01.sample(rng) * 0.05 + 0.1).max(0.001); x[[i, 1]] = (normal01.sample(rng) * 0.8 + 4.5).max(2.0); x[[i, 2]] = (normal01.sample(rng) * 30.0 + 185.0).max(100.0); x[[i, 3]] = (normal01.sample(rng) * 0.1 + 1.2).clamp(0.8, 1.8); x[[i, 4]] = (normal01.sample(rng) * 35.0 + 190.0).max(100.0); x[[i, 5]] = uniform01.sample(rng) * 0.8 + 0.1; y[i] = 1;
}
Ok((x, y))
}
pub fn synthetic_anomaly_injection(
normal_data: &Array2<f64>,
anomaly_type: AnomalyPattern,
contamination: f64,
rng: &mut StdRng,
) -> Result<(Array2<f64>, Array1<usize>)> {
if normal_data.is_empty() {
return Err(DatasetsError::InvalidFormat(
"synthetic_anomaly_injection: normal_data must not be empty".to_string(),
));
}
if contamination <= 0.0 || contamination >= 1.0 {
return Err(DatasetsError::InvalidFormat(format!(
"synthetic_anomaly_injection: contamination must be in (0, 1), got {contamination}"
)));
}
let n = normal_data.nrows();
let p = normal_data.ncols();
let n_anomaly = (n as f64 * contamination).floor() as usize;
let n_anomaly = n_anomaly.max(1).min(n - 1);
let mut feat_mean = vec![0.0_f64; p];
let mut feat_std = vec![1.0_f64; p];
for j in 0..p {
let col: Vec<f64> = (0..n).map(|i| normal_data[[i, j]]).collect();
let mean = col.iter().sum::<f64>() / n as f64;
let var = col.iter().map(|&v| (v - mean).powi(2)).sum::<f64>() / n as f64;
feat_mean[j] = mean;
feat_std[j] = var.sqrt().max(1e-8);
}
let mut x = normal_data.to_owned();
let mut y = Array1::zeros(n);
let mut all_indices: Vec<usize> = (0..n).collect();
{
use scirs2_core::random::SliceRandom;
all_indices.shuffle(rng);
}
let anomaly_indices: Vec<usize> = all_indices[..n_anomaly].to_vec();
let normal01 = scirs2_core::random::Normal::new(0.0_f64, 1.0_f64).map_err(|e| {
DatasetsError::ComputationError(format!("Normal(0,1): {e}"))
})?;
let uniform01 = scirs2_core::random::Uniform::new(0.0_f64, 1.0_f64).map_err(|e| {
DatasetsError::ComputationError(format!("Uniform(0,1): {e}"))
})?;
for &row in &anomaly_indices {
match anomaly_type {
AnomalyPattern::PointAnomaly => {
for j in 0..p {
let sign = if uniform01.sample(rng) < 0.5 { 1.0_f64 } else { -1.0_f64 };
x[[row, j]] += sign * 5.0 * feat_std[j];
}
}
AnomalyPattern::ContextualAnomaly => {
if p >= 2 {
let mut row_features: Vec<f64> =
(0..p).map(|j| x[[row, j]]).collect();
row_features.reverse();
for j in 0..p {
x[[row, j]] = row_features[j];
}
} else {
x[[row, 0]] = -x[[row, 0]];
}
}
AnomalyPattern::CollectiveAnomaly => {
for j in 0..p {
x[[row, j]] *= 10.0;
}
}
AnomalyPattern::Outlier => {
for j in 0..p {
let z1 = normal01.sample(rng);
let z2 = normal01.sample(rng);
let cauchy = if z2.abs() > 1e-9 { z1 / z2 } else { z1 * 10.0 };
x[[row, j]] = feat_mean[j] + feat_std[j] * cauchy;
}
}
}
y[row] = 1;
}
let mut order: Vec<usize> = (0..n).collect();
{
use scirs2_core::random::SliceRandom;
order.shuffle(rng);
}
let mut x_out = Array2::zeros((n, p));
let mut y_out = Array1::zeros(n);
for (out_row, &src) in order.iter().enumerate() {
for j in 0..p {
x_out[[out_row, j]] = x[[src, j]];
}
y_out[out_row] = y[src];
}
Ok((x_out, y_out))
}
pub fn isolation_forest_benchmark(
x: &Array2<f64>,
_contamination: f64,
n_estimators: usize,
rng: &mut StdRng,
) -> Result<Vec<f64>> {
if x.is_empty() {
return Err(DatasetsError::InvalidFormat(
"isolation_forest_benchmark: x must not be empty".to_string(),
));
}
if n_estimators == 0 {
return Err(DatasetsError::InvalidFormat(
"isolation_forest_benchmark: n_estimators must be >= 1".to_string(),
));
}
let n = x.nrows();
let p = x.ncols();
let sub_size = n.min(256);
let height_limit = ((sub_size as f64).ln() / 2.0_f64.ln()).ceil() as usize + 1;
let mut total_path = vec![0.0_f64; n];
let feat_dist = scirs2_core::random::Uniform::new(0usize, p.max(1)).map_err(|e| {
DatasetsError::ComputationError(format!("Uniform feat dist: {e}"))
})?;
for _ in 0..n_estimators {
let mut indices: Vec<usize> = (0..n).collect();
{
use scirs2_core::random::SliceRandom;
indices.shuffle(rng);
}
indices.truncate(sub_size);
let mut sub_data = vec![0.0_f64; sub_size * p];
for (si, &orig) in indices.iter().enumerate() {
for j in 0..p {
sub_data[si * p + j] = x[[orig, j]];
}
}
for i in 0..n {
let query: Vec<f64> = (0..p).map(|j| x[[i, j]]).collect();
let path_len = isolation_path_length(
&query,
&sub_data,
sub_size,
p,
0,
height_limit,
&feat_dist,
rng,
);
total_path[i] += path_len;
}
}
let avg_path: Vec<f64> = total_path
.iter()
.map(|&s| s / n_estimators as f64)
.collect();
let c_n = bst_avg_path_length(sub_size);
let scores: Vec<f64> = avg_path
.iter()
.map(|&h| {
let exponent = -h / c_n;
2.0_f64.powf(exponent)
})
.collect();
Ok(scores)
}
fn bst_avg_path_length(n: usize) -> f64 {
if n <= 1 {
return 1.0;
}
let n_f = n as f64;
let harmonic = harmonic_number(n - 1);
2.0 * harmonic - 2.0 * (n_f - 1.0) / n_f
}
fn harmonic_number(n: usize) -> f64 {
if n == 0 {
return 0.0;
}
if n > 20 {
let n_f = n as f64;
return n_f.ln() + 0.5772156649 + 1.0 / (2.0 * n_f) - 1.0 / (12.0 * n_f * n_f);
}
(1..=n).map(|i| 1.0 / i as f64).sum()
}
#[allow(clippy::too_many_arguments)]
fn isolation_path_length(
query: &[f64],
sub_data: &[f64],
sub_size: usize,
p: usize,
current_height: usize,
height_limit: usize,
feat_dist: &scirs2_core::random::Uniform<usize>,
rng: &mut StdRng,
) -> f64 {
if sub_size <= 1 || current_height >= height_limit {
return current_height as f64 + bst_avg_path_length(sub_size);
}
let feat_idx = feat_dist.sample(rng) % p.max(1);
let mut feat_min = f64::INFINITY;
let mut feat_max = f64::NEG_INFINITY;
for i in 0..sub_size {
let v = sub_data[i * p + feat_idx];
if v < feat_min {
feat_min = v;
}
if v > feat_max {
feat_max = v;
}
}
if (feat_max - feat_min).abs() < 1e-12 {
return current_height as f64 + bst_avg_path_length(sub_size);
}
let split_range = scirs2_core::random::Uniform::new(feat_min, feat_max)
.unwrap_or_else(|_| scirs2_core::random::Uniform::new(feat_min, feat_max + 1e-9)
.expect("fallback split range"));
let split = split_range.sample(rng);
let query_val = if feat_idx < query.len() { query[feat_idx] } else { 0.0 };
if query_val <= split {
let left_indices: Vec<usize> = (0..sub_size)
.filter(|&i| sub_data[i * p + feat_idx] <= split)
.collect();
let left_sub_size = left_indices.len();
if left_sub_size == 0 {
return current_height as f64 + 1.0;
}
let mut left_data = vec![0.0_f64; left_sub_size * p];
for (new_i, &old_i) in left_indices.iter().enumerate() {
for j in 0..p {
left_data[new_i * p + j] = sub_data[old_i * p + j];
}
}
isolation_path_length(
query,
&left_data,
left_sub_size,
p,
current_height + 1,
height_limit,
feat_dist,
rng,
)
} else {
let right_indices: Vec<usize> = (0..sub_size)
.filter(|&i| sub_data[i * p + feat_idx] > split)
.collect();
let right_sub_size = right_indices.len();
if right_sub_size == 0 {
return current_height as f64 + 1.0;
}
let mut right_data = vec![0.0_f64; right_sub_size * p];
for (new_i, &old_i) in right_indices.iter().enumerate() {
for j in 0..p {
right_data[new_i * p + j] = sub_data[old_i * p + j];
}
}
isolation_path_length(
query,
&right_data,
right_sub_size,
p,
current_height + 1,
height_limit,
feat_dist,
rng,
)
}
}
pub fn average_precision(y_true: &Array1<usize>, y_score: &Array1<f64>) -> Result<f64> {
if y_true.len() != y_score.len() {
return Err(DatasetsError::InvalidFormat(format!(
"average_precision: y_true.len() ({}) != y_score.len() ({})",
y_true.len(),
y_score.len()
)));
}
if y_true.is_empty() {
return Err(DatasetsError::InvalidFormat(
"average_precision: arrays must not be empty".to_string(),
));
}
let n_pos: usize = y_true.iter().filter(|&&v| v == 1).count();
if n_pos == 0 {
return Ok(0.0);
}
let mut order: Vec<usize> = (0..y_true.len()).collect();
order.sort_unstable_by(|&a, &b| {
y_score[b]
.partial_cmp(&y_score[a])
.unwrap_or(std::cmp::Ordering::Equal)
});
let mut ap = 0.0_f64;
let mut tp = 0usize;
let mut prev_recall = 0.0_f64;
for (k, &idx) in order.iter().enumerate() {
if y_true[idx] == 1 {
tp += 1;
let precision = tp as f64 / (k + 1) as f64;
let recall = tp as f64 / n_pos as f64;
ap += precision * (recall - prev_recall);
prev_recall = recall;
}
}
Ok(ap)
}
#[cfg(test)]
mod tests {
use super::*;
use scirs2_core::ndarray::{array, Array2};
fn make_rng(seed: u64) -> StdRng {
StdRng::seed_from_u64(seed)
}
#[test]
fn test_kdd99_shape() {
let mut rng = make_rng(42);
let (x, y) = kdd99_like(500, 50, &mut rng).expect("kdd99");
assert_eq!(x.shape(), &[550, 41]);
assert_eq!(y.len(), 550);
}
#[test]
fn test_kdd99_label_counts() {
let mut rng = make_rng(1);
let (_, y) = kdd99_like(200, 20, &mut rng).expect("kdd99 counts");
let n0: usize = y.iter().filter(|&&v| v == 0).count();
let n1: usize = y.iter().filter(|&&v| v == 1).count();
assert_eq!(n0, 200);
assert_eq!(n1, 20);
}
#[test]
fn test_kdd99_error_n_normal_zero() {
let mut rng = make_rng(1);
assert!(kdd99_like(0, 10, &mut rng).is_err());
}
#[test]
fn test_kdd99_error_n_anomaly_zero() {
let mut rng = make_rng(1);
assert!(kdd99_like(100, 0, &mut rng).is_err());
}
#[test]
fn test_kdd99_determinism() {
let mut rng1 = make_rng(99);
let (x1, y1) = kdd99_like(50, 5, &mut rng1).expect("kdd99 det1");
let mut rng2 = make_rng(99);
let (x2, y2) = kdd99_like(50, 5, &mut rng2).expect("kdd99 det2");
for i in 0..55 {
for j in 0..41 {
assert!((x1[[i, j]] - x2[[i, j]]).abs() < 1e-12);
}
assert_eq!(y1[i], y2[i]);
}
}
#[test]
fn test_thyroid_shape() {
let mut rng = make_rng(42);
let (x, y) = thyroid_like(400, &mut rng).expect("thyroid");
assert_eq!(x.shape(), &[400, 6]);
assert_eq!(y.len(), 400);
}
#[test]
fn test_thyroid_has_anomalies() {
let mut rng = make_rng(1);
let (_, y) = thyroid_like(200, &mut rng).expect("thyroid anomalies");
let n_anom: usize = y.iter().filter(|&&v| v == 1).count();
assert!(n_anom >= 1, "thyroid should have at least 1 anomaly");
let ratio = n_anom as f64 / y.len() as f64;
assert!(
ratio < 0.1,
"anomaly ratio {ratio:.3} should be < 0.1"
);
}
#[test]
fn test_thyroid_error_n_zero() {
let mut rng = make_rng(1);
assert!(thyroid_like(0, &mut rng).is_err());
}
#[test]
fn test_thyroid_tsh_anomaly_lower() {
let mut rng = make_rng(7);
let (x, y) = thyroid_like(400, &mut rng).expect("thyroid tsh");
let mut normal_tsh_sum = 0.0_f64;
let mut normal_count = 0usize;
let mut anom_tsh_sum = 0.0_f64;
let mut anom_count = 0usize;
for i in 0..400 {
if y[i] == 0 {
normal_tsh_sum += x[[i, 0]];
normal_count += 1;
} else {
anom_tsh_sum += x[[i, 0]];
anom_count += 1;
}
}
let normal_mean_tsh = normal_tsh_sum / normal_count as f64;
let anom_mean_tsh = anom_tsh_sum / anom_count as f64;
assert!(
anom_mean_tsh < normal_mean_tsh,
"anomalous TSH ({anom_mean_tsh:.4}) should be lower than normal ({normal_mean_tsh:.4})"
);
}
#[test]
fn test_anomaly_injection_shape() {
let data = Array2::zeros((200, 3));
let mut rng = make_rng(1);
let (x, y) =
synthetic_anomaly_injection(&data, AnomalyPattern::PointAnomaly, 0.05, &mut rng)
.expect("injection shape");
assert_eq!(x.shape(), &[200, 3]);
assert_eq!(y.len(), 200);
}
#[test]
fn test_anomaly_injection_count() {
let data = Array2::zeros((200, 3));
let mut rng = make_rng(2);
let (_, y) =
synthetic_anomaly_injection(&data, AnomalyPattern::PointAnomaly, 0.05, &mut rng)
.expect("injection count");
let n_anom: usize = y.iter().filter(|&&v| v == 1).count();
assert_eq!(n_anom, 10, "floor(0.05 * 200) = 10");
}
#[test]
fn test_anomaly_injection_collective() {
let mut data = Array2::ones((100, 4));
for i in 0..100 {
for j in 0..4 {
data[[i, j]] = 1.0;
}
}
let mut rng = make_rng(3);
let (x, y) =
synthetic_anomaly_injection(&data, AnomalyPattern::CollectiveAnomaly, 0.1, &mut rng)
.expect("collective injection");
for i in 0..100 {
if y[i] == 1 {
for j in 0..4 {
assert!(
(x[[i, j]] - 10.0).abs() < 1e-9,
"collective anomaly at ({i},{j}): got {}",
x[[i, j]]
);
}
}
}
}
#[test]
fn test_anomaly_injection_error_empty() {
let data: Array2<f64> = Array2::zeros((0, 3));
let mut rng = make_rng(1);
assert!(
synthetic_anomaly_injection(&data, AnomalyPattern::PointAnomaly, 0.1, &mut rng)
.is_err()
);
}
#[test]
fn test_anomaly_injection_error_bad_contamination() {
let data = Array2::zeros((100, 3));
let mut rng = make_rng(1);
assert!(
synthetic_anomaly_injection(&data, AnomalyPattern::PointAnomaly, 0.0, &mut rng)
.is_err()
);
assert!(
synthetic_anomaly_injection(&data, AnomalyPattern::PointAnomaly, 1.0, &mut rng)
.is_err()
);
}
#[test]
fn test_isolation_forest_len() {
let mut rng = make_rng(42);
let (x, _) = kdd99_like(100, 10, &mut rng).expect("kdd99");
let mut rng2 = make_rng(1);
let scores = isolation_forest_benchmark(&x, 0.09, 20, &mut rng2)
.expect("isolation forest len");
assert_eq!(scores.len(), 110);
}
#[test]
fn test_isolation_forest_scores_in_range() {
let mut rng = make_rng(5);
let (x, _) = kdd99_like(80, 10, &mut rng).expect("kdd99");
let mut rng2 = make_rng(2);
let scores = isolation_forest_benchmark(&x, 0.11, 30, &mut rng2)
.expect("isolation forest range");
for &s in &scores {
assert!(s > 0.0 && s < 1.0, "score {s} out of (0, 1)");
}
}
#[test]
fn test_isolation_forest_error_empty() {
let x: Array2<f64> = Array2::zeros((0, 5));
let mut rng = make_rng(1);
assert!(isolation_forest_benchmark(&x, 0.1, 10, &mut rng).is_err());
}
#[test]
fn test_isolation_forest_error_zero_estimators() {
let x = Array2::ones((50, 3));
let mut rng = make_rng(1);
assert!(isolation_forest_benchmark(&x, 0.1, 0, &mut rng).is_err());
}
#[test]
fn test_ap_perfect() {
let y_true = array![0usize, 0, 1, 1];
let y_score = array![0.1_f64, 0.2, 0.8, 0.9];
let ap = average_precision(&y_true, &y_score).expect("perfect ap");
assert!((ap - 1.0).abs() < 1e-9, "perfect ap={ap}");
}
#[test]
fn test_ap_no_positives() {
let y_true = array![0usize, 0, 0];
let y_score = array![0.1_f64, 0.5, 0.9];
let ap = average_precision(&y_true, &y_score).expect("no pos ap");
assert!((ap - 0.0).abs() < 1e-9);
}
#[test]
fn test_ap_error_length_mismatch() {
let y_true = array![0usize, 1];
let y_score = array![0.5_f64];
assert!(average_precision(&y_true, &y_score).is_err());
}
#[test]
fn test_ap_error_empty() {
let y_true: Array1<usize> = Array1::zeros(0);
let y_score: Array1<f64> = Array1::zeros(0);
assert!(average_precision(&y_true, &y_score).is_err());
}
#[test]
fn test_ap_partial_ranking() {
let y_true = array![0usize, 1, 1, 1];
let y_score = array![0.2_f64, 0.9, 0.8, 0.7];
let ap = average_precision(&y_true, &y_score).expect("partial ap");
assert!(ap > 0.9 && ap <= 1.0, "ap={ap}");
}
}