use rand::seq::SliceRandom;
use rand::Rng;
use std::collections::HashMap;
const DEFAULT_SUBSAMPLE_SIZE: usize = 256;
#[derive(Debug, Clone)]
enum IsolationNode {
Internal {
feature_idx: usize,
threshold: f64,
left: Box<IsolationNode>,
right: Box<IsolationNode>,
},
Leaf { size: usize },
}
impl IsolationNode {
fn path_length(&self, sample: &[f64], current_depth: usize) -> f64 {
match self {
IsolationNode::Internal { feature_idx, threshold, left, right } => {
if sample[*feature_idx] < *threshold {
left.path_length(sample, current_depth + 1)
} else {
right.path_length(sample, current_depth + 1)
}
}
IsolationNode::Leaf { size } => {
current_depth as f64 + Self::average_path_length(*size)
}
}
}
fn average_path_length(n: usize) -> f64 {
if n <= 1 {
return 0.0;
}
const EULER_GAMMA: f64 = 0.5772156649;
2.0 * ((n - 1) as f64).ln() + EULER_GAMMA - 2.0 * (n - 1) as f64 / n as f64
}
}
#[derive(Debug, Clone)]
pub struct IsolationTree {
root: IsolationNode,
}
impl IsolationTree {
fn build(samples: &[Vec<f64>], max_depth: usize) -> Self {
let root = Self::build_node(samples, 0, max_depth);
IsolationTree { root }
}
fn build_node(samples: &[Vec<f64>], depth: usize, max_depth: usize) -> IsolationNode {
if samples.is_empty() {
return IsolationNode::Leaf { size: 0 };
}
if depth >= max_depth || samples.len() <= 1 {
return IsolationNode::Leaf { size: samples.len() };
}
if samples.windows(2).all(|w| w[0] == w[1]) {
return IsolationNode::Leaf { size: samples.len() };
}
let num_features = samples[0].len();
let mut rng = rand::thread_rng();
let feature_idx = rng.gen_range(0..num_features);
let mut min_val = f64::MAX;
let mut max_val = f64::MIN;
for sample in samples {
let val = sample[feature_idx];
min_val = min_val.min(val);
max_val = max_val.max(val);
}
if (max_val - min_val).abs() < f64::EPSILON {
return IsolationNode::Leaf { size: samples.len() };
}
let threshold = rng.gen_range(min_val..max_val);
let (left_samples, right_samples): (Vec<Vec<f64>>, Vec<Vec<f64>>) =
samples.iter().cloned().partition(|sample| sample[feature_idx] < threshold);
if left_samples.is_empty() || right_samples.is_empty() {
return IsolationNode::Leaf { size: samples.len() };
}
let left = Box::new(Self::build_node(&left_samples, depth + 1, max_depth));
let right = Box::new(Self::build_node(&right_samples, depth + 1, max_depth));
IsolationNode::Internal { feature_idx, threshold, left, right }
}
fn path_length(&self, sample: &[f64]) -> f64 {
self.root.path_length(sample, 0)
}
}
pub struct IsolationForest {
trees: Vec<IsolationTree>,
num_trees: usize,
subsample_size: usize,
}
impl IsolationForest {
pub fn new(num_trees: usize, subsample_size: Option<usize>) -> Self {
IsolationForest {
trees: Vec::new(),
num_trees,
subsample_size: subsample_size.unwrap_or(DEFAULT_SUBSAMPLE_SIZE),
}
}
pub fn fit(&mut self, samples: &[Vec<f64>]) {
let mut rng = rand::thread_rng();
let max_depth = (self.subsample_size as f64).log2().ceil() as usize;
for _ in 0..self.num_trees {
let sample_size = self.subsample_size.min(samples.len());
let mut indices: Vec<_> = (0..samples.len()).collect();
indices.shuffle(&mut rng);
let subsamples: Vec<_> =
indices[..sample_size].iter().map(|&i| samples[i].clone()).collect();
let tree = IsolationTree::build(&subsamples, max_depth);
self.trees.push(tree);
}
}
pub fn anomaly_score(&self, sample: &[f64]) -> f64 {
if self.trees.is_empty() {
return 0.0;
}
if self.trees.is_empty() {
return 0.5; }
let avg_path_length: f64 =
self.trees.iter().map(|tree| tree.path_length(sample)).sum::<f64>()
/ self.trees.len() as f64;
let c = IsolationNode::average_path_length(self.subsample_size);
if c == 0.0 {
return 0.5; }
2_f64.powf(-avg_path_length / c)
}
pub fn predict(&self, sample: &[f64], contamination: f32) -> bool {
let score = self.anomaly_score(sample);
score > 0.5 + (f64::from(contamination) / 2.0)
}
}
#[derive(Debug, Clone)]
pub struct SyscallFeature {
pub syscall_name: String,
pub avg_duration_us: f64,
pub call_count: u64,
pub total_duration_us: f64,
}
pub fn extract_features(
syscall_data: &HashMap<String, (u64, u64)>,
) -> (Vec<String>, Vec<Vec<f64>>) {
let mut syscall_names = Vec::new();
let mut features = Vec::new();
for (name, (count, total_time_ns)) in syscall_data {
if *count == 0 {
continue;
}
let total_time_us = *total_time_ns as f64 / 1000.0;
let avg_time_us = total_time_us / *count as f64;
syscall_names.push(name.clone());
features.push(vec![
avg_time_us,
(*count as f64).ln().max(0.0), total_time_us.ln().max(0.0), ]);
}
(syscall_names, features)
}
#[derive(Debug, Clone)]
pub struct Outlier {
pub syscall: String,
pub anomaly_score: f64,
pub avg_duration_us: f64,
pub call_count: u64,
pub feature_importance: Vec<(String, f64)>,
}
#[derive(Debug, Clone)]
pub struct OutlierReport {
pub outliers: Vec<Outlier>,
pub total_samples: usize,
pub contamination: f32,
pub num_trees: usize,
}
pub fn analyze_outliers(
syscall_data: &HashMap<String, (u64, u64)>,
num_trees: usize,
contamination: f32,
explain: bool,
) -> OutlierReport {
let (syscall_names, features) = extract_features(syscall_data);
if features.len() < 2 {
return OutlierReport {
outliers: Vec::new(),
total_samples: features.len(),
contamination,
num_trees,
};
}
let mut forest = IsolationForest::new(num_trees, None);
forest.fit(&features);
let mut outliers = Vec::new();
for (name, feature_vec) in syscall_names.iter().zip(features.iter()) {
let score = forest.anomaly_score(feature_vec);
let is_outlier = forest.predict(feature_vec, contamination);
if is_outlier {
let (count, total_time_ns) = syscall_data[name];
let avg_duration_us = total_time_ns as f64 / 1000.0 / count as f64;
let feature_importance =
if explain { calculate_feature_importance(feature_vec) } else { Vec::new() };
outliers.push(Outlier {
syscall: name.clone(),
anomaly_score: score,
avg_duration_us,
call_count: count,
feature_importance,
});
}
}
outliers.sort_by(|a, b| {
b.anomaly_score.partial_cmp(&a.anomaly_score).unwrap_or(std::cmp::Ordering::Equal)
});
OutlierReport { outliers, total_samples: features.len(), contamination, num_trees }
}
fn calculate_feature_importance(features: &[f64]) -> Vec<(String, f64)> {
let feature_names = ["avg_duration", "call_frequency", "total_duration"];
let total: f64 = features.iter().map(|&f| f.abs()).sum();
feature_names
.iter()
.zip(features.iter())
.map(|(name, &value)| {
let importance = if total > 0.0 { (value.abs() / total) * 100.0 } else { 0.0 };
((*name).to_string(), importance)
})
.collect()
}
static_assertions::assert_impl_all!(IsolationTree: Send, Sync);
static_assertions::assert_impl_all!(IsolationForest: Send, Sync);
static_assertions::assert_impl_all!(SyscallFeature: Send, Sync);
static_assertions::assert_impl_all!(Outlier: Send, Sync);
static_assertions::assert_impl_all!(OutlierReport: Send, Sync);
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_isolation_tree_creation() {
let samples = vec![
vec![1.0, 2.0],
vec![1.1, 2.1],
vec![10.0, 20.0], ];
let mut outlier_total = 0.0;
let mut normal_total = 0.0;
for _ in 0..10 {
let tree = IsolationTree::build(&samples, 10);
outlier_total += tree.path_length(&[10.0, 20.0]);
normal_total += tree.path_length(&[1.0, 2.0]);
}
let outlier_avg = outlier_total / 10.0;
let normal_avg = normal_total / 10.0;
assert!(
outlier_avg < normal_avg,
"Outlier avg ({}) should be < normal avg ({})",
outlier_avg,
normal_avg
);
}
#[test]
fn test_isolation_forest_detects_outliers() {
let samples = vec![
vec![1.0, 2.0],
vec![1.1, 2.1],
vec![0.9, 1.9],
vec![1.2, 2.2],
vec![10.0, 20.0], ];
let mut forest = IsolationForest::new(100, Some(4));
forest.fit(&samples);
let outlier_score = forest.anomaly_score(&[10.0, 20.0]);
let normal_score = forest.anomaly_score(&[1.0, 2.0]);
assert!(
outlier_score > normal_score,
"Outlier score ({}) should be > normal score ({})",
outlier_score,
normal_score
);
assert!(outlier_score > 0.50, "Outlier score ({}) should be > 0.50", outlier_score);
}
#[test]
fn test_feature_extraction() {
let mut data = HashMap::new();
data.insert("write".to_string(), (100, 1_000_000)); data.insert("read".to_string(), (10, 10_000_000));
let (names, features) = extract_features(&data);
assert_eq!(names.len(), 2);
assert_eq!(features.len(), 2);
assert_eq!(features[0].len(), 3); }
#[test]
fn test_analyze_outliers() {
let mut data = HashMap::new();
data.insert("write".to_string(), (100, 1_000_000));
data.insert("read".to_string(), (100, 1_000_000));
data.insert("slow_syscall".to_string(), (10, 100_000_000));
let report = analyze_outliers(&data, 100, 0.1, false);
assert!(!report.outliers.is_empty());
assert_eq!(report.total_samples, 3);
}
#[test]
fn test_feature_importance() {
let features = vec![10.0, 5.0, 2.0];
let importance = calculate_feature_importance(&features);
assert_eq!(importance.len(), 3);
let total: f64 = importance.iter().map(|(_, v)| v).sum();
assert!((total - 100.0).abs() < 0.1);
}
#[test]
fn test_average_path_length() {
assert_eq!(IsolationNode::average_path_length(1), 0.0);
let apl_10 = IsolationNode::average_path_length(10);
assert!(apl_10 > 2.0 && apl_10 < 4.0); }
#[test]
fn test_insufficient_data() {
let mut data = HashMap::new();
data.insert("write".to_string(), (1, 1000));
let report = analyze_outliers(&data, 10, 0.1, false);
assert_eq!(report.outliers.len(), 0);
assert_eq!(report.total_samples, 1);
}
}