aprender-profile 0.29.0

Pure Rust system call tracer with source-aware correlation for Rust binaries
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
//! Isolation Forest for ML-based outlier detection (Sprint 22)
//!
//! Implements the Isolation Forest algorithm for unsupervised anomaly detection
//! with explainability features (XAI).
//!
//! # Algorithm Overview
//!
//! Isolation Forest isolates anomalies by randomly partitioning the feature space.
//! Anomalies are easier to isolate (shorter paths in trees) compared to normal points.
//!
//! # References
//!
//! Liu, F. T., Ting, K. M., & Zhou, Z. H. (2008). Isolation forest.
//! In 2008 Eighth IEEE International Conference on Data Mining (pp. 413-422).

use rand::seq::SliceRandom;
use rand::Rng;
use std::collections::HashMap;

/// Default sub-sampling size (following original paper)
const DEFAULT_SUBSAMPLE_SIZE: usize = 256;

/// A node in an Isolation Tree
#[derive(Debug, Clone)]
enum IsolationNode {
    /// Internal node with split feature and threshold
    Internal {
        feature_idx: usize,
        threshold: f64,
        left: Box<IsolationNode>,
        right: Box<IsolationNode>,
    },
    /// Leaf node with sample count (for path length calculation)
    Leaf { size: usize },
}

impl IsolationNode {
    /// Calculate path length from root to this node for a given sample
    fn path_length(&self, sample: &[f64], current_depth: usize) -> f64 {
        match self {
            IsolationNode::Internal { feature_idx, threshold, left, right } => {
                if sample[*feature_idx] < *threshold {
                    left.path_length(sample, current_depth + 1)
                } else {
                    right.path_length(sample, current_depth + 1)
                }
            }
            IsolationNode::Leaf { size } => {
                // Add average path length for unresolved instances
                current_depth as f64 + Self::average_path_length(*size)
            }
        }
    }

    /// Calculate average path length for BST with n samples (expected value)
    fn average_path_length(n: usize) -> f64 {
        if n <= 1 {
            return 0.0;
        }
        // Harmonic number approximation: H(n-1) ≈ ln(n-1) + γ
        const EULER_GAMMA: f64 = 0.5772156649;
        2.0 * ((n - 1) as f64).ln() + EULER_GAMMA - 2.0 * (n - 1) as f64 / n as f64
    }
}

/// Single Isolation Tree
#[derive(Debug, Clone)]
pub struct IsolationTree {
    root: IsolationNode,
}

impl IsolationTree {
    /// Build a tree from samples
    fn build(samples: &[Vec<f64>], max_depth: usize) -> Self {
        let root = Self::build_node(samples, 0, max_depth);
        IsolationTree { root }
    }

    /// Recursively build tree nodes
    fn build_node(samples: &[Vec<f64>], depth: usize, max_depth: usize) -> IsolationNode {
        // Base cases: stop splitting
        if samples.is_empty() {
            return IsolationNode::Leaf { size: 0 };
        }

        if depth >= max_depth || samples.len() <= 1 {
            return IsolationNode::Leaf { size: samples.len() };
        }

        // All samples are identical - create leaf
        if samples.windows(2).all(|w| w[0] == w[1]) {
            return IsolationNode::Leaf { size: samples.len() };
        }

        let num_features = samples[0].len();
        let mut rng = rand::thread_rng();

        // Randomly select a feature to split on
        let feature_idx = rng.gen_range(0..num_features);

        // Get min/max for this feature
        let mut min_val = f64::MAX;
        let mut max_val = f64::MIN;
        for sample in samples {
            let val = sample[feature_idx];
            min_val = min_val.min(val);
            max_val = max_val.max(val);
        }

        // If all values are the same for this feature, create leaf
        if (max_val - min_val).abs() < f64::EPSILON {
            return IsolationNode::Leaf { size: samples.len() };
        }

        // Random split threshold between min and max
        let threshold = rng.gen_range(min_val..max_val);

        // Partition samples
        let (left_samples, right_samples): (Vec<Vec<f64>>, Vec<Vec<f64>>) =
            samples.iter().cloned().partition(|sample| sample[feature_idx] < threshold);

        // If partition is empty on one side, create leaf
        if left_samples.is_empty() || right_samples.is_empty() {
            return IsolationNode::Leaf { size: samples.len() };
        }

        // Recursively build children
        let left = Box::new(Self::build_node(&left_samples, depth + 1, max_depth));
        let right = Box::new(Self::build_node(&right_samples, depth + 1, max_depth));

        IsolationNode::Internal { feature_idx, threshold, left, right }
    }

    /// Calculate path length for a sample
    fn path_length(&self, sample: &[f64]) -> f64 {
        self.root.path_length(sample, 0)
    }
}

/// Isolation Forest - ensemble of Isolation Trees
pub struct IsolationForest {
    trees: Vec<IsolationTree>,
    num_trees: usize,
    subsample_size: usize,
}

impl IsolationForest {
    /// Create a new Isolation Forest
    pub fn new(num_trees: usize, subsample_size: Option<usize>) -> Self {
        IsolationForest {
            trees: Vec::new(),
            num_trees,
            subsample_size: subsample_size.unwrap_or(DEFAULT_SUBSAMPLE_SIZE),
        }
    }

    /// Fit the model on training data
    pub fn fit(&mut self, samples: &[Vec<f64>]) {
        let mut rng = rand::thread_rng();
        let max_depth = (self.subsample_size as f64).log2().ceil() as usize;

        for _ in 0..self.num_trees {
            // Sub-sample data
            let sample_size = self.subsample_size.min(samples.len());
            let mut indices: Vec<_> = (0..samples.len()).collect();
            indices.shuffle(&mut rng);
            let subsamples: Vec<_> =
                indices[..sample_size].iter().map(|&i| samples[i].clone()).collect();

            // Build tree
            let tree = IsolationTree::build(&subsamples, max_depth);
            self.trees.push(tree);
        }
    }

    /// Calculate anomaly score for a sample (higher = more anomalous)
    /// Returns score in range [0, 1]
    pub fn anomaly_score(&self, sample: &[f64]) -> f64 {
        if self.trees.is_empty() {
            return 0.0;
        }

        if self.trees.is_empty() {
            return 0.5; // No trees → indeterminate score
        }

        // Average path length across all trees
        let avg_path_length: f64 =
            self.trees.iter().map(|tree| tree.path_length(sample)).sum::<f64>()
                / self.trees.len() as f64;

        // Normalize by expected path length
        let c = IsolationNode::average_path_length(self.subsample_size);
        if c == 0.0 {
            return 0.5; // subsample_size <= 1 → indeterminate
        }
        2_f64.powf(-avg_path_length / c)
    }

    /// Predict outliers based on contamination threshold
    /// Returns true if sample is an outlier
    pub fn predict(&self, sample: &[f64], contamination: f32) -> bool {
        let score = self.anomaly_score(sample);
        // Scores close to 1.0 are anomalies, close to 0.5 are normal
        score > 0.5 + (f64::from(contamination) / 2.0)
    }
}

/// Feature extracted from syscall data
#[derive(Debug, Clone)]
pub struct SyscallFeature {
    pub syscall_name: String,
    pub avg_duration_us: f64,
    pub call_count: u64,
    pub total_duration_us: f64,
}

/// Extract features from syscall statistics
pub fn extract_features(
    syscall_data: &HashMap<String, (u64, u64)>,
) -> (Vec<String>, Vec<Vec<f64>>) {
    let mut syscall_names = Vec::new();
    let mut features = Vec::new();

    for (name, (count, total_time_ns)) in syscall_data {
        if *count == 0 {
            continue;
        }

        let total_time_us = *total_time_ns as f64 / 1000.0;
        let avg_time_us = total_time_us / *count as f64;

        syscall_names.push(name.clone());

        // Feature vector: [avg_duration, call_count, total_duration]
        features.push(vec![
            avg_time_us,
            (*count as f64).ln().max(0.0), // Log scale for count
            total_time_us.ln().max(0.0),   // Log scale for total time
        ]);
    }

    (syscall_names, features)
}

/// Outlier detected by Isolation Forest
#[derive(Debug, Clone)]
pub struct Outlier {
    pub syscall: String,
    pub anomaly_score: f64,
    pub avg_duration_us: f64,
    pub call_count: u64,
    pub feature_importance: Vec<(String, f64)>,
}

/// Result of Isolation Forest analysis
#[derive(Debug, Clone)]
pub struct OutlierReport {
    pub outliers: Vec<Outlier>,
    pub total_samples: usize,
    pub contamination: f32,
    pub num_trees: usize,
}

/// Analyze syscall data for outliers using Isolation Forest
pub fn analyze_outliers(
    syscall_data: &HashMap<String, (u64, u64)>,
    num_trees: usize,
    contamination: f32,
    explain: bool,
) -> OutlierReport {
    // Extract features
    let (syscall_names, features) = extract_features(syscall_data);

    if features.len() < 2 {
        // Insufficient data
        return OutlierReport {
            outliers: Vec::new(),
            total_samples: features.len(),
            contamination,
            num_trees,
        };
    }

    // Train Isolation Forest
    let mut forest = IsolationForest::new(num_trees, None);
    forest.fit(&features);

    // Detect outliers
    let mut outliers = Vec::new();

    for (name, feature_vec) in syscall_names.iter().zip(features.iter()) {
        let score = forest.anomaly_score(feature_vec);
        let is_outlier = forest.predict(feature_vec, contamination);

        if is_outlier {
            let (count, total_time_ns) = syscall_data[name];
            let avg_duration_us = total_time_ns as f64 / 1000.0 / count as f64;

            let feature_importance =
                if explain { calculate_feature_importance(feature_vec) } else { Vec::new() };

            outliers.push(Outlier {
                syscall: name.clone(),
                anomaly_score: score,
                avg_duration_us,
                call_count: count,
                feature_importance,
            });
        }
    }

    // Sort by anomaly score (highest first, handle NaN gracefully)
    outliers.sort_by(|a, b| {
        b.anomaly_score.partial_cmp(&a.anomaly_score).unwrap_or(std::cmp::Ordering::Equal)
    });

    OutlierReport { outliers, total_samples: features.len(), contamination, num_trees }
}

/// Calculate feature importance for explainability (XAI)
fn calculate_feature_importance(features: &[f64]) -> Vec<(String, f64)> {
    let feature_names = ["avg_duration", "call_frequency", "total_duration"];

    // Simple feature importance: normalized absolute values
    let total: f64 = features.iter().map(|&f| f.abs()).sum();

    feature_names
        .iter()
        .zip(features.iter())
        .map(|(name, &value)| {
            let importance = if total > 0.0 { (value.abs() / total) * 100.0 } else { 0.0 };
            ((*name).to_string(), importance)
        })
        .collect()
}

// Compile-time thread-safety verification (Sprint 59)
static_assertions::assert_impl_all!(IsolationTree: Send, Sync);
static_assertions::assert_impl_all!(IsolationForest: Send, Sync);
static_assertions::assert_impl_all!(SyscallFeature: Send, Sync);
static_assertions::assert_impl_all!(Outlier: Send, Sync);
static_assertions::assert_impl_all!(OutlierReport: Send, Sync);

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_isolation_tree_creation() {
        let samples = vec![
            vec![1.0, 2.0],
            vec![1.1, 2.1],
            vec![10.0, 20.0], // Outlier
        ];

        // Use multiple trees to average out randomness
        let mut outlier_total = 0.0;
        let mut normal_total = 0.0;
        for _ in 0..10 {
            let tree = IsolationTree::build(&samples, 10);
            outlier_total += tree.path_length(&[10.0, 20.0]);
            normal_total += tree.path_length(&[1.0, 2.0]);
        }
        let outlier_avg = outlier_total / 10.0;
        let normal_avg = normal_total / 10.0;

        // Outlier should have shorter average path (more isolated)
        assert!(
            outlier_avg < normal_avg,
            "Outlier avg ({}) should be < normal avg ({})",
            outlier_avg,
            normal_avg
        );
    }

    #[test]
    fn test_isolation_forest_detects_outliers() {
        let samples = vec![
            vec![1.0, 2.0],
            vec![1.1, 2.1],
            vec![0.9, 1.9],
            vec![1.2, 2.2],
            vec![10.0, 20.0], // Clear outlier
        ];

        let mut forest = IsolationForest::new(100, Some(4));
        forest.fit(&samples);

        let outlier_score = forest.anomaly_score(&[10.0, 20.0]);
        let normal_score = forest.anomaly_score(&[1.0, 2.0]);

        // Outlier should have higher score
        assert!(
            outlier_score > normal_score,
            "Outlier score ({}) should be > normal score ({})",
            outlier_score,
            normal_score
        );
        assert!(outlier_score > 0.50, "Outlier score ({}) should be > 0.50", outlier_score);
        // Should be anomalous (>0.5 is anomaly baseline)
    }

    #[test]
    fn test_feature_extraction() {
        let mut data = HashMap::new();
        data.insert("write".to_string(), (100, 1_000_000)); // 1ms total, 10us avg
        data.insert("read".to_string(), (10, 10_000_000)); // 10ms total, 1000us avg (outlier)

        let (names, features) = extract_features(&data);

        assert_eq!(names.len(), 2);
        assert_eq!(features.len(), 2);
        assert_eq!(features[0].len(), 3); // 3 features per syscall
    }

    #[test]
    fn test_analyze_outliers() {
        let mut data = HashMap::new();
        // Normal syscalls
        data.insert("write".to_string(), (100, 1_000_000));
        data.insert("read".to_string(), (100, 1_000_000));
        // Outlier - very slow
        data.insert("slow_syscall".to_string(), (10, 100_000_000));

        let report = analyze_outliers(&data, 100, 0.1, false);

        assert!(!report.outliers.is_empty());
        assert_eq!(report.total_samples, 3);
    }

    #[test]
    fn test_feature_importance() {
        let features = vec![10.0, 5.0, 2.0];
        let importance = calculate_feature_importance(&features);

        assert_eq!(importance.len(), 3);
        // Sum should be ~100%
        let total: f64 = importance.iter().map(|(_, v)| v).sum();
        assert!((total - 100.0).abs() < 0.1);
    }

    #[test]
    fn test_average_path_length() {
        // Test known values from paper
        assert_eq!(IsolationNode::average_path_length(1), 0.0);
        let apl_10 = IsolationNode::average_path_length(10);
        assert!(apl_10 > 2.0 && apl_10 < 4.0); // Reasonable range
    }

    #[test]
    fn test_insufficient_data() {
        let mut data = HashMap::new();
        data.insert("write".to_string(), (1, 1000));

        let report = analyze_outliers(&data, 10, 0.1, false);
        assert_eq!(report.outliers.len(), 0);
        assert_eq!(report.total_samples, 1);
    }
}