ghostflow_ml/
outlier_detection.rs

1//! Outlier Detection - Local Outlier Factor, One-Class SVM, Elliptic Envelope
2
3use ghostflow_core::Tensor;
4
5/// Local Outlier Factor for anomaly detection
6pub struct LocalOutlierFactor {
7    pub n_neighbors: usize,
8    pub contamination: f32,
9    pub metric: LOFMetric,
10    x_train_: Option<Vec<f32>>,
11    lrd_: Option<Vec<f32>>,
12    lof_scores_: Option<Vec<f32>>,
13    threshold_: f32,
14    n_samples_: usize,
15    n_features_: usize,
16}
17
18#[derive(Clone, Copy, Debug)]
19pub enum LOFMetric {
20    Euclidean,
21    Manhattan,
22    Minkowski(f32),
23}
24
25impl LocalOutlierFactor {
26    pub fn new(n_neighbors: usize) -> Self {
27        LocalOutlierFactor {
28            n_neighbors,
29            contamination: 0.1,
30            metric: LOFMetric::Euclidean,
31            x_train_: None,
32            lrd_: None,
33            lof_scores_: None,
34            threshold_: 0.0,
35            n_samples_: 0,
36            n_features_: 0,
37        }
38    }
39
40    pub fn contamination(mut self, c: f32) -> Self {
41        self.contamination = c.clamp(0.0, 0.5);
42        self
43    }
44
45    fn distance(&self, a: &[f32], b: &[f32]) -> f32 {
46        match self.metric {
47            LOFMetric::Euclidean => {
48                a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).powi(2)).sum::<f32>().sqrt()
49            }
50            LOFMetric::Manhattan => {
51                a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).abs()).sum()
52            }
53            LOFMetric::Minkowski(p) => {
54                a.iter().zip(b.iter())
55                    .map(|(&ai, &bi)| (ai - bi).abs().powf(p))
56                    .sum::<f32>()
57                    .powf(1.0 / p)
58            }
59        }
60    }
61
62    fn k_neighbors(&self, x: &[f32], point_idx: usize, n_samples: usize, n_features: usize) -> Vec<(usize, f32)> {
63        let point = &x[point_idx * n_features..(point_idx + 1) * n_features];
64        
65        let mut distances: Vec<(usize, f32)> = (0..n_samples)
66            .filter(|&i| i != point_idx)
67            .map(|i| {
68                let other = &x[i * n_features..(i + 1) * n_features];
69                (i, self.distance(point, other))
70            })
71            .collect();
72
73        distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
74        distances.truncate(self.n_neighbors);
75        distances
76    }
77
78    fn k_distance(&self, neighbors: &[(usize, f32)]) -> f32 {
79        neighbors.last().map(|(_, d)| *d).unwrap_or(0.0)
80    }
81
82    fn reachability_distance(&self, x: &[f32], point_a: usize, point_b: usize, k_dist_b: f32, n_features: usize) -> f32 {
83        let a = &x[point_a * n_features..(point_a + 1) * n_features];
84        let b = &x[point_b * n_features..(point_b + 1) * n_features];
85        let dist = self.distance(a, b);
86        dist.max(k_dist_b)
87    }
88
89    pub fn fit(&mut self, x: &Tensor) {
90        let x_data = x.data_f32();
91        let n_samples = x.dims()[0];
92        let n_features = x.dims()[1];
93
94        self.n_samples_ = n_samples;
95        self.n_features_ = n_features;
96
97        // Compute k-neighbors and k-distances for all points
98        let all_neighbors: Vec<Vec<(usize, f32)>> = (0..n_samples)
99            .map(|i| self.k_neighbors(&x_data, i, n_samples, n_features))
100            .collect();
101
102        let k_distances: Vec<f32> = all_neighbors.iter()
103            .map(|neighbors| self.k_distance(neighbors))
104            .collect();
105
106        // Compute Local Reachability Density (LRD)
107        let lrd: Vec<f32> = (0..n_samples)
108            .map(|i| {
109                let neighbors = &all_neighbors[i];
110                let sum_reach_dist: f32 = neighbors.iter()
111                    .map(|&(j, _)| self.reachability_distance(&x_data, i, j, k_distances[j], n_features))
112                    .sum();
113                
114                if sum_reach_dist > 0.0 {
115                    neighbors.len() as f32 / sum_reach_dist
116                } else {
117                    f32::INFINITY
118                }
119            })
120            .collect();
121
122        // Compute LOF scores
123        let lof_scores: Vec<f32> = (0..n_samples)
124            .map(|i| {
125                let neighbors = &all_neighbors[i];
126                let sum_lrd_ratio: f32 = neighbors.iter()
127                    .map(|&(j, _)| lrd[j] / lrd[i].max(1e-10))
128                    .sum();
129                
130                sum_lrd_ratio / neighbors.len() as f32
131            })
132            .collect();
133
134        // Compute threshold based on contamination
135        let mut sorted_scores = lof_scores.clone();
136        sorted_scores.sort_by(|a, b| b.partial_cmp(a).unwrap());  // Descending
137        let threshold_idx = (self.contamination * n_samples as f32) as usize;
138        self.threshold_ = sorted_scores[threshold_idx.min(n_samples - 1)];
139
140        self.x_train_ = Some(x_data);
141        self.lrd_ = Some(lrd);
142        self.lof_scores_ = Some(lof_scores);
143    }
144
145    pub fn fit_predict(&mut self, x: &Tensor) -> Tensor {
146        self.fit(x);
147        
148        let lof_scores = self.lof_scores_.as_ref().unwrap();
149        let predictions: Vec<f32> = lof_scores.iter()
150            .map(|&score| if score > self.threshold_ { -1.0 } else { 1.0 })  // -1 = outlier, 1 = inlier
151            .collect();
152
153        Tensor::from_slice(&predictions, &[predictions.len()]).unwrap()
154    }
155
156    pub fn decision_function(&self, x: &Tensor) -> Tensor {
157        // For new data, compute LOF scores
158        let x_data = x.data_f32();
159        let n_samples = x.dims()[0];
160        let n_features = x.dims()[1];
161
162        let x_train = self.x_train_.as_ref().expect("Model not fitted");
163        let lrd_train = self.lrd_.as_ref().expect("Model not fitted");
164        let n_train = self.n_samples_;
165
166        let scores: Vec<f32> = (0..n_samples)
167            .map(|i| {
168                let point = &x_data[i * n_features..(i + 1) * n_features];
169                
170                // Find k-nearest neighbors in training data
171                let mut distances: Vec<(usize, f32)> = (0..n_train)
172                    .map(|j| {
173                        let train_point = &x_train[j * n_features..(j + 1) * n_features];
174                        (j, self.distance(point, train_point))
175                    })
176                    .collect();
177
178                distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
179                let neighbors: Vec<(usize, f32)> = distances.into_iter().take(self.n_neighbors).collect();
180
181                // Compute LRD for this point
182                let k_dist = neighbors.last().map(|(_, d)| *d).unwrap_or(0.0);
183                let sum_reach_dist: f32 = neighbors.iter()
184                    .map(|&(_j, d)| d.max(k_dist))
185                    .sum();
186                
187                let lrd_point = if sum_reach_dist > 0.0 {
188                    neighbors.len() as f32 / sum_reach_dist
189                } else {
190                    f32::INFINITY
191                };
192
193                // Compute LOF
194                let sum_lrd_ratio: f32 = neighbors.iter()
195                    .map(|&(j, _)| lrd_train[j] / lrd_point.max(1e-10))
196                    .sum();
197                
198                -(sum_lrd_ratio / neighbors.len() as f32)  // Negative so higher = more normal
199            })
200            .collect();
201
202        Tensor::from_slice(&scores, &[n_samples]).unwrap()
203    }
204
205    pub fn predict(&self, x: &Tensor) -> Tensor {
206        let scores = self.decision_function(x);
207        let score_data = scores.data_f32();
208        let n_samples = x.dims()[0];
209
210        let predictions: Vec<f32> = score_data.iter()
211            .map(|&s| if -s > self.threshold_ { -1.0 } else { 1.0 })
212            .collect();
213
214        Tensor::from_slice(&predictions, &[n_samples]).unwrap()
215    }
216}
217
218/// One-Class SVM for novelty detection
219pub struct OneClassSVM {
220    pub kernel: OCSVMKernel,
221    pub nu: f32,
222    pub gamma: f32,
223    pub max_iter: usize,
224    support_vectors_: Option<Vec<Vec<f32>>>,
225    dual_coef_: Option<Vec<f32>>,
226    intercept_: Option<f32>,
227    n_features_: usize,
228}
229
230#[derive(Clone, Copy, Debug)]
231pub enum OCSVMKernel {
232    RBF,
233    Linear,
234    Poly { degree: i32, coef0: f32 },
235    Sigmoid { coef0: f32 },
236}
237
238impl OneClassSVM {
239    pub fn new() -> Self {
240        OneClassSVM {
241            kernel: OCSVMKernel::RBF,
242            nu: 0.5,
243            gamma: 1.0,
244            max_iter: 1000,
245            support_vectors_: None,
246            dual_coef_: None,
247            intercept_: None,
248            n_features_: 0,
249        }
250    }
251
252    pub fn nu(mut self, nu: f32) -> Self {
253        self.nu = nu.clamp(0.0, 1.0);
254        self
255    }
256
257    pub fn gamma(mut self, gamma: f32) -> Self {
258        self.gamma = gamma;
259        self
260    }
261
262    pub fn kernel(mut self, kernel: OCSVMKernel) -> Self {
263        self.kernel = kernel;
264        self
265    }
266
267    fn kernel_function(&self, xi: &[f32], xj: &[f32]) -> f32 {
268        match self.kernel {
269            OCSVMKernel::RBF => {
270                let dist_sq: f32 = xi.iter().zip(xj.iter()).map(|(&a, &b)| (a - b).powi(2)).sum();
271                (-self.gamma * dist_sq).exp()
272            }
273            OCSVMKernel::Linear => {
274                xi.iter().zip(xj.iter()).map(|(&a, &b)| a * b).sum()
275            }
276            OCSVMKernel::Poly { degree, coef0 } => {
277                let dot: f32 = xi.iter().zip(xj.iter()).map(|(&a, &b)| a * b).sum();
278                (self.gamma * dot + coef0).powi(degree)
279            }
280            OCSVMKernel::Sigmoid { coef0 } => {
281                let dot: f32 = xi.iter().zip(xj.iter()).map(|(&a, &b)| a * b).sum();
282                (self.gamma * dot + coef0).tanh()
283            }
284        }
285    }
286
287    pub fn fit(&mut self, x: &Tensor) {
288        let x_data = x.data_f32();
289        let n_samples = x.dims()[0];
290        let n_features = x.dims()[1];
291
292        self.n_features_ = n_features;
293
294        // Simplified One-Class SVM using SMO-like approach
295        let mut alpha = vec![0.0f32; n_samples];
296        let upper_bound = 1.0 / (self.nu * n_samples as f32);
297
298        // Precompute kernel matrix
299        let mut kernel_matrix = vec![vec![0.0f32; n_samples]; n_samples];
300        for i in 0..n_samples {
301            for j in i..n_samples {
302                let xi = &x_data[i * n_features..(i + 1) * n_features];
303                let xj = &x_data[j * n_features..(j + 1) * n_features];
304                let k = self.kernel_function(xi, xj);
305                kernel_matrix[i][j] = k;
306                kernel_matrix[j][i] = k;
307            }
308        }
309
310        // Initialize alphas uniformly
311        let init_alpha = 1.0 / n_samples as f32;
312        for a in &mut alpha {
313            *a = init_alpha.min(upper_bound);
314        }
315
316        // Simple coordinate descent
317        for _ in 0..self.max_iter {
318            let mut changed = false;
319
320            for i in 0..n_samples {
321                // Compute gradient
322                let mut grad = 0.0f32;
323                for j in 0..n_samples {
324                    grad += alpha[j] * kernel_matrix[i][j];
325                }
326
327                // Update alpha
328                let old_alpha = alpha[i];
329                alpha[i] = (alpha[i] - 0.01 * grad).clamp(0.0, upper_bound);
330
331                if (alpha[i] - old_alpha).abs() > 1e-5 {
332                    changed = true;
333                }
334            }
335
336            // Normalize to sum to 1
337            let sum: f32 = alpha.iter().sum();
338            if sum > 0.0 {
339                for a in &mut alpha {
340                    *a /= sum;
341                }
342            }
343
344            if !changed {
345                break;
346            }
347        }
348
349        // Extract support vectors
350        let eps = 1e-5;
351        let mut support_vectors = Vec::new();
352        let mut dual_coef = Vec::new();
353
354        for i in 0..n_samples {
355            if alpha[i] > eps {
356                support_vectors.push(x_data[i * n_features..(i + 1) * n_features].to_vec());
357                dual_coef.push(alpha[i]);
358            }
359        }
360
361        // Compute intercept (rho)
362        let mut rho = 0.0f32;
363        let mut count = 0;
364        for i in 0..n_samples {
365            if alpha[i] > eps && alpha[i] < upper_bound - eps {
366                let mut sum = 0.0f32;
367                for j in 0..n_samples {
368                    sum += alpha[j] * kernel_matrix[i][j];
369                }
370                rho += sum;
371                count += 1;
372            }
373        }
374        if count > 0 {
375            rho /= count as f32;
376        }
377
378        self.support_vectors_ = Some(support_vectors);
379        self.dual_coef_ = Some(dual_coef);
380        self.intercept_ = Some(rho);
381    }
382
383    pub fn decision_function(&self, x: &Tensor) -> Tensor {
384        let x_data = x.data_f32();
385        let n_samples = x.dims()[0];
386        let n_features = x.dims()[1];
387
388        let support_vectors = self.support_vectors_.as_ref().expect("Model not fitted");
389        let dual_coef = self.dual_coef_.as_ref().expect("Model not fitted");
390        let rho = self.intercept_.unwrap_or(0.0);
391
392        let scores: Vec<f32> = (0..n_samples)
393            .map(|i| {
394                let xi = &x_data[i * n_features..(i + 1) * n_features];
395                let mut score = -rho;
396                
397                for (sv, &coef) in support_vectors.iter().zip(dual_coef.iter()) {
398                    score += coef * self.kernel_function(xi, sv);
399                }
400                
401                score
402            })
403            .collect();
404
405        Tensor::from_slice(&scores, &[n_samples]).unwrap()
406    }
407
408    pub fn predict(&self, x: &Tensor) -> Tensor {
409        let scores = self.decision_function(x);
410        let score_data = scores.data_f32();
411        let n_samples = x.dims()[0];
412
413        let predictions: Vec<f32> = score_data.iter()
414            .map(|&s| if s >= 0.0 { 1.0 } else { -1.0 })
415            .collect();
416
417        Tensor::from_slice(&predictions, &[n_samples]).unwrap()
418    }
419
420    pub fn fit_predict(&mut self, x: &Tensor) -> Tensor {
421        self.fit(x);
422        self.predict(x)
423    }
424}
425
426impl Default for OneClassSVM {
427    fn default() -> Self {
428        Self::new()
429    }
430}
431
432
433/// Elliptic Envelope for Gaussian-distributed data
434pub struct EllipticEnvelope {
435    pub contamination: f32,
436    pub support_fraction: Option<f32>,
437    location_: Option<Vec<f32>>,
438    covariance_: Option<Vec<f32>>,
439    precision_: Option<Vec<f32>>,
440    threshold_: f32,
441    n_features_: usize,
442}
443
444impl EllipticEnvelope {
445    pub fn new() -> Self {
446        EllipticEnvelope {
447            contamination: 0.1,
448            support_fraction: None,
449            location_: None,
450            covariance_: None,
451            precision_: None,
452            threshold_: 0.0,
453            n_features_: 0,
454        }
455    }
456
457    pub fn contamination(mut self, c: f32) -> Self {
458        self.contamination = c.clamp(0.0, 0.5);
459        self
460    }
461
462    fn compute_mean(&self, x: &[f32], n_samples: usize, n_features: usize) -> Vec<f32> {
463        let mut mean = vec![0.0f32; n_features];
464        for i in 0..n_samples {
465            for j in 0..n_features {
466                mean[j] += x[i * n_features + j];
467            }
468        }
469        for m in &mut mean {
470            *m /= n_samples as f32;
471        }
472        mean
473    }
474
475    fn compute_covariance(&self, x: &[f32], mean: &[f32], n_samples: usize, n_features: usize) -> Vec<f32> {
476        let mut cov = vec![0.0f32; n_features * n_features];
477        
478        for i in 0..n_samples {
479            for j in 0..n_features {
480                for k in 0..n_features {
481                    let diff_j = x[i * n_features + j] - mean[j];
482                    let diff_k = x[i * n_features + k] - mean[k];
483                    cov[j * n_features + k] += diff_j * diff_k;
484                }
485            }
486        }
487
488        let denom = (n_samples - 1).max(1) as f32;
489        for c in &mut cov {
490            *c /= denom;
491        }
492
493        cov
494    }
495
496    fn invert_matrix(&self, matrix: &[f32], n: usize) -> Vec<f32> {
497        // Cholesky-based inversion
498        let mut a = matrix.to_vec();
499        
500        // Add regularization
501        for i in 0..n {
502            a[i * n + i] += 1e-6;
503        }
504
505        // Cholesky decomposition
506        let mut l = vec![0.0f32; n * n];
507        for i in 0..n {
508            for j in 0..=i {
509                let mut sum = 0.0f32;
510                if i == j {
511                    for k in 0..j {
512                        sum += l[j * n + k] * l[j * n + k];
513                    }
514                    l[j * n + j] = (a[j * n + j] - sum).max(1e-10).sqrt();
515                } else {
516                    for k in 0..j {
517                        sum += l[i * n + k] * l[j * n + k];
518                    }
519                    l[i * n + j] = (a[i * n + j] - sum) / l[j * n + j].max(1e-10);
520                }
521            }
522        }
523
524        // Invert L
525        let mut l_inv = vec![0.0f32; n * n];
526        for i in 0..n {
527            l_inv[i * n + i] = 1.0 / l[i * n + i].max(1e-10);
528            for j in (i + 1)..n {
529                let mut sum = 0.0f32;
530                for k in i..j {
531                    sum += l[j * n + k] * l_inv[k * n + i];
532                }
533                l_inv[j * n + i] = -sum / l[j * n + j].max(1e-10);
534            }
535        }
536
537        // Precision = L_inv^T * L_inv
538        let mut precision = vec![0.0f32; n * n];
539        for i in 0..n {
540            for j in 0..n {
541                for k in 0..n {
542                    precision[i * n + j] += l_inv[k * n + i] * l_inv[k * n + j];
543                }
544            }
545        }
546
547        precision
548    }
549
550    fn mahalanobis_distance(&self, x: &[f32], mean: &[f32], precision: &[f32], n_features: usize) -> f32 {
551        let diff: Vec<f32> = x.iter().zip(mean.iter()).map(|(&xi, &mi)| xi - mi).collect();
552        
553        let mut dist = 0.0f32;
554        for i in 0..n_features {
555            for j in 0..n_features {
556                dist += diff[i] * precision[i * n_features + j] * diff[j];
557            }
558        }
559        
560        dist.sqrt()
561    }
562
563    pub fn fit(&mut self, x: &Tensor) {
564        let x_data = x.data_f32();
565        let n_samples = x.dims()[0];
566        let n_features = x.dims()[1];
567
568        self.n_features_ = n_features;
569
570        // Compute robust estimates using all data (simplified - full MCD would be better)
571        let mean = self.compute_mean(&x_data, n_samples, n_features);
572        let cov = self.compute_covariance(&x_data, &mean, n_samples, n_features);
573        let precision = self.invert_matrix(&cov, n_features);
574
575        // Compute Mahalanobis distances
576        let distances: Vec<f32> = (0..n_samples)
577            .map(|i| {
578                let xi = &x_data[i * n_features..(i + 1) * n_features];
579                self.mahalanobis_distance(xi, &mean, &precision, n_features)
580            })
581            .collect();
582
583        // Set threshold based on contamination
584        let mut sorted_distances = distances.clone();
585        sorted_distances.sort_by(|a, b| a.partial_cmp(b).unwrap());
586        let threshold_idx = ((1.0 - self.contamination) * n_samples as f32) as usize;
587        self.threshold_ = sorted_distances[threshold_idx.min(n_samples - 1)];
588
589        self.location_ = Some(mean);
590        self.covariance_ = Some(cov);
591        self.precision_ = Some(precision);
592    }
593
594    pub fn decision_function(&self, x: &Tensor) -> Tensor {
595        let x_data = x.data_f32();
596        let n_samples = x.dims()[0];
597        let n_features = x.dims()[1];
598
599        let mean = self.location_.as_ref().expect("Model not fitted");
600        let precision = self.precision_.as_ref().expect("Model not fitted");
601
602        let scores: Vec<f32> = (0..n_samples)
603            .map(|i| {
604                let xi = &x_data[i * n_features..(i + 1) * n_features];
605                -self.mahalanobis_distance(xi, mean, precision, n_features)
606            })
607            .collect();
608
609        Tensor::from_slice(&scores, &[n_samples]).unwrap()
610    }
611
612    pub fn predict(&self, x: &Tensor) -> Tensor {
613        let scores = self.decision_function(x);
614        let score_data = scores.data_f32();
615        let n_samples = x.dims()[0];
616
617        let predictions: Vec<f32> = score_data.iter()
618            .map(|&s| if -s <= self.threshold_ { 1.0 } else { -1.0 })
619            .collect();
620
621        Tensor::from_slice(&predictions, &[n_samples]).unwrap()
622    }
623
624    pub fn fit_predict(&mut self, x: &Tensor) -> Tensor {
625        self.fit(x);
626        self.predict(x)
627    }
628
629    pub fn mahalanobis(&self, x: &Tensor) -> Tensor {
630        let x_data = x.data_f32();
631        let n_samples = x.dims()[0];
632        let n_features = x.dims()[1];
633
634        let mean = self.location_.as_ref().expect("Model not fitted");
635        let precision = self.precision_.as_ref().expect("Model not fitted");
636
637        let distances: Vec<f32> = (0..n_samples)
638            .map(|i| {
639                let xi = &x_data[i * n_features..(i + 1) * n_features];
640                self.mahalanobis_distance(xi, mean, precision, n_features)
641            })
642            .collect();
643
644        Tensor::from_slice(&distances, &[n_samples]).unwrap()
645    }
646}
647
648impl Default for EllipticEnvelope {
649    fn default() -> Self {
650        Self::new()
651    }
652}
653
654#[cfg(test)]
655mod tests {
656    use super::*;
657
658    #[test]
659    fn test_lof() {
660        let x = Tensor::from_slice(&[0.0f32, 0.0,
661            0.1, 0.1,
662            0.2, 0.0,
663            10.0, 10.0,  // Outlier
664        ], &[4, 2]).unwrap();
665
666        let mut lof = LocalOutlierFactor::new(2).contamination(0.25);
667        let predictions = lof.fit_predict(&x);
668        
669        assert_eq!(predictions.dims(), &[4]);
670    }
671
672    #[test]
673    fn test_one_class_svm() {
674        let x = Tensor::from_slice(&[0.0f32, 0.0,
675            0.1, 0.1,
676            0.2, 0.0,
677            0.1, 0.2,
678        ], &[4, 2]).unwrap();
679
680        let mut ocsvm = OneClassSVM::new().nu(0.1);
681        ocsvm.fit(&x);
682        
683        let predictions = ocsvm.predict(&x);
684        assert_eq!(predictions.dims(), &[4]);
685    }
686
687    #[test]
688    fn test_elliptic_envelope() {
689        let x = Tensor::from_slice(&[0.0f32, 0.0,
690            0.1, 0.1,
691            0.2, 0.0,
692            10.0, 10.0,  // Outlier
693        ], &[4, 2]).unwrap();
694
695        let mut ee = EllipticEnvelope::new().contamination(0.25);
696        let predictions = ee.fit_predict(&x);
697        
698        assert_eq!(predictions.dims(), &[4]);
699    }
700}
701
702