fdars_core/
outliers.rs

1//! Outlier detection for functional data.
2//!
3//! This module provides methods for detecting outliers in functional data
4//! based on depth measures and likelihood ratio tests.
5
6use crate::depth::band::{modified_band_1d, modified_epigraph_index_1d};
7use crate::error::FdarError;
8use crate::iter_maybe_parallel;
9use crate::matrix::FdMatrix;
10use crate::streaming_depth::{SortedReferenceState, StreamingDepth, StreamingFraimanMuniz};
11use rand::prelude::*;
12use rand_distr::StandardNormal;
13#[cfg(feature = "parallel")]
14use rayon::iter::ParallelIterator;
15
16/// Compute trimmed mean and variance from data using depth-based trimming.
17///
18/// Returns (trimmed_mean, trimmed_var) each of length m.
19fn compute_trimmed_stats(data: &FdMatrix, depths: &[f64], n_keep: usize) -> (Vec<f64>, Vec<f64>) {
20    let m = data.ncols();
21
22    let mut depth_idx: Vec<(usize, f64)> =
23        depths.iter().enumerate().map(|(i, &d)| (i, d)).collect();
24    // O(n) partial sort instead of O(n log n) full sort — we only need the top n_keep elements
25    if n_keep < depth_idx.len() {
26        depth_idx.select_nth_unstable_by(n_keep - 1, |a, b| {
27            b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)
28        });
29    }
30    let keep_idx: Vec<usize> = depth_idx[..n_keep].iter().map(|(i, _)| *i).collect();
31
32    let results: Vec<(f64, f64)> = iter_maybe_parallel!(0..m)
33        .map(|j| {
34            let mut mean_j = 0.0;
35            for &i in &keep_idx {
36                mean_j += data[(i, j)];
37            }
38            mean_j /= n_keep as f64;
39
40            let mut var_j = 0.0;
41            for &i in &keep_idx {
42                let diff = data[(i, j)] - mean_j;
43                var_j += diff * diff;
44            }
45            var_j /= n_keep as f64;
46            var_j = var_j.max(1e-10);
47
48            (mean_j, var_j)
49        })
50        .collect();
51
52    let trimmed_mean: Vec<f64> = results.iter().map(|&(m, _)| m).collect();
53    let trimmed_var: Vec<f64> = results.iter().map(|&(_, v)| v).collect();
54
55    (trimmed_mean, trimmed_var)
56}
57
58/// Compute normalized Mahalanobis-like distance for a single observation.
59fn normalized_distance(
60    data: &FdMatrix,
61    i: usize,
62    trimmed_mean: &[f64],
63    trimmed_var: &[f64],
64) -> f64 {
65    let m = data.ncols();
66    let mut dist = 0.0;
67    for j in 0..m {
68        let diff = data[(i, j)] - trimmed_mean[j];
69        dist += diff * diff / trimmed_var[j];
70    }
71    (dist / m as f64).sqrt()
72}
73
74/// Compute bootstrap threshold for LRT outlier detection.
75///
76/// # Arguments
77/// * `data` - Functional data matrix (n observations x m evaluation points)
78/// * `nb` - Number of bootstrap iterations
79/// * `smo` - Smoothing parameter for bootstrap
80/// * `trim` - Trimming proportion
81/// * `seed` - Random seed
82/// * `percentile` - Percentile for threshold (e.g., 0.99 for 99th percentile)
83///
84/// # Returns
85/// Threshold at specified percentile for outlier detection
86#[must_use = "expensive computation whose result should not be discarded"]
87pub fn outliers_threshold_lrt(
88    data: &FdMatrix,
89    nb: usize,
90    smo: f64,
91    trim: f64,
92    seed: u64,
93    percentile: f64,
94) -> f64 {
95    outliers_threshold_lrt_with_dist(data, nb, smo, trim, seed, percentile).0
96}
97
98/// Compute bootstrap threshold and full null distribution for LRT outlier detection.
99///
100/// Same as [`outliers_threshold_lrt`] but also returns the sorted bootstrap
101/// distribution of max-distances, enabling per-curve p-value computation:
102/// `p = (sum(boot_dist >= d) + 1) / (B + 1)`.
103///
104/// # Arguments
105/// * `data` - Functional data matrix (n observations x m evaluation points)
106/// * `nb` - Number of bootstrap iterations
107/// * `smo` - Smoothing parameter for bootstrap
108/// * `trim` - Trimming proportion
109/// * `seed` - Random seed
110/// * `percentile` - Percentile for threshold (e.g., 0.99 for 99th percentile)
111///
112/// # Returns
113/// `(threshold, sorted_distribution)` — threshold at specified percentile and the
114/// full sorted bootstrap null distribution of max-distances (length `nb`).
115#[must_use = "expensive computation whose result should not be discarded"]
116pub fn outliers_threshold_lrt_with_dist(
117    data: &FdMatrix,
118    nb: usize,
119    smo: f64,
120    trim: f64,
121    seed: u64,
122    percentile: f64,
123) -> (f64, Vec<f64>) {
124    let n = data.nrows();
125    let m = data.ncols();
126
127    if n < 3 || m == 0 {
128        return (0.0, vec![]);
129    }
130
131    let n_keep = ((1.0 - trim) * n as f64).ceil().max(1.0) as usize;
132    let n_keep = n_keep.min(n);
133
134    // Compute column standard deviations for smoothing
135    let col_vars: Vec<f64> = iter_maybe_parallel!(0..m)
136        .map(|j| {
137            let mut sum = 0.0;
138            let mut sum_sq = 0.0;
139            for i in 0..n {
140                let val = data[(i, j)];
141                sum += val;
142                sum_sq += val * val;
143            }
144            let mean = sum / n as f64;
145            // Bootstrap variance, population formula
146            let var = sum_sq / n as f64 - mean * mean;
147            var.max(0.0).sqrt()
148        })
149        .collect();
150
151    // Run bootstrap iterations in parallel
152    let max_dists: Vec<f64> = iter_maybe_parallel!(0..nb)
153        .map(|b| {
154            let mut rng = StdRng::seed_from_u64(seed.wrapping_add(b as u64));
155
156            // Resample with replacement and add smoothing noise
157            let indices: Vec<usize> = (0..n).map(|_| rng.gen_range(0..n)).collect();
158            // Pre-generate all noise values to preserve RNG sequence
159            let noise_vals: Vec<f64> = (0..n * m)
160                .map(|_| rng.sample::<f64, _>(StandardNormal))
161                .collect();
162            let mut boot_data = FdMatrix::zeros(n, m);
163            // Column-first iteration for cache-friendly writes in column-major layout
164            for j in 0..m {
165                let smo_var = smo * col_vars[j];
166                for (new_i, &old_i) in indices.iter().enumerate() {
167                    let noise = noise_vals[new_i * m + j] * smo_var;
168                    boot_data[(new_i, j)] = data[(old_i, j)] + noise;
169                }
170            }
171
172            // Compute trimmed stats from bootstrap sample
173            let state = SortedReferenceState::from_reference(&boot_data);
174            let streaming_fm = StreamingFraimanMuniz::new(state, true);
175            let depths = streaming_fm.depth_batch(&boot_data);
176            let (trimmed_mean, trimmed_var) = compute_trimmed_stats(&boot_data, &depths, n_keep);
177
178            // Find max normalized distance across all observations
179            (0..n)
180                .map(|i| normalized_distance(&boot_data, i, &trimmed_mean, &trimmed_var))
181                .fold(0.0_f64, f64::max)
182        })
183        .collect();
184
185    // Sort and extract threshold at specified percentile
186    let mut sorted_dists = max_dists;
187    crate::helpers::sort_nan_safe(&mut sorted_dists);
188    let idx =
189        crate::utility::f64_to_usize_clamped(nb as f64 * percentile).min(nb.saturating_sub(1));
190    let threshold = sorted_dists.get(idx).copied().unwrap_or(0.0);
191    (threshold, sorted_dists)
192}
193
194/// Detect outliers using LRT method.
195///
196/// # Arguments
197/// * `data` - Functional data matrix (n observations x m evaluation points)
198/// * `threshold` - Outlier threshold
199/// * `trim` - Trimming proportion
200///
201/// # Returns
202/// Vector of booleans indicating outliers
203#[must_use = "expensive computation whose result should not be discarded"]
204/// Detect outliers in functional data using the Likelihood Ratio Test.
205///
206/// Compares each observation's normalized distance against a threshold.
207/// Observations exceeding the threshold are flagged as outliers.
208///
209/// # Arguments
210/// * `data` - Functional data matrix (n x m)
211/// * `threshold` - Decision threshold (from [`outliers_threshold_lrt`])
212/// * `trim` - Trimming proportion for robust estimation
213///
214/// # Examples
215///
216/// ```
217/// use fdars_core::matrix::FdMatrix;
218/// use fdars_core::outliers::detect_outliers_lrt;
219///
220/// let data = FdMatrix::from_column_major(
221///     (0..50).map(|i| (i as f64 * 0.1).sin()).collect(),
222///     5, 10,
223/// ).unwrap();
224/// let outliers = detect_outliers_lrt(&data, 3.0, 0.1);
225/// assert_eq!(outliers.len(), 5);
226/// ```
227pub fn detect_outliers_lrt(data: &FdMatrix, threshold: f64, trim: f64) -> Vec<bool> {
228    let n = data.nrows();
229    let m = data.ncols();
230
231    if n < 3 || m == 0 {
232        return vec![false; n];
233    }
234
235    let n_keep = ((1.0 - trim) * n as f64).ceil().max(1.0) as usize;
236    let n_keep = n_keep.min(n);
237
238    let state = SortedReferenceState::from_reference(data);
239    let streaming_fm = StreamingFraimanMuniz::new(state, true);
240    let depths = streaming_fm.depth_batch(data);
241    let (trimmed_mean, trimmed_var) = compute_trimmed_stats(data, &depths, n_keep);
242
243    iter_maybe_parallel!(0..n)
244        .map(|i| normalized_distance(data, i, &trimmed_mean, &trimmed_var) > threshold)
245        .collect()
246}
247
248/// Result of the outliergram analysis.
249#[derive(Debug, Clone, PartialEq)]
250#[non_exhaustive]
251pub struct OutligramResult {
252    /// Modified Epigraph Index for each curve
253    pub mei: Vec<f64>,
254    /// Modified Band Depth for each curve
255    pub mbd: Vec<f64>,
256    /// Parabola coefficients: MBD = a0 + a1*MEI + a2*MEI²
257    pub a0: f64,
258    pub a1: f64,
259    pub a2: f64,
260    /// Outlier threshold (IQR-based)
261    pub threshold: f64,
262    /// Outlier flags (true = outlier)
263    pub outlier_flags: Vec<bool>,
264}
265
266/// Compute outliergram with parabolic outlier boundary.
267///
268/// Combines modified band depth and modified epigraph index to detect
269/// shape outliers. Fits a parabolic upper bound `MBD = a0 + a1*MEI + a2*MEI²`
270/// and flags curves whose residuals fall below an IQR-based threshold.
271///
272/// # Arguments
273/// * `data` - Functional data matrix (n x m)
274/// * `factor` - IQR multiplier for the outlier threshold (typically 1.5)
275///
276/// # Returns
277/// Outliergram result with depths, parabola coefficients, and outlier flags.
278pub fn outliergram(data: &FdMatrix, factor: f64) -> Result<OutligramResult, FdarError> {
279    let n = data.nrows();
280    if n < 3 {
281        return Err(FdarError::InvalidDimension {
282            parameter: "data",
283            expected: "at least 3 rows".to_string(),
284            actual: format!("{n} rows"),
285        });
286    }
287
288    let mei = modified_epigraph_index_1d(data, data);
289    let mbd = modified_band_1d(data, data);
290
291    // Fit parabola: MBD = a0 + a1*MEI + a2*MEI²
292    // Using normal equations: X = [1, mei, mei²], y = mbd
293    let mut xtx = [[0.0; 3]; 3];
294    let mut xty = [0.0; 3];
295    for i in 0..n {
296        let x = [1.0, mei[i], mei[i] * mei[i]];
297        for r in 0..3 {
298            for c in 0..3 {
299                xtx[r][c] += x[r] * x[c];
300            }
301            xty[r] += x[r] * mbd[i];
302        }
303    }
304
305    // Solve 3x3 system via Cramer's rule
306    let (a0, a1, a2) = solve_3x3(xtx, xty);
307
308    // Compute residuals below the parabola
309    let residuals: Vec<f64> = (0..n)
310        .map(|i| mbd[i] - (a0 + a1 * mei[i] + a2 * mei[i] * mei[i]))
311        .collect();
312
313    // IQR-based threshold on residuals
314    let mut sorted_resid = residuals.clone();
315    sorted_resid.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
316    let q1 = sorted_resid[n / 4];
317    let q3 = sorted_resid[3 * n / 4];
318    let iqr = q3 - q1;
319    let threshold = q1 - factor * iqr;
320
321    let outlier_flags: Vec<bool> = residuals.iter().map(|&r| r < threshold).collect();
322
323    Ok(OutligramResult {
324        mei,
325        mbd,
326        a0,
327        a1,
328        a2,
329        threshold,
330        outlier_flags,
331    })
332}
333
334/// Result of magnitude-shape outlyingness decomposition.
335#[derive(Debug, Clone, PartialEq)]
336#[non_exhaustive]
337pub struct MagnitudeShapeResult {
338    /// Magnitude outlyingness: 1 - MBD (how far from the center)
339    pub magnitude: Vec<f64>,
340    /// Shape outlyingness: L2 distance of normalized curve direction from mean direction
341    pub shape: Vec<f64>,
342}
343
344/// Decompose outlyingness into magnitude and shape components.
345///
346/// Magnitude measures how far a curve is from the center (1 - modified band depth).
347/// Shape measures how different the curve's direction is from the mean direction
348/// (L2 distance of normalized centered curves).
349///
350/// # Arguments
351/// * `data` - Functional data matrix (n x m)
352pub fn magnitude_shape_outlyingness(data: &FdMatrix) -> Result<MagnitudeShapeResult, FdarError> {
353    let (n, m) = data.shape();
354    if n < 2 || m == 0 {
355        return Err(FdarError::InvalidDimension {
356            parameter: "data",
357            expected: "at least 2 rows and 1 column".to_string(),
358            actual: format!("{n} rows, {m} columns"),
359        });
360    }
361
362    // Magnitude: 1 - modified band depth
363    let mbd = modified_band_1d(data, data);
364    let magnitude: Vec<f64> = mbd.iter().map(|&d| 1.0 - d).collect();
365
366    // Shape: compute centered curves, normalize directions, compare to mean direction
367    // Step 1: compute column means
368    let mut col_means = vec![0.0; m];
369    for j in 0..m {
370        for i in 0..n {
371            col_means[j] += data[(i, j)];
372        }
373        col_means[j] /= n as f64;
374    }
375
376    // Step 2: center and normalize each curve (direction)
377    let mut directions = vec![vec![0.0; m]; n];
378    for i in 0..n {
379        let mut norm_sq = 0.0;
380        for j in 0..m {
381            let c = data[(i, j)] - col_means[j];
382            directions[i][j] = c;
383            norm_sq += c * c;
384        }
385        let norm = norm_sq.sqrt().max(1e-15);
386        for j in 0..m {
387            directions[i][j] /= norm;
388        }
389    }
390
391    // Step 3: mean direction
392    let mut mean_dir = vec![0.0; m];
393    for i in 0..n {
394        for j in 0..m {
395            mean_dir[j] += directions[i][j];
396        }
397    }
398    let mut mean_norm_sq = 0.0;
399    for j in 0..m {
400        mean_dir[j] /= n as f64;
401        mean_norm_sq += mean_dir[j] * mean_dir[j];
402    }
403    let mean_norm = mean_norm_sq.sqrt().max(1e-15);
404    for j in 0..m {
405        mean_dir[j] /= mean_norm;
406    }
407
408    // Step 4: shape = L2 distance from mean direction
409    let shape: Vec<f64> = (0..n)
410        .map(|i| {
411            let dist_sq: f64 = (0..m)
412                .map(|j| {
413                    let d = directions[i][j] - mean_dir[j];
414                    d * d
415                })
416                .sum();
417            dist_sq.sqrt()
418        })
419        .collect();
420
421    Ok(MagnitudeShapeResult { magnitude, shape })
422}
423
424/// Solve a 3x3 linear system via Cramer's rule.
425fn solve_3x3(a: [[f64; 3]; 3], b: [f64; 3]) -> (f64, f64, f64) {
426    let det = a[0][0] * (a[1][1] * a[2][2] - a[1][2] * a[2][1])
427        - a[0][1] * (a[1][0] * a[2][2] - a[1][2] * a[2][0])
428        + a[0][2] * (a[1][0] * a[2][1] - a[1][1] * a[2][0]);
429    if det.abs() < 1e-15 {
430        return (0.0, 0.0, 0.0);
431    }
432
433    let det_x = b[0] * (a[1][1] * a[2][2] - a[1][2] * a[2][1])
434        - a[0][1] * (b[1] * a[2][2] - a[1][2] * b[2])
435        + a[0][2] * (b[1] * a[2][1] - a[1][1] * b[2]);
436
437    let det_y = a[0][0] * (b[1] * a[2][2] - a[1][2] * b[2])
438        - b[0] * (a[1][0] * a[2][2] - a[1][2] * a[2][0])
439        + a[0][2] * (a[1][0] * b[2] - b[1] * a[2][0]);
440
441    let det_z = a[0][0] * (a[1][1] * b[2] - b[1] * a[2][1])
442        - a[0][1] * (a[1][0] * b[2] - b[1] * a[2][0])
443        + b[0] * (a[1][0] * a[2][1] - a[1][1] * a[2][0]);
444
445    (det_x / det, det_y / det, det_z / det)
446}
447
448#[cfg(test)]
449mod tests {
450    use super::*;
451    use std::f64::consts::PI;
452
453    /// Generate homogeneous functional data
454    fn generate_normal_fdata(n: usize, m: usize, seed: u64) -> FdMatrix {
455        let mut rng = StdRng::seed_from_u64(seed);
456        let t: Vec<f64> = (0..m).map(|j| j as f64 / (m - 1) as f64).collect();
457
458        let mut data = FdMatrix::zeros(n, m);
459        for i in 0..n {
460            let phase: f64 = rng.gen::<f64>() * 0.2;
461            let amp: f64 = 1.0 + rng.gen::<f64>() * 0.1;
462            for j in 0..m {
463                let noise: f64 = rng.sample::<f64, _>(StandardNormal) * 0.05;
464                data[(i, j)] = amp * (2.0 * PI * t[j] + phase).sin() + noise;
465            }
466        }
467        data
468    }
469
470    /// Generate data with obvious outliers
471    fn generate_data_with_outlier(n: usize, m: usize, n_outliers: usize) -> FdMatrix {
472        let t: Vec<f64> = (0..m).map(|j| j as f64 / (m - 1) as f64).collect();
473
474        let mut data = FdMatrix::zeros(n, m);
475
476        // Normal curves
477        for i in 0..(n - n_outliers) {
478            for j in 0..m {
479                data[(i, j)] = (2.0 * PI * t[j]).sin();
480            }
481        }
482
483        // Outlier curves (shifted up by 10)
484        for i in (n - n_outliers)..n {
485            for j in 0..m {
486                data[(i, j)] = (2.0 * PI * t[j]).sin() + 10.0;
487            }
488        }
489
490        data
491    }
492
493    // ============== Threshold tests ==============
494
495    #[test]
496    fn test_outliers_threshold_lrt_returns_positive() {
497        let n = 20;
498        let m = 30;
499        let data = generate_normal_fdata(n, m, 42);
500
501        let threshold = outliers_threshold_lrt(&data, 50, 0.1, 0.1, 42, 0.95);
502
503        assert!(threshold > 0.0, "Threshold should be positive");
504    }
505
506    #[test]
507    fn test_outliers_threshold_lrt_deterministic() {
508        let n = 15;
509        let m = 25;
510        let data = generate_normal_fdata(n, m, 42);
511
512        let t1 = outliers_threshold_lrt(&data, 30, 0.1, 0.1, 123, 0.95);
513        let t2 = outliers_threshold_lrt(&data, 30, 0.1, 0.1, 123, 0.95);
514
515        assert!(
516            (t1 - t2).abs() < 1e-10,
517            "Same seed should give same threshold"
518        );
519    }
520
521    #[test]
522    fn test_outliers_threshold_lrt_percentile_effect() {
523        let n = 20;
524        let m = 30;
525        let data = generate_normal_fdata(n, m, 42);
526
527        let t_low = outliers_threshold_lrt(&data, 50, 0.1, 0.1, 42, 0.50);
528        let t_high = outliers_threshold_lrt(&data, 50, 0.1, 0.1, 42, 0.99);
529
530        assert!(
531            t_high >= t_low,
532            "Higher percentile should give higher or equal threshold"
533        );
534    }
535
536    #[test]
537    fn test_outliers_threshold_lrt_invalid_input() {
538        // Too few observations
539        let data = FdMatrix::zeros(2, 30);
540        let threshold = outliers_threshold_lrt(&data, 50, 0.1, 0.1, 42, 0.95);
541        assert!(threshold.abs() < 1e-10, "Should return 0 for n < 3");
542
543        // Empty m
544        let data = FdMatrix::zeros(10, 0);
545        let threshold = outliers_threshold_lrt(&data, 50, 0.1, 0.1, 42, 0.95);
546        assert!(threshold.abs() < 1e-10);
547    }
548
549    // ============== Detection tests ==============
550
551    #[test]
552    fn test_detect_outliers_lrt_finds_obvious_outlier() {
553        let n = 20;
554        let m = 30;
555        let data = generate_data_with_outlier(n, m, 1);
556
557        // Use a reasonable threshold
558        let outliers = detect_outliers_lrt(&data, 3.0, 0.1);
559
560        assert_eq!(outliers.len(), n);
561
562        // The last curve (outlier) should be detected
563        assert!(outliers[n - 1], "Obvious outlier should be detected");
564
565        // Most normal curves should not be outliers
566        let n_detected: usize = outliers.iter().filter(|&&x| x).count();
567        assert!(n_detected <= 3, "Should not detect too many outliers");
568    }
569
570    #[test]
571    fn test_detect_outliers_lrt_homogeneous_data() {
572        let n = 20;
573        let m = 30;
574        let data = generate_normal_fdata(n, m, 42);
575
576        // With very high threshold, no outliers
577        let outliers = detect_outliers_lrt(&data, 100.0, 0.1);
578
579        let n_detected: usize = outliers.iter().filter(|&&x| x).count();
580        assert_eq!(
581            n_detected, 0,
582            "Very high threshold should detect no outliers"
583        );
584    }
585
586    #[test]
587    fn test_detect_outliers_lrt_threshold_effect() {
588        let n = 20;
589        let m = 30;
590        let data = generate_data_with_outlier(n, m, 3);
591
592        let low_thresh = detect_outliers_lrt(&data, 2.0, 0.1);
593        let high_thresh = detect_outliers_lrt(&data, 10.0, 0.1);
594
595        let n_low: usize = low_thresh.iter().filter(|&&x| x).count();
596        let n_high: usize = high_thresh.iter().filter(|&&x| x).count();
597
598        assert!(
599            n_low >= n_high,
600            "Lower threshold should detect more or equal outliers"
601        );
602    }
603
604    #[test]
605    fn test_detect_outliers_lrt_invalid_input() {
606        // Too few observations
607        let data = FdMatrix::zeros(2, 30);
608        let outliers = detect_outliers_lrt(&data, 3.0, 0.1);
609        assert_eq!(outliers.len(), 2);
610        assert!(
611            outliers.iter().all(|&x| !x),
612            "Should return all false for n < 3"
613        );
614    }
615
616    #[test]
617    fn test_identical_data_outliers() {
618        let n = 10;
619        let m = 20;
620        let data = FdMatrix::from_column_major(vec![1.0; n * m], n, m).unwrap();
621        let flags = detect_outliers_lrt(&data, 1.0, 0.15);
622        assert_eq!(flags.len(), n);
623        // All identical → no outliers
624        for &f in &flags {
625            assert!(!f);
626        }
627    }
628
629    #[test]
630    fn test_n3_minimal_outliers() {
631        // Minimum viable: 3 curves
632        let n = 3;
633        let m = 10;
634        let mut data_vec = vec![0.0; n * m];
635        // Third curve is an outlier
636        for j in 0..m {
637            data_vec[j * n] = 0.0;
638            data_vec[1 + j * n] = 0.1;
639            data_vec[2 + j * n] = 100.0;
640        }
641        let data = FdMatrix::from_column_major(data_vec, n, m).unwrap();
642        let flags = detect_outliers_lrt(&data, 0.5, 0.15);
643        assert_eq!(flags.len(), n);
644    }
645
646    // ============== With-distribution tests ==============
647
648    #[test]
649    fn test_with_dist_returns_sorted_distribution() {
650        let data = generate_normal_fdata(20, 30, 42);
651        let nb = 50;
652        let (threshold, dist) = outliers_threshold_lrt_with_dist(&data, nb, 0.1, 0.1, 42, 0.95);
653
654        assert_eq!(dist.len(), nb, "Distribution length should equal nb");
655        for w in dist.windows(2) {
656            assert!(w[0] <= w[1], "Distribution should be sorted");
657        }
658        let idx = ((nb as f64 * 0.95) as usize).min(nb - 1);
659        assert!(
660            (threshold - dist[idx]).abs() < 1e-10,
661            "Threshold should match distribution at percentile index"
662        );
663    }
664
665    #[test]
666    fn test_with_dist_matches_scalar() {
667        let data = generate_normal_fdata(15, 25, 99);
668        let scalar = outliers_threshold_lrt(&data, 40, 0.1, 0.1, 123, 0.95);
669        let (with_dist, _) = outliers_threshold_lrt_with_dist(&data, 40, 0.1, 0.1, 123, 0.95);
670        assert!(
671            (scalar - with_dist).abs() < 1e-10,
672            "Scalar version should match with_dist version"
673        );
674    }
675
676    #[test]
677    fn test_bootstrap_dist_enables_pvalue() {
678        let n = 20;
679        let m = 30;
680        let data = generate_data_with_outlier(n, m, 1);
681        let trim = 0.1;
682
683        let (_, dist) = outliers_threshold_lrt_with_dist(&data, 200, 0.1, trim, 42, 0.99);
684        let nb = dist.len();
685
686        // Compute per-curve distances
687        let n_keep = ((1.0 - trim) * n as f64).ceil() as usize;
688        let state = SortedReferenceState::from_reference(&data);
689        let streaming_fm = StreamingFraimanMuniz::new(state, true);
690        let depths = streaming_fm.depth_batch(&data);
691        let (tmean, tvar) = compute_trimmed_stats(&data, &depths, n_keep);
692
693        // p-value for the outlier curve (last one)
694        let d_outlier = normalized_distance(&data, n - 1, &tmean, &tvar);
695        let p_outlier =
696            (dist.iter().filter(|&&v| v >= d_outlier).count() as f64 + 1.0) / (nb as f64 + 1.0);
697
698        // p-value for a normal curve (first one)
699        let d_normal = normalized_distance(&data, 0, &tmean, &tvar);
700        let p_normal =
701            (dist.iter().filter(|&&v| v >= d_normal).count() as f64 + 1.0) / (nb as f64 + 1.0);
702
703        assert!(
704            p_outlier < 0.05,
705            "Outlier should have small p-value, got {p_outlier}"
706        );
707        assert!(
708            p_normal > 0.05,
709            "Normal curve should have large p-value, got {p_normal}"
710        );
711    }
712
713    #[test]
714    fn test_with_dist_invalid_input() {
715        let data = FdMatrix::zeros(2, 30);
716        let (threshold, dist) = outliers_threshold_lrt_with_dist(&data, 50, 0.1, 0.1, 42, 0.95);
717        assert!(threshold.abs() < 1e-10);
718        assert!(dist.is_empty(), "Should return empty dist for n < 3");
719    }
720
721    #[test]
722    fn test_all_false_high_threshold() {
723        let n = 10;
724        let m = 20;
725        let data_vec: Vec<f64> = (0..n * m).map(|i| (i as f64 * 0.1).sin()).collect();
726        let data = FdMatrix::from_column_major(data_vec, n, m).unwrap();
727        // Very high threshold → no outliers
728        let flags = detect_outliers_lrt(&data, 1e10, 0.15);
729        for &f in &flags {
730            assert!(!f, "High threshold should produce no outliers");
731        }
732    }
733
734    // ============== n_keep clamping tests ==============
735
736    #[test]
737    fn test_trim_zero_no_trimming() {
738        // trim=0 → n_keep=n, exercises the skip-partial-sort branch in compute_trimmed_stats
739        let data = generate_normal_fdata(10, 20, 42);
740        let threshold = outliers_threshold_lrt(&data, 30, 0.1, 0.0, 42, 0.95);
741        assert!(threshold > 0.0);
742        let flags = detect_outliers_lrt(&data, threshold, 0.0);
743        assert_eq!(flags.len(), 10);
744    }
745
746    #[test]
747    fn test_trim_near_one_heavy_trimming() {
748        // trim=0.9 → n_keep=1, exercises minimal trim set (single deepest curve)
749        let data = generate_normal_fdata(10, 20, 42);
750        let threshold = outliers_threshold_lrt(&data, 30, 0.1, 0.9, 42, 0.95);
751        assert!(threshold >= 0.0);
752        let flags = detect_outliers_lrt(&data, threshold, 0.9);
753        assert_eq!(flags.len(), 10);
754    }
755
756    #[test]
757    fn test_trim_one_clamps_to_one() {
758        // trim=1.0 → n_keep would be 0, must clamp to 1 (was a panic before fix)
759        let data = generate_normal_fdata(10, 20, 42);
760        let threshold = outliers_threshold_lrt(&data, 30, 0.1, 1.0, 42, 0.95);
761        assert!(threshold >= 0.0);
762        let flags = detect_outliers_lrt(&data, threshold, 1.0);
763        assert_eq!(flags.len(), 10);
764    }
765
766    #[test]
767    fn test_trim_negative_clamps_to_n() {
768        // trim=-0.5 → n_keep would exceed n, must clamp to n (was a panic before fix)
769        let data = generate_normal_fdata(10, 20, 42);
770        let threshold = outliers_threshold_lrt(&data, 30, 0.1, -0.5, 42, 0.95);
771        assert!(threshold > 0.0);
772        let flags = detect_outliers_lrt(&data, threshold, -0.5);
773        assert_eq!(flags.len(), 10);
774    }
775
776    // ============== Bootstrap parameter edge cases ==============
777
778    #[test]
779    fn test_smo_zero_no_noise() {
780        // smo=0 → bootstrap resamples without smoothing noise
781        let data = generate_normal_fdata(10, 20, 42);
782        let (threshold, dist) = outliers_threshold_lrt_with_dist(&data, 30, 0.0, 0.1, 42, 0.95);
783        assert!(threshold > 0.0);
784        assert_eq!(dist.len(), 30);
785    }
786
787    #[test]
788    fn test_nb_zero_empty_bootstrap() {
789        let data = generate_normal_fdata(10, 20, 42);
790        let (threshold, dist) = outliers_threshold_lrt_with_dist(&data, 0, 0.1, 0.1, 42, 0.95);
791        assert!(threshold.abs() < 1e-10);
792        assert!(dist.is_empty());
793    }
794
795    #[test]
796    fn test_nb_one_single_bootstrap() {
797        let data = generate_normal_fdata(10, 20, 42);
798        let (threshold, dist) = outliers_threshold_lrt_with_dist(&data, 1, 0.1, 0.1, 42, 0.95);
799        assert_eq!(dist.len(), 1);
800        // With 1 iteration, threshold must equal the single value
801        assert!((threshold - dist[0]).abs() < 1e-10);
802    }
803
804    #[test]
805    fn test_percentile_zero_returns_minimum() {
806        let data = generate_normal_fdata(15, 20, 42);
807        let nb = 50;
808        let (_, dist) = outliers_threshold_lrt_with_dist(&data, nb, 0.1, 0.1, 42, 0.95);
809        let t_zero = outliers_threshold_lrt(&data, nb, 0.1, 0.1, 42, 0.0);
810        assert!(
811            (t_zero - dist[0]).abs() < 1e-10,
812            "percentile=0 should return the minimum of the distribution"
813        );
814    }
815
816    #[test]
817    fn test_percentile_one_returns_maximum() {
818        let data = generate_normal_fdata(15, 20, 42);
819        let nb = 50;
820        let (_, dist) = outliers_threshold_lrt_with_dist(&data, nb, 0.1, 0.1, 42, 0.95);
821        let t_one = outliers_threshold_lrt(&data, nb, 0.1, 0.1, 42, 1.0);
822        assert!(
823            (t_one - *dist.last().unwrap()).abs() < 1e-10,
824            "percentile=1 should return the maximum of the distribution"
825        );
826    }
827
828    // ============== Distribution invariant tests ==============
829
830    #[test]
831    fn test_distribution_values_non_negative() {
832        let data = generate_normal_fdata(15, 20, 42);
833        let (_, dist) = outliers_threshold_lrt_with_dist(&data, 50, 0.1, 0.1, 42, 0.95);
834        for &v in &dist {
835            assert!(v >= 0.0, "Max-distances must be non-negative, got {v}");
836        }
837    }
838
839    // ============== detect_outliers_lrt edge cases ==============
840
841    #[test]
842    fn test_detect_m_zero_returns_all_false() {
843        let data = FdMatrix::zeros(10, 0);
844        let flags = detect_outliers_lrt(&data, 3.0, 0.1);
845        assert_eq!(flags.len(), 10);
846        assert!(flags.iter().all(|&f| !f));
847    }
848
849    #[test]
850    fn test_detect_multiple_outliers() {
851        let data = generate_data_with_outlier(20, 30, 3);
852        let flags = detect_outliers_lrt(&data, 3.0, 0.1);
853        // All three outlier curves (indices 17, 18, 19) should be detected
854        let outlier_count = flags[17..20].iter().filter(|&&x| x).count();
855        assert!(
856            outlier_count >= 2,
857            "At least 2 of 3 outliers should be detected, got {outlier_count}"
858        );
859    }
860
861    // ============== End-to-end integration ==============
862
863    #[test]
864    fn test_end_to_end_threshold_then_detect() {
865        let data = generate_data_with_outlier(20, 30, 2);
866        let threshold = outliers_threshold_lrt(&data, 100, 0.1, 0.1, 42, 0.99);
867        let flags = detect_outliers_lrt(&data, threshold, 0.1);
868
869        // Outlier curves (last 2) should be flagged
870        assert!(
871            flags[18] || flags[19],
872            "At least one outlier should be detected in end-to-end flow"
873        );
874        // Normal curves should mostly not be flagged
875        let false_positives = flags[..18].iter().filter(|&&x| x).count();
876        assert!(
877            false_positives <= 2,
878            "False positive count should be low, got {false_positives}"
879        );
880    }
881
882    #[test]
883    fn test_end_to_end_with_dist_pvalues_all_curves() {
884        // Full pipeline: bootstrap dist → per-curve p-values → outlier classification
885        let n = 25;
886        let m = 30;
887        let data = generate_data_with_outlier(n, m, 2);
888        let trim = 0.1;
889
890        let (_, dist) = outliers_threshold_lrt_with_dist(&data, 200, 0.1, trim, 42, 0.99);
891        let nb = dist.len();
892
893        let n_keep = ((1.0 - trim) * n as f64).ceil().max(1.0) as usize;
894        let n_keep = n_keep.min(n);
895        let state = SortedReferenceState::from_reference(&data);
896        let streaming_fm = StreamingFraimanMuniz::new(state, true);
897        let depths = streaming_fm.depth_batch(&data);
898        let (tmean, tvar) = compute_trimmed_stats(&data, &depths, n_keep);
899
900        // Compute p-values for all curves
901        let pvalues: Vec<f64> = (0..n)
902            .map(|i| {
903                let d = normalized_distance(&data, i, &tmean, &tvar);
904                (dist.iter().filter(|&&v| v >= d).count() as f64 + 1.0) / (nb as f64 + 1.0)
905            })
906            .collect();
907
908        // Normal curves (0..23) should have large p-values
909        let normal_small_p = pvalues[..23].iter().filter(|&&p| p < 0.01).count();
910        assert_eq!(
911            normal_small_p, 0,
912            "Normal curves should not have tiny p-values"
913        );
914
915        // Outlier curves (23, 24) should have small p-values
916        for &i in &[23, 24] {
917            assert!(
918                pvalues[i] < 0.05,
919                "Outlier curve {i} should have small p-value, got {}",
920                pvalues[i]
921            );
922        }
923    }
924
925    // ============== Outliergram tests ==============
926
927    fn outliergram_test_data() -> FdMatrix {
928        // 20 curves on 30 grid points, with 2 outliers
929        let n = 20;
930        let m = 30;
931        let t: Vec<f64> = (0..m).map(|j| j as f64 / (m - 1) as f64).collect();
932        let mut vals = vec![0.0; n * m];
933        for i in 0..n {
934            for (j, &tj) in t.iter().enumerate() {
935                let base = tj.sin();
936                vals[i + j * n] = if i < 18 {
937                    base + 0.1 * (i as f64 * 0.5).sin()
938                } else {
939                    // Outliers: large deviation
940                    base + 2.0 * (if i == 18 { 1.0 } else { -1.0 })
941                };
942            }
943        }
944        FdMatrix::from_column_major(vals, n, m).unwrap()
945    }
946
947    #[test]
948    fn outliergram_runs() {
949        let data = outliergram_test_data();
950        let result = outliergram(&data, 1.5).unwrap();
951        assert_eq!(result.mei.len(), 20);
952        assert_eq!(result.mbd.len(), 20);
953        assert_eq!(result.outlier_flags.len(), 20);
954        // The two outlier curves should have lower MBD
955        let central_mbd: f64 = result.mbd[..18].iter().sum::<f64>() / 18.0;
956        assert!(result.mbd[18] < central_mbd || result.mbd[19] < central_mbd);
957    }
958
959    #[test]
960    fn outliergram_parabola_coefficients() {
961        let data = outliergram_test_data();
962        let result = outliergram(&data, 1.5).unwrap();
963        // Parabola should have a2 <= 0 (concave) for the theoretical relationship
964        // (this is typical but not strictly required)
965        assert!(result.a0.is_finite());
966        assert!(result.a1.is_finite());
967        assert!(result.a2.is_finite());
968    }
969
970    #[test]
971    fn magnitude_shape_dimensions() {
972        let data = outliergram_test_data();
973        let result = magnitude_shape_outlyingness(&data).unwrap();
974        assert_eq!(result.magnitude.len(), 20);
975        assert_eq!(result.shape.len(), 20);
976        // All values should be non-negative
977        assert!(result.magnitude.iter().all(|&v| v >= 0.0));
978        assert!(result.shape.iter().all(|&v| v >= 0.0));
979    }
980
981    #[test]
982    fn magnitude_outliers_have_high_magnitude() {
983        let data = outliergram_test_data();
984        let result = magnitude_shape_outlyingness(&data).unwrap();
985        let central_mag: f64 = result.magnitude[..18].iter().sum::<f64>() / 18.0;
986        // At least one outlier should have higher magnitude
987        assert!(result.magnitude[18] > central_mag || result.magnitude[19] > central_mag);
988    }
989
990    #[test]
991    fn outliergram_too_few_curves() {
992        let data = FdMatrix::from_column_major(vec![1.0, 2.0, 3.0, 4.0], 2, 2).unwrap();
993        assert!(outliergram(&data, 1.5).is_err());
994    }
995}
fdars_core/outliers.rs

fdars_core/
outliers.rs