Skip to main content

fdars_core/spm/
ncomp.rs

1//! Automatic selection of the number of principal components (ncomp).
2//!
3//! Provides methods for choosing how many FPC scores to retain in
4//! FPCA-based monitoring: cumulative variance, elbow detection, or
5//! a fixed count.
6//!
7//! For functional linear regression, the optimal ncomp grows as
8//! O(n^{1/5}) (Hall & Horowitz, 2007, Theorem 1, p. 73). In practice,
9//! retaining 95% cumulative variance (Jackson, 1991, Chapter 4) is a
10//! widely used default that adapts to the eigenvalue decay rate.
11//!
12//! # References
13//!
14//! - Cattell, R.B. (1966). The scree test for the number of factors.
15//!   *Multivariate Behavioral Research*, 1(2), 245–276. p. 252.
16//! - Hall, P. & Horowitz, J.L. (2007). Methodology and convergence rates
17//!   for functional linear regression. *Annals of Statistics*, 35(1), 70–91.
18//!   Theorem 1, p. 73.
19//! - Jackson, J.E. (1991). *A User's Guide to Principal Components*. Wiley,
20//!   Chapter 4.
21//! - Kaiser, H.F. (1960). The application of electronic computers to factor
22//!   analysis. *Educational and Psychological Measurement*, 20, 141–151.
23//!   pp. 145–146.
24
25use crate::error::FdarError;
26
27/// Method for selecting the number of principal components.
28#[derive(Debug, Clone, PartialEq)]
29#[non_exhaustive]
30pub enum NcompMethod {
31    /// Retain components until cumulative variance >= threshold.
32    ///
33    /// Threshold 0.95 retains 95% of variance, which is a widely-used default
34    /// (Jackson, 1991). For monitoring applications, 0.90 may suffice as the
35    /// remaining variance contributes to SPE rather than T-squared.
36    CumulativeVariance(f64),
37    /// Detect the elbow (max second finite difference) in the scree plot
38    /// (Cattell, 1966, p. 252).
39    ///
40    /// The elbow method is sensitive to noise in the eigenvalue spectrum.
41    /// Second finite differences amplify noise by a factor of ~3 (condition
42    /// number of the differencing operator). The 3-point moving average
43    /// pre-smoothing reduces the effective amplification to ~1.5.
44    /// Falls back to `CumulativeVariance(0.95)` when fewer than 3 eigenvalues
45    /// are available.
46    Elbow,
47    /// Use a fixed number of components.
48    Fixed(usize),
49    /// Retain eigenvalues above the mean (Kaiser criterion variant;
50    /// Kaiser, 1960, pp. 145–146).
51    ///
52    /// The Kaiser criterion tends to over-extract when variables are many
53    /// and under-extract when few. It is equivalent to retaining components
54    /// with above-average explanatory power.
55    Kaiser,
56}
57
58/// Select the number of principal components from an eigenvalue spectrum.
59///
60/// # Arguments
61/// * `eigenvalues` - Eigenvalues in decreasing order (e.g. from FPCA)
62/// * `method` - Selection method
63///
64/// # Example
65///
66/// ```
67/// use fdars_core::spm::ncomp::{select_ncomp, NcompMethod};
68/// let eigenvalues = vec![10.0, 5.0, 1.0, 0.1, 0.01];
69/// let k = select_ncomp(&eigenvalues, &NcompMethod::CumulativeVariance(0.95)).unwrap();
70/// assert!(k >= 2 && k <= 4);
71/// let k_fixed = select_ncomp(&eigenvalues, &NcompMethod::Fixed(3)).unwrap();
72/// assert_eq!(k_fixed, 3);
73/// ```
74///
75/// # Errors
76///
77/// Returns [`FdarError::InvalidDimension`] if `eigenvalues` is empty.
78/// Returns [`FdarError::InvalidParameter`] if any eigenvalue is NaN or
79/// negative, or if `CumulativeVariance` threshold is not in (0, 1].
80#[must_use = "selected ncomp should not be discarded"]
81pub fn select_ncomp(eigenvalues: &[f64], method: &NcompMethod) -> Result<usize, FdarError> {
82    if eigenvalues.is_empty() {
83        return Err(FdarError::InvalidDimension {
84            parameter: "eigenvalues",
85            expected: "at least 1 eigenvalue".to_string(),
86            actual: "0 eigenvalues".to_string(),
87        });
88    }
89
90    // Validate eigenvalues: NaN or negative values indicate upstream issues
91    // (e.g., numerical instability in FPCA or corrupted data).
92    for (i, &ev) in eigenvalues.iter().enumerate() {
93        if ev.is_nan() {
94            return Err(FdarError::InvalidParameter {
95                parameter: "eigenvalues",
96                message: format!("eigenvalue[{i}] is NaN"),
97            });
98        }
99        if ev < 0.0 {
100            return Err(FdarError::InvalidParameter {
101                parameter: "eigenvalues",
102                message: format!("eigenvalue[{i}] = {ev} is negative"),
103            });
104        }
105    }
106
107    match method {
108        NcompMethod::CumulativeVariance(threshold) => {
109            if *threshold <= 0.0 || *threshold > 1.0 {
110                return Err(FdarError::InvalidParameter {
111                    parameter: "threshold",
112                    message: format!("threshold must be in (0, 1], got {threshold}"),
113                });
114            }
115            cumulative_variance_ncomp(eigenvalues, *threshold)
116        }
117        NcompMethod::Elbow => {
118            if eigenvalues.len() < 3 {
119                // Fallback to CV(0.95) when too few values for elbow
120                return cumulative_variance_ncomp(eigenvalues, 0.95);
121            }
122            Ok(elbow_ncomp(eigenvalues))
123        }
124        NcompMethod::Fixed(k) => Ok((*k).max(1).min(eigenvalues.len())),
125        NcompMethod::Kaiser => {
126            if eigenvalues.len() < 2 {
127                return Ok(1); // With 1 eigenvalue, always keep it
128            }
129            Ok(kaiser_ncomp(eigenvalues))
130        }
131    }
132}
133
134/// Cumulative variance method: first k where cumsum/total >= threshold.
135fn cumulative_variance_ncomp(eigenvalues: &[f64], threshold: f64) -> Result<usize, FdarError> {
136    let total: f64 = eigenvalues.iter().sum();
137    if total <= 0.0 {
138        return Err(FdarError::ComputationFailed {
139            operation: "select_ncomp",
140            detail: "total variance is non-positive".to_string(),
141        });
142    }
143
144    let mut cumsum = 0.0;
145    for (k, &ev) in eigenvalues.iter().enumerate() {
146        cumsum += ev;
147        if cumsum / total >= threshold {
148            return Ok(k + 1);
149        }
150    }
151    // If threshold is exactly 1.0, we need all components
152    Ok(eigenvalues.len())
153}
154
155/// Kaiser criterion: retain components with eigenvalue > mean(eigenvalues).
156///
157/// Equivalent to retaining components with above-average explanatory power
158/// (Kaiser, 1960, pp. 145–146). Tends to over-extract when the number of
159/// variables is large and under-extract when small.
160fn kaiser_ncomp(eigenvalues: &[f64]) -> usize {
161    let mean = eigenvalues.iter().sum::<f64>() / eigenvalues.len() as f64;
162    let k = eigenvalues.iter().filter(|&&ev| ev > mean).count();
163    k.max(1) // Always retain at least 1
164}
165
166/// Elbow method: argmax of second finite difference d2[k] = λ[k-1] - 2λ[k] + λ[k+1].
167///
168/// Applies a 3-point moving average to reduce noise sensitivity before
169/// computing the second differences (Cattell, 1966, p. 252). Second finite
170/// differences amplify noise by a factor of ~3 (condition number of the
171/// differencing operator); the 3-point moving average pre-smoothing reduces
172/// the effective amplification to ~1.5.
173fn elbow_ncomp(eigenvalues: &[f64]) -> usize {
174    let n = eigenvalues.len();
175    // 3-point moving average smoothing (edges use 2-point)
176    let smoothed: Vec<f64> = (0..n)
177        .map(|i| {
178            if i == 0 {
179                (eigenvalues[0] + eigenvalues[1]) / 2.0
180            } else if i == n - 1 {
181                (eigenvalues[n - 2] + eigenvalues[n - 1]) / 2.0
182            } else {
183                (eigenvalues[i - 1] + eigenvalues[i] + eigenvalues[i + 1]) / 3.0
184            }
185        })
186        .collect();
187
188    // d2[k] for k = 1..n-2
189    let mut best_k = 1;
190    let mut best_d2 = f64::NEG_INFINITY;
191
192    for k in 1..n - 1 {
193        let d2 = smoothed[k - 1] - 2.0 * smoothed[k] + smoothed[k + 1];
194        if d2 > best_d2 {
195            best_d2 = d2;
196            best_k = k;
197        }
198    }
199    (best_k + 1).max(1).min(eigenvalues.len())
200}