fdars_core/spm/ncomp.rs
1//! Automatic selection of the number of principal components (ncomp).
2//!
3//! Provides methods for choosing how many FPC scores to retain in
4//! FPCA-based monitoring: cumulative variance, elbow detection, or
5//! a fixed count.
6//!
7//! For functional linear regression, the optimal ncomp grows as
8//! O(n^{1/5}) (Hall & Horowitz, 2007, Theorem 1, p. 73). In practice,
9//! retaining 95% cumulative variance (Jackson, 1991, Chapter 4) is a
10//! widely used default that adapts to the eigenvalue decay rate.
11//!
12//! # References
13//!
14//! - Cattell, R.B. (1966). The scree test for the number of factors.
15//! *Multivariate Behavioral Research*, 1(2), 245–276. p. 252.
16//! - Hall, P. & Horowitz, J.L. (2007). Methodology and convergence rates
17//! for functional linear regression. *Annals of Statistics*, 35(1), 70–91.
18//! Theorem 1, p. 73.
19//! - Jackson, J.E. (1991). *A User's Guide to Principal Components*. Wiley,
20//! Chapter 4.
21//! - Kaiser, H.F. (1960). The application of electronic computers to factor
22//! analysis. *Educational and Psychological Measurement*, 20, 141–151.
23//! pp. 145–146.
24
25use crate::error::FdarError;
26
27/// Method for selecting the number of principal components.
28#[derive(Debug, Clone, PartialEq)]
29#[non_exhaustive]
30pub enum NcompMethod {
31 /// Retain components until cumulative variance >= threshold.
32 ///
33 /// Threshold 0.95 retains 95% of variance, which is a widely-used default
34 /// (Jackson, 1991). For monitoring applications, 0.90 may suffice as the
35 /// remaining variance contributes to SPE rather than T-squared.
36 CumulativeVariance(f64),
37 /// Detect the elbow (max second finite difference) in the scree plot
38 /// (Cattell, 1966, p. 252).
39 ///
40 /// The elbow method is sensitive to noise in the eigenvalue spectrum.
41 /// Second finite differences amplify noise by a factor of ~3 (condition
42 /// number of the differencing operator). The 3-point moving average
43 /// pre-smoothing reduces the effective amplification to ~1.5.
44 /// Falls back to `CumulativeVariance(0.95)` when fewer than 3 eigenvalues
45 /// are available.
46 Elbow,
47 /// Use a fixed number of components.
48 Fixed(usize),
49 /// Retain eigenvalues above the mean (Kaiser criterion variant;
50 /// Kaiser, 1960, pp. 145–146).
51 ///
52 /// The Kaiser criterion tends to over-extract when variables are many
53 /// and under-extract when few. It is equivalent to retaining components
54 /// with above-average explanatory power.
55 Kaiser,
56}
57
58/// Select the number of principal components from an eigenvalue spectrum.
59///
60/// # Arguments
61/// * `eigenvalues` - Eigenvalues in decreasing order (e.g. from FPCA)
62/// * `method` - Selection method
63///
64/// # Example
65///
66/// ```
67/// use fdars_core::spm::ncomp::{select_ncomp, NcompMethod};
68/// let eigenvalues = vec![10.0, 5.0, 1.0, 0.1, 0.01];
69/// let k = select_ncomp(&eigenvalues, &NcompMethod::CumulativeVariance(0.95)).unwrap();
70/// assert!(k >= 2 && k <= 4);
71/// let k_fixed = select_ncomp(&eigenvalues, &NcompMethod::Fixed(3)).unwrap();
72/// assert_eq!(k_fixed, 3);
73/// ```
74///
75/// # Errors
76///
77/// Returns [`FdarError::InvalidDimension`] if `eigenvalues` is empty.
78/// Returns [`FdarError::InvalidParameter`] if any eigenvalue is NaN or
79/// negative, or if `CumulativeVariance` threshold is not in (0, 1].
80#[must_use = "selected ncomp should not be discarded"]
81pub fn select_ncomp(eigenvalues: &[f64], method: &NcompMethod) -> Result<usize, FdarError> {
82 if eigenvalues.is_empty() {
83 return Err(FdarError::InvalidDimension {
84 parameter: "eigenvalues",
85 expected: "at least 1 eigenvalue".to_string(),
86 actual: "0 eigenvalues".to_string(),
87 });
88 }
89
90 // Validate eigenvalues: NaN or negative values indicate upstream issues
91 // (e.g., numerical instability in FPCA or corrupted data).
92 for (i, &ev) in eigenvalues.iter().enumerate() {
93 if ev.is_nan() {
94 return Err(FdarError::InvalidParameter {
95 parameter: "eigenvalues",
96 message: format!("eigenvalue[{i}] is NaN"),
97 });
98 }
99 if ev < 0.0 {
100 return Err(FdarError::InvalidParameter {
101 parameter: "eigenvalues",
102 message: format!("eigenvalue[{i}] = {ev} is negative"),
103 });
104 }
105 }
106
107 match method {
108 NcompMethod::CumulativeVariance(threshold) => {
109 if *threshold <= 0.0 || *threshold > 1.0 {
110 return Err(FdarError::InvalidParameter {
111 parameter: "threshold",
112 message: format!("threshold must be in (0, 1], got {threshold}"),
113 });
114 }
115 cumulative_variance_ncomp(eigenvalues, *threshold)
116 }
117 NcompMethod::Elbow => {
118 if eigenvalues.len() < 3 {
119 // Fallback to CV(0.95) when too few values for elbow
120 return cumulative_variance_ncomp(eigenvalues, 0.95);
121 }
122 Ok(elbow_ncomp(eigenvalues))
123 }
124 NcompMethod::Fixed(k) => Ok((*k).max(1).min(eigenvalues.len())),
125 NcompMethod::Kaiser => {
126 if eigenvalues.len() < 2 {
127 return Ok(1); // With 1 eigenvalue, always keep it
128 }
129 Ok(kaiser_ncomp(eigenvalues))
130 }
131 }
132}
133
134/// Cumulative variance method: first k where cumsum/total >= threshold.
135fn cumulative_variance_ncomp(eigenvalues: &[f64], threshold: f64) -> Result<usize, FdarError> {
136 let total: f64 = eigenvalues.iter().sum();
137 if total <= 0.0 {
138 return Err(FdarError::ComputationFailed {
139 operation: "select_ncomp",
140 detail: "total variance is non-positive".to_string(),
141 });
142 }
143
144 let mut cumsum = 0.0;
145 for (k, &ev) in eigenvalues.iter().enumerate() {
146 cumsum += ev;
147 if cumsum / total >= threshold {
148 return Ok(k + 1);
149 }
150 }
151 // If threshold is exactly 1.0, we need all components
152 Ok(eigenvalues.len())
153}
154
155/// Kaiser criterion: retain components with eigenvalue > mean(eigenvalues).
156///
157/// Equivalent to retaining components with above-average explanatory power
158/// (Kaiser, 1960, pp. 145–146). Tends to over-extract when the number of
159/// variables is large and under-extract when small.
160fn kaiser_ncomp(eigenvalues: &[f64]) -> usize {
161 let mean = eigenvalues.iter().sum::<f64>() / eigenvalues.len() as f64;
162 let k = eigenvalues.iter().filter(|&&ev| ev > mean).count();
163 k.max(1) // Always retain at least 1
164}
165
166/// Elbow method: argmax of second finite difference d2[k] = λ[k-1] - 2λ[k] + λ[k+1].
167///
168/// Applies a 3-point moving average to reduce noise sensitivity before
169/// computing the second differences (Cattell, 1966, p. 252). Second finite
170/// differences amplify noise by a factor of ~3 (condition number of the
171/// differencing operator); the 3-point moving average pre-smoothing reduces
172/// the effective amplification to ~1.5.
173fn elbow_ncomp(eigenvalues: &[f64]) -> usize {
174 let n = eigenvalues.len();
175 // 3-point moving average smoothing (edges use 2-point)
176 let smoothed: Vec<f64> = (0..n)
177 .map(|i| {
178 if i == 0 {
179 (eigenvalues[0] + eigenvalues[1]) / 2.0
180 } else if i == n - 1 {
181 (eigenvalues[n - 2] + eigenvalues[n - 1]) / 2.0
182 } else {
183 (eigenvalues[i - 1] + eigenvalues[i] + eigenvalues[i + 1]) / 3.0
184 }
185 })
186 .collect();
187
188 // d2[k] for k = 1..n-2
189 let mut best_k = 1;
190 let mut best_d2 = f64::NEG_INFINITY;
191
192 for k in 1..n - 1 {
193 let d2 = smoothed[k - 1] - 2.0 * smoothed[k] + smoothed[k + 1];
194 if d2 > best_d2 {
195 best_d2 = d2;
196 best_k = k;
197 }
198 }
199 (best_k + 1).max(1).min(eigenvalues.len())
200}