fdars_core/spm/bootstrap.rs
1//! Bootstrap and robust control limit estimation.
2//!
3//! Provides alternative methods for computing T-squared and SPE control
4//! limits when the parametric chi-squared assumption may not hold.
5//!
6//! # Note on bootstrap intervals
7//!
8//! The bootstrap methods here use the percentile method. For small samples
9//! (n < 30), bias-corrected and accelerated (BCa) intervals may give better
10//! coverage, but are not yet implemented.
11//!
12//! # Convergence properties
13//!
14//! The bootstrap percentile method has convergence rate O(n^{-1/2}) for
15//! quantile estimation. The BCa method (not yet implemented) achieves
16//! O(n^{-1}) via bias and skewness corrections (Efron & Tibshirani, 1993,
17//! Section 14.3).
18//!
19//! The KDE estimator with Silverman bandwidth converges at rate O(n^{-4/5})
20//! for smooth densities (the minimax-optimal rate for twice-differentiable
21//! densities). The bisection root-finding achieves machine-precision
22//! (~1e-10 relative) within 200 iterations.
23//!
24//! # Method selection guide
25//!
26//! 1. **Parametric**: use when SPE/T² is well-approximated by chi-squared
27//! (check via `spe_moment_match_diagnostic`).
28//! 2. **Empirical**: use for n > 50 when no distributional assumption is
29//! desired.
30//! 3. **Bootstrap**: use for n = 10–50 when parametric may be inaccurate.
31//! 4. **KDE**: use for smooth unimodal distributions with n > 30.
32//!
33//! # References
34//!
35//! - Efron, B. & Tibshirani, R.J. (1993). *An Introduction to the Bootstrap*.
36//! Chapman & Hall/CRC. Section 13.3, pp. 178–182 (bootstrap methodology);
37//! Section 14.3 (BCa convergence).
38//! - Silverman, B.W. (1986). *Density Estimation for Statistics and Data
39//! Analysis*. Chapman & Hall. Section 3.4.1, pp. 47–48 (kernel density
40//! estimation bandwidth selection).
41
42use crate::error::FdarError;
43use crate::helpers::sort_nan_safe;
44use crate::iter_maybe_parallel;
45
46use super::chi_squared::regularized_gamma_p;
47use super::control::{spe_control_limit, t2_control_limit, ControlLimit};
48
49use rand::rngs::StdRng;
50use rand::{Rng, SeedableRng};
51#[cfg(feature = "parallel")]
52use rayon::iter::ParallelIterator;
53
54/// Method for computing control limits.
55#[derive(Debug, Clone, PartialEq)]
56#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
57#[non_exhaustive]
58pub enum ControlLimitMethod {
59 /// Standard parametric (chi-squared) limits.
60 Parametric,
61 /// Empirical quantile from the observed values.
62 Empirical,
63 /// Bootstrap resampling to estimate the quantile.
64 ///
65 /// This implements the percentile method. For small samples (n < 30),
66 /// consider bias-corrected and accelerated (BCa) intervals, which are
67 /// not yet implemented.
68 Bootstrap {
69 /// Number of bootstrap resamples.
70 n_bootstrap: usize,
71 /// Random seed for reproducibility.
72 seed: u64,
73 },
74 /// Kernel density estimation with Gaussian kernel.
75 KernelDensity {
76 /// Bandwidth (None for Silverman's rule of thumb).
77 bandwidth: Option<f64>,
78 },
79}
80
81/// Compute a robust T-squared control limit.
82///
83/// # Arguments
84/// * `t2_values` - Observed T-squared values from calibration data
85/// * `ncomp` - Number of principal components
86/// * `alpha` - Significance level
87/// * `method` - Control limit method
88///
89/// # Recommended settings
90///
91/// - **Empirical**: No parameters; fast but noisy for small samples (n < 50).
92/// - **Bootstrap**: n_bootstrap >= 1000 recommended; 5000-10000 for stable
93/// results. Seed ensures reproducibility across runs.
94/// - **KDE**: Automatic Silverman bandwidth works well for unimodal distributions.
95/// For multimodal or heavy-tailed data, specify bandwidth manually.
96///
97/// # Errors
98///
99/// Returns errors from the underlying limit computation method.
100///
101/// # Example
102/// ```
103/// use fdars_core::spm::bootstrap::{t2_limit_robust, ControlLimitMethod};
104/// let t2_values = vec![1.0, 2.5, 1.8, 3.2, 2.1, 1.5, 2.8, 1.9, 2.3, 1.7];
105/// let limit = t2_limit_robust(&t2_values, 3, 0.05, &ControlLimitMethod::Empirical).unwrap();
106/// assert!(limit.ucl > 0.0);
107/// ```
108#[must_use = "control limit should not be discarded"]
109pub fn t2_limit_robust(
110 t2_values: &[f64],
111 ncomp: usize,
112 alpha: f64,
113 method: &ControlLimitMethod,
114) -> Result<ControlLimit, FdarError> {
115 validate_inputs(t2_values, alpha)?;
116
117 match method {
118 ControlLimitMethod::Parametric => t2_control_limit(ncomp, alpha),
119 ControlLimitMethod::Empirical => {
120 let ucl = empirical_quantile(t2_values, 1.0 - alpha);
121 Ok(ControlLimit {
122 ucl,
123 alpha,
124 description: format!("T2 empirical quantile, alpha={alpha}"),
125 })
126 }
127 ControlLimitMethod::Bootstrap { n_bootstrap, seed } => {
128 let ucl = bootstrap_quantile(t2_values, 1.0 - alpha, *n_bootstrap, *seed);
129 Ok(ControlLimit {
130 ucl,
131 alpha,
132 description: format!("T2 bootstrap ({n_bootstrap} resamples), alpha={alpha}"),
133 })
134 }
135 ControlLimitMethod::KernelDensity { bandwidth } => {
136 let ucl = kde_quantile(t2_values, 1.0 - alpha, *bandwidth)?;
137 Ok(ControlLimit {
138 ucl,
139 alpha,
140 description: format!("T2 KDE, alpha={alpha}"),
141 })
142 }
143 }
144}
145
146/// Compute a robust SPE control limit.
147///
148/// # Arguments
149/// * `spe_values` - Observed SPE values from calibration data
150/// * `alpha` - Significance level
151/// * `method` - Control limit method
152///
153/// # Recommended settings
154///
155/// - **Empirical**: No parameters; fast but noisy for small samples (n < 50).
156/// - **Bootstrap**: n_bootstrap >= 1000 recommended; 5000-10000 for stable
157/// results. Seed ensures reproducibility across runs.
158/// - **KDE**: Automatic Silverman bandwidth works well for unimodal distributions.
159/// For multimodal or heavy-tailed data, specify bandwidth manually.
160///
161/// # Errors
162///
163/// Returns errors from the underlying limit computation method.
164///
165/// # Example
166/// ```
167/// use fdars_core::spm::bootstrap::{spe_limit_robust, ControlLimitMethod};
168/// let spe_values = vec![0.5, 1.2, 0.8, 1.5, 0.9, 1.1, 0.7, 1.3, 0.6, 1.0];
169/// let limit = spe_limit_robust(&spe_values, 0.05, &ControlLimitMethod::Empirical).unwrap();
170/// assert!(limit.ucl > 0.0);
171/// ```
172#[must_use = "control limit should not be discarded"]
173pub fn spe_limit_robust(
174 spe_values: &[f64],
175 alpha: f64,
176 method: &ControlLimitMethod,
177) -> Result<ControlLimit, FdarError> {
178 validate_inputs(spe_values, alpha)?;
179
180 match method {
181 ControlLimitMethod::Parametric => spe_control_limit(spe_values, alpha),
182 ControlLimitMethod::Empirical => {
183 let ucl = empirical_quantile(spe_values, 1.0 - alpha);
184 Ok(ControlLimit {
185 ucl,
186 alpha,
187 description: format!("SPE empirical quantile, alpha={alpha}"),
188 })
189 }
190 ControlLimitMethod::Bootstrap { n_bootstrap, seed } => {
191 let ucl = bootstrap_quantile(spe_values, 1.0 - alpha, *n_bootstrap, *seed);
192 Ok(ControlLimit {
193 ucl,
194 alpha,
195 description: format!("SPE bootstrap ({n_bootstrap} resamples), alpha={alpha}"),
196 })
197 }
198 ControlLimitMethod::KernelDensity { bandwidth } => {
199 let ucl = kde_quantile(spe_values, 1.0 - alpha, *bandwidth)?;
200 Ok(ControlLimit {
201 ucl,
202 alpha,
203 description: format!("SPE KDE, alpha={alpha}"),
204 })
205 }
206 }
207}
208
209fn validate_inputs(values: &[f64], alpha: f64) -> Result<(), FdarError> {
210 if values.len() < 2 {
211 return Err(FdarError::InvalidDimension {
212 parameter: "values",
213 expected: "at least 2 values".to_string(),
214 actual: format!("{} values", values.len()),
215 });
216 }
217 if alpha <= 0.0 || alpha >= 1.0 {
218 return Err(FdarError::InvalidParameter {
219 parameter: "alpha",
220 message: format!("alpha must be in (0, 1), got {alpha}"),
221 });
222 }
223 Ok(())
224}
225
226/// Empirical quantile: sort, return values[ceil(p*n) - 1].
227///
228/// Full sort is O(n log n) per call. For repeated quantile queries on the
229/// same data, consider pre-sorting.
230fn empirical_quantile(values: &[f64], p: f64) -> f64 {
231 let mut sorted = values.to_vec();
232 sort_nan_safe(&mut sorted);
233 let n = sorted.len();
234 let idx = ((p * n as f64).ceil() as usize)
235 .saturating_sub(1)
236 .min(n - 1);
237 sorted[idx]
238}
239
240/// Bootstrap percentile method: resample `n_bootstrap` times, compute the
241/// p-quantile from each resample, return the median of bootstrap quantiles.
242///
243/// The median is used instead of the mean for robustness: bootstrap resamples
244/// can occasionally produce extreme quantile estimates (especially with small
245/// samples or heavy tails), and the median is less sensitive to these outliers
246/// (Efron & Tibshirani, 1993, §13.3).
247fn bootstrap_quantile(values: &[f64], p: f64, n_bootstrap: usize, seed: u64) -> f64 {
248 let n = values.len();
249
250 // Collect all bootstrap maxima (the p-quantile from each resample)
251 let quantiles: Vec<f64> = iter_maybe_parallel!((0..n_bootstrap).collect::<Vec<_>>())
252 .map(|rep| {
253 let mut rng = StdRng::seed_from_u64(seed + rep as u64);
254 let mut sample: Vec<f64> = (0..n).map(|_| values[rng.gen_range(0..n)]).collect();
255 sort_nan_safe(&mut sample);
256 let idx = ((p * n as f64).ceil() as usize)
257 .saturating_sub(1)
258 .min(n - 1);
259 sample[idx]
260 })
261 .collect();
262
263 // Return the median of bootstrap quantiles (more robust than mean)
264 let mut sorted_q = quantiles;
265 sort_nan_safe(&mut sorted_q);
266 let mid = sorted_q.len() / 2;
267 if sorted_q.len() % 2 == 0 {
268 (sorted_q[mid - 1] + sorted_q[mid]) / 2.0
269 } else {
270 sorted_q[mid]
271 }
272}
273
274/// KDE quantile: Gaussian kernel density, Silverman bandwidth, bisection.
275fn kde_quantile(values: &[f64], p: f64, bandwidth: Option<f64>) -> Result<f64, FdarError> {
276 let n = values.len() as f64;
277 let mean: f64 = values.iter().sum::<f64>() / n;
278 let var: f64 = values.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / (n - 1.0);
279 let std = var.sqrt();
280
281 if std <= 0.0 {
282 return Err(FdarError::ComputationFailed {
283 operation: "kde_quantile",
284 detail: "standard deviation is zero; cannot estimate KDE".to_string(),
285 });
286 }
287
288 // Silverman's rule of thumb with skewness adjustment
289 let h = bandwidth.unwrap_or_else(|| {
290 let iqr = {
291 let mut sorted = values.to_vec();
292 sort_nan_safe(&mut sorted);
293 let q75_idx = ((0.75 * n).ceil() as usize)
294 .saturating_sub(1)
295 .min(sorted.len() - 1);
296 let q25_idx = ((0.25 * n).ceil() as usize)
297 .saturating_sub(1)
298 .min(sorted.len() - 1);
299 sorted[q75_idx] - sorted[q25_idx]
300 };
301 let s = std.min(iqr / 1.34);
302 let s = if s > 0.0 { s } else { std };
303 // Skewness adjustment follows Silverman (1986, Section 3.4.1): the bandwidth
304 // is multiplied by (1 + |skewness|)^{1/5} to widen the kernel for asymmetric
305 // distributions, preventing undersmoothing in the tails.
306 let skewness = {
307 let m3: f64 = values
308 .iter()
309 .map(|&x| ((x - mean) / std).powi(3))
310 .sum::<f64>()
311 / n;
312 m3
313 };
314 let skew_factor = (1.0 + skewness.abs() * 0.2).min(1.5);
315 0.9 * s * n.powf(-0.2) * skew_factor
316 });
317
318 if h <= 0.0 {
319 return Err(FdarError::InvalidParameter {
320 parameter: "bandwidth",
321 message: format!("bandwidth must be positive, got {h}"),
322 });
323 }
324
325 // KDE CDF: F_hat(x) = (1/n) * sum_i Phi((x - x_i) / h)
326 // where Phi is the standard normal CDF
327 let kde_cdf = |x: f64| -> f64 {
328 let sum: f64 = values.iter().map(|&xi| normal_cdf((x - xi) / h)).sum();
329 sum / n
330 };
331
332 // Bisection to find x such that kde_cdf(x) = p
333 let mut sorted = values.to_vec();
334 sort_nan_safe(&mut sorted);
335 let mut lo = sorted[0] - 5.0 * h;
336 let mut hi = sorted[sorted.len() - 1] + 5.0 * h;
337
338 // 200 bisection iterations give precision of (range / 2^200) ~ machine epsilon
339 // for any practical range. This is conservative but guarantees convergence.
340 for _ in 0..200 {
341 let mid = (lo + hi) / 2.0;
342 if kde_cdf(mid) < p {
343 lo = mid;
344 } else {
345 hi = mid;
346 }
347 if (hi - lo) < 1e-10 * (1.0 + hi.abs()) {
348 break;
349 }
350 }
351
352 Ok((lo + hi) / 2.0)
353}
354
355/// Standard normal CDF using the regularized gamma function.
356///
357/// Phi(x) = 0.5 * (1 + erf(x / sqrt(2)))
358///
359/// The error function is computed via `regularized_gamma_p(0.5, x^2)`,
360/// exploiting the identity erf(x) = P(0.5, x^2) for x >= 0.
361///
362/// Uses the complementary error function (erfc) path internally for
363/// numerical stability; accuracy is approximately 1e-7 across the standard
364/// normal range.
365fn normal_cdf(x: f64) -> f64 {
366 if x >= 0.0 {
367 0.5 * (1.0 + erf_via_gamma(x))
368 } else {
369 0.5 * (1.0 - erf_via_gamma(-x))
370 }
371}
372
373/// erf(x) for x >= 0 via the regularized lower incomplete gamma function.
374///
375/// Uses the identity: erf(x) = P(0.5, x^2), which follows from the
376/// substitution u = sqrt(t) in the integral representation of P(0.5, x^2).
377fn erf_via_gamma(x: f64) -> f64 {
378 regularized_gamma_p(0.5, x * x)
379}