Skip to main content

fdars_core/spm/
bootstrap.rs

1//! Bootstrap and robust control limit estimation.
2//!
3//! Provides alternative methods for computing T-squared and SPE control
4//! limits when the parametric chi-squared assumption may not hold.
5//!
6//! # Note on bootstrap intervals
7//!
8//! The bootstrap methods here use the percentile method. For small samples
9//! (n < 30), bias-corrected and accelerated (BCa) intervals may give better
10//! coverage, but are not yet implemented.
11//!
12//! # Convergence properties
13//!
14//! The bootstrap percentile method has convergence rate O(n^{-1/2}) for
15//! quantile estimation. The BCa method (not yet implemented) achieves
16//! O(n^{-1}) via bias and skewness corrections (Efron & Tibshirani, 1993,
17//! Section 14.3).
18//!
19//! The KDE estimator with Silverman bandwidth converges at rate O(n^{-4/5})
20//! for smooth densities (the minimax-optimal rate for twice-differentiable
21//! densities). The bisection root-finding achieves machine-precision
22//! (~1e-10 relative) within 200 iterations.
23//!
24//! # Method selection guide
25//!
26//! 1. **Parametric**: use when SPE/T² is well-approximated by chi-squared
27//!    (check via `spe_moment_match_diagnostic`).
28//! 2. **Empirical**: use for n > 50 when no distributional assumption is
29//!    desired.
30//! 3. **Bootstrap**: use for n = 10–50 when parametric may be inaccurate.
31//! 4. **KDE**: use for smooth unimodal distributions with n > 30.
32//!
33//! # References
34//!
35//! - Efron, B. & Tibshirani, R.J. (1993). *An Introduction to the Bootstrap*.
36//!   Chapman & Hall/CRC. Section 13.3, pp. 178–182 (bootstrap methodology);
37//!   Section 14.3 (BCa convergence).
38//! - Silverman, B.W. (1986). *Density Estimation for Statistics and Data
39//!   Analysis*. Chapman & Hall. Section 3.4.1, pp. 47–48 (kernel density
40//!   estimation bandwidth selection).
41
42use crate::error::FdarError;
43use crate::helpers::sort_nan_safe;
44use crate::iter_maybe_parallel;
45
46use super::chi_squared::regularized_gamma_p;
47use super::control::{spe_control_limit, t2_control_limit, ControlLimit};
48
49use rand::rngs::StdRng;
50use rand::{Rng, SeedableRng};
51#[cfg(feature = "parallel")]
52use rayon::iter::ParallelIterator;
53
54/// Method for computing control limits.
55#[derive(Debug, Clone, PartialEq)]
56#[non_exhaustive]
57pub enum ControlLimitMethod {
58    /// Standard parametric (chi-squared) limits.
59    Parametric,
60    /// Empirical quantile from the observed values.
61    Empirical,
62    /// Bootstrap resampling to estimate the quantile.
63    ///
64    /// This implements the percentile method. For small samples (n < 30),
65    /// consider bias-corrected and accelerated (BCa) intervals, which are
66    /// not yet implemented.
67    Bootstrap {
68        /// Number of bootstrap resamples.
69        n_bootstrap: usize,
70        /// Random seed for reproducibility.
71        seed: u64,
72    },
73    /// Kernel density estimation with Gaussian kernel.
74    KernelDensity {
75        /// Bandwidth (None for Silverman's rule of thumb).
76        bandwidth: Option<f64>,
77    },
78}
79
80/// Compute a robust T-squared control limit.
81///
82/// # Arguments
83/// * `t2_values` - Observed T-squared values from calibration data
84/// * `ncomp` - Number of principal components
85/// * `alpha` - Significance level
86/// * `method` - Control limit method
87///
88/// # Recommended settings
89///
90/// - **Empirical**: No parameters; fast but noisy for small samples (n < 50).
91/// - **Bootstrap**: n_bootstrap >= 1000 recommended; 5000-10000 for stable
92///   results. Seed ensures reproducibility across runs.
93/// - **KDE**: Automatic Silverman bandwidth works well for unimodal distributions.
94///   For multimodal or heavy-tailed data, specify bandwidth manually.
95///
96/// # Errors
97///
98/// Returns errors from the underlying limit computation method.
99///
100/// # Example
101/// ```
102/// use fdars_core::spm::bootstrap::{t2_limit_robust, ControlLimitMethod};
103/// let t2_values = vec![1.0, 2.5, 1.8, 3.2, 2.1, 1.5, 2.8, 1.9, 2.3, 1.7];
104/// let limit = t2_limit_robust(&t2_values, 3, 0.05, &ControlLimitMethod::Empirical).unwrap();
105/// assert!(limit.ucl > 0.0);
106/// ```
107#[must_use = "control limit should not be discarded"]
108pub fn t2_limit_robust(
109    t2_values: &[f64],
110    ncomp: usize,
111    alpha: f64,
112    method: &ControlLimitMethod,
113) -> Result<ControlLimit, FdarError> {
114    validate_inputs(t2_values, alpha)?;
115
116    match method {
117        ControlLimitMethod::Parametric => t2_control_limit(ncomp, alpha),
118        ControlLimitMethod::Empirical => {
119            let ucl = empirical_quantile(t2_values, 1.0 - alpha);
120            Ok(ControlLimit {
121                ucl,
122                alpha,
123                description: format!("T2 empirical quantile, alpha={alpha}"),
124            })
125        }
126        ControlLimitMethod::Bootstrap { n_bootstrap, seed } => {
127            let ucl = bootstrap_quantile(t2_values, 1.0 - alpha, *n_bootstrap, *seed);
128            Ok(ControlLimit {
129                ucl,
130                alpha,
131                description: format!("T2 bootstrap ({n_bootstrap} resamples), alpha={alpha}"),
132            })
133        }
134        ControlLimitMethod::KernelDensity { bandwidth } => {
135            let ucl = kde_quantile(t2_values, 1.0 - alpha, *bandwidth)?;
136            Ok(ControlLimit {
137                ucl,
138                alpha,
139                description: format!("T2 KDE, alpha={alpha}"),
140            })
141        }
142    }
143}
144
145/// Compute a robust SPE control limit.
146///
147/// # Arguments
148/// * `spe_values` - Observed SPE values from calibration data
149/// * `alpha` - Significance level
150/// * `method` - Control limit method
151///
152/// # Recommended settings
153///
154/// - **Empirical**: No parameters; fast but noisy for small samples (n < 50).
155/// - **Bootstrap**: n_bootstrap >= 1000 recommended; 5000-10000 for stable
156///   results. Seed ensures reproducibility across runs.
157/// - **KDE**: Automatic Silverman bandwidth works well for unimodal distributions.
158///   For multimodal or heavy-tailed data, specify bandwidth manually.
159///
160/// # Errors
161///
162/// Returns errors from the underlying limit computation method.
163///
164/// # Example
165/// ```
166/// use fdars_core::spm::bootstrap::{spe_limit_robust, ControlLimitMethod};
167/// let spe_values = vec![0.5, 1.2, 0.8, 1.5, 0.9, 1.1, 0.7, 1.3, 0.6, 1.0];
168/// let limit = spe_limit_robust(&spe_values, 0.05, &ControlLimitMethod::Empirical).unwrap();
169/// assert!(limit.ucl > 0.0);
170/// ```
171#[must_use = "control limit should not be discarded"]
172pub fn spe_limit_robust(
173    spe_values: &[f64],
174    alpha: f64,
175    method: &ControlLimitMethod,
176) -> Result<ControlLimit, FdarError> {
177    validate_inputs(spe_values, alpha)?;
178
179    match method {
180        ControlLimitMethod::Parametric => spe_control_limit(spe_values, alpha),
181        ControlLimitMethod::Empirical => {
182            let ucl = empirical_quantile(spe_values, 1.0 - alpha);
183            Ok(ControlLimit {
184                ucl,
185                alpha,
186                description: format!("SPE empirical quantile, alpha={alpha}"),
187            })
188        }
189        ControlLimitMethod::Bootstrap { n_bootstrap, seed } => {
190            let ucl = bootstrap_quantile(spe_values, 1.0 - alpha, *n_bootstrap, *seed);
191            Ok(ControlLimit {
192                ucl,
193                alpha,
194                description: format!("SPE bootstrap ({n_bootstrap} resamples), alpha={alpha}"),
195            })
196        }
197        ControlLimitMethod::KernelDensity { bandwidth } => {
198            let ucl = kde_quantile(spe_values, 1.0 - alpha, *bandwidth)?;
199            Ok(ControlLimit {
200                ucl,
201                alpha,
202                description: format!("SPE KDE, alpha={alpha}"),
203            })
204        }
205    }
206}
207
208fn validate_inputs(values: &[f64], alpha: f64) -> Result<(), FdarError> {
209    if values.len() < 2 {
210        return Err(FdarError::InvalidDimension {
211            parameter: "values",
212            expected: "at least 2 values".to_string(),
213            actual: format!("{} values", values.len()),
214        });
215    }
216    if alpha <= 0.0 || alpha >= 1.0 {
217        return Err(FdarError::InvalidParameter {
218            parameter: "alpha",
219            message: format!("alpha must be in (0, 1), got {alpha}"),
220        });
221    }
222    Ok(())
223}
224
225/// Empirical quantile: sort, return values[ceil(p*n) - 1].
226///
227/// Full sort is O(n log n) per call. For repeated quantile queries on the
228/// same data, consider pre-sorting.
229fn empirical_quantile(values: &[f64], p: f64) -> f64 {
230    let mut sorted = values.to_vec();
231    sort_nan_safe(&mut sorted);
232    let n = sorted.len();
233    let idx = ((p * n as f64).ceil() as usize)
234        .saturating_sub(1)
235        .min(n - 1);
236    sorted[idx]
237}
238
239/// Bootstrap percentile method: resample `n_bootstrap` times, compute the
240/// p-quantile from each resample, return the median of bootstrap quantiles.
241///
242/// The median is used instead of the mean for robustness: bootstrap resamples
243/// can occasionally produce extreme quantile estimates (especially with small
244/// samples or heavy tails), and the median is less sensitive to these outliers
245/// (Efron & Tibshirani, 1993, §13.3).
246fn bootstrap_quantile(values: &[f64], p: f64, n_bootstrap: usize, seed: u64) -> f64 {
247    let n = values.len();
248
249    // Collect all bootstrap maxima (the p-quantile from each resample)
250    let quantiles: Vec<f64> = iter_maybe_parallel!((0..n_bootstrap).collect::<Vec<_>>())
251        .map(|rep| {
252            let mut rng = StdRng::seed_from_u64(seed + rep as u64);
253            let mut sample: Vec<f64> = (0..n).map(|_| values[rng.gen_range(0..n)]).collect();
254            sort_nan_safe(&mut sample);
255            let idx = ((p * n as f64).ceil() as usize)
256                .saturating_sub(1)
257                .min(n - 1);
258            sample[idx]
259        })
260        .collect();
261
262    // Return the median of bootstrap quantiles (more robust than mean)
263    let mut sorted_q = quantiles;
264    sort_nan_safe(&mut sorted_q);
265    let mid = sorted_q.len() / 2;
266    if sorted_q.len() % 2 == 0 {
267        (sorted_q[mid - 1] + sorted_q[mid]) / 2.0
268    } else {
269        sorted_q[mid]
270    }
271}
272
273/// KDE quantile: Gaussian kernel density, Silverman bandwidth, bisection.
274fn kde_quantile(values: &[f64], p: f64, bandwidth: Option<f64>) -> Result<f64, FdarError> {
275    let n = values.len() as f64;
276    let mean: f64 = values.iter().sum::<f64>() / n;
277    let var: f64 = values.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / (n - 1.0);
278    let std = var.sqrt();
279
280    if std <= 0.0 {
281        return Err(FdarError::ComputationFailed {
282            operation: "kde_quantile",
283            detail: "standard deviation is zero; cannot estimate KDE".to_string(),
284        });
285    }
286
287    // Silverman's rule of thumb with skewness adjustment
288    let h = bandwidth.unwrap_or_else(|| {
289        let iqr = {
290            let mut sorted = values.to_vec();
291            sort_nan_safe(&mut sorted);
292            let q75_idx = ((0.75 * n).ceil() as usize)
293                .saturating_sub(1)
294                .min(sorted.len() - 1);
295            let q25_idx = ((0.25 * n).ceil() as usize)
296                .saturating_sub(1)
297                .min(sorted.len() - 1);
298            sorted[q75_idx] - sorted[q25_idx]
299        };
300        let s = std.min(iqr / 1.34);
301        let s = if s > 0.0 { s } else { std };
302        // Skewness adjustment follows Silverman (1986, Section 3.4.1): the bandwidth
303        // is multiplied by (1 + |skewness|)^{1/5} to widen the kernel for asymmetric
304        // distributions, preventing undersmoothing in the tails.
305        let skewness = {
306            let m3: f64 = values
307                .iter()
308                .map(|&x| ((x - mean) / std).powi(3))
309                .sum::<f64>()
310                / n;
311            m3
312        };
313        let skew_factor = (1.0 + skewness.abs() * 0.2).min(1.5);
314        0.9 * s * n.powf(-0.2) * skew_factor
315    });
316
317    if h <= 0.0 {
318        return Err(FdarError::InvalidParameter {
319            parameter: "bandwidth",
320            message: format!("bandwidth must be positive, got {h}"),
321        });
322    }
323
324    // KDE CDF: F_hat(x) = (1/n) * sum_i Phi((x - x_i) / h)
325    // where Phi is the standard normal CDF
326    let kde_cdf = |x: f64| -> f64 {
327        let sum: f64 = values.iter().map(|&xi| normal_cdf((x - xi) / h)).sum();
328        sum / n
329    };
330
331    // Bisection to find x such that kde_cdf(x) = p
332    let mut sorted = values.to_vec();
333    sort_nan_safe(&mut sorted);
334    let mut lo = sorted[0] - 5.0 * h;
335    let mut hi = sorted[sorted.len() - 1] + 5.0 * h;
336
337    // 200 bisection iterations give precision of (range / 2^200) ~ machine epsilon
338    // for any practical range. This is conservative but guarantees convergence.
339    for _ in 0..200 {
340        let mid = (lo + hi) / 2.0;
341        if kde_cdf(mid) < p {
342            lo = mid;
343        } else {
344            hi = mid;
345        }
346        if (hi - lo) < 1e-10 * (1.0 + hi.abs()) {
347            break;
348        }
349    }
350
351    Ok((lo + hi) / 2.0)
352}
353
354/// Standard normal CDF using the regularized gamma function.
355///
356/// Phi(x) = 0.5 * (1 + erf(x / sqrt(2)))
357///
358/// The error function is computed via `regularized_gamma_p(0.5, x^2)`,
359/// exploiting the identity erf(x) = P(0.5, x^2) for x >= 0.
360///
361/// Uses the complementary error function (erfc) path internally for
362/// numerical stability; accuracy is approximately 1e-7 across the standard
363/// normal range.
364fn normal_cdf(x: f64) -> f64 {
365    if x >= 0.0 {
366        0.5 * (1.0 + erf_via_gamma(x))
367    } else {
368        0.5 * (1.0 - erf_via_gamma(-x))
369    }
370}
371
372/// erf(x) for x >= 0 via the regularized lower incomplete gamma function.
373///
374/// Uses the identity: erf(x) = P(0.5, x^2), which follows from the
375/// substitution u = sqrt(t) in the integral representation of P(0.5, x^2).
376fn erf_via_gamma(x: f64) -> f64 {
377    regularized_gamma_p(0.5, x * x)
378}