Skip to main content

fdars_core/spm/
bootstrap.rs

1//! Bootstrap and robust control limit estimation.
2//!
3//! Provides alternative methods for computing T-squared and SPE control
4//! limits when the parametric chi-squared assumption may not hold.
5//!
6//! # Note on bootstrap intervals
7//!
8//! The bootstrap methods here use the percentile method. For small samples
9//! (n < 30), bias-corrected and accelerated (BCa) intervals may give better
10//! coverage, but are not yet implemented.
11//!
12//! # Convergence properties
13//!
14//! The bootstrap percentile method has convergence rate O(n^{-1/2}) for
15//! quantile estimation. The BCa method (not yet implemented) achieves
16//! O(n^{-1}) via bias and skewness corrections (Efron & Tibshirani, 1993,
17//! Section 14.3).
18//!
19//! The KDE estimator with Silverman bandwidth converges at rate O(n^{-4/5})
20//! for smooth densities (the minimax-optimal rate for twice-differentiable
21//! densities). The bisection root-finding achieves machine-precision
22//! (~1e-10 relative) within 200 iterations.
23//!
24//! # Method selection guide
25//!
26//! 1. **Parametric**: use when SPE/T² is well-approximated by chi-squared
27//!    (check via `spe_moment_match_diagnostic`).
28//! 2. **Empirical**: use for n > 50 when no distributional assumption is
29//!    desired.
30//! 3. **Bootstrap**: use for n = 10–50 when parametric may be inaccurate.
31//! 4. **KDE**: use for smooth unimodal distributions with n > 30.
32//!
33//! # References
34//!
35//! - Efron, B. & Tibshirani, R.J. (1993). *An Introduction to the Bootstrap*.
36//!   Chapman & Hall/CRC. Section 13.3, pp. 178–182 (bootstrap methodology);
37//!   Section 14.3 (BCa convergence).
38//! - Silverman, B.W. (1986). *Density Estimation for Statistics and Data
39//!   Analysis*. Chapman & Hall. Section 3.4.1, pp. 47–48 (kernel density
40//!   estimation bandwidth selection).
41
42use crate::error::FdarError;
43use crate::helpers::sort_nan_safe;
44use crate::iter_maybe_parallel;
45
46use super::chi_squared::regularized_gamma_p;
47use super::control::{spe_control_limit, t2_control_limit, ControlLimit};
48
49use rand::rngs::StdRng;
50use rand::{Rng, SeedableRng};
51#[cfg(feature = "parallel")]
52use rayon::iter::ParallelIterator;
53
54/// Method for computing control limits.
55#[derive(Debug, Clone, PartialEq)]
56#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
57#[non_exhaustive]
58pub enum ControlLimitMethod {
59    /// Standard parametric (chi-squared) limits.
60    Parametric,
61    /// Empirical quantile from the observed values.
62    Empirical,
63    /// Bootstrap resampling to estimate the quantile.
64    ///
65    /// This implements the percentile method. For small samples (n < 30),
66    /// consider bias-corrected and accelerated (BCa) intervals, which are
67    /// not yet implemented.
68    Bootstrap {
69        /// Number of bootstrap resamples.
70        n_bootstrap: usize,
71        /// Random seed for reproducibility.
72        seed: u64,
73    },
74    /// Kernel density estimation with Gaussian kernel.
75    KernelDensity {
76        /// Bandwidth (None for Silverman's rule of thumb).
77        bandwidth: Option<f64>,
78    },
79}
80
81/// Compute a robust T-squared control limit.
82///
83/// # Arguments
84/// * `t2_values` - Observed T-squared values from calibration data
85/// * `ncomp` - Number of principal components
86/// * `alpha` - Significance level
87/// * `method` - Control limit method
88///
89/// # Recommended settings
90///
91/// - **Empirical**: No parameters; fast but noisy for small samples (n < 50).
92/// - **Bootstrap**: n_bootstrap >= 1000 recommended; 5000-10000 for stable
93///   results. Seed ensures reproducibility across runs.
94/// - **KDE**: Automatic Silverman bandwidth works well for unimodal distributions.
95///   For multimodal or heavy-tailed data, specify bandwidth manually.
96///
97/// # Errors
98///
99/// Returns errors from the underlying limit computation method.
100///
101/// # Example
102/// ```
103/// use fdars_core::spm::bootstrap::{t2_limit_robust, ControlLimitMethod};
104/// let t2_values = vec![1.0, 2.5, 1.8, 3.2, 2.1, 1.5, 2.8, 1.9, 2.3, 1.7];
105/// let limit = t2_limit_robust(&t2_values, 3, 0.05, &ControlLimitMethod::Empirical).unwrap();
106/// assert!(limit.ucl > 0.0);
107/// ```
108#[must_use = "control limit should not be discarded"]
109pub fn t2_limit_robust(
110    t2_values: &[f64],
111    ncomp: usize,
112    alpha: f64,
113    method: &ControlLimitMethod,
114) -> Result<ControlLimit, FdarError> {
115    validate_inputs(t2_values, alpha)?;
116
117    match method {
118        ControlLimitMethod::Parametric => t2_control_limit(ncomp, alpha),
119        ControlLimitMethod::Empirical => {
120            let ucl = empirical_quantile(t2_values, 1.0 - alpha);
121            Ok(ControlLimit {
122                ucl,
123                alpha,
124                description: format!("T2 empirical quantile, alpha={alpha}"),
125            })
126        }
127        ControlLimitMethod::Bootstrap { n_bootstrap, seed } => {
128            let ucl = bootstrap_quantile(t2_values, 1.0 - alpha, *n_bootstrap, *seed);
129            Ok(ControlLimit {
130                ucl,
131                alpha,
132                description: format!("T2 bootstrap ({n_bootstrap} resamples), alpha={alpha}"),
133            })
134        }
135        ControlLimitMethod::KernelDensity { bandwidth } => {
136            let ucl = kde_quantile(t2_values, 1.0 - alpha, *bandwidth)?;
137            Ok(ControlLimit {
138                ucl,
139                alpha,
140                description: format!("T2 KDE, alpha={alpha}"),
141            })
142        }
143    }
144}
145
146/// Compute a robust SPE control limit.
147///
148/// # Arguments
149/// * `spe_values` - Observed SPE values from calibration data
150/// * `alpha` - Significance level
151/// * `method` - Control limit method
152///
153/// # Recommended settings
154///
155/// - **Empirical**: No parameters; fast but noisy for small samples (n < 50).
156/// - **Bootstrap**: n_bootstrap >= 1000 recommended; 5000-10000 for stable
157///   results. Seed ensures reproducibility across runs.
158/// - **KDE**: Automatic Silverman bandwidth works well for unimodal distributions.
159///   For multimodal or heavy-tailed data, specify bandwidth manually.
160///
161/// # Errors
162///
163/// Returns errors from the underlying limit computation method.
164///
165/// # Example
166/// ```
167/// use fdars_core::spm::bootstrap::{spe_limit_robust, ControlLimitMethod};
168/// let spe_values = vec![0.5, 1.2, 0.8, 1.5, 0.9, 1.1, 0.7, 1.3, 0.6, 1.0];
169/// let limit = spe_limit_robust(&spe_values, 0.05, &ControlLimitMethod::Empirical).unwrap();
170/// assert!(limit.ucl > 0.0);
171/// ```
172#[must_use = "control limit should not be discarded"]
173pub fn spe_limit_robust(
174    spe_values: &[f64],
175    alpha: f64,
176    method: &ControlLimitMethod,
177) -> Result<ControlLimit, FdarError> {
178    validate_inputs(spe_values, alpha)?;
179
180    match method {
181        ControlLimitMethod::Parametric => spe_control_limit(spe_values, alpha),
182        ControlLimitMethod::Empirical => {
183            let ucl = empirical_quantile(spe_values, 1.0 - alpha);
184            Ok(ControlLimit {
185                ucl,
186                alpha,
187                description: format!("SPE empirical quantile, alpha={alpha}"),
188            })
189        }
190        ControlLimitMethod::Bootstrap { n_bootstrap, seed } => {
191            let ucl = bootstrap_quantile(spe_values, 1.0 - alpha, *n_bootstrap, *seed);
192            Ok(ControlLimit {
193                ucl,
194                alpha,
195                description: format!("SPE bootstrap ({n_bootstrap} resamples), alpha={alpha}"),
196            })
197        }
198        ControlLimitMethod::KernelDensity { bandwidth } => {
199            let ucl = kde_quantile(spe_values, 1.0 - alpha, *bandwidth)?;
200            Ok(ControlLimit {
201                ucl,
202                alpha,
203                description: format!("SPE KDE, alpha={alpha}"),
204            })
205        }
206    }
207}
208
209fn validate_inputs(values: &[f64], alpha: f64) -> Result<(), FdarError> {
210    if values.len() < 2 {
211        return Err(FdarError::InvalidDimension {
212            parameter: "values",
213            expected: "at least 2 values".to_string(),
214            actual: format!("{} values", values.len()),
215        });
216    }
217    if alpha <= 0.0 || alpha >= 1.0 {
218        return Err(FdarError::InvalidParameter {
219            parameter: "alpha",
220            message: format!("alpha must be in (0, 1), got {alpha}"),
221        });
222    }
223    Ok(())
224}
225
226/// Empirical quantile: sort, return values[ceil(p*n) - 1].
227///
228/// Full sort is O(n log n) per call. For repeated quantile queries on the
229/// same data, consider pre-sorting.
230fn empirical_quantile(values: &[f64], p: f64) -> f64 {
231    let mut sorted = values.to_vec();
232    sort_nan_safe(&mut sorted);
233    let n = sorted.len();
234    let idx = ((p * n as f64).ceil() as usize)
235        .saturating_sub(1)
236        .min(n - 1);
237    sorted[idx]
238}
239
240/// Bootstrap percentile method: resample `n_bootstrap` times, compute the
241/// p-quantile from each resample, return the median of bootstrap quantiles.
242///
243/// The median is used instead of the mean for robustness: bootstrap resamples
244/// can occasionally produce extreme quantile estimates (especially with small
245/// samples or heavy tails), and the median is less sensitive to these outliers
246/// (Efron & Tibshirani, 1993, §13.3).
247fn bootstrap_quantile(values: &[f64], p: f64, n_bootstrap: usize, seed: u64) -> f64 {
248    let n = values.len();
249
250    // Collect all bootstrap maxima (the p-quantile from each resample)
251    let quantiles: Vec<f64> = iter_maybe_parallel!((0..n_bootstrap).collect::<Vec<_>>())
252        .map(|rep| {
253            let mut rng = StdRng::seed_from_u64(seed + rep as u64);
254            let mut sample: Vec<f64> = (0..n).map(|_| values[rng.gen_range(0..n)]).collect();
255            sort_nan_safe(&mut sample);
256            let idx = ((p * n as f64).ceil() as usize)
257                .saturating_sub(1)
258                .min(n - 1);
259            sample[idx]
260        })
261        .collect();
262
263    // Return the median of bootstrap quantiles (more robust than mean)
264    let mut sorted_q = quantiles;
265    sort_nan_safe(&mut sorted_q);
266    let mid = sorted_q.len() / 2;
267    if sorted_q.len() % 2 == 0 {
268        (sorted_q[mid - 1] + sorted_q[mid]) / 2.0
269    } else {
270        sorted_q[mid]
271    }
272}
273
274/// KDE quantile: Gaussian kernel density, Silverman bandwidth, bisection.
275fn kde_quantile(values: &[f64], p: f64, bandwidth: Option<f64>) -> Result<f64, FdarError> {
276    let n = values.len() as f64;
277    let mean: f64 = values.iter().sum::<f64>() / n;
278    let var: f64 = values.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / (n - 1.0);
279    let std = var.sqrt();
280
281    if std <= 0.0 {
282        return Err(FdarError::ComputationFailed {
283            operation: "kde_quantile",
284            detail: "standard deviation is zero; cannot estimate KDE".to_string(),
285        });
286    }
287
288    // Silverman's rule of thumb with skewness adjustment
289    let h = bandwidth.unwrap_or_else(|| {
290        let iqr = {
291            let mut sorted = values.to_vec();
292            sort_nan_safe(&mut sorted);
293            let q75_idx = ((0.75 * n).ceil() as usize)
294                .saturating_sub(1)
295                .min(sorted.len() - 1);
296            let q25_idx = ((0.25 * n).ceil() as usize)
297                .saturating_sub(1)
298                .min(sorted.len() - 1);
299            sorted[q75_idx] - sorted[q25_idx]
300        };
301        let s = std.min(iqr / 1.34);
302        let s = if s > 0.0 { s } else { std };
303        // Skewness adjustment follows Silverman (1986, Section 3.4.1): the bandwidth
304        // is multiplied by (1 + |skewness|)^{1/5} to widen the kernel for asymmetric
305        // distributions, preventing undersmoothing in the tails.
306        let skewness = {
307            let m3: f64 = values
308                .iter()
309                .map(|&x| ((x - mean) / std).powi(3))
310                .sum::<f64>()
311                / n;
312            m3
313        };
314        let skew_factor = (1.0 + skewness.abs() * 0.2).min(1.5);
315        0.9 * s * n.powf(-0.2) * skew_factor
316    });
317
318    if h <= 0.0 {
319        return Err(FdarError::InvalidParameter {
320            parameter: "bandwidth",
321            message: format!("bandwidth must be positive, got {h}"),
322        });
323    }
324
325    // KDE CDF: F_hat(x) = (1/n) * sum_i Phi((x - x_i) / h)
326    // where Phi is the standard normal CDF
327    let kde_cdf = |x: f64| -> f64 {
328        let sum: f64 = values.iter().map(|&xi| normal_cdf((x - xi) / h)).sum();
329        sum / n
330    };
331
332    // Bisection to find x such that kde_cdf(x) = p
333    let mut sorted = values.to_vec();
334    sort_nan_safe(&mut sorted);
335    let mut lo = sorted[0] - 5.0 * h;
336    let mut hi = sorted[sorted.len() - 1] + 5.0 * h;
337
338    // 200 bisection iterations give precision of (range / 2^200) ~ machine epsilon
339    // for any practical range. This is conservative but guarantees convergence.
340    for _ in 0..200 {
341        let mid = (lo + hi) / 2.0;
342        if kde_cdf(mid) < p {
343            lo = mid;
344        } else {
345            hi = mid;
346        }
347        if (hi - lo) < 1e-10 * (1.0 + hi.abs()) {
348            break;
349        }
350    }
351
352    Ok((lo + hi) / 2.0)
353}
354
355/// Standard normal CDF using the regularized gamma function.
356///
357/// Phi(x) = 0.5 * (1 + erf(x / sqrt(2)))
358///
359/// The error function is computed via `regularized_gamma_p(0.5, x^2)`,
360/// exploiting the identity erf(x) = P(0.5, x^2) for x >= 0.
361///
362/// Uses the complementary error function (erfc) path internally for
363/// numerical stability; accuracy is approximately 1e-7 across the standard
364/// normal range.
365fn normal_cdf(x: f64) -> f64 {
366    if x >= 0.0 {
367        0.5 * (1.0 + erf_via_gamma(x))
368    } else {
369        0.5 * (1.0 - erf_via_gamma(-x))
370    }
371}
372
373/// erf(x) for x >= 0 via the regularized lower incomplete gamma function.
374///
375/// Uses the identity: erf(x) = P(0.5, x^2), which follows from the
376/// substitution u = sqrt(t) in the integral representation of P(0.5, x^2).
377fn erf_via_gamma(x: f64) -> f64 {
378    regularized_gamma_p(0.5, x * x)
379}