datasynth_eval/statistical/
anderson_darling.rs

1//! Anderson-Darling goodness-of-fit test for distribution validation.
2//!
3//! Tests whether sample data follows a specified theoretical distribution.
4//! More sensitive to deviations in the tails compared to the K-S test.
5
6use crate::error::{EvalError, EvalResult};
7use serde::{Deserialize, Serialize};
8
9/// Target distribution types for Anderson-Darling test.
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
11pub enum TargetDistribution {
12    /// Normal (Gaussian) distribution
13    #[default]
14    Normal,
15    /// Log-normal distribution (for positive amounts)
16    LogNormal,
17    /// Exponential distribution
18    Exponential,
19    /// Uniform distribution
20    Uniform,
21}
22
23/// Fitted distribution parameters.
24#[derive(Debug, Clone, Serialize, Deserialize)]
25pub enum FittedParameters {
26    /// Normal distribution parameters
27    Normal {
28        /// Mean
29        mean: f64,
30        /// Standard deviation
31        std_dev: f64,
32    },
33    /// Log-normal distribution parameters
34    LogNormal {
35        /// Location parameter (mu)
36        mu: f64,
37        /// Scale parameter (sigma)
38        sigma: f64,
39    },
40    /// Exponential distribution parameters
41    Exponential {
42        /// Rate parameter (lambda)
43        lambda: f64,
44    },
45    /// Uniform distribution parameters
46    Uniform {
47        /// Minimum value
48        min: f64,
49        /// Maximum value
50        max: f64,
51    },
52}
53
54/// Anderson-Darling test results.
55#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct AndersonDarlingAnalysis {
57    /// Sample size
58    pub sample_size: usize,
59    /// Target distribution tested
60    pub target_distribution: TargetDistribution,
61    /// Anderson-Darling test statistic (A²)
62    pub statistic: f64,
63    /// Critical values at different significance levels
64    pub critical_values: CriticalValues,
65    /// Approximate p-value
66    pub p_value: f64,
67    /// Significance level used for pass/fail
68    pub significance_level: f64,
69    /// Whether the test passes (fail to reject null hypothesis)
70    pub passes: bool,
71    /// Fitted distribution parameters
72    pub fitted_params: FittedParameters,
73    /// Issues found during analysis
74    pub issues: Vec<String>,
75}
76
77/// Critical values for Anderson-Darling test at standard significance levels.
78#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct CriticalValues {
80    /// Critical value at 15% significance
81    pub cv_15: f64,
82    /// Critical value at 10% significance
83    pub cv_10: f64,
84    /// Critical value at 5% significance
85    pub cv_05: f64,
86    /// Critical value at 2.5% significance
87    pub cv_025: f64,
88    /// Critical value at 1% significance
89    pub cv_01: f64,
90}
91
92impl CriticalValues {
93    /// Standard critical values for normal distribution test.
94    pub fn normal() -> Self {
95        // Critical values from D'Agostino & Stephens (1986)
96        Self {
97            cv_15: 0.576,
98            cv_10: 0.656,
99            cv_05: 0.787,
100            cv_025: 0.918,
101            cv_01: 1.092,
102        }
103    }
104
105    /// Standard critical values for exponential distribution test.
106    pub fn exponential() -> Self {
107        Self {
108            cv_15: 0.922,
109            cv_10: 1.078,
110            cv_05: 1.341,
111            cv_025: 1.606,
112            cv_01: 1.957,
113        }
114    }
115}
116
117/// Analyzer for Anderson-Darling goodness-of-fit tests.
118pub struct AndersonDarlingAnalyzer {
119    /// Target distribution to test against
120    target_distribution: TargetDistribution,
121    /// Significance level for the test
122    significance_level: f64,
123}
124
125impl AndersonDarlingAnalyzer {
126    /// Create a new analyzer with default settings.
127    pub fn new() -> Self {
128        Self {
129            target_distribution: TargetDistribution::Normal,
130            significance_level: 0.05,
131        }
132    }
133
134    /// Set the target distribution.
135    pub fn with_target_distribution(mut self, dist: TargetDistribution) -> Self {
136        self.target_distribution = dist;
137        self
138    }
139
140    /// Set the significance level.
141    pub fn with_significance_level(mut self, level: f64) -> Self {
142        self.significance_level = level;
143        self
144    }
145
146    /// Perform the Anderson-Darling test on the provided sample data.
147    pub fn analyze(&self, values: &[f64]) -> EvalResult<AndersonDarlingAnalysis> {
148        let n = values.len();
149        if n < 8 {
150            return Err(EvalError::InsufficientData {
151                required: 8,
152                actual: n,
153            });
154        }
155
156        // Filter out invalid values
157        let valid_values: Vec<f64> = values.iter().filter(|&&v| v.is_finite()).copied().collect();
158
159        if valid_values.len() < 8 {
160            return Err(EvalError::InsufficientData {
161                required: 8,
162                actual: valid_values.len(),
163            });
164        }
165
166        let mut issues = Vec::new();
167
168        // Fit parameters and compute test statistic based on target distribution
169        let (statistic, fitted_params) = match self.target_distribution {
170            TargetDistribution::Normal => self.test_normal(&valid_values),
171            TargetDistribution::LogNormal => self.test_lognormal(&valid_values, &mut issues)?,
172            TargetDistribution::Exponential => self.test_exponential(&valid_values, &mut issues)?,
173            TargetDistribution::Uniform => self.test_uniform(&valid_values),
174        };
175
176        // Get critical values
177        let critical_values = match self.target_distribution {
178            TargetDistribution::Exponential => CriticalValues::exponential(),
179            _ => CriticalValues::normal(),
180        };
181
182        // Calculate approximate p-value
183        let p_value = self.approximate_p_value(statistic, self.target_distribution);
184
185        // Determine if test passes (fail to reject null hypothesis)
186        let critical_threshold = match self.significance_level {
187            s if s >= 0.15 => critical_values.cv_15,
188            s if s >= 0.10 => critical_values.cv_10,
189            s if s >= 0.05 => critical_values.cv_05,
190            s if s >= 0.025 => critical_values.cv_025,
191            _ => critical_values.cv_01,
192        };
193        let passes = statistic <= critical_threshold;
194
195        if !passes {
196            issues.push(format!(
197                "A² = {:.4} exceeds critical value {:.4} at α = {:.2}",
198                statistic, critical_threshold, self.significance_level
199            ));
200        }
201
202        Ok(AndersonDarlingAnalysis {
203            sample_size: valid_values.len(),
204            target_distribution: self.target_distribution,
205            statistic,
206            critical_values,
207            p_value,
208            significance_level: self.significance_level,
209            passes,
210            fitted_params,
211            issues,
212        })
213    }
214
215    /// Test for normality using Anderson-Darling.
216    fn test_normal(&self, values: &[f64]) -> (f64, FittedParameters) {
217        let n = values.len();
218        let n_f = n as f64;
219
220        // Fit parameters (MLE for normal)
221        let mean = values.iter().sum::<f64>() / n_f;
222        let variance = values.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n_f;
223        let std_dev = variance.sqrt();
224
225        // Standardize and sort
226        let mut z: Vec<f64> = values
227            .iter()
228            .map(|&x| (x - mean) / std_dev.max(1e-10))
229            .collect();
230        z.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
231
232        // Compute A² statistic
233        let a2 = self.compute_a2_statistic(&z, Self::normal_cdf);
234
235        // Apply correction for estimated parameters
236        let a2_corrected = a2 * (1.0 + 0.75 / n_f + 2.25 / (n_f * n_f));
237
238        (a2_corrected, FittedParameters::Normal { mean, std_dev })
239    }
240
241    /// Test for log-normality.
242    fn test_lognormal(
243        &self,
244        values: &[f64],
245        issues: &mut Vec<String>,
246    ) -> EvalResult<(f64, FittedParameters)> {
247        // Check for non-positive values
248        let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
249
250        if positive_values.len() < 8 {
251            return Err(EvalError::InvalidParameter(
252                "Log-normal test requires at least 8 positive values".to_string(),
253            ));
254        }
255
256        let skipped = values.len() - positive_values.len();
257        if skipped > 0 {
258            issues.push(format!(
259                "Skipped {} non-positive values for log-normal test",
260                skipped
261            ));
262        }
263
264        // Transform to log scale
265        let log_values: Vec<f64> = positive_values.iter().map(|&x| x.ln()).collect();
266
267        // Test normality of log-transformed values
268        let (a2, normal_params) = self.test_normal(&log_values);
269
270        // Convert parameters back to log-normal scale
271        let (mu, sigma) = match normal_params {
272            FittedParameters::Normal { mean, std_dev } => (mean, std_dev),
273            _ => unreachable!(),
274        };
275
276        Ok((a2, FittedParameters::LogNormal { mu, sigma }))
277    }
278
279    /// Test for exponential distribution.
280    fn test_exponential(
281        &self,
282        values: &[f64],
283        issues: &mut Vec<String>,
284    ) -> EvalResult<(f64, FittedParameters)> {
285        // Check for non-positive values
286        let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
287
288        if positive_values.len() < 8 {
289            return Err(EvalError::InvalidParameter(
290                "Exponential test requires at least 8 positive values".to_string(),
291            ));
292        }
293
294        let skipped = values.len() - positive_values.len();
295        if skipped > 0 {
296            issues.push(format!(
297                "Skipped {} non-positive values for exponential test",
298                skipped
299            ));
300        }
301
302        let n = positive_values.len();
303        let n_f = n as f64;
304
305        // Fit parameter (MLE)
306        let mean = positive_values.iter().sum::<f64>() / n_f;
307        let lambda = 1.0 / mean;
308
309        // Sort values
310        let mut sorted: Vec<f64> = positive_values;
311        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
312
313        // Transform to uniform using exponential CDF
314        let u: Vec<f64> = sorted.iter().map(|&x| 1.0 - (-lambda * x).exp()).collect();
315
316        // Compute A² statistic on uniform
317        let a2 = self.compute_a2_uniform(&u);
318
319        // Apply correction for estimated parameters
320        let a2_corrected = a2 * (1.0 + 0.6 / n_f);
321
322        Ok((a2_corrected, FittedParameters::Exponential { lambda }))
323    }
324
325    /// Test for uniform distribution.
326    fn test_uniform(&self, values: &[f64]) -> (f64, FittedParameters) {
327        let min = values.iter().cloned().fold(f64::INFINITY, f64::min);
328        let max = values.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
329
330        // Sort values
331        let mut sorted: Vec<f64> = values.to_vec();
332        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
333
334        // Transform to standard uniform
335        let range = (max - min).max(1e-10);
336        let u: Vec<f64> = sorted.iter().map(|&x| (x - min) / range).collect();
337
338        let a2 = self.compute_a2_uniform(&u);
339
340        (a2, FittedParameters::Uniform { min, max })
341    }
342
343    /// Compute A² statistic from sorted, standardized values using given CDF.
344    fn compute_a2_statistic<F>(&self, z: &[f64], cdf: F) -> f64
345    where
346        F: Fn(f64) -> f64,
347    {
348        let n = z.len();
349        let n_f = n as f64;
350
351        let mut sum = 0.0;
352        for (i, &zi) in z.iter().enumerate() {
353            let fi = cdf(zi).clamp(1e-10, 1.0 - 1e-10);
354            let fn_minus_i = cdf(z[n - 1 - i]).clamp(1e-10, 1.0 - 1e-10);
355
356            let term = (2.0 * (i as f64) + 1.0) * (fi.ln() + (1.0 - fn_minus_i).ln());
357            sum += term;
358        }
359
360        -n_f - sum / n_f
361    }
362
363    /// Compute A² statistic from uniform samples.
364    fn compute_a2_uniform(&self, u: &[f64]) -> f64 {
365        let n = u.len();
366        let n_f = n as f64;
367
368        let mut sum = 0.0;
369        for (i, &ui) in u.iter().enumerate() {
370            let ui = ui.clamp(1e-10, 1.0 - 1e-10);
371            let u_n_minus_i = u[n - 1 - i].clamp(1e-10, 1.0 - 1e-10);
372
373            let term = (2.0 * (i as f64) + 1.0) * (ui.ln() + (1.0 - u_n_minus_i).ln());
374            sum += term;
375        }
376
377        -n_f - sum / n_f
378    }
379
380    /// Standard normal CDF.
381    fn normal_cdf(x: f64) -> f64 {
382        0.5 * (1.0 + erf(x / std::f64::consts::SQRT_2))
383    }
384
385    /// Approximate p-value for Anderson-Darling statistic.
386    fn approximate_p_value(&self, a2: f64, dist: TargetDistribution) -> f64 {
387        match dist {
388            TargetDistribution::Normal => {
389                // Approximation from D'Agostino & Stephens (1986)
390                if a2 < 0.2 {
391                    1.0 - (-13.436 + 101.14 * a2 - 223.73 * a2.powi(2)).exp()
392                } else if a2 < 0.34 {
393                    1.0 - (-8.318 + 42.796 * a2 - 59.938 * a2.powi(2)).exp()
394                } else if a2 < 0.6 {
395                    (0.9177 - 4.279 * a2 - 1.38 * a2.powi(2)).exp()
396                } else if a2 < 13.0 {
397                    (1.2937 - 5.709 * a2 + 0.0186 * a2.powi(2)).exp()
398                } else {
399                    0.0
400                }
401            }
402            TargetDistribution::Exponential => {
403                // Approximation for exponential
404                if a2 < 0.26 {
405                    1.0 - (-12.0 + 70.0 * a2 - 100.0 * a2.powi(2)).exp()
406                } else if a2 < 0.51 {
407                    1.0 - (-6.0 + 24.0 * a2 - 24.0 * a2.powi(2)).exp()
408                } else if a2 < 0.95 {
409                    (0.7 - 3.5 * a2 + 0.6 * a2.powi(2)).exp()
410                } else if a2 < 10.0 {
411                    (0.9 - 4.0 * a2 + 0.01 * a2.powi(2)).exp()
412                } else {
413                    0.0
414                }
415            }
416            _ => {
417                // Generic approximation
418                if a2 < 2.0 {
419                    (-a2 + 0.5).exp().clamp(0.0, 1.0)
420                } else {
421                    0.0
422                }
423            }
424        }
425    }
426}
427
428impl Default for AndersonDarlingAnalyzer {
429    fn default() -> Self {
430        Self::new()
431    }
432}
433
434/// Error function approximation.
435fn erf(x: f64) -> f64 {
436    let a1 = 0.254829592;
437    let a2 = -0.284496736;
438    let a3 = 1.421413741;
439    let a4 = -1.453152027;
440    let a5 = 1.061405429;
441    let p = 0.3275911;
442
443    let sign = if x < 0.0 { -1.0 } else { 1.0 };
444    let x = x.abs();
445
446    let t = 1.0 / (1.0 + p * x);
447    let y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * (-x * x).exp();
448
449    sign * y
450}
451
452#[cfg(test)]
453#[allow(clippy::unwrap_used)]
454mod tests {
455    use super::*;
456    use rand::SeedableRng;
457    use rand_chacha::ChaCha8Rng;
458    use rand_distr::{Distribution, Exp, LogNormal, Normal, Uniform};
459
460    #[test]
461    fn test_normal_sample() {
462        let mut rng = ChaCha8Rng::seed_from_u64(42);
463        let normal = Normal::new(0.0, 1.0).unwrap();
464        let values: Vec<f64> = (0..500).map(|_| normal.sample(&mut rng)).collect();
465
466        let analyzer = AndersonDarlingAnalyzer::new()
467            .with_target_distribution(TargetDistribution::Normal)
468            .with_significance_level(0.05);
469
470        let result = analyzer.analyze(&values).unwrap();
471        assert!(result.passes, "Normal sample should pass normality test");
472        assert!(result.p_value > 0.05);
473    }
474
475    #[test]
476    fn test_non_normal_sample() {
477        let mut rng = ChaCha8Rng::seed_from_u64(42);
478        let exp = Exp::new(1.0).unwrap();
479        let values: Vec<f64> = (0..500).map(|_| exp.sample(&mut rng)).collect();
480
481        let analyzer = AndersonDarlingAnalyzer::new()
482            .with_target_distribution(TargetDistribution::Normal)
483            .with_significance_level(0.05);
484
485        let result = analyzer.analyze(&values).unwrap();
486        assert!(
487            !result.passes,
488            "Exponential sample should fail normality test"
489        );
490    }
491
492    #[test]
493    fn test_lognormal_sample() {
494        let mut rng = ChaCha8Rng::seed_from_u64(42);
495        let lognormal = LogNormal::new(3.0, 0.5).unwrap();
496        let values: Vec<f64> = (0..500).map(|_| lognormal.sample(&mut rng)).collect();
497
498        let analyzer = AndersonDarlingAnalyzer::new()
499            .with_target_distribution(TargetDistribution::LogNormal)
500            .with_significance_level(0.05);
501
502        let result = analyzer.analyze(&values).unwrap();
503        assert!(
504            result.passes,
505            "Log-normal sample should pass log-normality test"
506        );
507
508        // Check fitted parameters are reasonable
509        if let FittedParameters::LogNormal { mu, sigma } = result.fitted_params {
510            assert!((mu - 3.0).abs() < 0.5, "Mu should be close to 3.0");
511            assert!((sigma - 0.5).abs() < 0.2, "Sigma should be close to 0.5");
512        } else {
513            panic!("Expected LogNormal parameters");
514        }
515    }
516
517    #[test]
518    fn test_exponential_sample() {
519        let mut rng = ChaCha8Rng::seed_from_u64(42);
520        let exp = Exp::new(2.0).unwrap();
521        let values: Vec<f64> = (0..500).map(|_| exp.sample(&mut rng)).collect();
522
523        let analyzer = AndersonDarlingAnalyzer::new()
524            .with_target_distribution(TargetDistribution::Exponential)
525            .with_significance_level(0.05);
526
527        let result = analyzer.analyze(&values).unwrap();
528        assert!(
529            result.passes,
530            "Exponential sample should pass exponential test"
531        );
532
533        // Check fitted lambda is reasonable
534        if let FittedParameters::Exponential { lambda } = result.fitted_params {
535            assert!(
536                (lambda - 2.0).abs() < 0.5,
537                "Lambda should be close to 2.0, got {}",
538                lambda
539            );
540        } else {
541            panic!("Expected Exponential parameters");
542        }
543    }
544
545    #[test]
546    fn test_uniform_sample() {
547        let mut rng = ChaCha8Rng::seed_from_u64(42);
548        let uniform = Uniform::new(0.0, 10.0);
549        let values: Vec<f64> = (0..500).map(|_| uniform.sample(&mut rng)).collect();
550
551        let analyzer = AndersonDarlingAnalyzer::new()
552            .with_target_distribution(TargetDistribution::Uniform)
553            .with_significance_level(0.05);
554
555        let result = analyzer.analyze(&values).unwrap();
556        // Uniform test may or may not pass depending on sample
557        assert!(result.sample_size == 500);
558    }
559
560    #[test]
561    fn test_insufficient_data() {
562        let values = vec![1.0, 2.0, 3.0, 4.0, 5.0]; // Only 5 values
563
564        let analyzer = AndersonDarlingAnalyzer::new();
565        let result = analyzer.analyze(&values);
566
567        assert!(matches!(
568            result,
569            Err(EvalError::InsufficientData {
570                required: 8,
571                actual: 5
572            })
573        ));
574    }
575
576    #[test]
577    fn test_critical_values() {
578        let cv = CriticalValues::normal();
579        assert!(cv.cv_15 < cv.cv_10);
580        assert!(cv.cv_10 < cv.cv_05);
581        assert!(cv.cv_05 < cv.cv_025);
582        assert!(cv.cv_025 < cv.cv_01);
583    }
584}
datasynth_eval/statistical/anderson_darling.rs

datasynth_eval/statistical/
anderson_darling.rs