datasynth_eval/statistical/
anderson_darling.rs

1//! Anderson-Darling goodness-of-fit test for distribution validation.
2//!
3//! Tests whether sample data follows a specified theoretical distribution.
4//! More sensitive to deviations in the tails compared to the K-S test.
5
6use crate::error::{EvalError, EvalResult};
7use serde::{Deserialize, Serialize};
8
9/// Target distribution types for Anderson-Darling test.
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
11pub enum TargetDistribution {
12    /// Normal (Gaussian) distribution
13    #[default]
14    Normal,
15    /// Log-normal distribution (for positive amounts)
16    LogNormal,
17    /// Exponential distribution
18    Exponential,
19    /// Uniform distribution
20    Uniform,
21}
22
23/// Fitted distribution parameters.
24#[derive(Debug, Clone, Serialize, Deserialize)]
25pub enum FittedParameters {
26    /// Normal distribution parameters
27    Normal {
28        /// Mean
29        mean: f64,
30        /// Standard deviation
31        std_dev: f64,
32    },
33    /// Log-normal distribution parameters
34    LogNormal {
35        /// Location parameter (mu)
36        mu: f64,
37        /// Scale parameter (sigma)
38        sigma: f64,
39    },
40    /// Exponential distribution parameters
41    Exponential {
42        /// Rate parameter (lambda)
43        lambda: f64,
44    },
45    /// Uniform distribution parameters
46    Uniform {
47        /// Minimum value
48        min: f64,
49        /// Maximum value
50        max: f64,
51    },
52}
53
54/// Anderson-Darling test results.
55#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct AndersonDarlingAnalysis {
57    /// Sample size
58    pub sample_size: usize,
59    /// Target distribution tested
60    pub target_distribution: TargetDistribution,
61    /// Anderson-Darling test statistic (A²)
62    pub statistic: f64,
63    /// Critical values at different significance levels
64    pub critical_values: CriticalValues,
65    /// Approximate p-value
66    pub p_value: f64,
67    /// Significance level used for pass/fail
68    pub significance_level: f64,
69    /// Whether the test passes (fail to reject null hypothesis)
70    pub passes: bool,
71    /// Fitted distribution parameters
72    pub fitted_params: FittedParameters,
73    /// Issues found during analysis
74    pub issues: Vec<String>,
75}
76
77/// Critical values for Anderson-Darling test at standard significance levels.
78#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct CriticalValues {
80    /// Critical value at 15% significance
81    pub cv_15: f64,
82    /// Critical value at 10% significance
83    pub cv_10: f64,
84    /// Critical value at 5% significance
85    pub cv_05: f64,
86    /// Critical value at 2.5% significance
87    pub cv_025: f64,
88    /// Critical value at 1% significance
89    pub cv_01: f64,
90}
91
92impl CriticalValues {
93    /// Standard critical values for normal distribution test.
94    pub fn normal() -> Self {
95        // Critical values from D'Agostino & Stephens (1986)
96        Self {
97            cv_15: 0.576,
98            cv_10: 0.656,
99            cv_05: 0.787,
100            cv_025: 0.918,
101            cv_01: 1.092,
102        }
103    }
104
105    /// Standard critical values for exponential distribution test.
106    pub fn exponential() -> Self {
107        Self {
108            cv_15: 0.922,
109            cv_10: 1.078,
110            cv_05: 1.341,
111            cv_025: 1.606,
112            cv_01: 1.957,
113        }
114    }
115}
116
117/// Analyzer for Anderson-Darling goodness-of-fit tests.
118pub struct AndersonDarlingAnalyzer {
119    /// Target distribution to test against
120    target_distribution: TargetDistribution,
121    /// Significance level for the test
122    significance_level: f64,
123}
124
125impl AndersonDarlingAnalyzer {
126    /// Create a new analyzer with default settings.
127    pub fn new() -> Self {
128        Self {
129            target_distribution: TargetDistribution::Normal,
130            significance_level: 0.05,
131        }
132    }
133
134    /// Set the target distribution.
135    pub fn with_target_distribution(mut self, dist: TargetDistribution) -> Self {
136        self.target_distribution = dist;
137        self
138    }
139
140    /// Set the significance level.
141    pub fn with_significance_level(mut self, level: f64) -> Self {
142        self.significance_level = level;
143        self
144    }
145
146    /// Perform the Anderson-Darling test on the provided sample data.
147    pub fn analyze(&self, values: &[f64]) -> EvalResult<AndersonDarlingAnalysis> {
148        let n = values.len();
149        if n < 8 {
150            return Err(EvalError::InsufficientData {
151                required: 8,
152                actual: n,
153            });
154        }
155
156        // Filter out invalid values
157        let valid_values: Vec<f64> = values.iter().filter(|&&v| v.is_finite()).copied().collect();
158
159        if valid_values.len() < 8 {
160            return Err(EvalError::InsufficientData {
161                required: 8,
162                actual: valid_values.len(),
163            });
164        }
165
166        let mut issues = Vec::new();
167
168        // Fit parameters and compute test statistic based on target distribution
169        let (statistic, fitted_params) = match self.target_distribution {
170            TargetDistribution::Normal => self.test_normal(&valid_values),
171            TargetDistribution::LogNormal => self.test_lognormal(&valid_values, &mut issues)?,
172            TargetDistribution::Exponential => self.test_exponential(&valid_values, &mut issues)?,
173            TargetDistribution::Uniform => self.test_uniform(&valid_values),
174        };
175
176        // Get critical values
177        let critical_values = match self.target_distribution {
178            TargetDistribution::Exponential => CriticalValues::exponential(),
179            _ => CriticalValues::normal(),
180        };
181
182        // Calculate approximate p-value
183        let p_value = self.approximate_p_value(statistic, self.target_distribution);
184
185        // Determine if test passes (fail to reject null hypothesis)
186        let critical_threshold = match self.significance_level {
187            s if s >= 0.15 => critical_values.cv_15,
188            s if s >= 0.10 => critical_values.cv_10,
189            s if s >= 0.05 => critical_values.cv_05,
190            s if s >= 0.025 => critical_values.cv_025,
191            _ => critical_values.cv_01,
192        };
193        let passes = statistic <= critical_threshold;
194
195        if !passes {
196            issues.push(format!(
197                "A² = {:.4} exceeds critical value {:.4} at α = {:.2}",
198                statistic, critical_threshold, self.significance_level
199            ));
200        }
201
202        Ok(AndersonDarlingAnalysis {
203            sample_size: valid_values.len(),
204            target_distribution: self.target_distribution,
205            statistic,
206            critical_values,
207            p_value,
208            significance_level: self.significance_level,
209            passes,
210            fitted_params,
211            issues,
212        })
213    }
214
215    /// Test for normality using Anderson-Darling.
216    fn test_normal(&self, values: &[f64]) -> (f64, FittedParameters) {
217        let n = values.len();
218        let n_f = n as f64;
219
220        // Fit parameters (MLE for normal)
221        let mean = values.iter().sum::<f64>() / n_f;
222        let variance = values.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n_f;
223        let std_dev = variance.sqrt();
224
225        // Standardize and sort
226        let mut z: Vec<f64> = values
227            .iter()
228            .map(|&x| (x - mean) / std_dev.max(1e-10))
229            .collect();
230        z.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
231
232        // Compute A² statistic
233        let a2 = self.compute_a2_statistic(&z, Self::normal_cdf);
234
235        // Apply correction for estimated parameters
236        let a2_corrected = a2 * (1.0 + 0.75 / n_f + 2.25 / (n_f * n_f));
237
238        (a2_corrected, FittedParameters::Normal { mean, std_dev })
239    }
240
241    /// Test for log-normality.
242    fn test_lognormal(
243        &self,
244        values: &[f64],
245        issues: &mut Vec<String>,
246    ) -> EvalResult<(f64, FittedParameters)> {
247        // Check for non-positive values
248        let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
249
250        if positive_values.len() < 8 {
251            return Err(EvalError::InvalidParameter(
252                "Log-normal test requires at least 8 positive values".to_string(),
253            ));
254        }
255
256        let skipped = values.len() - positive_values.len();
257        if skipped > 0 {
258            issues.push(format!(
259                "Skipped {} non-positive values for log-normal test",
260                skipped
261            ));
262        }
263
264        // Transform to log scale
265        let log_values: Vec<f64> = positive_values.iter().map(|&x| x.ln()).collect();
266
267        // Test normality of log-transformed values
268        let (a2, normal_params) = self.test_normal(&log_values);
269
270        // Convert parameters back to log-normal scale
271        let (mu, sigma) = match normal_params {
272            FittedParameters::Normal { mean, std_dev } => (mean, std_dev),
273            _ => unreachable!(),
274        };
275
276        Ok((a2, FittedParameters::LogNormal { mu, sigma }))
277    }
278
279    /// Test for exponential distribution.
280    fn test_exponential(
281        &self,
282        values: &[f64],
283        issues: &mut Vec<String>,
284    ) -> EvalResult<(f64, FittedParameters)> {
285        // Check for non-positive values
286        let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
287
288        if positive_values.len() < 8 {
289            return Err(EvalError::InvalidParameter(
290                "Exponential test requires at least 8 positive values".to_string(),
291            ));
292        }
293
294        let skipped = values.len() - positive_values.len();
295        if skipped > 0 {
296            issues.push(format!(
297                "Skipped {} non-positive values for exponential test",
298                skipped
299            ));
300        }
301
302        let n = positive_values.len();
303        let n_f = n as f64;
304
305        // Fit parameter (MLE)
306        let mean = positive_values.iter().sum::<f64>() / n_f;
307        let lambda = 1.0 / mean;
308
309        // Sort values
310        let mut sorted: Vec<f64> = positive_values;
311        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
312
313        // Transform to uniform using exponential CDF
314        let u: Vec<f64> = sorted.iter().map(|&x| 1.0 - (-lambda * x).exp()).collect();
315
316        // Compute A² statistic on uniform
317        let a2 = self.compute_a2_uniform(&u);
318
319        // Apply correction for estimated parameters
320        let a2_corrected = a2 * (1.0 + 0.6 / n_f);
321
322        Ok((a2_corrected, FittedParameters::Exponential { lambda }))
323    }
324
325    /// Test for uniform distribution.
326    fn test_uniform(&self, values: &[f64]) -> (f64, FittedParameters) {
327        let min = values.iter().cloned().fold(f64::INFINITY, f64::min);
328        let max = values.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
329
330        // Sort values
331        let mut sorted: Vec<f64> = values.to_vec();
332        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
333
334        // Transform to standard uniform
335        let range = (max - min).max(1e-10);
336        let u: Vec<f64> = sorted.iter().map(|&x| (x - min) / range).collect();
337
338        let a2 = self.compute_a2_uniform(&u);
339
340        (a2, FittedParameters::Uniform { min, max })
341    }
342
343    /// Compute A² statistic from sorted, standardized values using given CDF.
344    fn compute_a2_statistic<F>(&self, z: &[f64], cdf: F) -> f64
345    where
346        F: Fn(f64) -> f64,
347    {
348        let n = z.len();
349        let n_f = n as f64;
350
351        let mut sum = 0.0;
352        for (i, &zi) in z.iter().enumerate() {
353            let fi = cdf(zi).clamp(1e-10, 1.0 - 1e-10);
354            let fn_minus_i = cdf(z[n - 1 - i]).clamp(1e-10, 1.0 - 1e-10);
355
356            let term = (2.0 * (i as f64) + 1.0) * (fi.ln() + (1.0 - fn_minus_i).ln());
357            sum += term;
358        }
359
360        -n_f - sum / n_f
361    }
362
363    /// Compute A² statistic from uniform samples.
364    fn compute_a2_uniform(&self, u: &[f64]) -> f64 {
365        let n = u.len();
366        let n_f = n as f64;
367
368        let mut sum = 0.0;
369        for (i, &ui) in u.iter().enumerate() {
370            let ui = ui.clamp(1e-10, 1.0 - 1e-10);
371            let u_n_minus_i = u[n - 1 - i].clamp(1e-10, 1.0 - 1e-10);
372
373            let term = (2.0 * (i as f64) + 1.0) * (ui.ln() + (1.0 - u_n_minus_i).ln());
374            sum += term;
375        }
376
377        -n_f - sum / n_f
378    }
379
380    /// Standard normal CDF.
381    fn normal_cdf(x: f64) -> f64 {
382        0.5 * (1.0 + erf(x / std::f64::consts::SQRT_2))
383    }
384
385    /// Approximate p-value for Anderson-Darling statistic.
386    fn approximate_p_value(&self, a2: f64, dist: TargetDistribution) -> f64 {
387        match dist {
388            TargetDistribution::Normal => {
389                // Approximation from D'Agostino & Stephens (1986)
390                if a2 < 0.2 {
391                    1.0 - (-13.436 + 101.14 * a2 - 223.73 * a2.powi(2)).exp()
392                } else if a2 < 0.34 {
393                    1.0 - (-8.318 + 42.796 * a2 - 59.938 * a2.powi(2)).exp()
394                } else if a2 < 0.6 {
395                    (0.9177 - 4.279 * a2 - 1.38 * a2.powi(2)).exp()
396                } else if a2 < 13.0 {
397                    (1.2937 - 5.709 * a2 + 0.0186 * a2.powi(2)).exp()
398                } else {
399                    0.0
400                }
401            }
402            TargetDistribution::Exponential => {
403                // Approximation for exponential
404                if a2 < 0.26 {
405                    1.0 - (-12.0 + 70.0 * a2 - 100.0 * a2.powi(2)).exp()
406                } else if a2 < 0.51 {
407                    1.0 - (-6.0 + 24.0 * a2 - 24.0 * a2.powi(2)).exp()
408                } else if a2 < 0.95 {
409                    (0.7 - 3.5 * a2 + 0.6 * a2.powi(2)).exp()
410                } else if a2 < 10.0 {
411                    (0.9 - 4.0 * a2 + 0.01 * a2.powi(2)).exp()
412                } else {
413                    0.0
414                }
415            }
416            _ => {
417                // Conservative approximation using normal critical value thresholds.
418                // Returns the highest significance level whose critical value the
419                // statistic does not exceed, providing conservative p-value bounds.
420                let cv = CriticalValues::normal();
421                if a2 <= cv.cv_15 {
422                    0.25 // clearly passes, p > 0.15
423                } else if a2 <= cv.cv_10 {
424                    0.15
425                } else if a2 <= cv.cv_05 {
426                    0.10
427                } else if a2 <= cv.cv_025 {
428                    0.05
429                } else if a2 <= cv.cv_01 {
430                    0.025
431                } else {
432                    // Beyond the 1% critical value
433                    (1.2937 - 5.709 * a2 + 0.0186 * a2.powi(2)).exp().max(0.0)
434                }
435            }
436        }
437    }
438}
439
440impl Default for AndersonDarlingAnalyzer {
441    fn default() -> Self {
442        Self::new()
443    }
444}
445
446/// Error function approximation.
447fn erf(x: f64) -> f64 {
448    let a1 = 0.254829592;
449    let a2 = -0.284496736;
450    let a3 = 1.421413741;
451    let a4 = -1.453152027;
452    let a5 = 1.061405429;
453    let p = 0.3275911;
454
455    let sign = if x < 0.0 { -1.0 } else { 1.0 };
456    let x = x.abs();
457
458    let t = 1.0 / (1.0 + p * x);
459    let y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * (-x * x).exp();
460
461    sign * y
462}
463
464#[cfg(test)]
465#[allow(clippy::unwrap_used)]
466mod tests {
467    use super::*;
468    use rand::SeedableRng;
469    use rand_chacha::ChaCha8Rng;
470    use rand_distr::{Distribution, Exp, LogNormal, Normal, Uniform};
471
472    #[test]
473    fn test_normal_sample() {
474        let mut rng = ChaCha8Rng::seed_from_u64(42);
475        let normal = Normal::new(0.0, 1.0).unwrap();
476        let values: Vec<f64> = (0..500).map(|_| normal.sample(&mut rng)).collect();
477
478        let analyzer = AndersonDarlingAnalyzer::new()
479            .with_target_distribution(TargetDistribution::Normal)
480            .with_significance_level(0.05);
481
482        let result = analyzer.analyze(&values).unwrap();
483        assert!(result.passes, "Normal sample should pass normality test");
484        assert!(result.p_value > 0.05);
485    }
486
487    #[test]
488    fn test_non_normal_sample() {
489        let mut rng = ChaCha8Rng::seed_from_u64(42);
490        let exp = Exp::new(1.0).unwrap();
491        let values: Vec<f64> = (0..500).map(|_| exp.sample(&mut rng)).collect();
492
493        let analyzer = AndersonDarlingAnalyzer::new()
494            .with_target_distribution(TargetDistribution::Normal)
495            .with_significance_level(0.05);
496
497        let result = analyzer.analyze(&values).unwrap();
498        assert!(
499            !result.passes,
500            "Exponential sample should fail normality test"
501        );
502    }
503
504    #[test]
505    fn test_lognormal_sample() {
506        let mut rng = ChaCha8Rng::seed_from_u64(42);
507        let lognormal = LogNormal::new(3.0, 0.5).unwrap();
508        let values: Vec<f64> = (0..500).map(|_| lognormal.sample(&mut rng)).collect();
509
510        let analyzer = AndersonDarlingAnalyzer::new()
511            .with_target_distribution(TargetDistribution::LogNormal)
512            .with_significance_level(0.05);
513
514        let result = analyzer.analyze(&values).unwrap();
515        assert!(
516            result.passes,
517            "Log-normal sample should pass log-normality test"
518        );
519
520        // Check fitted parameters are reasonable
521        if let FittedParameters::LogNormal { mu, sigma } = result.fitted_params {
522            assert!((mu - 3.0).abs() < 0.5, "Mu should be close to 3.0");
523            assert!((sigma - 0.5).abs() < 0.2, "Sigma should be close to 0.5");
524        } else {
525            panic!("Expected LogNormal parameters");
526        }
527    }
528
529    #[test]
530    fn test_exponential_sample() {
531        let mut rng = ChaCha8Rng::seed_from_u64(42);
532        let exp = Exp::new(2.0).unwrap();
533        let values: Vec<f64> = (0..500).map(|_| exp.sample(&mut rng)).collect();
534
535        let analyzer = AndersonDarlingAnalyzer::new()
536            .with_target_distribution(TargetDistribution::Exponential)
537            .with_significance_level(0.05);
538
539        let result = analyzer.analyze(&values).unwrap();
540        assert!(
541            result.passes,
542            "Exponential sample should pass exponential test"
543        );
544
545        // Check fitted lambda is reasonable
546        if let FittedParameters::Exponential { lambda } = result.fitted_params {
547            assert!(
548                (lambda - 2.0).abs() < 0.5,
549                "Lambda should be close to 2.0, got {}",
550                lambda
551            );
552        } else {
553            panic!("Expected Exponential parameters");
554        }
555    }
556
557    #[test]
558    fn test_uniform_sample() {
559        let mut rng = ChaCha8Rng::seed_from_u64(42);
560        let uniform = Uniform::new(0.0, 10.0).unwrap();
561        let values: Vec<f64> = (0..500).map(|_| uniform.sample(&mut rng)).collect();
562
563        let analyzer = AndersonDarlingAnalyzer::new()
564            .with_target_distribution(TargetDistribution::Uniform)
565            .with_significance_level(0.05);
566
567        let result = analyzer.analyze(&values).unwrap();
568        // Uniform test may or may not pass depending on sample
569        assert!(result.sample_size == 500);
570    }
571
572    #[test]
573    fn test_insufficient_data() {
574        let values = vec![1.0, 2.0, 3.0, 4.0, 5.0]; // Only 5 values
575
576        let analyzer = AndersonDarlingAnalyzer::new();
577        let result = analyzer.analyze(&values);
578
579        assert!(matches!(
580            result,
581            Err(EvalError::InsufficientData {
582                required: 8,
583                actual: 5
584            })
585        ));
586    }
587
588    #[test]
589    fn test_critical_values() {
590        let cv = CriticalValues::normal();
591        assert!(cv.cv_15 < cv.cv_10);
592        assert!(cv.cv_10 < cv.cv_05);
593        assert!(cv.cv_05 < cv.cv_025);
594        assert!(cv.cv_025 < cv.cv_01);
595    }
596}
datasynth_eval/statistical/anderson_darling.rs

datasynth_eval/statistical/
anderson_darling.rs