datasynth_eval/statistical/
anderson_darling.rs

1//! Anderson-Darling goodness-of-fit test for distribution validation.
2//!
3//! Tests whether sample data follows a specified theoretical distribution.
4//! More sensitive to deviations in the tails compared to the K-S test.
5
6use crate::error::{EvalError, EvalResult};
7use serde::{Deserialize, Serialize};
8
9/// Target distribution types for Anderson-Darling test.
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
11pub enum TargetDistribution {
12    /// Normal (Gaussian) distribution
13    #[default]
14    Normal,
15    /// Log-normal distribution (for positive amounts)
16    LogNormal,
17    /// Exponential distribution
18    Exponential,
19    /// Uniform distribution
20    Uniform,
21}
22
23/// Fitted distribution parameters.
24#[derive(Debug, Clone, Serialize, Deserialize)]
25pub enum FittedParameters {
26    /// Normal distribution parameters
27    Normal {
28        /// Mean
29        mean: f64,
30        /// Standard deviation
31        std_dev: f64,
32    },
33    /// Log-normal distribution parameters
34    LogNormal {
35        /// Location parameter (mu)
36        mu: f64,
37        /// Scale parameter (sigma)
38        sigma: f64,
39    },
40    /// Exponential distribution parameters
41    Exponential {
42        /// Rate parameter (lambda)
43        lambda: f64,
44    },
45    /// Uniform distribution parameters
46    Uniform {
47        /// Minimum value
48        min: f64,
49        /// Maximum value
50        max: f64,
51    },
52}
53
54/// Anderson-Darling test results.
55#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct AndersonDarlingAnalysis {
57    /// Sample size
58    pub sample_size: usize,
59    /// Target distribution tested
60    pub target_distribution: TargetDistribution,
61    /// Anderson-Darling test statistic (A²)
62    pub statistic: f64,
63    /// Critical values at different significance levels
64    pub critical_values: CriticalValues,
65    /// Approximate p-value
66    pub p_value: f64,
67    /// Significance level used for pass/fail
68    pub significance_level: f64,
69    /// Whether the test passes (fail to reject null hypothesis)
70    pub passes: bool,
71    /// Fitted distribution parameters
72    pub fitted_params: FittedParameters,
73    /// Issues found during analysis
74    pub issues: Vec<String>,
75}
76
77/// Critical values for Anderson-Darling test at standard significance levels.
78#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct CriticalValues {
80    /// Critical value at 15% significance
81    pub cv_15: f64,
82    /// Critical value at 10% significance
83    pub cv_10: f64,
84    /// Critical value at 5% significance
85    pub cv_05: f64,
86    /// Critical value at 2.5% significance
87    pub cv_025: f64,
88    /// Critical value at 1% significance
89    pub cv_01: f64,
90}
91
92impl CriticalValues {
93    /// Standard critical values for normal distribution test.
94    pub fn normal() -> Self {
95        // Critical values from D'Agostino & Stephens (1986)
96        Self {
97            cv_15: 0.576,
98            cv_10: 0.656,
99            cv_05: 0.787,
100            cv_025: 0.918,
101            cv_01: 1.092,
102        }
103    }
104
105    /// Standard critical values for exponential distribution test.
106    pub fn exponential() -> Self {
107        Self {
108            cv_15: 0.922,
109            cv_10: 1.078,
110            cv_05: 1.341,
111            cv_025: 1.606,
112            cv_01: 1.957,
113        }
114    }
115}
116
117/// Analyzer for Anderson-Darling goodness-of-fit tests.
118pub struct AndersonDarlingAnalyzer {
119    /// Target distribution to test against
120    target_distribution: TargetDistribution,
121    /// Significance level for the test
122    significance_level: f64,
123}
124
125impl AndersonDarlingAnalyzer {
126    /// Create a new analyzer with default settings.
127    pub fn new() -> Self {
128        Self {
129            target_distribution: TargetDistribution::Normal,
130            significance_level: 0.05,
131        }
132    }
133
134    /// Set the target distribution.
135    pub fn with_target_distribution(mut self, dist: TargetDistribution) -> Self {
136        self.target_distribution = dist;
137        self
138    }
139
140    /// Set the significance level.
141    pub fn with_significance_level(mut self, level: f64) -> Self {
142        self.significance_level = level;
143        self
144    }
145
146    /// Perform the Anderson-Darling test on the provided sample data.
147    pub fn analyze(&self, values: &[f64]) -> EvalResult<AndersonDarlingAnalysis> {
148        let n = values.len();
149        if n < 8 {
150            return Err(EvalError::InsufficientData {
151                required: 8,
152                actual: n,
153            });
154        }
155
156        // Filter out invalid values
157        let valid_values: Vec<f64> = values.iter().filter(|&&v| v.is_finite()).copied().collect();
158
159        if valid_values.len() < 8 {
160            return Err(EvalError::InsufficientData {
161                required: 8,
162                actual: valid_values.len(),
163            });
164        }
165
166        let mut issues = Vec::new();
167
168        // Fit parameters and compute test statistic based on target distribution
169        let (statistic, fitted_params) = match self.target_distribution {
170            TargetDistribution::Normal => self.test_normal(&valid_values),
171            TargetDistribution::LogNormal => self.test_lognormal(&valid_values, &mut issues)?,
172            TargetDistribution::Exponential => self.test_exponential(&valid_values, &mut issues)?,
173            TargetDistribution::Uniform => self.test_uniform(&valid_values),
174        };
175
176        // Get critical values
177        let critical_values = match self.target_distribution {
178            TargetDistribution::Exponential => CriticalValues::exponential(),
179            _ => CriticalValues::normal(),
180        };
181
182        // Calculate approximate p-value
183        let p_value = self.approximate_p_value(statistic, self.target_distribution);
184
185        // Determine if test passes (fail to reject null hypothesis)
186        let critical_threshold = match self.significance_level {
187            s if s >= 0.15 => critical_values.cv_15,
188            s if s >= 0.10 => critical_values.cv_10,
189            s if s >= 0.05 => critical_values.cv_05,
190            s if s >= 0.025 => critical_values.cv_025,
191            _ => critical_values.cv_01,
192        };
193        let passes = statistic <= critical_threshold;
194
195        if !passes {
196            issues.push(format!(
197                "A² = {:.4} exceeds critical value {:.4} at α = {:.2}",
198                statistic, critical_threshold, self.significance_level
199            ));
200        }
201
202        Ok(AndersonDarlingAnalysis {
203            sample_size: valid_values.len(),
204            target_distribution: self.target_distribution,
205            statistic,
206            critical_values,
207            p_value,
208            significance_level: self.significance_level,
209            passes,
210            fitted_params,
211            issues,
212        })
213    }
214
215    /// Test for normality using Anderson-Darling.
216    fn test_normal(&self, values: &[f64]) -> (f64, FittedParameters) {
217        let n = values.len();
218        let n_f = n as f64;
219
220        // Fit parameters (MLE for normal)
221        let mean = values.iter().sum::<f64>() / n_f;
222        let variance = values.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n_f;
223        let std_dev = variance.sqrt();
224
225        // Standardize and sort
226        let mut z: Vec<f64> = values
227            .iter()
228            .map(|&x| (x - mean) / std_dev.max(1e-10))
229            .collect();
230        z.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
231
232        // Compute A² statistic
233        let a2 = self.compute_a2_statistic(&z, Self::normal_cdf);
234
235        // Apply correction for estimated parameters
236        let a2_corrected = a2 * (1.0 + 0.75 / n_f + 2.25 / (n_f * n_f));
237
238        (a2_corrected, FittedParameters::Normal { mean, std_dev })
239    }
240
241    /// Test for log-normality.
242    fn test_lognormal(
243        &self,
244        values: &[f64],
245        issues: &mut Vec<String>,
246    ) -> EvalResult<(f64, FittedParameters)> {
247        // Check for non-positive values
248        let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
249
250        if positive_values.len() < 8 {
251            return Err(EvalError::InvalidParameter(
252                "Log-normal test requires at least 8 positive values".to_string(),
253            ));
254        }
255
256        let skipped = values.len() - positive_values.len();
257        if skipped > 0 {
258            issues.push(format!(
259                "Skipped {skipped} non-positive values for log-normal test"
260            ));
261        }
262
263        // Transform to log scale
264        let log_values: Vec<f64> = positive_values.iter().map(|&x| x.ln()).collect();
265
266        // Test normality of log-transformed values
267        let (a2, normal_params) = self.test_normal(&log_values);
268
269        // Convert parameters back to log-normal scale
270        let (mu, sigma) = match normal_params {
271            FittedParameters::Normal { mean, std_dev } => (mean, std_dev),
272            _ => unreachable!(),
273        };
274
275        Ok((a2, FittedParameters::LogNormal { mu, sigma }))
276    }
277
278    /// Test for exponential distribution.
279    fn test_exponential(
280        &self,
281        values: &[f64],
282        issues: &mut Vec<String>,
283    ) -> EvalResult<(f64, FittedParameters)> {
284        // Check for non-positive values
285        let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
286
287        if positive_values.len() < 8 {
288            return Err(EvalError::InvalidParameter(
289                "Exponential test requires at least 8 positive values".to_string(),
290            ));
291        }
292
293        let skipped = values.len() - positive_values.len();
294        if skipped > 0 {
295            issues.push(format!(
296                "Skipped {skipped} non-positive values for exponential test"
297            ));
298        }
299
300        let n = positive_values.len();
301        let n_f = n as f64;
302
303        // Fit parameter (MLE)
304        let mean = positive_values.iter().sum::<f64>() / n_f;
305        let lambda = 1.0 / mean;
306
307        // Sort values
308        let mut sorted: Vec<f64> = positive_values;
309        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
310
311        // Transform to uniform using exponential CDF
312        let u: Vec<f64> = sorted.iter().map(|&x| 1.0 - (-lambda * x).exp()).collect();
313
314        // Compute A² statistic on uniform
315        let a2 = self.compute_a2_uniform(&u);
316
317        // Apply correction for estimated parameters
318        let a2_corrected = a2 * (1.0 + 0.6 / n_f);
319
320        Ok((a2_corrected, FittedParameters::Exponential { lambda }))
321    }
322
323    /// Test for uniform distribution.
324    fn test_uniform(&self, values: &[f64]) -> (f64, FittedParameters) {
325        let min = values.iter().cloned().fold(f64::INFINITY, f64::min);
326        let max = values.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
327
328        // Sort values
329        let mut sorted: Vec<f64> = values.to_vec();
330        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
331
332        // Transform to standard uniform
333        let range = (max - min).max(1e-10);
334        let u: Vec<f64> = sorted.iter().map(|&x| (x - min) / range).collect();
335
336        let a2 = self.compute_a2_uniform(&u);
337
338        (a2, FittedParameters::Uniform { min, max })
339    }
340
341    /// Compute A² statistic from sorted, standardized values using given CDF.
342    fn compute_a2_statistic<F>(&self, z: &[f64], cdf: F) -> f64
343    where
344        F: Fn(f64) -> f64,
345    {
346        let n = z.len();
347        let n_f = n as f64;
348
349        let mut sum = 0.0;
350        for (i, &zi) in z.iter().enumerate() {
351            let fi = cdf(zi).clamp(1e-10, 1.0 - 1e-10);
352            let fn_minus_i = cdf(z[n - 1 - i]).clamp(1e-10, 1.0 - 1e-10);
353
354            let term = (2.0 * (i as f64) + 1.0) * (fi.ln() + (1.0 - fn_minus_i).ln());
355            sum += term;
356        }
357
358        -n_f - sum / n_f
359    }
360
361    /// Compute A² statistic from uniform samples.
362    fn compute_a2_uniform(&self, u: &[f64]) -> f64 {
363        let n = u.len();
364        let n_f = n as f64;
365
366        let mut sum = 0.0;
367        for (i, &ui) in u.iter().enumerate() {
368            let ui = ui.clamp(1e-10, 1.0 - 1e-10);
369            let u_n_minus_i = u[n - 1 - i].clamp(1e-10, 1.0 - 1e-10);
370
371            let term = (2.0 * (i as f64) + 1.0) * (ui.ln() + (1.0 - u_n_minus_i).ln());
372            sum += term;
373        }
374
375        -n_f - sum / n_f
376    }
377
378    /// Standard normal CDF.
379    fn normal_cdf(x: f64) -> f64 {
380        0.5 * (1.0 + erf(x / std::f64::consts::SQRT_2))
381    }
382
383    /// Approximate p-value for Anderson-Darling statistic.
384    fn approximate_p_value(&self, a2: f64, dist: TargetDistribution) -> f64 {
385        match dist {
386            TargetDistribution::Normal => {
387                // Approximation from D'Agostino & Stephens (1986)
388                if a2 < 0.2 {
389                    1.0 - (-13.436 + 101.14 * a2 - 223.73 * a2.powi(2)).exp()
390                } else if a2 < 0.34 {
391                    1.0 - (-8.318 + 42.796 * a2 - 59.938 * a2.powi(2)).exp()
392                } else if a2 < 0.6 {
393                    (0.9177 - 4.279 * a2 - 1.38 * a2.powi(2)).exp()
394                } else if a2 < 13.0 {
395                    (1.2937 - 5.709 * a2 + 0.0186 * a2.powi(2)).exp()
396                } else {
397                    0.0
398                }
399            }
400            TargetDistribution::Exponential => {
401                // Approximation for exponential
402                if a2 < 0.26 {
403                    1.0 - (-12.0 + 70.0 * a2 - 100.0 * a2.powi(2)).exp()
404                } else if a2 < 0.51 {
405                    1.0 - (-6.0 + 24.0 * a2 - 24.0 * a2.powi(2)).exp()
406                } else if a2 < 0.95 {
407                    (0.7 - 3.5 * a2 + 0.6 * a2.powi(2)).exp()
408                } else if a2 < 10.0 {
409                    (0.9 - 4.0 * a2 + 0.01 * a2.powi(2)).exp()
410                } else {
411                    0.0
412                }
413            }
414            _ => {
415                // Conservative approximation using normal critical value thresholds.
416                // Returns the highest significance level whose critical value the
417                // statistic does not exceed, providing conservative p-value bounds.
418                let cv = CriticalValues::normal();
419                if a2 <= cv.cv_15 {
420                    0.25 // clearly passes, p > 0.15
421                } else if a2 <= cv.cv_10 {
422                    0.15
423                } else if a2 <= cv.cv_05 {
424                    0.10
425                } else if a2 <= cv.cv_025 {
426                    0.05
427                } else if a2 <= cv.cv_01 {
428                    0.025
429                } else {
430                    // Beyond the 1% critical value
431                    (1.2937 - 5.709 * a2 + 0.0186 * a2.powi(2)).exp().max(0.0)
432                }
433            }
434        }
435    }
436}
437
438impl Default for AndersonDarlingAnalyzer {
439    fn default() -> Self {
440        Self::new()
441    }
442}
443
444/// Error function approximation.
445fn erf(x: f64) -> f64 {
446    let a1 = 0.254829592;
447    let a2 = -0.284496736;
448    let a3 = 1.421413741;
449    let a4 = -1.453152027;
450    let a5 = 1.061405429;
451    let p = 0.3275911;
452
453    let sign = if x < 0.0 { -1.0 } else { 1.0 };
454    let x = x.abs();
455
456    let t = 1.0 / (1.0 + p * x);
457    let y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * (-x * x).exp();
458
459    sign * y
460}
461
462#[cfg(test)]
463#[allow(clippy::unwrap_used)]
464mod tests {
465    use super::*;
466    use rand::SeedableRng;
467    use rand_chacha::ChaCha8Rng;
468    use rand_distr::{Distribution, Exp, LogNormal, Normal, Uniform};
469
470    #[test]
471    fn test_normal_sample() {
472        let mut rng = ChaCha8Rng::seed_from_u64(42);
473        let normal = Normal::new(0.0, 1.0).unwrap();
474        let values: Vec<f64> = (0..500).map(|_| normal.sample(&mut rng)).collect();
475
476        let analyzer = AndersonDarlingAnalyzer::new()
477            .with_target_distribution(TargetDistribution::Normal)
478            .with_significance_level(0.05);
479
480        let result = analyzer.analyze(&values).unwrap();
481        assert!(result.passes, "Normal sample should pass normality test");
482        assert!(result.p_value > 0.05);
483    }
484
485    #[test]
486    fn test_non_normal_sample() {
487        let mut rng = ChaCha8Rng::seed_from_u64(42);
488        let exp = Exp::new(1.0).unwrap();
489        let values: Vec<f64> = (0..500).map(|_| exp.sample(&mut rng)).collect();
490
491        let analyzer = AndersonDarlingAnalyzer::new()
492            .with_target_distribution(TargetDistribution::Normal)
493            .with_significance_level(0.05);
494
495        let result = analyzer.analyze(&values).unwrap();
496        assert!(
497            !result.passes,
498            "Exponential sample should fail normality test"
499        );
500    }
501
502    #[test]
503    fn test_lognormal_sample() {
504        let mut rng = ChaCha8Rng::seed_from_u64(42);
505        let lognormal = LogNormal::new(3.0, 0.5).unwrap();
506        let values: Vec<f64> = (0..500).map(|_| lognormal.sample(&mut rng)).collect();
507
508        let analyzer = AndersonDarlingAnalyzer::new()
509            .with_target_distribution(TargetDistribution::LogNormal)
510            .with_significance_level(0.05);
511
512        let result = analyzer.analyze(&values).unwrap();
513        assert!(
514            result.passes,
515            "Log-normal sample should pass log-normality test"
516        );
517
518        // Check fitted parameters are reasonable
519        if let FittedParameters::LogNormal { mu, sigma } = result.fitted_params {
520            assert!((mu - 3.0).abs() < 0.5, "Mu should be close to 3.0");
521            assert!((sigma - 0.5).abs() < 0.2, "Sigma should be close to 0.5");
522        } else {
523            panic!("Expected LogNormal parameters");
524        }
525    }
526
527    #[test]
528    fn test_exponential_sample() {
529        let mut rng = ChaCha8Rng::seed_from_u64(42);
530        let exp = Exp::new(2.0).unwrap();
531        let values: Vec<f64> = (0..500).map(|_| exp.sample(&mut rng)).collect();
532
533        let analyzer = AndersonDarlingAnalyzer::new()
534            .with_target_distribution(TargetDistribution::Exponential)
535            .with_significance_level(0.05);
536
537        let result = analyzer.analyze(&values).unwrap();
538        assert!(
539            result.passes,
540            "Exponential sample should pass exponential test"
541        );
542
543        // Check fitted lambda is reasonable
544        if let FittedParameters::Exponential { lambda } = result.fitted_params {
545            assert!(
546                (lambda - 2.0).abs() < 0.5,
547                "Lambda should be close to 2.0, got {}",
548                lambda
549            );
550        } else {
551            panic!("Expected Exponential parameters");
552        }
553    }
554
555    #[test]
556    fn test_uniform_sample() {
557        let mut rng = ChaCha8Rng::seed_from_u64(42);
558        let uniform = Uniform::new(0.0, 10.0).unwrap();
559        let values: Vec<f64> = (0..500).map(|_| uniform.sample(&mut rng)).collect();
560
561        let analyzer = AndersonDarlingAnalyzer::new()
562            .with_target_distribution(TargetDistribution::Uniform)
563            .with_significance_level(0.05);
564
565        let result = analyzer.analyze(&values).unwrap();
566        // Uniform test may or may not pass depending on sample
567        assert!(result.sample_size == 500);
568    }
569
570    #[test]
571    fn test_insufficient_data() {
572        let values = vec![1.0, 2.0, 3.0, 4.0, 5.0]; // Only 5 values
573
574        let analyzer = AndersonDarlingAnalyzer::new();
575        let result = analyzer.analyze(&values);
576
577        assert!(matches!(
578            result,
579            Err(EvalError::InsufficientData {
580                required: 8,
581                actual: 5
582            })
583        ));
584    }
585
586    #[test]
587    fn test_critical_values() {
588        let cv = CriticalValues::normal();
589        assert!(cv.cv_15 < cv.cv_10);
590        assert!(cv.cv_10 < cv.cv_05);
591        assert!(cv.cv_05 < cv.cv_025);
592        assert!(cv.cv_025 < cv.cv_01);
593    }
594}
datasynth_eval/statistical/anderson_darling.rs

datasynth_eval/statistical/
anderson_darling.rs