datasynth_eval/statistical/
anderson_darling.rs1use crate::error::{EvalError, EvalResult};
7use serde::{Deserialize, Serialize};
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
11pub enum TargetDistribution {
12 #[default]
14 Normal,
15 LogNormal,
17 Exponential,
19 Uniform,
21}
22
23#[derive(Debug, Clone, Serialize, Deserialize)]
25pub enum FittedParameters {
26 Normal {
28 mean: f64,
30 std_dev: f64,
32 },
33 LogNormal {
35 mu: f64,
37 sigma: f64,
39 },
40 Exponential {
42 lambda: f64,
44 },
45 Uniform {
47 min: f64,
49 max: f64,
51 },
52}
53
54#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct AndersonDarlingAnalysis {
57 pub sample_size: usize,
59 pub target_distribution: TargetDistribution,
61 pub statistic: f64,
63 pub critical_values: CriticalValues,
65 pub p_value: f64,
67 pub significance_level: f64,
69 pub passes: bool,
71 pub fitted_params: FittedParameters,
73 pub issues: Vec<String>,
75}
76
77#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct CriticalValues {
80 pub cv_15: f64,
82 pub cv_10: f64,
84 pub cv_05: f64,
86 pub cv_025: f64,
88 pub cv_01: f64,
90}
91
92impl CriticalValues {
93 pub fn normal() -> Self {
95 Self {
97 cv_15: 0.576,
98 cv_10: 0.656,
99 cv_05: 0.787,
100 cv_025: 0.918,
101 cv_01: 1.092,
102 }
103 }
104
105 pub fn exponential() -> Self {
107 Self {
108 cv_15: 0.922,
109 cv_10: 1.078,
110 cv_05: 1.341,
111 cv_025: 1.606,
112 cv_01: 1.957,
113 }
114 }
115}
116
117pub struct AndersonDarlingAnalyzer {
119 target_distribution: TargetDistribution,
121 significance_level: f64,
123}
124
125impl AndersonDarlingAnalyzer {
126 pub fn new() -> Self {
128 Self {
129 target_distribution: TargetDistribution::Normal,
130 significance_level: 0.05,
131 }
132 }
133
134 pub fn with_target_distribution(mut self, dist: TargetDistribution) -> Self {
136 self.target_distribution = dist;
137 self
138 }
139
140 pub fn with_significance_level(mut self, level: f64) -> Self {
142 self.significance_level = level;
143 self
144 }
145
146 pub fn analyze(&self, values: &[f64]) -> EvalResult<AndersonDarlingAnalysis> {
148 let n = values.len();
149 if n < 8 {
150 return Err(EvalError::InsufficientData {
151 required: 8,
152 actual: n,
153 });
154 }
155
156 let valid_values: Vec<f64> = values.iter().filter(|&&v| v.is_finite()).copied().collect();
158
159 if valid_values.len() < 8 {
160 return Err(EvalError::InsufficientData {
161 required: 8,
162 actual: valid_values.len(),
163 });
164 }
165
166 let mut issues = Vec::new();
167
168 let (statistic, fitted_params) = match self.target_distribution {
170 TargetDistribution::Normal => self.test_normal(&valid_values),
171 TargetDistribution::LogNormal => self.test_lognormal(&valid_values, &mut issues)?,
172 TargetDistribution::Exponential => self.test_exponential(&valid_values, &mut issues)?,
173 TargetDistribution::Uniform => self.test_uniform(&valid_values),
174 };
175
176 let critical_values = match self.target_distribution {
178 TargetDistribution::Exponential => CriticalValues::exponential(),
179 _ => CriticalValues::normal(),
180 };
181
182 let p_value = self.approximate_p_value(statistic, self.target_distribution);
184
185 let critical_threshold = match self.significance_level {
187 s if s >= 0.15 => critical_values.cv_15,
188 s if s >= 0.10 => critical_values.cv_10,
189 s if s >= 0.05 => critical_values.cv_05,
190 s if s >= 0.025 => critical_values.cv_025,
191 _ => critical_values.cv_01,
192 };
193 let passes = statistic <= critical_threshold;
194
195 if !passes {
196 issues.push(format!(
197 "A² = {:.4} exceeds critical value {:.4} at α = {:.2}",
198 statistic, critical_threshold, self.significance_level
199 ));
200 }
201
202 Ok(AndersonDarlingAnalysis {
203 sample_size: valid_values.len(),
204 target_distribution: self.target_distribution,
205 statistic,
206 critical_values,
207 p_value,
208 significance_level: self.significance_level,
209 passes,
210 fitted_params,
211 issues,
212 })
213 }
214
215 fn test_normal(&self, values: &[f64]) -> (f64, FittedParameters) {
217 let n = values.len();
218 let n_f = n as f64;
219
220 let mean = values.iter().sum::<f64>() / n_f;
222 let variance = values.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n_f;
223 let std_dev = variance.sqrt();
224
225 let mut z: Vec<f64> = values
227 .iter()
228 .map(|&x| (x - mean) / std_dev.max(1e-10))
229 .collect();
230 z.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
231
232 let a2 = self.compute_a2_statistic(&z, Self::normal_cdf);
234
235 let a2_corrected = a2 * (1.0 + 0.75 / n_f + 2.25 / (n_f * n_f));
237
238 (a2_corrected, FittedParameters::Normal { mean, std_dev })
239 }
240
241 fn test_lognormal(
243 &self,
244 values: &[f64],
245 issues: &mut Vec<String>,
246 ) -> EvalResult<(f64, FittedParameters)> {
247 let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
249
250 if positive_values.len() < 8 {
251 return Err(EvalError::InvalidParameter(
252 "Log-normal test requires at least 8 positive values".to_string(),
253 ));
254 }
255
256 let skipped = values.len() - positive_values.len();
257 if skipped > 0 {
258 issues.push(format!(
259 "Skipped {skipped} non-positive values for log-normal test"
260 ));
261 }
262
263 let log_values: Vec<f64> = positive_values.iter().map(|&x| x.ln()).collect();
265
266 let (a2, normal_params) = self.test_normal(&log_values);
268
269 let (mu, sigma) = match normal_params {
271 FittedParameters::Normal { mean, std_dev } => (mean, std_dev),
272 _ => unreachable!(),
273 };
274
275 Ok((a2, FittedParameters::LogNormal { mu, sigma }))
276 }
277
278 fn test_exponential(
280 &self,
281 values: &[f64],
282 issues: &mut Vec<String>,
283 ) -> EvalResult<(f64, FittedParameters)> {
284 let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
286
287 if positive_values.len() < 8 {
288 return Err(EvalError::InvalidParameter(
289 "Exponential test requires at least 8 positive values".to_string(),
290 ));
291 }
292
293 let skipped = values.len() - positive_values.len();
294 if skipped > 0 {
295 issues.push(format!(
296 "Skipped {skipped} non-positive values for exponential test"
297 ));
298 }
299
300 let n = positive_values.len();
301 let n_f = n as f64;
302
303 let mean = positive_values.iter().sum::<f64>() / n_f;
305 let lambda = 1.0 / mean;
306
307 let mut sorted: Vec<f64> = positive_values;
309 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
310
311 let u: Vec<f64> = sorted.iter().map(|&x| 1.0 - (-lambda * x).exp()).collect();
313
314 let a2 = self.compute_a2_uniform(&u);
316
317 let a2_corrected = a2 * (1.0 + 0.6 / n_f);
319
320 Ok((a2_corrected, FittedParameters::Exponential { lambda }))
321 }
322
323 fn test_uniform(&self, values: &[f64]) -> (f64, FittedParameters) {
325 let min = values.iter().cloned().fold(f64::INFINITY, f64::min);
326 let max = values.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
327
328 let mut sorted: Vec<f64> = values.to_vec();
330 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
331
332 let range = (max - min).max(1e-10);
334 let u: Vec<f64> = sorted.iter().map(|&x| (x - min) / range).collect();
335
336 let a2 = self.compute_a2_uniform(&u);
337
338 (a2, FittedParameters::Uniform { min, max })
339 }
340
341 fn compute_a2_statistic<F>(&self, z: &[f64], cdf: F) -> f64
343 where
344 F: Fn(f64) -> f64,
345 {
346 let n = z.len();
347 let n_f = n as f64;
348
349 let mut sum = 0.0;
350 for (i, &zi) in z.iter().enumerate() {
351 let fi = cdf(zi).clamp(1e-10, 1.0 - 1e-10);
352 let fn_minus_i = cdf(z[n - 1 - i]).clamp(1e-10, 1.0 - 1e-10);
353
354 let term = (2.0 * (i as f64) + 1.0) * (fi.ln() + (1.0 - fn_minus_i).ln());
355 sum += term;
356 }
357
358 -n_f - sum / n_f
359 }
360
361 fn compute_a2_uniform(&self, u: &[f64]) -> f64 {
363 let n = u.len();
364 let n_f = n as f64;
365
366 let mut sum = 0.0;
367 for (i, &ui) in u.iter().enumerate() {
368 let ui = ui.clamp(1e-10, 1.0 - 1e-10);
369 let u_n_minus_i = u[n - 1 - i].clamp(1e-10, 1.0 - 1e-10);
370
371 let term = (2.0 * (i as f64) + 1.0) * (ui.ln() + (1.0 - u_n_minus_i).ln());
372 sum += term;
373 }
374
375 -n_f - sum / n_f
376 }
377
378 fn normal_cdf(x: f64) -> f64 {
380 0.5 * (1.0 + erf(x / std::f64::consts::SQRT_2))
381 }
382
383 fn approximate_p_value(&self, a2: f64, dist: TargetDistribution) -> f64 {
385 match dist {
386 TargetDistribution::Normal => {
387 if a2 < 0.2 {
389 1.0 - (-13.436 + 101.14 * a2 - 223.73 * a2.powi(2)).exp()
390 } else if a2 < 0.34 {
391 1.0 - (-8.318 + 42.796 * a2 - 59.938 * a2.powi(2)).exp()
392 } else if a2 < 0.6 {
393 (0.9177 - 4.279 * a2 - 1.38 * a2.powi(2)).exp()
394 } else if a2 < 13.0 {
395 (1.2937 - 5.709 * a2 + 0.0186 * a2.powi(2)).exp()
396 } else {
397 0.0
398 }
399 }
400 TargetDistribution::Exponential => {
401 if a2 < 0.26 {
403 1.0 - (-12.0 + 70.0 * a2 - 100.0 * a2.powi(2)).exp()
404 } else if a2 < 0.51 {
405 1.0 - (-6.0 + 24.0 * a2 - 24.0 * a2.powi(2)).exp()
406 } else if a2 < 0.95 {
407 (0.7 - 3.5 * a2 + 0.6 * a2.powi(2)).exp()
408 } else if a2 < 10.0 {
409 (0.9 - 4.0 * a2 + 0.01 * a2.powi(2)).exp()
410 } else {
411 0.0
412 }
413 }
414 _ => {
415 let cv = CriticalValues::normal();
419 if a2 <= cv.cv_15 {
420 0.25 } else if a2 <= cv.cv_10 {
422 0.15
423 } else if a2 <= cv.cv_05 {
424 0.10
425 } else if a2 <= cv.cv_025 {
426 0.05
427 } else if a2 <= cv.cv_01 {
428 0.025
429 } else {
430 (1.2937 - 5.709 * a2 + 0.0186 * a2.powi(2)).exp().max(0.0)
432 }
433 }
434 }
435 }
436}
437
438impl Default for AndersonDarlingAnalyzer {
439 fn default() -> Self {
440 Self::new()
441 }
442}
443
444fn erf(x: f64) -> f64 {
446 let a1 = 0.254829592;
447 let a2 = -0.284496736;
448 let a3 = 1.421413741;
449 let a4 = -1.453152027;
450 let a5 = 1.061405429;
451 let p = 0.3275911;
452
453 let sign = if x < 0.0 { -1.0 } else { 1.0 };
454 let x = x.abs();
455
456 let t = 1.0 / (1.0 + p * x);
457 let y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * (-x * x).exp();
458
459 sign * y
460}
461
462#[cfg(test)]
463#[allow(clippy::unwrap_used)]
464mod tests {
465 use super::*;
466 use rand::SeedableRng;
467 use rand_chacha::ChaCha8Rng;
468 use rand_distr::{Distribution, Exp, LogNormal, Normal, Uniform};
469
470 #[test]
471 fn test_normal_sample() {
472 let mut rng = ChaCha8Rng::seed_from_u64(42);
473 let normal = Normal::new(0.0, 1.0).unwrap();
474 let values: Vec<f64> = (0..500).map(|_| normal.sample(&mut rng)).collect();
475
476 let analyzer = AndersonDarlingAnalyzer::new()
477 .with_target_distribution(TargetDistribution::Normal)
478 .with_significance_level(0.05);
479
480 let result = analyzer.analyze(&values).unwrap();
481 assert!(result.passes, "Normal sample should pass normality test");
482 assert!(result.p_value > 0.05);
483 }
484
485 #[test]
486 fn test_non_normal_sample() {
487 let mut rng = ChaCha8Rng::seed_from_u64(42);
488 let exp = Exp::new(1.0).unwrap();
489 let values: Vec<f64> = (0..500).map(|_| exp.sample(&mut rng)).collect();
490
491 let analyzer = AndersonDarlingAnalyzer::new()
492 .with_target_distribution(TargetDistribution::Normal)
493 .with_significance_level(0.05);
494
495 let result = analyzer.analyze(&values).unwrap();
496 assert!(
497 !result.passes,
498 "Exponential sample should fail normality test"
499 );
500 }
501
502 #[test]
503 fn test_lognormal_sample() {
504 let mut rng = ChaCha8Rng::seed_from_u64(42);
505 let lognormal = LogNormal::new(3.0, 0.5).unwrap();
506 let values: Vec<f64> = (0..500).map(|_| lognormal.sample(&mut rng)).collect();
507
508 let analyzer = AndersonDarlingAnalyzer::new()
509 .with_target_distribution(TargetDistribution::LogNormal)
510 .with_significance_level(0.05);
511
512 let result = analyzer.analyze(&values).unwrap();
513 assert!(
514 result.passes,
515 "Log-normal sample should pass log-normality test"
516 );
517
518 if let FittedParameters::LogNormal { mu, sigma } = result.fitted_params {
520 assert!((mu - 3.0).abs() < 0.5, "Mu should be close to 3.0");
521 assert!((sigma - 0.5).abs() < 0.2, "Sigma should be close to 0.5");
522 } else {
523 panic!("Expected LogNormal parameters");
524 }
525 }
526
527 #[test]
528 fn test_exponential_sample() {
529 let mut rng = ChaCha8Rng::seed_from_u64(42);
530 let exp = Exp::new(2.0).unwrap();
531 let values: Vec<f64> = (0..500).map(|_| exp.sample(&mut rng)).collect();
532
533 let analyzer = AndersonDarlingAnalyzer::new()
534 .with_target_distribution(TargetDistribution::Exponential)
535 .with_significance_level(0.05);
536
537 let result = analyzer.analyze(&values).unwrap();
538 assert!(
539 result.passes,
540 "Exponential sample should pass exponential test"
541 );
542
543 if let FittedParameters::Exponential { lambda } = result.fitted_params {
545 assert!(
546 (lambda - 2.0).abs() < 0.5,
547 "Lambda should be close to 2.0, got {}",
548 lambda
549 );
550 } else {
551 panic!("Expected Exponential parameters");
552 }
553 }
554
555 #[test]
556 fn test_uniform_sample() {
557 let mut rng = ChaCha8Rng::seed_from_u64(42);
558 let uniform = Uniform::new(0.0, 10.0).unwrap();
559 let values: Vec<f64> = (0..500).map(|_| uniform.sample(&mut rng)).collect();
560
561 let analyzer = AndersonDarlingAnalyzer::new()
562 .with_target_distribution(TargetDistribution::Uniform)
563 .with_significance_level(0.05);
564
565 let result = analyzer.analyze(&values).unwrap();
566 assert!(result.sample_size == 500);
568 }
569
570 #[test]
571 fn test_insufficient_data() {
572 let values = vec![1.0, 2.0, 3.0, 4.0, 5.0]; let analyzer = AndersonDarlingAnalyzer::new();
575 let result = analyzer.analyze(&values);
576
577 assert!(matches!(
578 result,
579 Err(EvalError::InsufficientData {
580 required: 8,
581 actual: 5
582 })
583 ));
584 }
585
586 #[test]
587 fn test_critical_values() {
588 let cv = CriticalValues::normal();
589 assert!(cv.cv_15 < cv.cv_10);
590 assert!(cv.cv_10 < cv.cv_05);
591 assert!(cv.cv_05 < cv.cv_025);
592 assert!(cv.cv_025 < cv.cv_01);
593 }
594}