1use rust_decimal::prelude::ToPrimitive;
18use rust_decimal::Decimal;
19use serde::{Deserialize, Serialize};
20
21use super::benford::get_first_digit;
22
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
25#[serde(rename_all = "snake_case")]
26pub enum TestOutcome {
27 Passed,
29 Warning,
31 Failed,
33 Skipped,
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct StatisticalTestResult {
40 pub name: String,
42 pub outcome: TestOutcome,
44 pub statistic: f64,
46 pub threshold: f64,
48 pub message: String,
50}
51
52#[derive(Debug, Clone, Default, Serialize, Deserialize)]
54pub struct StatisticalValidationReport {
55 pub sample_count: usize,
57 pub results: Vec<StatisticalTestResult>,
59}
60
61impl StatisticalValidationReport {
62 pub fn all_passed(&self) -> bool {
64 self.results
65 .iter()
66 .all(|r| !matches!(r.outcome, TestOutcome::Failed))
67 }
68
69 pub fn has_warnings(&self) -> bool {
71 self.results
72 .iter()
73 .any(|r| matches!(r.outcome, TestOutcome::Warning))
74 }
75
76 pub fn failed_names(&self) -> Vec<String> {
78 self.results
79 .iter()
80 .filter(|r| matches!(r.outcome, TestOutcome::Failed))
81 .map(|r| r.name.clone())
82 .collect()
83 }
84}
85
86pub fn run_benford_first_digit(
93 amounts: &[Decimal],
94 threshold_mad: f64,
95 warning_mad: f64,
96) -> StatisticalTestResult {
97 let mut counts = [0u32; 10]; let mut total = 0u32;
99 for amount in amounts {
100 if let Some(d) = get_first_digit(*amount) {
101 counts[d as usize] += 1;
102 total += 1;
103 }
104 }
105
106 if total < 100 {
107 return StatisticalTestResult {
108 name: "benford_first_digit".to_string(),
109 outcome: TestOutcome::Skipped,
110 statistic: 0.0,
111 threshold: threshold_mad,
112 message: format!("only {total} samples with valid first digit; need ≥100"),
113 };
114 }
115
116 const EXPECTED: [f64; 10] = [
119 0.0,
120 std::f64::consts::LOG10_2, 0.17609125905568124, 0.12493873660829995,
123 0.09691001300805642,
124 0.07918124604762482,
125 0.06694678963061322,
126 0.057991946977686726,
127 0.05115252244738129,
128 0.04575749056067514,
129 ];
130
131 let total_f = total as f64;
132 let mad: f64 = (1..=9)
133 .map(|d| (counts[d] as f64 / total_f - EXPECTED[d]).abs())
134 .sum::<f64>()
135 / 9.0;
136
137 let outcome = if mad > threshold_mad {
138 TestOutcome::Failed
139 } else if mad > warning_mad {
140 TestOutcome::Warning
141 } else {
142 TestOutcome::Passed
143 };
144
145 StatisticalTestResult {
146 name: "benford_first_digit".to_string(),
147 outcome,
148 statistic: mad,
149 threshold: threshold_mad,
150 message: format!(
151 "MAD={mad:.4} over {total} first digits (threshold={threshold_mad:.4}, warn={warning_mad:.4})"
152 ),
153 }
154}
155
156pub fn run_chi_squared(
164 amounts: &[Decimal],
165 bins: usize,
166 significance: f64,
167) -> StatisticalTestResult {
168 if amounts.len() < 100 {
169 return StatisticalTestResult {
170 name: "chi_squared".to_string(),
171 outcome: TestOutcome::Skipped,
172 statistic: 0.0,
173 threshold: 0.0,
174 message: format!("only {} samples; need ≥100", amounts.len()),
175 };
176 }
177
178 let bins = bins.max(2);
179 let positives: Vec<f64> = amounts
180 .iter()
181 .filter_map(|a| a.to_f64())
182 .filter(|v| *v > 0.0)
183 .collect();
184 if positives.len() < 100 {
185 return StatisticalTestResult {
186 name: "chi_squared".to_string(),
187 outcome: TestOutcome::Skipped,
188 statistic: 0.0,
189 threshold: 0.0,
190 message: format!("only {} positive samples; need ≥100", positives.len()),
191 };
192 }
193
194 let logs: Vec<f64> = positives.iter().map(|v| v.ln()).collect();
195 let min = logs.iter().cloned().fold(f64::INFINITY, f64::min);
196 let max = logs.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
197 if !min.is_finite() || !max.is_finite() || max <= min {
198 return StatisticalTestResult {
199 name: "chi_squared".to_string(),
200 outcome: TestOutcome::Skipped,
201 statistic: 0.0,
202 threshold: 0.0,
203 message: "degenerate log-range".to_string(),
204 };
205 }
206
207 let bin_width = (max - min) / bins as f64;
208 let mut observed = vec![0u32; bins];
209 for v in &logs {
210 let idx = (((v - min) / bin_width) as usize).min(bins - 1);
211 observed[idx] += 1;
212 }
213
214 let n = logs.len() as f64;
215 let expected_per_bin = n / bins as f64;
216 let chi_sq: f64 = observed
217 .iter()
218 .map(|o| {
219 let diff = *o as f64 - expected_per_bin;
220 diff * diff / expected_per_bin
221 })
222 .sum();
223
224 let df = bins - 1;
229 let critical = chi_sq_critical(df, significance);
230
231 let outcome = if chi_sq > critical {
232 TestOutcome::Failed
233 } else {
234 TestOutcome::Passed
235 };
236
237 StatisticalTestResult {
238 name: "chi_squared".to_string(),
239 outcome,
240 statistic: chi_sq,
241 threshold: critical,
242 message: format!(
243 "χ²={chi_sq:.2} over {bins} log-bins ({n} samples), critical={critical:.2} at α={significance}"
244 ),
245 }
246}
247
248pub fn run_ks_uniform_log(amounts: &[Decimal], significance: f64) -> StatisticalTestResult {
256 let positives: Vec<f64> = amounts
257 .iter()
258 .filter_map(|a| a.to_f64())
259 .filter(|v| *v > 0.0)
260 .collect();
261 if positives.len() < 100 {
262 return StatisticalTestResult {
263 name: "ks_uniform_log".to_string(),
264 outcome: TestOutcome::Skipped,
265 statistic: 0.0,
266 threshold: 0.0,
267 message: format!("only {} positive samples; need ≥100", positives.len()),
268 };
269 }
270
271 let mut logs: Vec<f64> = positives.iter().map(|v| v.ln()).collect();
272 logs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
273 let min = logs[0];
274 let max = logs[logs.len() - 1];
275 if max <= min {
276 return StatisticalTestResult {
277 name: "ks_uniform_log".to_string(),
278 outcome: TestOutcome::Skipped,
279 statistic: 0.0,
280 threshold: 0.0,
281 message: "degenerate log-range".to_string(),
282 };
283 }
284
285 let n = logs.len() as f64;
286 let mut max_diff: f64 = 0.0;
287 for (i, v) in logs.iter().enumerate() {
288 let empirical = (i as f64 + 1.0) / n;
289 let uniform = (v - min) / (max - min);
290 let diff = (empirical - uniform).abs();
291 if diff > max_diff {
292 max_diff = diff;
293 }
294 }
295
296 let c = if significance <= 0.011 {
300 1.628
301 } else if significance <= 0.051 {
302 1.358
303 } else {
304 1.224
305 };
306 let critical = c / n.sqrt();
307
308 let outcome = if max_diff > critical {
309 TestOutcome::Failed
310 } else {
311 TestOutcome::Passed
312 };
313
314 StatisticalTestResult {
315 name: "ks_uniform_log".to_string(),
316 outcome,
317 statistic: max_diff,
318 threshold: critical,
319 message: format!(
320 "D={max_diff:.4} over {n} samples, critical={critical:.4} at α={significance}"
321 ),
322 }
323}
324
325pub fn spearman_rank_correlation(xs: &[f64], ys: &[f64]) -> f64 {
330 let n = xs.len().min(ys.len());
331 if n < 2 {
332 return 0.0;
333 }
334 let rank = |v: &[f64]| -> Vec<f64> {
335 let mut idx: Vec<usize> = (0..v.len()).collect();
336 idx.sort_by(|&a, &b| v[a].partial_cmp(&v[b]).unwrap_or(std::cmp::Ordering::Equal));
337 let mut ranks = vec![0.0; v.len()];
338 let mut i = 0;
340 while i < idx.len() {
341 let mut j = i;
342 while j + 1 < idx.len() && v[idx[j + 1]] == v[idx[i]] {
343 j += 1;
344 }
345 let avg = (i + j) as f64 / 2.0 + 1.0;
347 for k in i..=j {
348 ranks[idx[k]] = avg;
349 }
350 i = j + 1;
351 }
352 ranks
353 };
354 let rx = rank(&xs[..n]);
355 let ry = rank(&ys[..n]);
356 let mean_x = rx.iter().sum::<f64>() / n as f64;
357 let mean_y = ry.iter().sum::<f64>() / n as f64;
358 let mut num = 0.0;
359 let mut den_x = 0.0;
360 let mut den_y = 0.0;
361 for i in 0..n {
362 let dx = rx[i] - mean_x;
363 let dy = ry[i] - mean_y;
364 num += dx * dy;
365 den_x += dx * dx;
366 den_y += dy * dy;
367 }
368 let denom = (den_x * den_y).sqrt();
369 if denom == 0.0 {
370 0.0
371 } else {
372 num / denom
373 }
374}
375
376pub fn run_correlation_check(
381 name: &str,
382 xs: &[f64],
383 ys: &[f64],
384 expected: f64,
385 tolerance: f64,
386) -> StatisticalTestResult {
387 if xs.len().min(ys.len()) < 100 {
388 return StatisticalTestResult {
389 name: format!("correlation_check_{name}"),
390 outcome: TestOutcome::Skipped,
391 statistic: 0.0,
392 threshold: tolerance,
393 message: format!("only {} paired samples; need ≥100", xs.len().min(ys.len())),
394 };
395 }
396 let rho = spearman_rank_correlation(xs, ys);
397 let diff = (rho - expected).abs();
398 let outcome = if diff > tolerance {
399 TestOutcome::Failed
400 } else {
401 TestOutcome::Passed
402 };
403 StatisticalTestResult {
404 name: format!("correlation_check_{name}"),
405 outcome,
406 statistic: rho,
407 threshold: tolerance,
408 message: format!(
409 "Spearman ρ={rho:.4} (expected {expected:.4} ±{tolerance:.4}; diff {diff:.4})"
410 ),
411 }
412}
413
414pub fn run_anderson_darling(amounts: &[Decimal], significance: f64) -> StatisticalTestResult {
420 let positives: Vec<f64> = amounts
421 .iter()
422 .filter_map(|a| a.to_f64())
423 .filter(|v| *v > 0.0)
424 .collect();
425 if positives.len() < 100 {
426 return StatisticalTestResult {
427 name: "anderson_darling".to_string(),
428 outcome: TestOutcome::Skipped,
429 statistic: 0.0,
430 threshold: 0.0,
431 message: format!("only {} positive samples; need ≥100", positives.len()),
432 };
433 }
434 let mut logs: Vec<f64> = positives.iter().map(|v| v.ln()).collect();
435 let n = logs.len() as f64;
436 let mean = logs.iter().sum::<f64>() / n;
437 let var = logs.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / (n - 1.0);
438 let sd = var.sqrt();
439 if sd == 0.0 {
440 return StatisticalTestResult {
441 name: "anderson_darling".to_string(),
442 outcome: TestOutcome::Skipped,
443 statistic: 0.0,
444 threshold: 0.0,
445 message: "zero log-variance (degenerate input)".to_string(),
446 };
447 }
448 logs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
449 let standard_normal_cdf = |x: f64| -> f64 { 0.5 * (1.0 + erf(x / std::f64::consts::SQRT_2)) };
451 let mut s = 0.0;
452 for (i, v) in logs.iter().enumerate() {
453 let z = (v - mean) / sd;
454 let p = standard_normal_cdf(z).clamp(1e-12, 1.0 - 1e-12);
455 let q = 1.0
456 - standard_normal_cdf((logs[logs.len() - 1 - i] - mean) / sd).clamp(1e-12, 1.0 - 1e-12);
457 s += (2.0 * (i + 1) as f64 - 1.0) * (p.ln() + q.ln());
458 }
459 let a_sq = -n - s / n;
460 let a_sq_star = a_sq * (1.0 + 0.75 / n + 2.25 / n.powi(2));
462 let critical = if significance <= 0.011 {
464 1.035
465 } else if significance <= 0.026 {
466 0.873
467 } else if significance <= 0.051 {
468 0.752
469 } else if significance <= 0.101 {
470 0.631
471 } else {
472 0.500
473 };
474 let outcome = if a_sq_star > critical {
475 TestOutcome::Failed
476 } else {
477 TestOutcome::Passed
478 };
479 StatisticalTestResult {
480 name: "anderson_darling".to_string(),
481 outcome,
482 statistic: a_sq_star,
483 threshold: critical,
484 message: format!(
485 "A*²={a_sq_star:.4} vs log-normal, critical={critical:.4} at α={significance} (n={n})"
486 ),
487 }
488}
489
490fn erf(x: f64) -> f64 {
493 let sign = if x < 0.0 { -1.0 } else { 1.0 };
494 let x = x.abs();
495 let t = 1.0 / (1.0 + 0.3275911 * x);
496 let y = 1.0
497 - (((((1.061405429 * t - 1.453152027) * t) + 1.421413741) * t - 0.284496736) * t
498 + 0.254829592)
499 * t
500 * (-x * x).exp();
501 sign * y
502}
503
504fn chi_sq_critical(df: usize, alpha: f64) -> f64 {
508 let table: &[(usize, f64, f64, f64)] = &[
510 (1, 2.706, 3.841, 6.635),
511 (2, 4.605, 5.991, 9.210),
512 (3, 6.251, 7.815, 11.345),
513 (4, 7.779, 9.488, 13.277),
514 (5, 9.236, 11.070, 15.086),
515 (6, 10.645, 12.592, 16.812),
516 (7, 12.017, 14.067, 18.475),
517 (8, 13.362, 15.507, 20.090),
518 (9, 14.684, 16.919, 21.666),
519 (10, 15.987, 18.307, 23.209),
520 (14, 21.064, 23.685, 29.141),
521 (19, 27.204, 30.144, 36.191),
522 (24, 33.196, 36.415, 42.980),
523 (29, 39.087, 42.557, 49.588),
524 ];
525
526 let row = table
527 .iter()
528 .min_by_key(|(d, _, _, _)| (*d as i64 - df as i64).unsigned_abs());
529 if let Some(&(_, c_10, c_05, c_01)) = row {
530 if alpha <= 0.011 {
531 c_01
532 } else if alpha <= 0.051 {
533 c_05
534 } else {
535 c_10
536 }
537 } else {
538 1_000_000.0
540 }
541}
542
543#[cfg(test)]
544mod tests {
545 use super::*;
546 use rand::SeedableRng;
547 use rand_chacha::ChaCha8Rng;
548 use rand_distr::{Distribution, LogNormal};
549
550 fn lognormal_samples(n: usize, mu: f64, sigma: f64, seed: u64) -> Vec<Decimal> {
551 let mut rng = ChaCha8Rng::seed_from_u64(seed);
552 let ln = LogNormal::new(mu, sigma).unwrap();
553 (0..n)
554 .map(|_| Decimal::from_f64_retain(ln.sample(&mut rng)).unwrap_or(Decimal::ONE))
555 .collect()
556 }
557
558 #[test]
559 fn benford_passes_for_lognormal() {
560 let samples = lognormal_samples(2000, 7.0, 2.0, 42);
561 let r = run_benford_first_digit(&samples, 0.015, 0.010);
562 assert!(
563 !matches!(r.outcome, TestOutcome::Failed),
564 "expected pass/warning, got {:?}: {}",
565 r.outcome,
566 r.message
567 );
568 }
569
570 #[test]
571 fn benford_fails_for_concentrated_single_digit() {
572 let samples: Vec<Decimal> = (0..500).map(|i| Decimal::from(5000 + i)).collect();
574 let r = run_benford_first_digit(&samples, 0.015, 0.010);
575 assert!(matches!(r.outcome, TestOutcome::Failed));
576 }
577
578 #[test]
579 fn benford_skipped_below_100_samples() {
580 let samples: Vec<Decimal> = (0..50).map(Decimal::from).collect();
581 let r = run_benford_first_digit(&samples, 0.015, 0.010);
582 assert!(matches!(r.outcome, TestOutcome::Skipped));
583 }
584
585 #[test]
586 fn chi_squared_passes_for_log_uniform() {
587 let samples: Vec<Decimal> = (0..1000)
591 .map(|i| {
592 let log_val = (i as f64 / 1000.0) * 10.0;
594 let v = log_val.exp();
595 Decimal::from_f64_retain(v).unwrap_or(Decimal::ONE)
596 })
597 .collect();
598 let r = run_chi_squared(&samples, 10, 0.05);
599 assert!(
600 !matches!(r.outcome, TestOutcome::Failed),
601 "expected pass, got {:?}: {}",
602 r.outcome,
603 r.message
604 );
605 }
606
607 #[test]
608 fn chi_squared_fails_for_bimodal_concentration() {
609 let mut samples: Vec<Decimal> = (0..450).map(|_| Decimal::from(1000)).collect();
612 samples.extend((0..50).map(|_| Decimal::from(1_000_000)));
613 let r = run_chi_squared(&samples, 10, 0.05);
614 assert!(
615 matches!(r.outcome, TestOutcome::Failed),
616 "expected Failed for bimodal, got {:?}: {}",
617 r.outcome,
618 r.message
619 );
620 }
621
622 #[test]
623 fn spearman_rho_perfect_positive() {
624 let xs: Vec<f64> = (1..=100).map(|i| i as f64).collect();
625 let ys: Vec<f64> = (1..=100).map(|i| i as f64).collect();
626 let rho = spearman_rank_correlation(&xs, &ys);
627 assert!((rho - 1.0).abs() < 1e-6, "expected ρ=1.0, got {rho}");
628 }
629
630 #[test]
631 fn spearman_rho_perfect_negative() {
632 let xs: Vec<f64> = (1..=100).map(|i| i as f64).collect();
633 let ys: Vec<f64> = (1..=100).rev().map(|i| i as f64).collect();
634 let rho = spearman_rank_correlation(&xs, &ys);
635 assert!((rho + 1.0).abs() < 1e-6, "expected ρ=-1.0, got {rho}");
636 }
637
638 #[test]
639 fn correlation_check_passes_when_within_tolerance() {
640 let xs: Vec<f64> = (1..=200).map(|i| i as f64).collect();
641 let ys: Vec<f64> = xs.iter().map(|v| v + 0.5).collect();
642 let r = run_correlation_check("test", &xs, &ys, 1.0, 0.05);
643 assert!(matches!(r.outcome, TestOutcome::Passed));
644 }
645
646 #[test]
647 fn correlation_check_fails_when_off_target() {
648 let xs: Vec<f64> = (1..=200).map(|i| i as f64).collect();
649 let ys: Vec<f64> = xs.iter().rev().copied().collect();
650 let r = run_correlation_check("test", &xs, &ys, 1.0, 0.05);
651 assert!(matches!(r.outcome, TestOutcome::Failed));
652 }
653
654 #[test]
655 fn anderson_darling_passes_for_lognormal() {
656 let samples = lognormal_samples(2000, 7.0, 1.5, 42);
657 let r = run_anderson_darling(&samples, 0.05);
658 assert!(
659 !matches!(r.outcome, TestOutcome::Failed),
660 "expected pass/warning for log-normal, got {:?}: {}",
661 r.outcome,
662 r.message
663 );
664 }
665
666 #[test]
667 fn anderson_darling_fails_for_uniform() {
668 let samples: Vec<Decimal> = (0..2000)
670 .map(|i| Decimal::from(1000 + (i % 500) * 20))
671 .collect();
672 let r = run_anderson_darling(&samples, 0.05);
673 assert!(
674 matches!(r.outcome, TestOutcome::Failed),
675 "expected fail for uniform-like data, got {:?}: {}",
676 r.outcome,
677 r.message
678 );
679 }
680
681 #[test]
682 fn report_all_passed_tracks_failures() {
683 let rep = StatisticalValidationReport {
684 sample_count: 100,
685 results: vec![
686 StatisticalTestResult {
687 name: "a".into(),
688 outcome: TestOutcome::Passed,
689 statistic: 0.0,
690 threshold: 1.0,
691 message: "".into(),
692 },
693 StatisticalTestResult {
694 name: "b".into(),
695 outcome: TestOutcome::Warning,
696 statistic: 0.0,
697 threshold: 1.0,
698 message: "".into(),
699 },
700 ],
701 };
702 assert!(rep.all_passed()); assert!(rep.has_warnings());
704
705 let rep_failed = StatisticalValidationReport {
706 sample_count: 100,
707 results: vec![StatisticalTestResult {
708 name: "c".into(),
709 outcome: TestOutcome::Failed,
710 statistic: 2.0,
711 threshold: 1.0,
712 message: "".into(),
713 }],
714 };
715 assert!(!rep_failed.all_passed());
716 assert_eq!(rep_failed.failed_names(), vec!["c".to_string()]);
717 }
718}