1use crate::{
2 common::risk::RiskLevel,
3 error::{BenfError, Result},
4};
5
6#[derive(Debug, Clone)]
8pub struct NormalResult {
9 pub dataset_name: String,
10 pub numbers_analyzed: usize,
11 pub risk_level: RiskLevel,
12
13 pub mean: f64, pub std_dev: f64, pub variance: f64, pub skewness: f64, pub kurtosis: f64, pub shapiro_wilk_statistic: f64, pub shapiro_wilk_p_value: f64, pub anderson_darling_statistic: f64, pub anderson_darling_p_value: f64, pub kolmogorov_smirnov_statistic: f64, pub kolmogorov_smirnov_p_value: f64, pub normality_score: f64, pub qq_correlation: f64, pub distribution_quality: f64, pub outliers_z_score: Vec<(usize, f64, f64)>, pub outliers_modified_z: Vec<(usize, f64, f64)>, pub outliers_iqr: Vec<(usize, f64)>, pub mean_confidence_interval: (f64, f64), pub prediction_interval_95: (f64, f64), pub three_sigma_limits: (f64, f64), pub within_1_sigma_percent: f64, pub within_2_sigma_percent: f64, pub within_3_sigma_percent: f64, }
48
49impl NormalResult {
50 pub fn new(dataset_name: String, numbers: &[f64]) -> Result<Self> {
51 if numbers.len() < 8 {
52 return Err(BenfError::InsufficientData(numbers.len()));
53 }
54
55 let numbers_analyzed = numbers.len();
56
57 let mean = calculate_mean(numbers);
59 let variance = calculate_variance(numbers, mean);
60 let std_dev = variance.sqrt();
61 let skewness = calculate_skewness(numbers, mean, std_dev);
62 let kurtosis = calculate_kurtosis(numbers, mean, std_dev);
63
64 let shapiro_result = shapiro_wilk_test(numbers);
66 let anderson_result = anderson_darling_test(numbers, mean, std_dev);
67 let ks_result = kolmogorov_smirnov_test(numbers, mean, std_dev);
68
69 let qq_correlation = calculate_qq_correlation(numbers, mean, std_dev);
71 let normality_score = calculate_normality_score(
72 shapiro_result.1,
73 anderson_result.1,
74 ks_result.1,
75 qq_correlation,
76 );
77 let distribution_quality =
78 calculate_distribution_quality(skewness, kurtosis, normality_score);
79
80 let outliers_z_score = detect_outliers_z_score(numbers, mean, std_dev);
82 let outliers_modified_z = detect_outliers_modified_z_score(numbers);
83 let outliers_iqr = detect_outliers_iqr(numbers);
84
85 let mean_confidence_interval = calculate_mean_confidence_interval(numbers, mean, std_dev);
87 let prediction_interval_95 = (mean - 1.96 * std_dev, mean + 1.96 * std_dev);
88 let three_sigma_limits = (mean - 3.0 * std_dev, mean + 3.0 * std_dev);
89
90 let within_1_sigma_percent = calculate_within_sigma_percent(numbers, mean, std_dev, 1.0);
92 let within_2_sigma_percent = calculate_within_sigma_percent(numbers, mean, std_dev, 2.0);
93 let within_3_sigma_percent = calculate_within_sigma_percent(numbers, mean, std_dev, 3.0);
94
95 let risk_level =
97 determine_risk_level(normality_score, &outliers_z_score, skewness, kurtosis);
98
99 Ok(NormalResult {
100 dataset_name,
101 numbers_analyzed,
102 risk_level,
103 mean,
104 std_dev,
105 variance,
106 skewness,
107 kurtosis,
108 shapiro_wilk_statistic: shapiro_result.0,
109 shapiro_wilk_p_value: shapiro_result.1,
110 anderson_darling_statistic: anderson_result.0,
111 anderson_darling_p_value: anderson_result.1,
112 kolmogorov_smirnov_statistic: ks_result.0,
113 kolmogorov_smirnov_p_value: ks_result.1,
114 normality_score,
115 qq_correlation,
116 distribution_quality,
117 outliers_z_score,
118 outliers_modified_z,
119 outliers_iqr,
120 mean_confidence_interval,
121 prediction_interval_95,
122 three_sigma_limits,
123 within_1_sigma_percent,
124 within_2_sigma_percent,
125 within_3_sigma_percent,
126 })
127 }
128}
129
130fn calculate_mean(numbers: &[f64]) -> f64 {
132 numbers.iter().sum::<f64>() / numbers.len() as f64
133}
134
135fn calculate_variance(numbers: &[f64], mean: f64) -> f64 {
137 let sum_squared_diff: f64 = numbers.iter().map(|&x| (x - mean).powi(2)).sum();
138 sum_squared_diff / (numbers.len() - 1) as f64
139}
140
141fn calculate_skewness(numbers: &[f64], mean: f64, std_dev: f64) -> f64 {
143 let n = numbers.len() as f64;
144 let sum_cubed: f64 = numbers
145 .iter()
146 .map(|&x| ((x - mean) / std_dev).powi(3))
147 .sum();
148
149 sum_cubed / n
150}
151
152fn calculate_kurtosis(numbers: &[f64], mean: f64, std_dev: f64) -> f64 {
154 let n = numbers.len() as f64;
155 let sum_fourth: f64 = numbers
156 .iter()
157 .map(|&x| ((x - mean) / std_dev).powi(4))
158 .sum();
159
160 (sum_fourth / n) - 3.0 }
162
163fn shapiro_wilk_test(numbers: &[f64]) -> (f64, f64) {
165 let n = numbers.len();
166 if !(8..=5000).contains(&n) {
167 return (0.0, 1.0); }
169
170 let mut sorted = numbers.to_vec();
171 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
172
173 let qq_corr = calculate_qq_correlation(
175 &sorted,
176 calculate_mean(&sorted),
177 calculate_variance(&sorted, calculate_mean(&sorted)).sqrt(),
178 );
179 let w_statistic = qq_corr * qq_corr;
180
181 let p_value = if w_statistic > 0.95 {
183 0.5
184 } else if w_statistic > 0.90 {
185 0.1
186 } else if w_statistic > 0.80 {
187 0.01
188 } else {
189 0.001
190 };
191
192 (w_statistic, p_value)
193}
194
195fn anderson_darling_test(numbers: &[f64], mean: f64, std_dev: f64) -> (f64, f64) {
197 let mut sorted = numbers.to_vec();
198 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
199
200 let n = sorted.len() as f64;
201 let mut a_squared = 0.0;
202
203 for (i, &x) in sorted.iter().enumerate() {
204 let z = (x - mean) / std_dev;
205 let phi = standard_normal_cdf(z);
206 let phi_complement = 1.0 - standard_normal_cdf(-z);
207
208 if phi > 0.0 && phi < 1.0 && phi_complement > 0.0 {
209 a_squared += (2.0 * (i + 1) as f64 - 1.0) * (phi.ln() + phi_complement.ln()) / n;
210 }
211 }
212
213 a_squared = -n - a_squared;
214
215 let p_value = if a_squared < 0.5 {
217 0.5
218 } else if a_squared < 1.0 {
219 0.1
220 } else if a_squared < 2.0 {
221 0.01
222 } else {
223 0.001
224 };
225
226 (a_squared, p_value)
227}
228
229fn kolmogorov_smirnov_test(numbers: &[f64], mean: f64, std_dev: f64) -> (f64, f64) {
231 let mut sorted = numbers.to_vec();
232 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
233
234 let n = sorted.len() as f64;
235 let mut max_diff: f64 = 0.0;
236
237 for (i, &x) in sorted.iter().enumerate() {
238 let z = (x - mean) / std_dev;
239 let expected_cdf = standard_normal_cdf(z);
240 let empirical_cdf = (i + 1) as f64 / n;
241
242 let diff = (expected_cdf - empirical_cdf).abs();
243 max_diff = max_diff.max(diff);
244 }
245
246 let ks_critical = 1.36 / n.sqrt(); let p_value = if max_diff < ks_critical * 0.5 {
249 0.5
250 } else if max_diff < ks_critical {
251 0.1
252 } else if max_diff < ks_critical * 1.5 {
253 0.01
254 } else {
255 0.001
256 };
257
258 (max_diff, p_value)
259}
260
261fn standard_normal_cdf(z: f64) -> f64 {
263 if z > 6.0 {
264 return 1.0;
265 }
266 if z < -6.0 {
267 return 0.0;
268 }
269
270 let t = 1.0 / (1.0 + 0.2316419 * z.abs());
272 let d = 0.3989423 * (-z * z / 2.0).exp();
273 let probability =
274 d * t * (0.3193815 + t * (-0.3565638 + t * (1.7814779 + t * (-1.8212560 + t * 1.3302744))));
275
276 if z >= 0.0 {
277 1.0 - probability
278 } else {
279 probability
280 }
281}
282
283fn calculate_qq_correlation(numbers: &[f64], mean: f64, std_dev: f64) -> f64 {
285 let mut sorted = numbers.to_vec();
286 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
287
288 let n = sorted.len();
289 let mut theoretical_quantiles = Vec::new();
290
291 for i in 0..n {
292 let p = (i + 1) as f64 / (n + 1) as f64;
293 let z = inverse_standard_normal(p);
294 theoretical_quantiles.push(mean + std_dev * z);
295 }
296
297 pearson_correlation(&sorted, &theoretical_quantiles)
298}
299
300fn inverse_standard_normal(p: f64) -> f64 {
302 if p <= 0.0 {
303 return -6.0;
304 }
305 if p >= 1.0 {
306 return 6.0;
307 }
308
309 let a = p - 0.5;
311 if a.abs() < 0.42 {
312 let r = a * a;
313 return a
314 * (((2.5066282 + r * 0.3001648) + r * 0.0010805)
315 / ((1.0 + r * 1.6372227) + r * 0.0312753));
316 }
317
318 let r = if a < 0.0 { p } else { 1.0 - p };
319 let s = (-r.ln()).sqrt();
320 let t = s
321 - ((2.515517 + s * 0.802853) + s * s * 0.010328)
322 / ((1.0 + s * 1.432788) + s * s * 0.189269);
323
324 if a < 0.0 {
325 -t
326 } else {
327 t
328 }
329}
330
331fn pearson_correlation(x: &[f64], y: &[f64]) -> f64 {
333 if x.len() != y.len() {
334 return 0.0;
335 }
336
337 let n = x.len() as f64;
338 let mean_x = x.iter().sum::<f64>() / n;
339 let mean_y = y.iter().sum::<f64>() / n;
340
341 let mut numerator = 0.0;
342 let mut sum_x_sq = 0.0;
343 let mut sum_y_sq = 0.0;
344
345 for (&xi, &yi) in x.iter().zip(y.iter()) {
346 let dx = xi - mean_x;
347 let dy = yi - mean_y;
348 numerator += dx * dy;
349 sum_x_sq += dx * dx;
350 sum_y_sq += dy * dy;
351 }
352
353 let denominator = (sum_x_sq * sum_y_sq).sqrt();
354 if denominator == 0.0 {
355 0.0
356 } else {
357 numerator / denominator
358 }
359}
360
361fn calculate_normality_score(sw_p: f64, ad_p: f64, ks_p: f64, qq_corr: f64) -> f64 {
363 let p_score = (sw_p * 0.4 + ad_p * 0.3 + ks_p * 0.3).min(1.0);
365 let corr_score = qq_corr.abs();
366
367 (p_score * 0.6 + corr_score * 0.4).clamp(0.0, 1.0)
368}
369
370fn calculate_distribution_quality(skewness: f64, kurtosis: f64, normality_score: f64) -> f64 {
372 let skew_score = 1.0 - (skewness.abs() / 2.0).min(1.0);
374 let kurt_score = 1.0 - (kurtosis.abs() / 3.0).min(1.0);
375
376 (normality_score * 0.5 + skew_score * 0.25 + kurt_score * 0.25).clamp(0.0, 1.0)
377}
378
379fn detect_outliers_z_score(numbers: &[f64], mean: f64, std_dev: f64) -> Vec<(usize, f64, f64)> {
381 let mut outliers = Vec::new();
382 let threshold = 2.5; for (i, &value) in numbers.iter().enumerate() {
385 let z_score = (value - mean) / std_dev;
386 if z_score.abs() > threshold {
387 outliers.push((i, value, z_score));
388 }
389 }
390
391 outliers
392}
393
394fn detect_outliers_modified_z_score(numbers: &[f64]) -> Vec<(usize, f64, f64)> {
396 let median = calculate_median(numbers);
397 let mad = calculate_mad(numbers, median);
398 let mut outliers = Vec::new();
399 let threshold = 3.5; for (i, &value) in numbers.iter().enumerate() {
402 let modified_z = 0.6745 * (value - median) / mad;
403 if modified_z.abs() > threshold {
404 outliers.push((i, value, modified_z));
405 }
406 }
407
408 outliers
409}
410
411fn detect_outliers_iqr(numbers: &[f64]) -> Vec<(usize, f64)> {
413 let mut sorted = numbers.to_vec();
414 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
415
416 let n = sorted.len();
417 let q1 = sorted[n / 4];
418 let q3 = sorted[3 * n / 4];
419 let iqr = q3 - q1;
420 let lower_bound = q1 - 1.5 * iqr;
421 let upper_bound = q3 + 1.5 * iqr;
422
423 let mut outliers = Vec::new();
424 for (i, &value) in numbers.iter().enumerate() {
425 if value < lower_bound || value > upper_bound {
426 outliers.push((i, value));
427 }
428 }
429
430 outliers
431}
432
433fn calculate_median(numbers: &[f64]) -> f64 {
435 let mut sorted = numbers.to_vec();
436 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
437 let n = sorted.len();
438
439 if n % 2 == 0 {
440 (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0
441 } else {
442 sorted[n / 2]
443 }
444}
445
446fn calculate_mad(numbers: &[f64], median: f64) -> f64 {
448 let deviations: Vec<f64> = numbers.iter().map(|&x| (x - median).abs()).collect();
449 calculate_median(&deviations)
450}
451
452fn calculate_mean_confidence_interval(numbers: &[f64], mean: f64, std_dev: f64) -> (f64, f64) {
454 let n = numbers.len() as f64;
455 let se = std_dev / n.sqrt();
456 let t_critical = 1.96; (mean - t_critical * se, mean + t_critical * se)
459}
460
461fn calculate_within_sigma_percent(numbers: &[f64], mean: f64, std_dev: f64, sigma: f64) -> f64 {
463 let lower = mean - sigma * std_dev;
464 let upper = mean + sigma * std_dev;
465
466 let within_count = numbers
467 .iter()
468 .filter(|&&x| x >= lower && x <= upper)
469 .count();
470
471 (within_count as f64 / numbers.len() as f64) * 100.0
472}
473
474fn determine_risk_level(
476 normality_score: f64,
477 outliers: &[(usize, f64, f64)],
478 skewness: f64,
479 kurtosis: f64,
480) -> RiskLevel {
481 let outlier_ratio = outliers.len() as f64 / 100.0; if normality_score > 0.8 && outlier_ratio < 0.05 && skewness.abs() < 0.5 && kurtosis.abs() < 0.5
484 {
485 RiskLevel::Low
486 } else if normality_score > 0.6
487 && outlier_ratio < 0.1
488 && skewness.abs() < 1.0
489 && kurtosis.abs() < 1.0
490 {
491 RiskLevel::Medium
492 } else if normality_score > 0.3 && outlier_ratio < 0.2 {
493 RiskLevel::High
494 } else {
495 RiskLevel::Critical
496 }
497}
498
499#[cfg(test)]
500mod tests {
501 use super::*;
502
503 #[test]
504 fn test_normal_distribution() {
505 let numbers = vec![
507 0.0, 0.5, -0.3, 1.2, -0.8, 0.2, -0.1, 0.9, -0.6, 0.7, -0.4, 0.3, 1.1, -0.9, 0.6, -0.2,
508 0.8, -0.5, 0.1, 0.4,
509 ];
510
511 let result = NormalResult::new("test".to_string(), &numbers).unwrap();
512
513 assert_eq!(result.numbers_analyzed, 20);
514 assert!((result.mean - 0.0).abs() < 0.3); assert!(result.std_dev > 0.0);
516 assert!(result.normality_score > 0.0);
517 }
518
519 #[test]
520 fn test_insufficient_data() {
521 let numbers = vec![1.0, 2.0, 3.0]; let result = NormalResult::new("test".to_string(), &numbers);
523
524 assert!(result.is_err());
525 }
526
527 #[test]
528 fn test_outlier_detection() {
529 let numbers = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 100.0]; let result = NormalResult::new("outlier_test".to_string(), &numbers).unwrap();
531
532 assert!(!result.outliers_z_score.is_empty());
533 }
534}