temporal_neural_solver/benchmarks/
statistical_validation.rs1use std::time::Duration;
7use std::collections::HashMap;
8use serde::{Serialize, Deserialize};
9
10#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct StatisticalTest {
13 pub test_name: String,
14 pub p_value: f64,
15 pub effect_size: f64,
16 pub confidence_interval: (f64, f64),
17 pub is_significant: bool,
18 pub power: f64,
19}
20
21#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct StatisticalAnalysis {
24 pub sample_size: usize,
25 pub normality_test: StatisticalTest,
26 pub homogeneity_test: StatisticalTest,
27 pub performance_tests: Vec<StatisticalTest>,
28 pub effect_sizes: HashMap<String, f64>,
29 pub confidence_level: f64,
30 pub validated: bool,
31}
32
33pub struct StatisticalValidator {
35 confidence_level: f64,
36 min_effect_size: f64,
37 min_power: f64,
38}
39
40impl Default for StatisticalValidator {
41 fn default() -> Self {
42 Self {
43 confidence_level: 0.95,
44 min_effect_size: 0.8, min_power: 0.8, }
47 }
48}
49
50impl StatisticalValidator {
51 pub fn new(confidence_level: f64, min_effect_size: f64, min_power: f64) -> Self {
52 Self {
53 confidence_level,
54 min_effect_size,
55 min_power,
56 }
57 }
58
59 pub fn validate_benchmarks(
61 &self,
62 baseline_timings: &[Duration],
63 optimized_timings: &[Duration],
64 implementation_name: &str,
65 ) -> StatisticalAnalysis {
66 let baseline_us: Vec<f64> = baseline_timings
68 .iter()
69 .map(|d| d.as_secs_f64() * 1_000_000.0)
70 .collect();
71
72 let optimized_us: Vec<f64> = optimized_timings
73 .iter()
74 .map(|d| d.as_secs_f64() * 1_000_000.0)
75 .collect();
76
77 let mut performance_tests = Vec::new();
78 let mut effect_sizes = HashMap::new();
79
80 let baseline_normality = self.shapiro_wilk_test(&baseline_us);
82 let optimized_normality = self.shapiro_wilk_test(&optimized_us);
83
84 let homogeneity = self.levene_test(&baseline_us, &optimized_us);
86
87 let use_parametric = baseline_normality.is_significant &&
89 optimized_normality.is_significant &&
90 homogeneity.is_significant;
91
92 let performance_test = if use_parametric {
93 self.welch_t_test(&baseline_us, &optimized_us, implementation_name)
95 } else {
96 self.mann_whitney_test(&baseline_us, &optimized_us, implementation_name)
98 };
99
100 performance_tests.push(performance_test.clone());
101
102 let cohens_d = self.cohens_d(&baseline_us, &optimized_us);
104 let speedup_ratio = self.median(&baseline_us) / self.median(&optimized_us);
105
106 effect_sizes.insert("cohens_d".to_string(), cohens_d);
107 effect_sizes.insert("speedup_ratio".to_string(), speedup_ratio);
108
109 let power = self.power_analysis(&baseline_us, &optimized_us, cohens_d);
111
112 let ci = self.bootstrap_confidence_interval(&baseline_us, &optimized_us);
114
115 let mut updated_test = performance_test;
117 updated_test.power = power;
118 updated_test.confidence_interval = ci;
119 updated_test.effect_size = cohens_d;
120
121 performance_tests[0] = updated_test.clone();
122
123 let validated = updated_test.is_significant &&
125 updated_test.effect_size >= self.min_effect_size &&
126 updated_test.power >= self.min_power;
127
128 StatisticalAnalysis {
129 sample_size: baseline_us.len().min(optimized_us.len()),
130 normality_test: baseline_normality,
131 homogeneity_test: homogeneity,
132 performance_tests,
133 effect_sizes,
134 confidence_level: self.confidence_level,
135 validated,
136 }
137 }
138
139 fn shapiro_wilk_test(&self, data: &[f64]) -> StatisticalTest {
141 let n = data.len();
142 if n < 3 {
143 return StatisticalTest {
144 test_name: "Shapiro-Wilk".to_string(),
145 p_value: 1.0,
146 effect_size: 0.0,
147 confidence_interval: (0.0, 1.0),
148 is_significant: false,
149 power: 0.0,
150 };
151 }
152
153 let mean = self.mean(data);
155 let std_dev = self.std_dev(data);
156
157 let skewness = self.skewness(data, mean, std_dev);
158 let kurtosis = self.kurtosis(data, mean, std_dev);
159
160 let w_stat = 1.0 - (skewness.powi(2) / 6.0 + (kurtosis - 3.0).powi(2) / 24.0);
162
163 let p_value = if w_stat > 0.95 {
165 0.1
166 } else if w_stat > 0.90 {
167 0.05
168 } else {
169 0.01
170 };
171
172 StatisticalTest {
173 test_name: "Shapiro-Wilk (approx)".to_string(),
174 p_value,
175 effect_size: w_stat,
176 confidence_interval: (0.0, 1.0),
177 is_significant: p_value > 0.05,
178 power: 0.8,
179 }
180 }
181
182 fn levene_test(&self, group1: &[f64], group2: &[f64]) -> StatisticalTest {
184 let median1 = self.median(group1);
185 let median2 = self.median(group2);
186
187 let dev1: Vec<f64> = group1.iter().map(|&x| (x - median1).abs()).collect();
189 let dev2: Vec<f64> = group2.iter().map(|&x| (x - median2).abs()).collect();
190
191 let mean_dev1 = self.mean(&dev1);
192 let mean_dev2 = self.mean(&dev2);
193
194 let var1 = self.variance(&dev1);
196 let var2 = self.variance(&dev2);
197
198 let f_stat = var1.max(var2) / var1.min(var2);
199
200 let p_value = if f_stat < 2.0 { 0.1 } else { 0.01 };
202
203 StatisticalTest {
204 test_name: "Levene's Test (approx)".to_string(),
205 p_value,
206 effect_size: f_stat,
207 confidence_interval: (0.0, f_stat * 1.2),
208 is_significant: p_value > 0.05,
209 power: 0.8,
210 }
211 }
212
213 fn welch_t_test(&self, group1: &[f64], group2: &[f64], name: &str) -> StatisticalTest {
215 let n1 = group1.len() as f64;
216 let n2 = group2.len() as f64;
217
218 let mean1 = self.mean(group1);
219 let mean2 = self.mean(group2);
220 let var1 = self.variance(group1);
221 let var2 = self.variance(group2);
222
223 let t_stat = (mean1 - mean2) / ((var1 / n1) + (var2 / n2)).sqrt();
225
226 let df_num = ((var1 / n1) + (var2 / n2)).powi(2);
228 let df_denom = (var1 / n1).powi(2) / (n1 - 1.0) + (var2 / n2).powi(2) / (n2 - 1.0);
229 let df = df_num / df_denom;
230
231 let p_value = if t_stat.abs() > 2.5 { 0.01 } else if t_stat.abs() > 1.96 { 0.05 } else { 0.1 };
233
234 StatisticalTest {
235 test_name: format!("Welch's t-test ({})", name),
236 p_value,
237 effect_size: t_stat.abs(),
238 confidence_interval: (mean1 - mean2 - 1.96 * (var1/n1 + var2/n2).sqrt(),
239 mean1 - mean2 + 1.96 * (var1/n1 + var2/n2).sqrt()),
240 is_significant: p_value < 0.05,
241 power: 0.8,
242 }
243 }
244
245 fn mann_whitney_test(&self, group1: &[f64], group2: &[f64], name: &str) -> StatisticalTest {
247 let n1 = group1.len();
248 let n2 = group2.len();
249
250 let mut combined: Vec<(f64, usize)> = Vec::new();
252 for &val in group1 {
253 combined.push((val, 1));
254 }
255 for &val in group2 {
256 combined.push((val, 2));
257 }
258
259 combined.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
260
261 let mut r1_sum = 0.0;
263 for (i, &(_, group)) in combined.iter().enumerate() {
264 if group == 1 {
265 r1_sum += (i + 1) as f64;
266 }
267 }
268
269 let u1 = r1_sum - (n1 as f64 * (n1 as f64 + 1.0)) / 2.0;
271 let u2 = (n1 * n2) as f64 - u1;
272 let u = u1.min(u2);
273
274 let mean_u = (n1 * n2) as f64 / 2.0;
276 let std_u = ((n1 * n2 * (n1 + n2 + 1)) as f64 / 12.0).sqrt();
277 let z = (u - mean_u) / std_u;
278
279 let p_value = if z.abs() > 2.5 { 0.01 } else if z.abs() > 1.96 { 0.05 } else { 0.1 };
280
281 StatisticalTest {
282 test_name: format!("Mann-Whitney U ({})", name),
283 p_value,
284 effect_size: z.abs(),
285 confidence_interval: (u - 1.96 * std_u, u + 1.96 * std_u),
286 is_significant: p_value < 0.05,
287 power: 0.8,
288 }
289 }
290
291 fn cohens_d(&self, group1: &[f64], group2: &[f64]) -> f64 {
293 let mean1 = self.mean(group1);
294 let mean2 = self.mean(group2);
295 let var1 = self.variance(group1);
296 let var2 = self.variance(group2);
297
298 let pooled_std = (((group1.len() - 1) as f64 * var1 + (group2.len() - 1) as f64 * var2) /
299 (group1.len() + group2.len() - 2) as f64).sqrt();
300
301 (mean1 - mean2).abs() / pooled_std
302 }
303
304 fn power_analysis(&self, group1: &[f64], group2: &[f64], effect_size: f64) -> f64 {
306 let n = group1.len().min(group2.len()) as f64;
307
308 let ncp = effect_size * (n / 2.0).sqrt(); if ncp > 2.8 { 0.95 }
312 else if ncp > 2.0 { 0.8 }
313 else if ncp > 1.0 { 0.5 }
314 else { 0.2 }
315 }
316
317 fn bootstrap_confidence_interval(&self, group1: &[f64], group2: &[f64]) -> (f64, f64) {
319 let median1 = self.median(group1);
320 let median2 = self.median(group2);
321 let diff = median1 - median2;
322
323 let combined_std = (self.variance(group1) + self.variance(group2)).sqrt();
325 let margin = 1.96 * combined_std / (group1.len() as f64).sqrt();
326
327 (diff - margin, diff + margin)
328 }
329
330 fn mean(&self, data: &[f64]) -> f64 {
332 data.iter().sum::<f64>() / data.len() as f64
333 }
334
335 fn variance(&self, data: &[f64]) -> f64 {
336 let mean = self.mean(data);
337 data.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (data.len() - 1) as f64
338 }
339
340 fn std_dev(&self, data: &[f64]) -> f64 {
341 self.variance(data).sqrt()
342 }
343
344 fn median(&self, data: &[f64]) -> f64 {
345 let mut sorted = data.to_vec();
346 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
347 let n = sorted.len();
348 if n % 2 == 0 {
349 (sorted[n/2 - 1] + sorted[n/2]) / 2.0
350 } else {
351 sorted[n/2]
352 }
353 }
354
355 fn skewness(&self, data: &[f64], mean: f64, std_dev: f64) -> f64 {
356 let n = data.len() as f64;
357 let sum_cubed = data.iter()
358 .map(|&x| ((x - mean) / std_dev).powi(3))
359 .sum::<f64>();
360 sum_cubed / n
361 }
362
363 fn kurtosis(&self, data: &[f64], mean: f64, std_dev: f64) -> f64 {
364 let n = data.len() as f64;
365 let sum_fourth = data.iter()
366 .map(|&x| ((x - mean) / std_dev).powi(4))
367 .sum::<f64>();
368 sum_fourth / n
369 }
370
371 pub fn generate_report(&self, analysis: &StatisticalAnalysis) -> String {
373 let mut report = String::new();
374
375 report.push_str(&format!("\n{}\n", "=".repeat(60)));
376 report.push_str("STATISTICAL VALIDATION REPORT\n");
377 report.push_str(&format!("{}\n", "=".repeat(60)));
378
379 report.push_str(&format!("Sample Size: {}\n", analysis.sample_size));
380 report.push_str(&format!("Confidence Level: {:.1}%\n", analysis.confidence_level * 100.0));
381
382 report.push_str("\nš ASSUMPTION TESTS:\n");
383 report.push_str(&format!("⢠Normality: {} (p = {:.4})\n",
384 if analysis.normality_test.is_significant { "ā
Normal" } else { "ā Non-normal" },
385 analysis.normality_test.p_value));
386 report.push_str(&format!("⢠Homogeneity: {} (p = {:.4})\n",
387 if analysis.homogeneity_test.is_significant { "ā
Equal variances" } else { "ā Unequal variances" },
388 analysis.homogeneity_test.p_value));
389
390 report.push_str("\nš PERFORMANCE TESTS:\n");
391 for test in &analysis.performance_tests {
392 report.push_str(&format!("⢠{}: {} (p = {:.6})\n",
393 test.test_name,
394 if test.is_significant { "ā
Significant" } else { "ā Not significant" },
395 test.p_value));
396 report.push_str(&format!(" Effect Size: {:.3}, Power: {:.3}\n",
397 test.effect_size, test.power));
398 }
399
400 report.push_str("\nš EFFECT SIZES:\n");
401 for (name, value) in &analysis.effect_sizes {
402 let interpretation = match name.as_str() {
403 "cohens_d" => {
404 if *value > 0.8 { "Large effect" }
405 else if *value > 0.5 { "Medium effect" }
406 else if *value > 0.2 { "Small effect" }
407 else { "Negligible effect" }
408 },
409 "speedup_ratio" => {
410 format!("{:.1}x faster", value).leak()
411 },
412 _ => "Unknown"
413 };
414 report.push_str(&format!("⢠{}: {:.3} ({})\n", name, value, interpretation));
415 }
416
417 report.push_str(&format!("\nšÆ OVERALL VALIDATION: {}\n",
418 if analysis.validated { "ā
PASSED" } else { "ā FAILED" }));
419
420 if analysis.validated {
421 report.push_str("⢠Performance improvement is statistically significant\n");
422 report.push_str("⢠Effect size is large enough to be meaningful\n");
423 report.push_str("⢠Statistical power is adequate\n");
424 } else {
425 report.push_str("⢠Review statistical assumptions and/or increase sample size\n");
426 }
427
428 report
429 }
430}
431
432#[cfg(test)]
433mod tests {
434 use super::*;
435
436 #[test]
437 fn test_statistical_validation() {
438 let validator = StatisticalValidator::default();
439
440 let baseline: Vec<Duration> = (0..1000)
442 .map(|i| Duration::from_micros(100 + i % 50))
443 .collect();
444
445 let optimized: Vec<Duration> = (0..1000)
446 .map(|i| Duration::from_micros(20 + i % 10))
447 .collect();
448
449 let analysis = validator.validate_benchmarks(&baseline, &optimized, "Test");
450
451 println!("{}", validator.generate_report(&analysis));
452
453 assert!(analysis.validated, "Should show significant improvement");
454 assert!(analysis.effect_sizes["speedup_ratio"] > 2.0, "Should show significant speedup");
455 }
456}