1use serde::{Deserialize, Serialize};
7
8#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct BaselineMetrics {
25 corpus_size: u64,
27 first_try_successes: u64,
29 eventual_successes: u64,
31 average_iterations: f64,
33 ci_lower: f64,
35 ci_upper: f64,
37}
38
39impl BaselineMetrics {
40 pub const TARGET_RATE: f64 = 0.85;
42
43 pub fn new(first_try_successes: u64, corpus_size: u64) -> Self {
45 let (ci_lower, ci_upper) = wilson_score_interval(first_try_successes, corpus_size, 0.95);
46
47 Self {
48 corpus_size,
49 first_try_successes,
50 eventual_successes: first_try_successes, average_iterations: 1.0,
52 ci_lower,
53 ci_upper,
54 }
55 }
56
57 pub fn with_iterations(
59 first_try_successes: u64,
60 eventual_successes: u64,
61 total_iterations: u64,
62 corpus_size: u64,
63 ) -> Self {
64 let (ci_lower, ci_upper) = wilson_score_interval(first_try_successes, corpus_size, 0.95);
65
66 let average_iterations = if eventual_successes == 0 {
67 0.0
68 } else {
69 total_iterations as f64 / eventual_successes as f64
70 };
71
72 Self {
73 corpus_size,
74 first_try_successes,
75 eventual_successes,
76 average_iterations,
77 ci_lower,
78 ci_upper,
79 }
80 }
81
82 pub fn corpus_size(&self) -> u64 {
84 self.corpus_size
85 }
86
87 pub fn first_try_successes(&self) -> u64 {
89 self.first_try_successes
90 }
91
92 pub fn eventual_successes(&self) -> u64 {
94 self.eventual_successes
95 }
96
97 pub fn first_try_rate(&self) -> f64 {
99 if self.corpus_size == 0 {
100 return 0.0;
101 }
102 self.first_try_successes as f64 / self.corpus_size as f64
103 }
104
105 pub fn confidence_interval(&self) -> (f64, f64) {
107 (self.ci_lower, self.ci_upper)
108 }
109
110 pub fn average_iterations(&self) -> f64 {
112 self.average_iterations
113 }
114
115 pub fn meets_target(&self) -> bool {
117 self.first_try_rate() >= Self::TARGET_RATE
118 }
119
120 pub fn significantly_below_target(&self) -> bool {
122 self.ci_upper < Self::TARGET_RATE
123 }
124
125 pub fn includes_target(&self) -> bool {
127 self.ci_lower <= Self::TARGET_RATE && self.ci_upper >= Self::TARGET_RATE
128 }
129
130 pub fn to_markdown(&self) -> String {
132 let status = if self.meets_target() {
133 "PASSED"
134 } else if self.significantly_below_target() {
135 "FAILED (significantly below target)"
136 } else {
137 "PENDING (includes target in CI)"
138 };
139
140 format!(
141 r#"## Baseline Measurement Report
142
143| Metric | Value |
144|--------|-------|
145| Corpus Size | {} |
146| First-Try Successes | {} |
147| First-Try Rate | {:.1}% |
148| 95% CI | [{:.1}%, {:.1}%] |
149| Target Rate | {:.1}% |
150| Average Iterations | {:.2} |
151
152### Status: {}
153"#,
154 self.corpus_size,
155 self.first_try_successes,
156 self.first_try_rate() * 100.0,
157 self.ci_lower * 100.0,
158 self.ci_upper * 100.0,
159 Self::TARGET_RATE * 100.0,
160 self.average_iterations,
161 status
162 )
163 }
164}
165
166pub fn wilson_score_interval(successes: u64, total: u64, confidence: f64) -> (f64, f64) {
179 if total == 0 {
180 return (0.0, 1.0);
181 }
182
183 let n = total as f64;
184 let p = successes as f64 / n;
185
186 let z = match confidence {
188 c if (c - 0.90).abs() < 0.01 => 1.645,
189 c if (c - 0.95).abs() < 0.01 => 1.96,
190 c if (c - 0.99).abs() < 0.01 => 2.576,
191 _ => 1.96, };
193
194 let z2 = z * z;
195
196 let denominator = 1.0 + z2 / n;
198 let center = (p + z2 / (2.0 * n)) / denominator;
199 let margin = (z / denominator) * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt());
200
201 let lower = (center - margin).max(0.0);
202 let upper = (center + margin).min(1.0);
203
204 (lower, upper)
205}
206
207#[derive(Debug, Clone, Serialize, Deserialize)]
209pub struct FileMeasurement {
210 pub path: String,
212 pub first_try_success: bool,
214 pub eventual_success: bool,
216 pub iterations: u32,
218 pub error_codes: Vec<String>,
220}
221
222impl FileMeasurement {
223 pub fn first_try_success(path: impl Into<String>) -> Self {
225 Self {
226 path: path.into(),
227 first_try_success: true,
228 eventual_success: true,
229 iterations: 1,
230 error_codes: Vec::new(),
231 }
232 }
233
234 pub fn success_after(path: impl Into<String>, iterations: u32, errors: Vec<String>) -> Self {
236 Self {
237 path: path.into(),
238 first_try_success: iterations == 1,
239 eventual_success: true,
240 iterations,
241 error_codes: errors,
242 }
243 }
244
245 pub fn failure(path: impl Into<String>, iterations: u32, errors: Vec<String>) -> Self {
247 Self {
248 path: path.into(),
249 first_try_success: false,
250 eventual_success: false,
251 iterations,
252 error_codes: errors,
253 }
254 }
255}
256
257pub fn aggregate_measurements(measurements: &[FileMeasurement]) -> BaselineMetrics {
259 let corpus_size = measurements.len() as u64;
260
261 let first_try_successes = measurements.iter().filter(|m| m.first_try_success).count() as u64;
262
263 let eventual_successes = measurements.iter().filter(|m| m.eventual_success).count() as u64;
264
265 let total_iterations: u64 =
266 measurements.iter().filter(|m| m.eventual_success).map(|m| m.iterations as u64).sum();
267
268 BaselineMetrics::with_iterations(
269 first_try_successes,
270 eventual_successes,
271 total_iterations,
272 corpus_size,
273 )
274}
275
276#[cfg(test)]
277mod tests {
278 use super::*;
279
280 #[test]
285 fn baseline_metrics_new() {
286 let metrics = BaselineMetrics::new(85, 100);
287 assert_eq!(metrics.corpus_size(), 100);
288 assert_eq!(metrics.first_try_successes(), 85);
289 }
290
291 #[test]
292 fn baseline_metrics_first_try_rate() {
293 let metrics = BaselineMetrics::new(85, 100);
294 assert!((metrics.first_try_rate() - 0.85).abs() < 0.001);
295 }
296
297 #[test]
298 fn baseline_metrics_empty_corpus() {
299 let metrics = BaselineMetrics::new(0, 0);
300 assert_eq!(metrics.first_try_rate(), 0.0);
301 }
302
303 #[test]
304 fn baseline_metrics_meets_target() {
305 let passing = BaselineMetrics::new(85, 100);
306 assert!(passing.meets_target());
307
308 let failing = BaselineMetrics::new(80, 100);
309 assert!(!failing.meets_target());
310 }
311
312 #[test]
313 fn baseline_metrics_confidence_interval_exists() {
314 let metrics = BaselineMetrics::new(85, 100);
315 let (lower, upper) = metrics.confidence_interval();
316
317 assert!(lower < 0.85);
319 assert!(upper > 0.85);
320 assert!(lower >= 0.0);
321 assert!(upper <= 1.0);
322 }
323
324 #[test]
325 fn baseline_metrics_ci_narrows_with_larger_samples() {
326 let small = BaselineMetrics::new(17, 20);
327 let large = BaselineMetrics::new(850, 1000);
328
329 let (small_lo, small_hi) = small.confidence_interval();
330 let (large_lo, large_hi) = large.confidence_interval();
331
332 let small_width = small_hi - small_lo;
333 let large_width = large_hi - large_lo;
334
335 assert!(large_width < small_width);
337 }
338
339 #[test]
340 fn baseline_metrics_significantly_below_target() {
341 let metrics = BaselineMetrics::new(500, 1000);
343 assert!(metrics.significantly_below_target());
344
345 let close = BaselineMetrics::new(840, 1000);
347 assert!(!close.significantly_below_target());
348 }
349
350 #[test]
351 fn baseline_metrics_with_iterations() {
352 let metrics = BaselineMetrics::with_iterations(80, 95, 150, 100);
353
354 assert_eq!(metrics.first_try_successes(), 80);
355 assert_eq!(metrics.eventual_successes(), 95);
356 assert!((metrics.average_iterations() - 1.578).abs() < 0.01);
357 }
358
359 #[test]
360 fn baseline_metrics_to_markdown() {
361 let metrics = BaselineMetrics::new(85, 100);
362 let md = metrics.to_markdown();
363
364 assert!(md.contains("Baseline Measurement Report"));
365 assert!(md.contains("| Corpus Size | 100 |"));
366 assert!(md.contains("| First-Try Successes | 85 |"));
367 assert!(md.contains("PASSED"));
368 }
369
370 #[test]
375 fn wilson_score_empty() {
376 let (lower, upper) = wilson_score_interval(0, 0, 0.95);
377 assert_eq!(lower, 0.0);
378 assert_eq!(upper, 1.0);
379 }
380
381 #[test]
382 fn wilson_score_all_success() {
383 let (lower, upper) = wilson_score_interval(100, 100, 0.95);
384 assert!(lower > 0.95);
385 assert!((upper - 1.0).abs() < 1e-10);
386 }
387
388 #[test]
389 fn wilson_score_all_failure() {
390 let (lower, upper) = wilson_score_interval(0, 100, 0.95);
391 assert_eq!(lower, 0.0);
392 assert!(upper < 0.05);
393 }
394
395 #[test]
396 fn wilson_score_typical_case() {
397 let (lower, upper) = wilson_score_interval(85, 100, 0.95);
399
400 assert!(lower > 0.75 && lower < 0.80);
402 assert!(upper > 0.89 && upper < 0.93);
403 }
404
405 #[test]
410 fn file_measurement_first_try_success() {
411 let m = FileMeasurement::first_try_success("test.c");
412 assert!(m.first_try_success);
413 assert!(m.eventual_success);
414 assert_eq!(m.iterations, 1);
415 assert!(m.error_codes.is_empty());
416 }
417
418 #[test]
419 fn file_measurement_success_after_iterations() {
420 let m = FileMeasurement::success_after("test.c", 3, vec!["E0382".to_string()]);
421 assert!(!m.first_try_success);
422 assert!(m.eventual_success);
423 assert_eq!(m.iterations, 3);
424 assert_eq!(m.error_codes.len(), 1);
425 }
426
427 #[test]
428 fn file_measurement_failure() {
429 let m =
430 FileMeasurement::failure("test.c", 5, vec!["E0382".to_string(), "E0499".to_string()]);
431 assert!(!m.first_try_success);
432 assert!(!m.eventual_success);
433 assert_eq!(m.iterations, 5);
434 assert_eq!(m.error_codes.len(), 2);
435 }
436
437 #[test]
442 fn aggregate_empty() {
443 let metrics = aggregate_measurements(&[]);
444 assert_eq!(metrics.corpus_size(), 0);
445 assert_eq!(metrics.first_try_rate(), 0.0);
446 }
447
448 #[test]
449 fn aggregate_all_first_try() {
450 let measurements = vec![
451 FileMeasurement::first_try_success("a.c"),
452 FileMeasurement::first_try_success("b.c"),
453 FileMeasurement::first_try_success("c.c"),
454 ];
455 let metrics = aggregate_measurements(&measurements);
456
457 assert_eq!(metrics.corpus_size(), 3);
458 assert_eq!(metrics.first_try_successes(), 3);
459 assert!((metrics.first_try_rate() - 1.0).abs() < 0.001);
460 }
461
462 #[test]
463 fn aggregate_mixed_results() {
464 let measurements = vec![
465 FileMeasurement::first_try_success("a.c"),
466 FileMeasurement::first_try_success("b.c"),
467 FileMeasurement::success_after("c.c", 2, vec!["E0382".to_string()]),
468 FileMeasurement::success_after("d.c", 3, vec!["E0499".to_string()]),
469 FileMeasurement::failure("e.c", 5, vec!["E0515".to_string()]),
470 ];
471 let metrics = aggregate_measurements(&measurements);
472
473 assert_eq!(metrics.corpus_size(), 5);
474 assert_eq!(metrics.first_try_successes(), 2);
475 assert_eq!(metrics.eventual_successes(), 4);
476 assert!((metrics.first_try_rate() - 0.4).abs() < 0.001);
477 assert!((metrics.average_iterations() - 1.75).abs() < 0.001);
479 }
480
481 #[test]
486 fn baseline_metrics_includes_target() {
487 let close = BaselineMetrics::new(840, 1000);
489 assert!(close.includes_target(), "CI should include 85% target");
490
491 let high = BaselineMetrics::new(990, 1000);
493 assert!(!high.includes_target(), "99% rate CI lower bound should be above 85%");
494 }
495
496 #[test]
497 fn baseline_metrics_to_markdown_failed() {
498 let metrics = BaselineMetrics::new(500, 1000);
500 let md = metrics.to_markdown();
501 assert!(md.contains("FAILED"));
502 }
503
504 #[test]
505 fn baseline_metrics_to_markdown_pending() {
506 let metrics = BaselineMetrics::new(840, 1000);
508 assert!(!metrics.meets_target());
509 assert!(metrics.includes_target());
510 let md = metrics.to_markdown();
511 assert!(md.contains("PENDING"));
512 }
513
514 #[test]
515 fn wilson_score_90_confidence() {
516 let (lower, upper) = wilson_score_interval(85, 100, 0.90);
517 let (lower_95, upper_95) = wilson_score_interval(85, 100, 0.95);
519 assert!((upper - lower) < (upper_95 - lower_95));
520 }
521
522 #[test]
523 fn wilson_score_99_confidence() {
524 let (lower, upper) = wilson_score_interval(85, 100, 0.99);
525 let (lower_95, upper_95) = wilson_score_interval(85, 100, 0.95);
527 assert!((upper - lower) > (upper_95 - lower_95));
528 }
529
530 #[test]
531 fn wilson_score_non_standard_confidence() {
532 let (lower, upper) = wilson_score_interval(85, 100, 0.80);
534 let (lower_95, upper_95) = wilson_score_interval(85, 100, 0.95);
535 assert!((lower - lower_95).abs() < 0.001);
536 assert!((upper - upper_95).abs() < 0.001);
537 }
538
539 #[test]
540 fn file_measurement_success_after_one_iteration() {
541 let m = FileMeasurement::success_after("test.c", 1, vec![]);
543 assert!(m.first_try_success);
544 assert!(m.eventual_success);
545 assert_eq!(m.iterations, 1);
546 }
547
548 #[test]
549 fn baseline_metrics_with_iterations_zero_eventual() {
550 let metrics = BaselineMetrics::with_iterations(0, 0, 0, 100);
552 assert_eq!(metrics.average_iterations(), 0.0);
553 }
554
555 #[test]
556 fn aggregate_all_failures() {
557 let measurements = vec![
558 FileMeasurement::failure("a.c", 5, vec![]),
559 FileMeasurement::failure("b.c", 5, vec![]),
560 ];
561 let metrics = aggregate_measurements(&measurements);
562
563 assert_eq!(metrics.corpus_size(), 2);
564 assert_eq!(metrics.first_try_successes(), 0);
565 assert_eq!(metrics.eventual_successes(), 0);
566 assert!((metrics.first_try_rate() - 0.0).abs() < 0.001);
567 assert!((metrics.average_iterations() - 0.0).abs() < 0.001);
568 }
569}