Skip to main content

decy_oracle/
baseline.rs

1//! Baseline measurement for ownership inference quality (DECY-ML-005).
2//!
3//! Measures "compiles on first try" rate with 95% confidence interval
4//! against a test corpus of C files.
5
6use serde::{Deserialize, Serialize};
7
8/// Baseline measurement results.
9///
10/// Records the "compiles on first try" rate and confidence interval
11/// from running transpilation on a test corpus.
12///
13/// # Example
14///
15/// ```
16/// use decy_oracle::baseline::BaselineMetrics;
17///
18/// let metrics = BaselineMetrics::new(85, 100);
19/// assert_eq!(metrics.first_try_rate(), 0.85);
20/// assert!(metrics.confidence_interval().0 < 0.85);
21/// assert!(metrics.confidence_interval().1 > 0.85);
22/// ```
23#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct BaselineMetrics {
25    /// Total files in corpus
26    corpus_size: u64,
27    /// Files that compiled on first try
28    first_try_successes: u64,
29    /// Files that eventually compiled (with iterations)
30    eventual_successes: u64,
31    /// Average iterations needed for success
32    average_iterations: f64,
33    /// 95% confidence interval lower bound
34    ci_lower: f64,
35    /// 95% confidence interval upper bound
36    ci_upper: f64,
37}
38
39impl BaselineMetrics {
40    /// Target "compiles on first try" rate (85%).
41    pub const TARGET_RATE: f64 = 0.85;
42
43    /// Create new baseline metrics from success count and total.
44    pub fn new(first_try_successes: u64, corpus_size: u64) -> Self {
45        let (ci_lower, ci_upper) = wilson_score_interval(first_try_successes, corpus_size, 0.95);
46
47        Self {
48            corpus_size,
49            first_try_successes,
50            eventual_successes: first_try_successes, // Default to same
51            average_iterations: 1.0,
52            ci_lower,
53            ci_upper,
54        }
55    }
56
57    /// Create metrics with full iteration data.
58    pub fn with_iterations(
59        first_try_successes: u64,
60        eventual_successes: u64,
61        total_iterations: u64,
62        corpus_size: u64,
63    ) -> Self {
64        let (ci_lower, ci_upper) = wilson_score_interval(first_try_successes, corpus_size, 0.95);
65
66        let average_iterations = if eventual_successes == 0 {
67            0.0
68        } else {
69            total_iterations as f64 / eventual_successes as f64
70        };
71
72        Self {
73            corpus_size,
74            first_try_successes,
75            eventual_successes,
76            average_iterations,
77            ci_lower,
78            ci_upper,
79        }
80    }
81
82    /// Get corpus size.
83    pub fn corpus_size(&self) -> u64 {
84        self.corpus_size
85    }
86
87    /// Get number of first-try successes.
88    pub fn first_try_successes(&self) -> u64 {
89        self.first_try_successes
90    }
91
92    /// Get number of eventual successes.
93    pub fn eventual_successes(&self) -> u64 {
94        self.eventual_successes
95    }
96
97    /// Calculate "compiles on first try" rate (0.0 - 1.0).
98    pub fn first_try_rate(&self) -> f64 {
99        if self.corpus_size == 0 {
100            return 0.0;
101        }
102        self.first_try_successes as f64 / self.corpus_size as f64
103    }
104
105    /// Get 95% confidence interval as (lower, upper).
106    pub fn confidence_interval(&self) -> (f64, f64) {
107        (self.ci_lower, self.ci_upper)
108    }
109
110    /// Get average iterations needed for success.
111    pub fn average_iterations(&self) -> f64 {
112        self.average_iterations
113    }
114
115    /// Check if first-try rate meets target.
116    pub fn meets_target(&self) -> bool {
117        self.first_try_rate() >= Self::TARGET_RATE
118    }
119
120    /// Check if confidence interval excludes target (significantly below).
121    pub fn significantly_below_target(&self) -> bool {
122        self.ci_upper < Self::TARGET_RATE
123    }
124
125    /// Check if confidence interval includes target (not significantly different).
126    pub fn includes_target(&self) -> bool {
127        self.ci_lower <= Self::TARGET_RATE && self.ci_upper >= Self::TARGET_RATE
128    }
129
130    /// Format as markdown report.
131    pub fn to_markdown(&self) -> String {
132        let status = if self.meets_target() {
133            "PASSED"
134        } else if self.significantly_below_target() {
135            "FAILED (significantly below target)"
136        } else {
137            "PENDING (includes target in CI)"
138        };
139
140        format!(
141            r#"## Baseline Measurement Report
142
143| Metric | Value |
144|--------|-------|
145| Corpus Size | {} |
146| First-Try Successes | {} |
147| First-Try Rate | {:.1}% |
148| 95% CI | [{:.1}%, {:.1}%] |
149| Target Rate | {:.1}% |
150| Average Iterations | {:.2} |
151
152### Status: {}
153"#,
154            self.corpus_size,
155            self.first_try_successes,
156            self.first_try_rate() * 100.0,
157            self.ci_lower * 100.0,
158            self.ci_upper * 100.0,
159            Self::TARGET_RATE * 100.0,
160            self.average_iterations,
161            status
162        )
163    }
164}
165
166/// Calculate Wilson score confidence interval for a proportion.
167///
168/// This is more accurate than the normal approximation for small samples
169/// or extreme proportions (near 0 or 1).
170///
171/// # Arguments
172/// * `successes` - Number of successes
173/// * `total` - Total trials
174/// * `confidence` - Confidence level (e.g., 0.95 for 95%)
175///
176/// # Returns
177/// Tuple of (lower_bound, upper_bound)
178pub fn wilson_score_interval(successes: u64, total: u64, confidence: f64) -> (f64, f64) {
179    if total == 0 {
180        return (0.0, 1.0);
181    }
182
183    let n = total as f64;
184    let p = successes as f64 / n;
185
186    // Z-score for confidence level (using 1.96 for 95%)
187    let z = match confidence {
188        c if (c - 0.90).abs() < 0.01 => 1.645,
189        c if (c - 0.95).abs() < 0.01 => 1.96,
190        c if (c - 0.99).abs() < 0.01 => 2.576,
191        _ => 1.96, // Default to 95%
192    };
193
194    let z2 = z * z;
195
196    // Wilson score formula
197    let denominator = 1.0 + z2 / n;
198    let center = (p + z2 / (2.0 * n)) / denominator;
199    let margin = (z / denominator) * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt());
200
201    let lower = (center - margin).max(0.0);
202    let upper = (center + margin).min(1.0);
203
204    (lower, upper)
205}
206
207/// Result of measuring a single file.
208#[derive(Debug, Clone, Serialize, Deserialize)]
209pub struct FileMeasurement {
210    /// File path
211    pub path: String,
212    /// Whether compilation succeeded on first try
213    pub first_try_success: bool,
214    /// Whether compilation eventually succeeded
215    pub eventual_success: bool,
216    /// Number of iterations needed
217    pub iterations: u32,
218    /// Error codes encountered (if any)
219    pub error_codes: Vec<String>,
220}
221
222impl FileMeasurement {
223    /// Create a successful first-try measurement.
224    pub fn first_try_success(path: impl Into<String>) -> Self {
225        Self {
226            path: path.into(),
227            first_try_success: true,
228            eventual_success: true,
229            iterations: 1,
230            error_codes: Vec::new(),
231        }
232    }
233
234    /// Create a measurement that succeeded after iterations.
235    pub fn success_after(path: impl Into<String>, iterations: u32, errors: Vec<String>) -> Self {
236        Self {
237            path: path.into(),
238            first_try_success: iterations == 1,
239            eventual_success: true,
240            iterations,
241            error_codes: errors,
242        }
243    }
244
245    /// Create a failed measurement.
246    pub fn failure(path: impl Into<String>, iterations: u32, errors: Vec<String>) -> Self {
247        Self {
248            path: path.into(),
249            first_try_success: false,
250            eventual_success: false,
251            iterations,
252            error_codes: errors,
253        }
254    }
255}
256
257/// Aggregate file measurements into baseline metrics.
258pub fn aggregate_measurements(measurements: &[FileMeasurement]) -> BaselineMetrics {
259    let corpus_size = measurements.len() as u64;
260
261    let first_try_successes = measurements.iter().filter(|m| m.first_try_success).count() as u64;
262
263    let eventual_successes = measurements.iter().filter(|m| m.eventual_success).count() as u64;
264
265    let total_iterations: u64 =
266        measurements.iter().filter(|m| m.eventual_success).map(|m| m.iterations as u64).sum();
267
268    BaselineMetrics::with_iterations(
269        first_try_successes,
270        eventual_successes,
271        total_iterations,
272        corpus_size,
273    )
274}
275
276#[cfg(test)]
277mod tests {
278    use super::*;
279
280    // ========================================================================
281    // BaselineMetrics tests
282    // ========================================================================
283
284    #[test]
285    fn baseline_metrics_new() {
286        let metrics = BaselineMetrics::new(85, 100);
287        assert_eq!(metrics.corpus_size(), 100);
288        assert_eq!(metrics.first_try_successes(), 85);
289    }
290
291    #[test]
292    fn baseline_metrics_first_try_rate() {
293        let metrics = BaselineMetrics::new(85, 100);
294        assert!((metrics.first_try_rate() - 0.85).abs() < 0.001);
295    }
296
297    #[test]
298    fn baseline_metrics_empty_corpus() {
299        let metrics = BaselineMetrics::new(0, 0);
300        assert_eq!(metrics.first_try_rate(), 0.0);
301    }
302
303    #[test]
304    fn baseline_metrics_meets_target() {
305        let passing = BaselineMetrics::new(85, 100);
306        assert!(passing.meets_target());
307
308        let failing = BaselineMetrics::new(80, 100);
309        assert!(!failing.meets_target());
310    }
311
312    #[test]
313    fn baseline_metrics_confidence_interval_exists() {
314        let metrics = BaselineMetrics::new(85, 100);
315        let (lower, upper) = metrics.confidence_interval();
316
317        // CI should bracket the point estimate
318        assert!(lower < 0.85);
319        assert!(upper > 0.85);
320        assert!(lower >= 0.0);
321        assert!(upper <= 1.0);
322    }
323
324    #[test]
325    fn baseline_metrics_ci_narrows_with_larger_samples() {
326        let small = BaselineMetrics::new(17, 20);
327        let large = BaselineMetrics::new(850, 1000);
328
329        let (small_lo, small_hi) = small.confidence_interval();
330        let (large_lo, large_hi) = large.confidence_interval();
331
332        let small_width = small_hi - small_lo;
333        let large_width = large_hi - large_lo;
334
335        // Larger sample should have narrower CI
336        assert!(large_width < small_width);
337    }
338
339    #[test]
340    fn baseline_metrics_significantly_below_target() {
341        // Very low rate with large sample - CI upper bound below target
342        let metrics = BaselineMetrics::new(500, 1000);
343        assert!(metrics.significantly_below_target());
344
345        // Rate that includes target
346        let close = BaselineMetrics::new(840, 1000);
347        assert!(!close.significantly_below_target());
348    }
349
350    #[test]
351    fn baseline_metrics_with_iterations() {
352        let metrics = BaselineMetrics::with_iterations(80, 95, 150, 100);
353
354        assert_eq!(metrics.first_try_successes(), 80);
355        assert_eq!(metrics.eventual_successes(), 95);
356        assert!((metrics.average_iterations() - 1.578).abs() < 0.01);
357    }
358
359    #[test]
360    fn baseline_metrics_to_markdown() {
361        let metrics = BaselineMetrics::new(85, 100);
362        let md = metrics.to_markdown();
363
364        assert!(md.contains("Baseline Measurement Report"));
365        assert!(md.contains("| Corpus Size | 100 |"));
366        assert!(md.contains("| First-Try Successes | 85 |"));
367        assert!(md.contains("PASSED"));
368    }
369
370    // ========================================================================
371    // Wilson score interval tests
372    // ========================================================================
373
374    #[test]
375    fn wilson_score_empty() {
376        let (lower, upper) = wilson_score_interval(0, 0, 0.95);
377        assert_eq!(lower, 0.0);
378        assert_eq!(upper, 1.0);
379    }
380
381    #[test]
382    fn wilson_score_all_success() {
383        let (lower, upper) = wilson_score_interval(100, 100, 0.95);
384        assert!(lower > 0.95);
385        assert!((upper - 1.0).abs() < 1e-10);
386    }
387
388    #[test]
389    fn wilson_score_all_failure() {
390        let (lower, upper) = wilson_score_interval(0, 100, 0.95);
391        assert_eq!(lower, 0.0);
392        assert!(upper < 0.05);
393    }
394
395    #[test]
396    fn wilson_score_typical_case() {
397        // 85% success rate with 100 samples
398        let (lower, upper) = wilson_score_interval(85, 100, 0.95);
399
400        // Should be roughly [0.77, 0.91]
401        assert!(lower > 0.75 && lower < 0.80);
402        assert!(upper > 0.89 && upper < 0.93);
403    }
404
405    // ========================================================================
406    // FileMeasurement tests
407    // ========================================================================
408
409    #[test]
410    fn file_measurement_first_try_success() {
411        let m = FileMeasurement::first_try_success("test.c");
412        assert!(m.first_try_success);
413        assert!(m.eventual_success);
414        assert_eq!(m.iterations, 1);
415        assert!(m.error_codes.is_empty());
416    }
417
418    #[test]
419    fn file_measurement_success_after_iterations() {
420        let m = FileMeasurement::success_after("test.c", 3, vec!["E0382".to_string()]);
421        assert!(!m.first_try_success);
422        assert!(m.eventual_success);
423        assert_eq!(m.iterations, 3);
424        assert_eq!(m.error_codes.len(), 1);
425    }
426
427    #[test]
428    fn file_measurement_failure() {
429        let m =
430            FileMeasurement::failure("test.c", 5, vec!["E0382".to_string(), "E0499".to_string()]);
431        assert!(!m.first_try_success);
432        assert!(!m.eventual_success);
433        assert_eq!(m.iterations, 5);
434        assert_eq!(m.error_codes.len(), 2);
435    }
436
437    // ========================================================================
438    // Aggregation tests
439    // ========================================================================
440
441    #[test]
442    fn aggregate_empty() {
443        let metrics = aggregate_measurements(&[]);
444        assert_eq!(metrics.corpus_size(), 0);
445        assert_eq!(metrics.first_try_rate(), 0.0);
446    }
447
448    #[test]
449    fn aggregate_all_first_try() {
450        let measurements = vec![
451            FileMeasurement::first_try_success("a.c"),
452            FileMeasurement::first_try_success("b.c"),
453            FileMeasurement::first_try_success("c.c"),
454        ];
455        let metrics = aggregate_measurements(&measurements);
456
457        assert_eq!(metrics.corpus_size(), 3);
458        assert_eq!(metrics.first_try_successes(), 3);
459        assert!((metrics.first_try_rate() - 1.0).abs() < 0.001);
460    }
461
462    #[test]
463    fn aggregate_mixed_results() {
464        let measurements = vec![
465            FileMeasurement::first_try_success("a.c"),
466            FileMeasurement::first_try_success("b.c"),
467            FileMeasurement::success_after("c.c", 2, vec!["E0382".to_string()]),
468            FileMeasurement::success_after("d.c", 3, vec!["E0499".to_string()]),
469            FileMeasurement::failure("e.c", 5, vec!["E0515".to_string()]),
470        ];
471        let metrics = aggregate_measurements(&measurements);
472
473        assert_eq!(metrics.corpus_size(), 5);
474        assert_eq!(metrics.first_try_successes(), 2);
475        assert_eq!(metrics.eventual_successes(), 4);
476        assert!((metrics.first_try_rate() - 0.4).abs() < 0.001);
477        // (1 + 1 + 2 + 3) / 4 = 1.75
478        assert!((metrics.average_iterations() - 1.75).abs() < 0.001);
479    }
480
481    // ========================================================================
482    // Additional coverage: uncovered paths
483    // ========================================================================
484
485    #[test]
486    fn baseline_metrics_includes_target() {
487        // Rate below target but CI includes target
488        let close = BaselineMetrics::new(840, 1000);
489        assert!(close.includes_target(), "CI should include 85% target");
490
491        // Rate way above target — CI doesn't include target (lower bound > target)
492        let high = BaselineMetrics::new(990, 1000);
493        assert!(!high.includes_target(), "99% rate CI lower bound should be above 85%");
494    }
495
496    #[test]
497    fn baseline_metrics_to_markdown_failed() {
498        // significantly_below_target → "FAILED" status
499        let metrics = BaselineMetrics::new(500, 1000);
500        let md = metrics.to_markdown();
501        assert!(md.contains("FAILED"));
502    }
503
504    #[test]
505    fn baseline_metrics_to_markdown_pending() {
506        // includes_target but !meets_target → "PENDING" status
507        let metrics = BaselineMetrics::new(840, 1000);
508        assert!(!metrics.meets_target());
509        assert!(metrics.includes_target());
510        let md = metrics.to_markdown();
511        assert!(md.contains("PENDING"));
512    }
513
514    #[test]
515    fn wilson_score_90_confidence() {
516        let (lower, upper) = wilson_score_interval(85, 100, 0.90);
517        // 90% CI should be narrower than 95%
518        let (lower_95, upper_95) = wilson_score_interval(85, 100, 0.95);
519        assert!((upper - lower) < (upper_95 - lower_95));
520    }
521
522    #[test]
523    fn wilson_score_99_confidence() {
524        let (lower, upper) = wilson_score_interval(85, 100, 0.99);
525        // 99% CI should be wider than 95%
526        let (lower_95, upper_95) = wilson_score_interval(85, 100, 0.95);
527        assert!((upper - lower) > (upper_95 - lower_95));
528    }
529
530    #[test]
531    fn wilson_score_non_standard_confidence() {
532        // Non-standard confidence → defaults to 1.96 (same as 95%)
533        let (lower, upper) = wilson_score_interval(85, 100, 0.80);
534        let (lower_95, upper_95) = wilson_score_interval(85, 100, 0.95);
535        assert!((lower - lower_95).abs() < 0.001);
536        assert!((upper - upper_95).abs() < 0.001);
537    }
538
539    #[test]
540    fn file_measurement_success_after_one_iteration() {
541        // iterations == 1 → first_try_success is true
542        let m = FileMeasurement::success_after("test.c", 1, vec![]);
543        assert!(m.first_try_success);
544        assert!(m.eventual_success);
545        assert_eq!(m.iterations, 1);
546    }
547
548    #[test]
549    fn baseline_metrics_with_iterations_zero_eventual() {
550        // No eventual successes → average_iterations = 0.0
551        let metrics = BaselineMetrics::with_iterations(0, 0, 0, 100);
552        assert_eq!(metrics.average_iterations(), 0.0);
553    }
554
555    #[test]
556    fn aggregate_all_failures() {
557        let measurements = vec![
558            FileMeasurement::failure("a.c", 5, vec![]),
559            FileMeasurement::failure("b.c", 5, vec![]),
560        ];
561        let metrics = aggregate_measurements(&measurements);
562
563        assert_eq!(metrics.corpus_size(), 2);
564        assert_eq!(metrics.first_try_successes(), 0);
565        assert_eq!(metrics.eventual_successes(), 0);
566        assert!((metrics.first_try_rate() - 0.0).abs() < 0.001);
567        assert!((metrics.average_iterations() - 0.0).abs() < 0.001);
568    }
569}