entrenar/eval/generative/
code.rs

1//! Code generation evaluation metrics
2//!
3//! Provides pass@k — the unbiased estimator for functional correctness
4//! of code generation models (Chen et al., 2021 "Evaluating Large Language
5//! Models Trained on Code").
6
7/// Compute pass@k: unbiased estimator of functional correctness.
8///
9/// Formula: `1 - C(n-c, k) / C(n, k)`
10///
11/// where n = total samples, c = correct samples, k = top-k threshold.
12///
13/// Returns a value in [0, 1] where 1.0 means all k samples pass.
14///
15/// # Arguments
16/// * `n` - Total number of generated code samples
17/// * `c` - Number of correct (passing) samples
18/// * `k` - Number of samples to consider (typically 1, 10, or 100)
19///
20/// # Edge Cases
21/// * If `k > n`, returns `if c > 0 { 1.0 } else { 0.0 }`
22/// * If `c >= n`, returns 1.0
23/// * If `c == 0`, returns 0.0
24pub fn pass_at_k(n: usize, c: usize, k: usize) -> f64 {
25    if c == 0 {
26        return 0.0;
27    }
28    if c >= n || k > n {
29        return 1.0;
30    }
31
32    // 1 - C(n-c, k) / C(n, k)
33    // Compute in log space to avoid overflow for large n
34    // C(n-c, k) / C(n, k) = product_{i=0..k} (n-c-i) / (n-i)
35    let mut log_ratio = 0.0f64;
36    for i in 0..k {
37        let numerator = (n - c - i) as f64;
38        let denominator = (n - i) as f64;
39        if numerator <= 0.0 {
40            return 1.0;
41        }
42        log_ratio +=
43            numerator.max(f64::MIN_POSITIVE).ln() - denominator.max(f64::MIN_POSITIVE).ln();
44    }
45
46    1.0 - log_ratio.exp()
47}
entrenar/eval/generative/code.rs

entrenar/eval/generative/
code.rs