pub const AC_SHIP1_005_NOMINAL_HUMANEVAL_PASS_AT_1_PCT: f32 = 86.00;
pub const AC_SHIP1_005_NOISE_ALLOWANCE_PP: f32 = 1.20;
pub const AC_SHIP1_005_EFFECTIVE_HUMANEVAL_PASS_AT_1_PCT: f32 =
AC_SHIP1_005_NOMINAL_HUMANEVAL_PASS_AT_1_PCT - AC_SHIP1_005_NOISE_ALLOWANCE_PP;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Ship005Verdict {
Pass,
Fail,
}
#[must_use]
pub fn verdict_from_pass_at_1(correct: usize, total: usize, threshold_pct: f32) -> Ship005Verdict {
if total == 0 {
return Ship005Verdict::Fail;
}
if correct > total {
return Ship005Verdict::Fail;
}
if !threshold_pct.is_finite() {
return Ship005Verdict::Fail;
}
#[allow(clippy::cast_precision_loss)]
let ratio_pct = (correct as f32 / total as f32) * 100.0_f32;
if ratio_pct >= threshold_pct {
Ship005Verdict::Pass
} else {
Ship005Verdict::Fail
}
}
#[cfg(test)]
mod ship_005_tests {
use super::*;
#[test]
fn falsify_ship_005_humaneval_pass_at_1_threshold_logic() {
assert_eq!(
verdict_from_pass_at_1(85, 100, AC_SHIP1_005_EFFECTIVE_HUMANEVAL_PASS_AT_1_PCT,),
Ship005Verdict::Pass,
"85/100 = 85.0% must Pass the effective floor",
);
assert_eq!(
verdict_from_pass_at_1(87, 100, AC_SHIP1_005_NOMINAL_HUMANEVAL_PASS_AT_1_PCT,),
Ship005Verdict::Pass,
"87/100 = 87.00% must Pass the nominal floor",
);
assert_eq!(
verdict_from_pass_at_1(85, 100, AC_SHIP1_005_NOMINAL_HUMANEVAL_PASS_AT_1_PCT,),
Ship005Verdict::Fail,
"85% must Fail against the nominal floor (no noise allowance)",
);
assert_eq!(
verdict_from_pass_at_1(84, 100, AC_SHIP1_005_EFFECTIVE_HUMANEVAL_PASS_AT_1_PCT,),
Ship005Verdict::Fail,
"84/100 = 84.00% must Fail the effective floor",
);
assert_eq!(
verdict_from_pass_at_1(139, 164, AC_SHIP1_005_EFFECTIVE_HUMANEVAL_PASS_AT_1_PCT,),
Ship005Verdict::Fail,
"139/164 = 84.756% must Fail the effective floor",
);
assert_eq!(
verdict_from_pass_at_1(140, 164, AC_SHIP1_005_EFFECTIVE_HUMANEVAL_PASS_AT_1_PCT,),
Ship005Verdict::Pass,
"140/164 = 85.365% must Pass the effective floor",
);
let mut seen_pass = false;
for correct in 0..=164 {
let v = verdict_from_pass_at_1(
correct,
164,
AC_SHIP1_005_EFFECTIVE_HUMANEVAL_PASS_AT_1_PCT,
);
if v == Ship005Verdict::Pass {
seen_pass = true;
} else if seen_pass {
panic!("monotonicity broken: correct={correct} flipped back to Fail after Pass");
}
}
assert_eq!(
verdict_from_pass_at_1(0, 0, AC_SHIP1_005_EFFECTIVE_HUMANEVAL_PASS_AT_1_PCT,),
Ship005Verdict::Fail,
"total=0 must Fail (div-by-zero guard)",
);
assert_eq!(
verdict_from_pass_at_1(200, 100, AC_SHIP1_005_EFFECTIVE_HUMANEVAL_PASS_AT_1_PCT,),
Ship005Verdict::Fail,
"correct>total must Fail (sanity guard)",
);
assert_eq!(
verdict_from_pass_at_1(164, 164, f32::NAN),
Ship005Verdict::Fail,
"NaN threshold must Fail conservatively",
);
assert_eq!(
verdict_from_pass_at_1(164, 164, f32::INFINITY),
Ship005Verdict::Fail,
"+∞ threshold must Fail conservatively",
);
assert_eq!(
verdict_from_pass_at_1(164, 164, f32::NEG_INFINITY),
Ship005Verdict::Fail,
"-∞ threshold must Fail conservatively",
);
#[allow(clippy::float_cmp)]
{
assert_eq!(
AC_SHIP1_005_NOMINAL_HUMANEVAL_PASS_AT_1_PCT, 86.00,
"nominal floor is 86.00% (spec §4.2 AC-SHIP1-005)",
);
assert_eq!(
AC_SHIP1_005_NOISE_ALLOWANCE_PP, 1.20,
"noise allowance is 1.2 pp (spec §4.2 AC-SHIP1-005)",
);
}
assert!(
(AC_SHIP1_005_EFFECTIVE_HUMANEVAL_PASS_AT_1_PCT - 84.80).abs() < 1e-4,
"effective floor must be ~84.80% (nominal − noise); got {}",
AC_SHIP1_005_EFFECTIVE_HUMANEVAL_PASS_AT_1_PCT,
);
}
}