pub const AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP: f32 = 1.2;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Ship023Verdict {
Pass,
Fail,
}
#[must_use]
pub fn verdict_from_score_drift(day1_pct: f32, day2_pct: f32, tolerance_pp: f32) -> Ship023Verdict {
if !day1_pct.is_finite() || !day2_pct.is_finite() || !tolerance_pp.is_finite() {
return Ship023Verdict::Fail;
}
if !(0.0_f32..=100.0_f32).contains(&day1_pct) {
return Ship023Verdict::Fail;
}
if !(0.0_f32..=100.0_f32).contains(&day2_pct) {
return Ship023Verdict::Fail;
}
if tolerance_pp < 0.0 {
return Ship023Verdict::Fail;
}
let drift = (day1_pct - day2_pct).abs();
if drift <= tolerance_pp {
Ship023Verdict::Pass
} else {
Ship023Verdict::Fail
}
}
#[cfg(test)]
mod ship_023_tests {
use super::*;
#[test]
fn falsify_ship_023_score_drift_threshold_logic() {
assert_eq!(
verdict_from_score_drift(
AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP,
0.0,
AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP
),
Ship023Verdict::Pass,
"drift exactly at tolerance boundary must Pass (inclusive)",
);
let just_above: f32 = AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP + 1e-4;
assert!(
just_above > AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP,
"harness sanity: just_above must really exceed tolerance",
);
assert_eq!(
verdict_from_score_drift(just_above, 0.0, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Fail,
"drift = 1.2 + 1e-4 must Fail (sharpest counter-example)",
);
for &drift in &[0.0_f32, 0.5, 1.0, 1.199] {
let day1 = 86.0;
let day2 = 86.0 - drift;
assert_eq!(
verdict_from_score_drift(day1, day2, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Pass,
"drift {drift} pp must Pass (below 1.2 tolerance)",
);
}
for &drift in &[1.3_f32, 2.0, 10.0, 86.0] {
let day1 = 86.0;
let day2 = 86.0 - drift;
assert_eq!(
verdict_from_score_drift(day1, day2, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Fail,
"drift {drift} pp must Fail (above 1.2 tolerance)",
);
}
let (pass_a, pass_b) = (86.0_f32, 84.8_f32);
assert_eq!(
verdict_from_score_drift(pass_a, pass_b, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
verdict_from_score_drift(pass_b, pass_a, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
"verdict must be order-invariant (Pass direction)",
);
let (pass_c, pass_d) = (86.0_f32, 85.0_f32);
assert_eq!(
verdict_from_score_drift(pass_c, pass_d, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Pass,
"1.0 pp drift (within 1.2) must Pass in forward direction",
);
assert_eq!(
verdict_from_score_drift(pass_d, pass_c, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Pass,
"1.0 pp drift (within 1.2) must Pass in reverse direction (symmetry)",
);
let (fail_a, fail_b) = (86.0_f32, 82.0_f32);
assert_eq!(
verdict_from_score_drift(fail_a, fail_b, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
verdict_from_score_drift(fail_b, fail_a, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
"verdict must be order-invariant (Fail direction)",
);
assert_eq!(
verdict_from_score_drift(fail_a, fail_b, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Fail,
"4.0 pp drift must Fail regardless of order",
);
for &bad in &[f32::NAN, f32::INFINITY, f32::NEG_INFINITY] {
assert_eq!(
verdict_from_score_drift(bad, 86.0, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Fail,
"non-finite day1 ({bad}) must Fail conservatively",
);
assert_eq!(
verdict_from_score_drift(86.0, bad, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Fail,
"non-finite day2 ({bad}) must Fail conservatively",
);
assert_eq!(
verdict_from_score_drift(86.0, 86.0, bad),
Ship023Verdict::Fail,
"non-finite tolerance ({bad}) must Fail conservatively",
);
}
for &oor in &[-0.1_f32, -1.0, -86.0, 100.1, 101.0, 1_000.0] {
assert_eq!(
verdict_from_score_drift(oor, 86.0, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Fail,
"out-of-range day1 ({oor}) must Fail",
);
assert_eq!(
verdict_from_score_drift(86.0, oor, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Fail,
"out-of-range day2 ({oor}) must Fail",
);
}
for &neg_tol in &[-1e-6_f32, -0.001, -1.0, -100.0] {
assert_eq!(
verdict_from_score_drift(86.0, 86.0, neg_tol),
Ship023Verdict::Fail,
"negative tolerance ({neg_tol}) must Fail (contract drift)",
);
}
assert_eq!(
verdict_from_score_drift(86.0, 86.0, 0.0),
Ship023Verdict::Pass,
"zero tolerance with identical scores must Pass",
);
assert_eq!(
verdict_from_score_drift(86.0, 86.1, 0.0),
Ship023Verdict::Fail,
"zero tolerance with any drift must Fail",
);
assert_eq!(
verdict_from_score_drift(0.0, 0.0, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Pass,
"both at 0.0% must Pass (degenerate but legal)",
);
assert_eq!(
verdict_from_score_drift(100.0, 100.0, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Pass,
"both at 100.0% must Pass (degenerate but legal)",
);
#[allow(clippy::float_cmp)]
{
assert_eq!(
AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP, 1.2_f32,
"max HumanEval drift is 1.2 pp (spec §7.1 FALSIFY-SHIP-023)",
);
}
}
}