Skip to main content

harn_vm/orchestration/
skill_gate.rs

1//! Contamination-safe held-out gates for skill and guidance candidates.
2//!
3//! The gate is deliberately data-driven: local or hosted runners can produce
4//! with/without observations however they execute models, then this module
5//! applies the reusable admission policy, contamination filter, context-cost
6//! accounting, grader checksum checks, and receipt shaping.
7
8use std::collections::{BTreeMap, BTreeSet};
9use std::fs;
10use std::path::{Path, PathBuf};
11
12use serde::{Deserialize, Serialize};
13use serde_json::Value as JsonValue;
14use sha2::{Digest, Sha256};
15use walkdir::WalkDir;
16
17use super::estimate_chunk_tokens;
18use crate::value::VmError;
19
20pub const SKILL_GATE_SCHEMA_VERSION: u32 = 1;
21pub const SKILL_GATE_MANIFEST_TYPE: &str = "harn.skill_gate.manifest.v1";
22pub const SKILL_GATE_REPORT_TYPE: &str = "harn.skill_gate.report.v1";
23pub const SKILL_GATE_RECEIPT_TYPE: &str = "harn.skill_gate.receipt.v1";
24
25const EPSILON: f64 = 0.000_000_1;
26
27#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
28#[serde(default)]
29pub struct SkillGateManifest {
30    #[serde(rename = "_type")]
31    pub type_name: String,
32    pub version: u32,
33    pub id: String,
34    pub name: Option<String>,
35    pub description: Option<String>,
36    #[serde(default, alias = "base-dir")]
37    pub base_dir: Option<String>,
38    #[serde(default, alias = "target-model")]
39    pub target_model: SkillGateTargetModel,
40    pub policy: SkillGatePolicy,
41    pub grader: SkillGateGrader,
42    pub tasks: Vec<SkillGateTask>,
43    pub variants: Vec<SkillGateVariant>,
44    pub metadata: BTreeMap<String, JsonValue>,
45}
46
47#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
48#[serde(default)]
49pub struct SkillGateTargetModel {
50    pub id: String,
51    pub provider: Option<String>,
52    #[serde(default, alias = "knowledge-cutoff")]
53    pub knowledge_cutoff: Option<String>,
54    #[serde(default, alias = "context-budget-tokens")]
55    pub context_budget_tokens: Option<usize>,
56    pub metadata: BTreeMap<String, JsonValue>,
57}
58
59#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
60#[serde(default)]
61pub struct SkillGatePolicy {
62    #[serde(default, alias = "min-included-tasks")]
63    pub min_included_tasks: Option<usize>,
64    #[serde(default, alias = "min-score-lift")]
65    pub min_score_lift: Option<f64>,
66    #[serde(default, alias = "min-gap-recovery")]
67    pub min_gap_recovery: Option<f64>,
68    #[serde(default, alias = "min-cluster-gap-recovery")]
69    pub min_cluster_gap_recovery: Option<f64>,
70    #[serde(default, alias = "require-cluster-lift")]
71    pub require_cluster_lift: bool,
72    #[serde(default, alias = "max-regression-rate")]
73    pub max_regression_rate: Option<f64>,
74    #[serde(default, alias = "min-win-rate")]
75    pub min_win_rate: Option<f64>,
76    #[serde(default, alias = "max-context-delta-tokens")]
77    pub max_context_delta_tokens: Option<i64>,
78    #[serde(default, alias = "pass-score-threshold")]
79    pub pass_score_threshold: Option<f64>,
80    #[serde(default, alias = "require-no-tamper")]
81    pub require_no_tamper: Option<bool>,
82    pub metadata: BTreeMap<String, JsonValue>,
83}
84
85impl SkillGatePolicy {
86    fn min_included_tasks(&self) -> usize {
87        self.min_included_tasks.unwrap_or(1)
88    }
89
90    fn min_score_lift(&self) -> f64 {
91        self.min_score_lift.unwrap_or(0.0)
92    }
93
94    fn min_gap_recovery(&self) -> f64 {
95        self.min_gap_recovery.unwrap_or(0.0)
96    }
97
98    fn min_cluster_gap_recovery(&self) -> f64 {
99        self.min_cluster_gap_recovery
100            .unwrap_or_else(|| self.min_gap_recovery())
101    }
102
103    fn max_regression_rate(&self) -> f64 {
104        self.max_regression_rate.unwrap_or(0.0)
105    }
106
107    fn pass_score_threshold(&self) -> f64 {
108        self.pass_score_threshold.unwrap_or(0.5)
109    }
110
111    fn require_no_tamper(&self) -> bool {
112        self.require_no_tamper.unwrap_or(true)
113    }
114}
115
116#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
117#[serde(default)]
118pub struct SkillGateGrader {
119    pub id: String,
120    #[serde(default, alias = "immutable-paths", alias = "protected-paths")]
121    pub immutable_paths: Vec<SkillGateProtectedPath>,
122    pub metadata: BTreeMap<String, JsonValue>,
123}
124
125#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
126#[serde(default)]
127pub struct SkillGateProtectedPath {
128    pub path: String,
129    pub sha256: String,
130    pub label: Option<String>,
131}
132
133#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
134#[serde(default)]
135pub struct SkillGateTask {
136    pub id: String,
137    pub name: Option<String>,
138    pub cluster: String,
139    pub source: Option<String>,
140    pub heldout: SkillGateHeldout,
141    #[serde(default, alias = "baseline-score")]
142    pub baseline_score: f64,
143    #[serde(default, alias = "frontier-score")]
144    pub frontier_score: f64,
145    #[serde(default, alias = "baseline-passed")]
146    pub baseline_passed: Option<bool>,
147    pub metadata: BTreeMap<String, JsonValue>,
148}
149
150#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
151#[serde(default)]
152pub struct SkillGateHeldout {
153    pub kind: String,
154    #[serde(default, alias = "created-at")]
155    pub created_at: Option<String>,
156    pub private: bool,
157    pub suite: Option<String>,
158    pub metadata: BTreeMap<String, JsonValue>,
159}
160
161#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
162#[serde(default)]
163pub struct SkillGateVariant {
164    pub id: String,
165    pub name: Option<String>,
166    pub description: Option<String>,
167    pub baseline: SkillGateArtifact,
168    pub candidate: SkillGateArtifact,
169    #[serde(default, alias = "case-results")]
170    pub case_results: Vec<SkillGateCaseResult>,
171    pub metadata: BTreeMap<String, JsonValue>,
172}
173
174#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
175#[serde(default)]
176pub struct SkillGateArtifact {
177    pub kind: String,
178    pub paths: Vec<String>,
179    #[serde(default, alias = "context-tokens")]
180    pub context_tokens: Option<usize>,
181    pub metadata: BTreeMap<String, JsonValue>,
182}
183
184#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
185#[serde(default)]
186pub struct SkillGateCaseResult {
187    #[serde(default, alias = "task-id")]
188    pub task_id: String,
189    pub score: Option<f64>,
190    pub passed: Option<bool>,
191    pub notes: Option<String>,
192    pub metadata: BTreeMap<String, JsonValue>,
193}
194
195#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
196#[serde(default)]
197pub struct SkillGateReport {
198    #[serde(rename = "_type")]
199    pub type_name: String,
200    pub schema_version: u32,
201    pub manifest_id: String,
202    pub manifest_name: Option<String>,
203    pub target_model: SkillGateTargetModel,
204    pub pass: bool,
205    pub selected_variant_id: Option<String>,
206    pub included_task_count: usize,
207    pub excluded_task_count: usize,
208    pub task_safety: Vec<SkillGateTaskSafetyReport>,
209    pub tamper: SkillGateTamperReport,
210    pub variants: Vec<SkillGateVariantReport>,
211    pub pareto_frontier: Vec<String>,
212    pub receipt: SkillGateReceipt,
213    pub metadata: BTreeMap<String, JsonValue>,
214}
215
216#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
217#[serde(default)]
218pub struct SkillGateTaskSafetyReport {
219    pub task_id: String,
220    pub cluster: String,
221    pub included: bool,
222    pub heldout_kind: String,
223    pub created_at: Option<String>,
224    pub private: bool,
225    pub exclusion_reason: Option<String>,
226}
227
228#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
229#[serde(default)]
230pub struct SkillGateTamperReport {
231    pub pass: bool,
232    pub checks: Vec<SkillGateTamperCheck>,
233    pub failures: Vec<String>,
234}
235
236#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
237#[serde(default)]
238pub struct SkillGateTamperCheck {
239    pub path: String,
240    pub label: Option<String>,
241    pub expected_sha256: String,
242    pub actual_sha256: Option<String>,
243    pub status: String,
244    pub failure: Option<String>,
245}
246
247#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
248#[serde(default)]
249pub struct SkillGateVariantReport {
250    pub id: String,
251    pub name: Option<String>,
252    pub accepted: bool,
253    pub decision: String,
254    pub failures: Vec<String>,
255    pub warnings: Vec<String>,
256    pub metrics: SkillGateVariantMetrics,
257    pub context: SkillGateContextReport,
258    pub clusters: Vec<SkillGateClusterReport>,
259    pub cases: Vec<SkillGateCaseReport>,
260}
261
262#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
263#[serde(default)]
264pub struct SkillGateVariantMetrics {
265    pub included_task_count: usize,
266    pub scored_task_count: usize,
267    pub gap_task_count: usize,
268    pub mean_baseline_score: f64,
269    pub mean_candidate_score: f64,
270    pub mean_frontier_score: f64,
271    pub mean_score_lift: f64,
272    pub mean_gap_recovery: f64,
273    pub candidate_win_count: usize,
274    pub candidate_tie_count: usize,
275    pub candidate_loss_count: usize,
276    pub win_rate: f64,
277    pub regression_count: usize,
278    pub regression_denominator: usize,
279    pub regression_rate: f64,
280}
281
282#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
283#[serde(default)]
284pub struct SkillGateContextReport {
285    pub baseline_tokens: usize,
286    pub candidate_tokens: usize,
287    pub delta_tokens: i64,
288    pub max_delta_tokens: Option<i64>,
289    pub target_context_budget_tokens: Option<usize>,
290    pub within_delta_budget: bool,
291    pub within_target_budget: bool,
292    pub artifact_hashes: Vec<SkillGateArtifactHash>,
293}
294
295#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
296#[serde(default)]
297pub struct SkillGateArtifactHash {
298    pub role: String,
299    pub path: String,
300    pub sha256: String,
301    pub tokens: usize,
302    pub bytes: usize,
303}
304
305#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
306#[serde(default)]
307pub struct SkillGateClusterReport {
308    pub cluster: String,
309    pub task_count: usize,
310    pub gap_task_count: usize,
311    pub mean_baseline_score: f64,
312    pub mean_candidate_score: f64,
313    pub mean_frontier_score: f64,
314    pub mean_score_lift: f64,
315    pub mean_gap_recovery: f64,
316    pub pass: bool,
317}
318
319#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
320#[serde(default)]
321pub struct SkillGateCaseReport {
322    pub task_id: String,
323    pub cluster: String,
324    pub included: bool,
325    pub exclusion_reason: Option<String>,
326    pub baseline_score: f64,
327    pub candidate_score: Option<f64>,
328    pub frontier_score: f64,
329    pub score_lift: Option<f64>,
330    pub gap_recovery: Option<f64>,
331    pub baseline_passed: bool,
332    pub candidate_passed: Option<bool>,
333    pub regression: bool,
334    pub failures: Vec<String>,
335    pub notes: Option<String>,
336}
337
338#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
339#[serde(default)]
340pub struct SkillGateReceipt {
341    #[serde(rename = "_type")]
342    pub type_name: String,
343    pub schema_version: u32,
344    pub manifest_id: String,
345    pub target_model_id: String,
346    pub accepted: bool,
347    pub selected_variant_id: Option<String>,
348    pub decision: String,
349    pub metrics: Option<SkillGateVariantMetrics>,
350    pub context: Option<SkillGateContextReport>,
351    pub tamper: SkillGateTamperReport,
352    pub pareto_frontier: Vec<String>,
353    pub excluded_task_ids: Vec<String>,
354    pub variant_receipts: Vec<SkillGateVariantReceipt>,
355    pub metadata: BTreeMap<String, JsonValue>,
356}
357
358#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
359#[serde(default)]
360pub struct SkillGateVariantReceipt {
361    pub variant_id: String,
362    pub accepted: bool,
363    pub decision: String,
364    pub metrics: SkillGateVariantMetrics,
365    pub context_delta_tokens: i64,
366    pub failures: Vec<String>,
367}
368
369pub fn load_skill_gate_manifest(path: &Path) -> Result<SkillGateManifest, VmError> {
370    let content = fs::read_to_string(path).map_err(|error| {
371        VmError::Runtime(format!("failed to read skill gate manifest: {error}"))
372    })?;
373    let mut manifest: SkillGateManifest =
374        if path.extension().and_then(|ext| ext.to_str()) == Some("toml") {
375            toml::from_str(&content).map_err(|error| {
376                VmError::Runtime(format!("failed to parse skill gate TOML: {error}"))
377            })?
378        } else {
379            serde_json::from_str(&content).map_err(|error| {
380                VmError::Runtime(format!("failed to parse skill gate JSON: {error}"))
381            })?
382        };
383    if manifest.base_dir.is_none() {
384        manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
385    }
386    normalize_skill_gate_manifest(&mut manifest)?;
387    Ok(manifest)
388}
389
390pub fn evaluate_skill_gate_manifest(
391    manifest: &SkillGateManifest,
392) -> Result<SkillGateReport, VmError> {
393    let mut manifest = manifest.clone();
394    normalize_skill_gate_manifest(&mut manifest)?;
395    let base_dir = manifest.base_dir.as_deref().map(Path::new);
396    let task_safety = manifest
397        .tasks
398        .iter()
399        .map(|task| task_safety_report(task, &manifest.target_model))
400        .collect::<Vec<_>>();
401    let included_task_count = task_safety.iter().filter(|task| task.included).count();
402    let excluded_task_count = task_safety.len().saturating_sub(included_task_count);
403    let safety_by_id = task_safety
404        .iter()
405        .map(|task| (task.task_id.as_str(), task))
406        .collect::<BTreeMap<_, _>>();
407    let tamper = verify_immutable_grader(&manifest.grader, base_dir);
408    let variants = manifest
409        .variants
410        .iter()
411        .map(|variant| evaluate_variant(variant, &manifest, &safety_by_id, &tamper, base_dir))
412        .collect::<Vec<_>>();
413    let pareto_frontier = pareto_frontier(&variants);
414    let selected_variant_id = select_variant(&variants, &pareto_frontier);
415    let pass = selected_variant_id.is_some();
416    let receipt = build_receipt(
417        &manifest,
418        pass,
419        selected_variant_id.clone(),
420        &task_safety,
421        tamper.clone(),
422        &variants,
423        pareto_frontier.clone(),
424    );
425    Ok(SkillGateReport {
426        type_name: SKILL_GATE_REPORT_TYPE.to_string(),
427        schema_version: SKILL_GATE_SCHEMA_VERSION,
428        manifest_id: manifest.id,
429        manifest_name: manifest.name,
430        target_model: manifest.target_model,
431        pass,
432        selected_variant_id,
433        included_task_count,
434        excluded_task_count,
435        task_safety,
436        tamper,
437        variants,
438        pareto_frontier,
439        receipt,
440        metadata: manifest.metadata,
441    })
442}
443
444fn normalize_skill_gate_manifest(manifest: &mut SkillGateManifest) -> Result<(), VmError> {
445    if manifest.type_name.is_empty() {
446        manifest.type_name = SKILL_GATE_MANIFEST_TYPE.to_string();
447    }
448    if manifest.type_name != SKILL_GATE_MANIFEST_TYPE {
449        return Err(VmError::Runtime(format!(
450            "skill gate manifest _type must be {SKILL_GATE_MANIFEST_TYPE}"
451        )));
452    }
453    if manifest.version == 0 {
454        manifest.version = SKILL_GATE_SCHEMA_VERSION;
455    }
456    if manifest.version != SKILL_GATE_SCHEMA_VERSION {
457        return Err(VmError::Runtime(format!(
458            "skill gate manifest version must be {SKILL_GATE_SCHEMA_VERSION}"
459        )));
460    }
461    if manifest.id.trim().is_empty() {
462        manifest.id = "skill-gate".to_string();
463    }
464    if manifest.target_model.id.trim().is_empty() {
465        return Err(VmError::Runtime(
466            "skill gate manifest target_model.id is required".to_string(),
467        ));
468    }
469    if manifest.tasks.is_empty() {
470        return Err(VmError::Runtime(
471            "skill gate manifest must declare at least one task".to_string(),
472        ));
473    }
474    if manifest.variants.is_empty() {
475        return Err(VmError::Runtime(
476            "skill gate manifest must declare at least one variant".to_string(),
477        ));
478    }
479    let mut task_ids = BTreeSet::new();
480    for (index, task) in manifest.tasks.iter_mut().enumerate() {
481        if task.id.trim().is_empty() {
482            task.id = format!("task_{}", index + 1);
483        }
484        if !task_ids.insert(task.id.clone()) {
485            return Err(VmError::Runtime(format!(
486                "skill gate manifest has duplicate task id '{}'",
487                task.id
488            )));
489        }
490        if task.cluster.trim().is_empty() {
491            task.cluster = "default".to_string();
492        }
493        validate_score("baseline_score", &task.id, task.baseline_score)?;
494        validate_score("frontier_score", &task.id, task.frontier_score)?;
495    }
496    let mut variant_ids = BTreeSet::new();
497    for (index, variant) in manifest.variants.iter_mut().enumerate() {
498        if variant.id.trim().is_empty() {
499            variant.id = format!("variant_{}", index + 1);
500        }
501        if !variant_ids.insert(variant.id.clone()) {
502            return Err(VmError::Runtime(format!(
503                "skill gate manifest has duplicate variant id '{}'",
504                variant.id
505            )));
506        }
507        let mut result_ids = BTreeSet::new();
508        for result in &variant.case_results {
509            if result.task_id.trim().is_empty() {
510                return Err(VmError::Runtime(format!(
511                    "skill gate variant '{}' has a case result with no task_id",
512                    variant.id
513                )));
514            }
515            if !task_ids.contains(&result.task_id) {
516                return Err(VmError::Runtime(format!(
517                    "skill gate variant '{}' references unknown task '{}'",
518                    variant.id, result.task_id
519                )));
520            }
521            if !result_ids.insert(result.task_id.clone()) {
522                return Err(VmError::Runtime(format!(
523                    "skill gate variant '{}' has duplicate result for task '{}'",
524                    variant.id, result.task_id
525                )));
526            }
527            if let Some(score) = result.score {
528                validate_score("candidate score", &result.task_id, score)?;
529            }
530        }
531    }
532    Ok(())
533}
534
535fn validate_score(label: &str, task_id: &str, score: f64) -> Result<(), VmError> {
536    if !(0.0..=1.0).contains(&score) {
537        return Err(VmError::Runtime(format!(
538            "skill gate task '{task_id}' {label} must be between 0 and 1"
539        )));
540    }
541    Ok(())
542}
543
544fn task_safety_report(
545    task: &SkillGateTask,
546    target_model: &SkillGateTargetModel,
547) -> SkillGateTaskSafetyReport {
548    let kind = normalize_kind(&task.heldout.kind);
549    let (included, exclusion_reason) = if task.heldout.private || kind == "private" {
550        (true, None)
551    } else if matches!(kind.as_str(), "public_static" | "static" | "pre_cutoff") {
552        (
553            false,
554            Some("static public or declared pre-cutoff task is contamination-prone".to_string()),
555        )
556    } else if matches!(
557        kind.as_str(),
558        "post_cutoff" | "rolling" | "livecodebench" | "swe_mera" | "swe_rebench"
559    ) {
560        match (
561            task.heldout.created_at.as_deref(),
562            target_model.knowledge_cutoff.as_deref(),
563        ) {
564            (Some(created_at), Some(cutoff)) if date_after(created_at, cutoff).unwrap_or(false) => {
565                (true, None)
566            }
567            (Some(_), Some(cutoff)) => (
568                false,
569                Some(format!(
570                    "task does not post-date target model cutoff {cutoff}"
571                )),
572            ),
573            (Some(_), None) => (
574                false,
575                Some(
576                    "target model knowledge_cutoff is required for non-private held-out tasks"
577                        .to_string(),
578                ),
579            ),
580            (None, _) => (
581                false,
582                Some("non-private held-out task must declare created_at".to_string()),
583            ),
584        }
585    } else {
586        (
587            false,
588            Some(format!(
589                "held-out kind '{}' is not recognized as contamination-safe",
590                task.heldout.kind
591            )),
592        )
593    };
594
595    SkillGateTaskSafetyReport {
596        task_id: task.id.clone(),
597        cluster: task.cluster.clone(),
598        included,
599        heldout_kind: task.heldout.kind.clone(),
600        created_at: task.heldout.created_at.clone(),
601        private: task.heldout.private,
602        exclusion_reason,
603    }
604}
605
606fn normalize_kind(kind: &str) -> String {
607    kind.trim().to_ascii_lowercase().replace(['-', ' '], "_")
608}
609
610fn date_after(created_at: &str, cutoff: &str) -> Option<bool> {
611    Some(parse_date_prefix(created_at)? > parse_date_prefix(cutoff)?)
612}
613
614fn parse_date_prefix(value: &str) -> Option<(u32, u32, u32)> {
615    let trimmed = value.trim();
616    let prefix = trimmed.get(..10)?;
617    let bytes = prefix.as_bytes();
618    if bytes.get(4) != Some(&b'-') || bytes.get(7) != Some(&b'-') {
619        return None;
620    }
621    for index in [0, 1, 2, 3, 5, 6, 8, 9] {
622        if !bytes[index].is_ascii_digit() {
623            return None;
624        }
625    }
626    let year = prefix[0..4].parse::<u32>().ok()?;
627    let month = prefix[5..7].parse::<u32>().ok()?;
628    let day = prefix[8..10].parse::<u32>().ok()?;
629    let max_day = match month {
630        1 | 3 | 5 | 7 | 8 | 10 | 12 => 31,
631        4 | 6 | 9 | 11 => 30,
632        2 if is_leap_year(year) => 29,
633        2 => 28,
634        _ => return None,
635    };
636    if day == 0 || day > max_day {
637        None
638    } else {
639        Some((year, month, day))
640    }
641}
642
643fn is_leap_year(year: u32) -> bool {
644    (year.is_multiple_of(4) && !year.is_multiple_of(100)) || year.is_multiple_of(400)
645}
646
647fn verify_immutable_grader(
648    grader: &SkillGateGrader,
649    base_dir: Option<&Path>,
650) -> SkillGateTamperReport {
651    let mut checks = Vec::new();
652    let mut failures = Vec::new();
653    for protected in &grader.immutable_paths {
654        let resolved = resolve_manifest_path(base_dir, &protected.path);
655        let mut check = SkillGateTamperCheck {
656            path: protected.path.clone(),
657            label: protected.label.clone(),
658            expected_sha256: protected.sha256.clone(),
659            ..Default::default()
660        };
661        match sha256_path(&resolved) {
662            Ok(hash) => {
663                check.actual_sha256 = Some(hash.sha256.clone());
664                if hash.sha256.eq_ignore_ascii_case(protected.sha256.trim()) {
665                    check.status = "pass".to_string();
666                } else {
667                    check.status = "fail".to_string();
668                    check.failure = Some(format!(
669                        "checksum mismatch for immutable grader path {}",
670                        protected.path
671                    ));
672                }
673            }
674            Err(error) => {
675                check.status = "fail".to_string();
676                check.failure = Some(error);
677            }
678        }
679        if let Some(failure) = &check.failure {
680            failures.push(failure.clone());
681        }
682        checks.push(check);
683    }
684    SkillGateTamperReport {
685        pass: failures.is_empty(),
686        checks,
687        failures,
688    }
689}
690
691fn evaluate_variant(
692    variant: &SkillGateVariant,
693    manifest: &SkillGateManifest,
694    safety_by_id: &BTreeMap<&str, &SkillGateTaskSafetyReport>,
695    tamper: &SkillGateTamperReport,
696    base_dir: Option<&Path>,
697) -> SkillGateVariantReport {
698    let mut failures = Vec::new();
699    let mut warnings = Vec::new();
700    let mut context_valid = true;
701    let context = match measure_context(variant, manifest, base_dir) {
702        Ok(context) => context,
703        Err(error) => {
704            context_valid = false;
705            failures.push(error);
706            SkillGateContextReport::default()
707        }
708    };
709    let results_by_task = variant
710        .case_results
711        .iter()
712        .map(|result| (result.task_id.as_str(), result))
713        .collect::<BTreeMap<_, _>>();
714    let cases = manifest
715        .tasks
716        .iter()
717        .map(|task| {
718            evaluate_case(
719                task,
720                results_by_task.get(task.id.as_str()).copied(),
721                safety_by_id.get(task.id.as_str()).copied(),
722                manifest.policy.pass_score_threshold(),
723            )
724        })
725        .collect::<Vec<_>>();
726    for case in &cases {
727        failures.extend(case.failures.iter().cloned());
728    }
729    let metrics = aggregate_variant_metrics(&cases);
730    let clusters = aggregate_cluster_reports(&cases, manifest.policy.min_cluster_gap_recovery());
731    if !tamper.pass && manifest.policy.require_no_tamper() {
732        failures.push("immutable grader check failed".to_string());
733    }
734    if metrics.included_task_count < manifest.policy.min_included_tasks() {
735        failures.push(format!(
736            "included held-out task count {} is below required {}",
737            metrics.included_task_count,
738            manifest.policy.min_included_tasks()
739        ));
740    }
741    if metrics.scored_task_count == 0 {
742        failures.push("no contamination-safe scored tasks were available".to_string());
743    }
744    if metrics.mean_score_lift + EPSILON < manifest.policy.min_score_lift() {
745        failures.push(format!(
746            "mean score lift {:.4} is below required {:.4}",
747            metrics.mean_score_lift,
748            manifest.policy.min_score_lift()
749        ));
750    }
751    if metrics.mean_gap_recovery + EPSILON < manifest.policy.min_gap_recovery() {
752        failures.push(format!(
753            "mean gap recovery {:.4} is below required {:.4}",
754            metrics.mean_gap_recovery,
755            manifest.policy.min_gap_recovery()
756        ));
757    }
758    if metrics.regression_rate > manifest.policy.max_regression_rate() + EPSILON {
759        failures.push(format!(
760            "regression rate {:.4} exceeds allowed {:.4}",
761            metrics.regression_rate,
762            manifest.policy.max_regression_rate()
763        ));
764    }
765    if let Some(min_win_rate) = manifest.policy.min_win_rate {
766        if metrics.win_rate + EPSILON < min_win_rate {
767            failures.push(format!(
768                "candidate win rate {:.4} is below required {:.4}",
769                metrics.win_rate, min_win_rate
770            ));
771        }
772    }
773    if context_valid && !context.within_delta_budget {
774        failures.push(format!(
775            "context delta {} tokens exceeds allowed {}",
776            context.delta_tokens,
777            context.max_delta_tokens.unwrap_or_default()
778        ));
779    }
780    if context_valid && !context.within_target_budget {
781        failures.push(format!(
782            "candidate context {} tokens exceeds target budget {}",
783            context.candidate_tokens,
784            context.target_context_budget_tokens.unwrap_or_default()
785        ));
786    }
787    if manifest.policy.require_cluster_lift {
788        for cluster in &clusters {
789            if !cluster.pass {
790                failures.push(format!(
791                    "cluster '{}' gap recovery {:.4} is below required {:.4}",
792                    cluster.cluster,
793                    cluster.mean_gap_recovery,
794                    manifest.policy.min_cluster_gap_recovery()
795                ));
796            }
797        }
798    }
799    if manifest.grader.immutable_paths.is_empty() {
800        let warning = "no immutable grader paths were declared".to_string();
801        if manifest.policy.require_no_tamper() {
802            failures.push(warning.clone());
803        }
804        warnings.push(warning);
805    }
806    let accepted = failures.is_empty();
807    SkillGateVariantReport {
808        id: variant.id.clone(),
809        name: variant.name.clone(),
810        accepted,
811        decision: if accepted {
812            "accepted".to_string()
813        } else {
814            "rejected".to_string()
815        },
816        failures,
817        warnings,
818        metrics,
819        context,
820        clusters,
821        cases,
822    }
823}
824
825fn evaluate_case(
826    task: &SkillGateTask,
827    result: Option<&SkillGateCaseResult>,
828    safety: Option<&SkillGateTaskSafetyReport>,
829    pass_score_threshold: f64,
830) -> SkillGateCaseReport {
831    let included = safety.is_none_or(|safety| safety.included);
832    let exclusion_reason = safety.and_then(|safety| safety.exclusion_reason.clone());
833    let baseline_passed = task
834        .baseline_passed
835        .unwrap_or(task.baseline_score >= pass_score_threshold);
836    let mut report = SkillGateCaseReport {
837        task_id: task.id.clone(),
838        cluster: task.cluster.clone(),
839        included,
840        exclusion_reason,
841        baseline_score: task.baseline_score,
842        candidate_score: result.and_then(|result| result.score),
843        frontier_score: task.frontier_score,
844        baseline_passed,
845        candidate_passed: result.map(|result| {
846            result
847                .passed
848                .unwrap_or_else(|| result.score.unwrap_or(0.0) >= pass_score_threshold)
849        }),
850        notes: result.and_then(|result| result.notes.clone()),
851        ..Default::default()
852    };
853    if !included {
854        return report;
855    }
856    let Some(candidate_score) = report.candidate_score else {
857        report
858            .failures
859            .push(format!("variant is missing result for task '{}'", task.id));
860        return report;
861    };
862    let score_lift = candidate_score - task.baseline_score;
863    report.score_lift = Some(score_lift);
864    if task.frontier_score > task.baseline_score + EPSILON {
865        report.gap_recovery = Some(score_lift / (task.frontier_score - task.baseline_score));
866    }
867    let candidate_passed = report.candidate_passed.unwrap_or(false);
868    report.regression = baseline_passed && !candidate_passed;
869    if report.regression {
870        report.failures.push(format!(
871            "task '{}' regressed from passing to failing",
872            task.id
873        ));
874    }
875    report
876}
877
878fn aggregate_variant_metrics(cases: &[SkillGateCaseReport]) -> SkillGateVariantMetrics {
879    let included_task_count = cases.iter().filter(|case| case.included).count();
880    let scored = cases
881        .iter()
882        .filter(|case| case.included && case.candidate_score.is_some())
883        .collect::<Vec<_>>();
884    let scored_task_count = scored.len();
885    let gap_cases = scored
886        .iter()
887        .filter(|case| case.gap_recovery.is_some())
888        .collect::<Vec<_>>();
889    let gap_task_count = gap_cases.len();
890    let mut metrics = SkillGateVariantMetrics {
891        included_task_count,
892        scored_task_count,
893        gap_task_count,
894        ..Default::default()
895    };
896    if scored_task_count > 0 {
897        metrics.mean_baseline_score =
898            scored.iter().map(|case| case.baseline_score).sum::<f64>() / scored_task_count as f64;
899        metrics.mean_candidate_score = scored
900            .iter()
901            .map(|case| case.candidate_score.unwrap_or_default())
902            .sum::<f64>()
903            / scored_task_count as f64;
904        metrics.mean_frontier_score =
905            scored.iter().map(|case| case.frontier_score).sum::<f64>() / scored_task_count as f64;
906        metrics.mean_score_lift = scored
907            .iter()
908            .map(|case| case.score_lift.unwrap_or_default())
909            .sum::<f64>()
910            / scored_task_count as f64;
911        metrics.candidate_win_count = scored
912            .iter()
913            .filter(|case| case.score_lift.unwrap_or_default() > EPSILON)
914            .count();
915        metrics.candidate_loss_count = scored
916            .iter()
917            .filter(|case| case.score_lift.unwrap_or_default() < -EPSILON)
918            .count();
919        metrics.candidate_tie_count = scored_task_count
920            .saturating_sub(metrics.candidate_win_count + metrics.candidate_loss_count);
921        metrics.win_rate = metrics.candidate_win_count as f64 / scored_task_count as f64;
922    }
923    if gap_task_count > 0 {
924        metrics.mean_gap_recovery = gap_cases
925            .iter()
926            .map(|case| case.gap_recovery.unwrap_or_default())
927            .sum::<f64>()
928            / gap_task_count as f64;
929    }
930    metrics.regression_denominator = cases
931        .iter()
932        .filter(|case| case.included && case.baseline_passed)
933        .count();
934    metrics.regression_count = cases
935        .iter()
936        .filter(|case| case.included && case.regression)
937        .count();
938    if metrics.regression_denominator > 0 {
939        metrics.regression_rate =
940            metrics.regression_count as f64 / metrics.regression_denominator as f64;
941    }
942    metrics
943}
944
945fn aggregate_cluster_reports(
946    cases: &[SkillGateCaseReport],
947    min_cluster_gap_recovery: f64,
948) -> Vec<SkillGateClusterReport> {
949    let mut grouped: BTreeMap<String, Vec<&SkillGateCaseReport>> = BTreeMap::new();
950    for case in cases
951        .iter()
952        .filter(|case| case.included && case.candidate_score.is_some())
953    {
954        grouped.entry(case.cluster.clone()).or_default().push(case);
955    }
956    grouped
957        .into_iter()
958        .map(|(cluster, cases)| {
959            let task_count = cases.len();
960            let gap_cases = cases
961                .iter()
962                .filter(|case| case.gap_recovery.is_some())
963                .copied()
964                .collect::<Vec<_>>();
965            let gap_task_count = gap_cases.len();
966            let mean_baseline_score =
967                cases.iter().map(|case| case.baseline_score).sum::<f64>() / task_count as f64;
968            let mean_candidate_score = cases
969                .iter()
970                .map(|case| case.candidate_score.unwrap_or_default())
971                .sum::<f64>()
972                / task_count as f64;
973            let mean_frontier_score =
974                cases.iter().map(|case| case.frontier_score).sum::<f64>() / task_count as f64;
975            let mean_score_lift = cases
976                .iter()
977                .map(|case| case.score_lift.unwrap_or_default())
978                .sum::<f64>()
979                / task_count as f64;
980            let mean_gap_recovery = if gap_task_count == 0 {
981                0.0
982            } else {
983                gap_cases
984                    .iter()
985                    .map(|case| case.gap_recovery.unwrap_or_default())
986                    .sum::<f64>()
987                    / gap_task_count as f64
988            };
989            SkillGateClusterReport {
990                cluster,
991                task_count,
992                gap_task_count,
993                mean_baseline_score,
994                mean_candidate_score,
995                mean_frontier_score,
996                mean_score_lift,
997                mean_gap_recovery,
998                pass: gap_task_count == 0
999                    || mean_gap_recovery + EPSILON >= min_cluster_gap_recovery,
1000            }
1001        })
1002        .collect()
1003}
1004
1005fn measure_context(
1006    variant: &SkillGateVariant,
1007    manifest: &SkillGateManifest,
1008    base_dir: Option<&Path>,
1009) -> Result<SkillGateContextReport, String> {
1010    let baseline = measure_artifact("baseline", &variant.baseline, base_dir)?;
1011    let candidate = measure_artifact("candidate", &variant.candidate, base_dir)?;
1012    let baseline_tokens = baseline.context_tokens;
1013    let candidate_tokens = candidate.context_tokens;
1014    let delta_tokens = candidate_tokens as i64 - baseline_tokens as i64;
1015    let max_delta_tokens = manifest.policy.max_context_delta_tokens;
1016    let target_context_budget_tokens = manifest.target_model.context_budget_tokens;
1017    let within_delta_budget = max_delta_tokens.is_none_or(|max| delta_tokens <= max);
1018    let within_target_budget =
1019        target_context_budget_tokens.is_none_or(|max| candidate_tokens <= max);
1020    let mut artifact_hashes = baseline.hashes;
1021    artifact_hashes.extend(candidate.hashes);
1022    Ok(SkillGateContextReport {
1023        baseline_tokens,
1024        candidate_tokens,
1025        delta_tokens,
1026        max_delta_tokens,
1027        target_context_budget_tokens,
1028        within_delta_budget,
1029        within_target_budget,
1030        artifact_hashes,
1031    })
1032}
1033
1034#[derive(Debug, Default)]
1035struct ArtifactMeasurement {
1036    context_tokens: usize,
1037    hashes: Vec<SkillGateArtifactHash>,
1038}
1039
1040fn measure_artifact(
1041    role: &str,
1042    artifact: &SkillGateArtifact,
1043    base_dir: Option<&Path>,
1044) -> Result<ArtifactMeasurement, String> {
1045    let mut measurement = ArtifactMeasurement::default();
1046    for path in &artifact.paths {
1047        let resolved = resolve_manifest_path(base_dir, path);
1048        let hash = sha256_path(&resolved)?;
1049        measurement.context_tokens += hash.tokens;
1050        measurement.hashes.push(SkillGateArtifactHash {
1051            role: role.to_string(),
1052            path: path.clone(),
1053            sha256: hash.sha256,
1054            tokens: hash.tokens,
1055            bytes: hash.bytes,
1056        });
1057    }
1058    if let Some(tokens) = artifact.context_tokens {
1059        measurement.context_tokens = tokens;
1060    }
1061    Ok(measurement)
1062}
1063
1064#[derive(Debug)]
1065struct PathHash {
1066    sha256: String,
1067    tokens: usize,
1068    bytes: usize,
1069}
1070
1071fn sha256_path(path: &Path) -> Result<PathHash, String> {
1072    let metadata = fs::symlink_metadata(path)
1073        .map_err(|error| format!("failed to stat {}: {error}", path.display()))?;
1074    if metadata.file_type().is_symlink() {
1075        return Err(format!(
1076            "refusing to hash symlink protected path {}",
1077            path.display()
1078        ));
1079    }
1080    if metadata.is_file() {
1081        return sha256_file(path);
1082    }
1083    if metadata.is_dir() {
1084        return sha256_dir(path);
1085    }
1086    Err(format!(
1087        "protected path {} is neither a file nor a directory",
1088        path.display()
1089    ))
1090}
1091
1092fn sha256_file(path: &Path) -> Result<PathHash, String> {
1093    let bytes =
1094        fs::read(path).map_err(|error| format!("failed to read {}: {error}", path.display()))?;
1095    let sha256 = hex_digest(&bytes);
1096    let tokens = estimate_chunk_tokens(&String::from_utf8_lossy(&bytes));
1097    Ok(PathHash {
1098        sha256,
1099        tokens,
1100        bytes: bytes.len(),
1101    })
1102}
1103
1104fn sha256_dir(path: &Path) -> Result<PathHash, String> {
1105    let mut files = Vec::new();
1106    for entry in WalkDir::new(path).follow_links(false) {
1107        let entry = entry.map_err(|error| format!("failed to walk {}: {error}", path.display()))?;
1108        if entry.file_type().is_symlink() {
1109            return Err(format!(
1110                "refusing to hash symlink inside protected directory {}",
1111                entry.path().display()
1112            ));
1113        }
1114        if entry.file_type().is_file() {
1115            files.push(entry.path().to_path_buf());
1116        }
1117    }
1118    files.sort();
1119    let mut hasher = Sha256::new();
1120    let mut tokens = 0;
1121    let mut bytes_total = 0;
1122    for file in files {
1123        let rel = file
1124            .strip_prefix(path)
1125            .map_err(|error| format!("failed to relativize {}: {error}", file.display()))?;
1126        let rel = rel.to_string_lossy().replace('\\', "/");
1127        let bytes = fs::read(&file)
1128            .map_err(|error| format!("failed to read {}: {error}", file.display()))?;
1129        hasher.update(rel.as_bytes());
1130        hasher.update([0]);
1131        hasher.update(&bytes);
1132        hasher.update([0xff]);
1133        tokens += estimate_chunk_tokens(&String::from_utf8_lossy(&bytes));
1134        bytes_total += bytes.len();
1135    }
1136    Ok(PathHash {
1137        sha256: bytes_to_hex(hasher.finalize().as_ref()),
1138        tokens,
1139        bytes: bytes_total,
1140    })
1141}
1142
1143fn hex_digest(bytes: &[u8]) -> String {
1144    let mut hasher = Sha256::new();
1145    hasher.update(bytes);
1146    bytes_to_hex(hasher.finalize().as_ref())
1147}
1148
1149fn bytes_to_hex(bytes: &[u8]) -> String {
1150    bytes.iter().map(|byte| format!("{byte:02x}")).collect()
1151}
1152
1153fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
1154    let path_buf = PathBuf::from(path);
1155    if path_buf.is_absolute() {
1156        path_buf
1157    } else if let Some(base_dir) = base_dir {
1158        base_dir.join(path_buf)
1159    } else {
1160        path_buf
1161    }
1162}
1163
1164fn pareto_frontier(variants: &[SkillGateVariantReport]) -> Vec<String> {
1165    variants
1166        .iter()
1167        .filter(|variant| variant.metrics.scored_task_count > 0)
1168        .filter(|variant| {
1169            !variants
1170                .iter()
1171                .any(|other| other.id != variant.id && dominates(other, variant))
1172        })
1173        .map(|variant| variant.id.clone())
1174        .collect()
1175}
1176
1177fn dominates(left: &SkillGateVariantReport, right: &SkillGateVariantReport) -> bool {
1178    if left.metrics.scored_task_count == 0 {
1179        return false;
1180    }
1181    let at_least_as_good = left.metrics.mean_gap_recovery + EPSILON
1182        >= right.metrics.mean_gap_recovery
1183        && left.metrics.mean_score_lift + EPSILON >= right.metrics.mean_score_lift
1184        && left.metrics.regression_rate <= right.metrics.regression_rate + EPSILON
1185        && left.context.delta_tokens <= right.context.delta_tokens;
1186    let strictly_better = left.metrics.mean_gap_recovery
1187        > right.metrics.mean_gap_recovery + EPSILON
1188        || left.metrics.mean_score_lift > right.metrics.mean_score_lift + EPSILON
1189        || left.metrics.regression_rate + EPSILON < right.metrics.regression_rate
1190        || left.context.delta_tokens < right.context.delta_tokens;
1191    at_least_as_good && strictly_better
1192}
1193
1194fn select_variant(
1195    variants: &[SkillGateVariantReport],
1196    pareto_frontier: &[String],
1197) -> Option<String> {
1198    let frontier = pareto_frontier.iter().collect::<BTreeSet<_>>();
1199    let mut accepted = variants
1200        .iter()
1201        .filter(|variant| variant.accepted && frontier.contains(&variant.id))
1202        .collect::<Vec<_>>();
1203    if accepted.is_empty() {
1204        accepted = variants.iter().filter(|variant| variant.accepted).collect();
1205    }
1206    accepted.sort_by(|left, right| {
1207        right
1208            .metrics
1209            .mean_gap_recovery
1210            .partial_cmp(&left.metrics.mean_gap_recovery)
1211            .unwrap_or(std::cmp::Ordering::Equal)
1212            .then_with(|| {
1213                right
1214                    .metrics
1215                    .mean_score_lift
1216                    .partial_cmp(&left.metrics.mean_score_lift)
1217                    .unwrap_or(std::cmp::Ordering::Equal)
1218            })
1219            .then_with(|| left.context.delta_tokens.cmp(&right.context.delta_tokens))
1220            .then_with(|| left.id.cmp(&right.id))
1221    });
1222    accepted.first().map(|variant| variant.id.clone())
1223}
1224
1225fn build_receipt(
1226    manifest: &SkillGateManifest,
1227    accepted: bool,
1228    selected_variant_id: Option<String>,
1229    task_safety: &[SkillGateTaskSafetyReport],
1230    tamper: SkillGateTamperReport,
1231    variants: &[SkillGateVariantReport],
1232    pareto_frontier: Vec<String>,
1233) -> SkillGateReceipt {
1234    let selected = selected_variant_id
1235        .as_ref()
1236        .and_then(|id| variants.iter().find(|variant| &variant.id == id));
1237    SkillGateReceipt {
1238        type_name: SKILL_GATE_RECEIPT_TYPE.to_string(),
1239        schema_version: SKILL_GATE_SCHEMA_VERSION,
1240        manifest_id: manifest.id.clone(),
1241        target_model_id: manifest.target_model.id.clone(),
1242        accepted,
1243        selected_variant_id,
1244        decision: if accepted {
1245            "accepted".to_string()
1246        } else {
1247            "rejected".to_string()
1248        },
1249        metrics: selected.map(|variant| variant.metrics.clone()),
1250        context: selected.map(|variant| variant.context.clone()),
1251        tamper,
1252        pareto_frontier,
1253        excluded_task_ids: task_safety
1254            .iter()
1255            .filter(|task| !task.included)
1256            .map(|task| task.task_id.clone())
1257            .collect(),
1258        variant_receipts: variants
1259            .iter()
1260            .map(|variant| SkillGateVariantReceipt {
1261                variant_id: variant.id.clone(),
1262                accepted: variant.accepted,
1263                decision: variant.decision.clone(),
1264                metrics: variant.metrics.clone(),
1265                context_delta_tokens: variant.context.delta_tokens,
1266                failures: variant.failures.clone(),
1267            })
1268            .collect(),
1269        metadata: manifest.metadata.clone(),
1270    }
1271}
1272
1273#[cfg(test)]
1274mod tests {
1275    use super::*;
1276
1277    fn write(path: &Path, content: &str) {
1278        fs::create_dir_all(path.parent().unwrap()).unwrap();
1279        fs::write(path, content).unwrap();
1280    }
1281
1282    fn fixture_manifest(root: &Path, grader_hash: String) -> SkillGateManifest {
1283        SkillGateManifest {
1284            type_name: SKILL_GATE_MANIFEST_TYPE.to_string(),
1285            version: SKILL_GATE_SCHEMA_VERSION,
1286            id: "skill-gate-test".to_string(),
1287            base_dir: Some(root.display().to_string()),
1288            target_model: SkillGateTargetModel {
1289                id: "mock-cheap".to_string(),
1290                knowledge_cutoff: Some("2026-05-01".to_string()),
1291                context_budget_tokens: Some(220),
1292                ..Default::default()
1293            },
1294            policy: SkillGatePolicy {
1295                min_included_tasks: Some(2),
1296                min_score_lift: Some(0.10),
1297                min_gap_recovery: Some(0.25),
1298                max_regression_rate: Some(0.0),
1299                max_context_delta_tokens: Some(120),
1300                min_win_rate: Some(0.5),
1301                ..Default::default()
1302            },
1303            grader: SkillGateGrader {
1304                id: "immutable".to_string(),
1305                immutable_paths: vec![SkillGateProtectedPath {
1306                    path: "grader/check.txt".to_string(),
1307                    sha256: grader_hash,
1308                    label: Some("grader".to_string()),
1309                }],
1310                ..Default::default()
1311            },
1312            tasks: vec![
1313                SkillGateTask {
1314                    id: "post-cutoff-failure".to_string(),
1315                    cluster: "api-drift".to_string(),
1316                    heldout: SkillGateHeldout {
1317                        kind: "post_cutoff".to_string(),
1318                        created_at: Some("2026-05-20".to_string()),
1319                        ..Default::default()
1320                    },
1321                    baseline_score: 0.20,
1322                    frontier_score: 1.0,
1323                    baseline_passed: Some(false),
1324                    ..Default::default()
1325                },
1326                SkillGateTask {
1327                    id: "private-regression-check".to_string(),
1328                    cluster: "regression".to_string(),
1329                    heldout: SkillGateHeldout {
1330                        kind: "private".to_string(),
1331                        private: true,
1332                        ..Default::default()
1333                    },
1334                    baseline_score: 0.90,
1335                    frontier_score: 1.0,
1336                    baseline_passed: Some(true),
1337                    ..Default::default()
1338                },
1339                SkillGateTask {
1340                    id: "old-public-benchmark".to_string(),
1341                    cluster: "contaminated".to_string(),
1342                    heldout: SkillGateHeldout {
1343                        kind: "public_static".to_string(),
1344                        created_at: Some("2024-01-01".to_string()),
1345                        ..Default::default()
1346                    },
1347                    baseline_score: 0.0,
1348                    frontier_score: 1.0,
1349                    baseline_passed: Some(false),
1350                    ..Default::default()
1351                },
1352            ],
1353            variants: vec![
1354                SkillGateVariant {
1355                    id: "known-good".to_string(),
1356                    candidate: SkillGateArtifact {
1357                        kind: "skill".to_string(),
1358                        paths: vec!["skills/good/SKILL.md".to_string()],
1359                        ..Default::default()
1360                    },
1361                    case_results: vec![
1362                        SkillGateCaseResult {
1363                            task_id: "post-cutoff-failure".to_string(),
1364                            score: Some(0.80),
1365                            passed: Some(true),
1366                            ..Default::default()
1367                        },
1368                        SkillGateCaseResult {
1369                            task_id: "private-regression-check".to_string(),
1370                            score: Some(0.92),
1371                            passed: Some(true),
1372                            ..Default::default()
1373                        },
1374                        SkillGateCaseResult {
1375                            task_id: "old-public-benchmark".to_string(),
1376                            score: Some(1.0),
1377                            passed: Some(true),
1378                            ..Default::default()
1379                        },
1380                    ],
1381                    ..Default::default()
1382                },
1383                SkillGateVariant {
1384                    id: "bloated".to_string(),
1385                    candidate: SkillGateArtifact {
1386                        kind: "skill".to_string(),
1387                        paths: vec!["skills/bloat/SKILL.md".to_string()],
1388                        ..Default::default()
1389                    },
1390                    case_results: vec![
1391                        SkillGateCaseResult {
1392                            task_id: "post-cutoff-failure".to_string(),
1393                            score: Some(0.85),
1394                            passed: Some(true),
1395                            ..Default::default()
1396                        },
1397                        SkillGateCaseResult {
1398                            task_id: "private-regression-check".to_string(),
1399                            score: Some(0.91),
1400                            passed: Some(true),
1401                            ..Default::default()
1402                        },
1403                    ],
1404                    ..Default::default()
1405                },
1406            ],
1407            ..Default::default()
1408        }
1409    }
1410
1411    #[test]
1412    fn gate_accepts_compact_lift_rejects_bloat_and_excludes_contamination() {
1413        let temp = tempfile::tempdir().unwrap();
1414        write(
1415            temp.path().join("grader/check.txt").as_path(),
1416            "stable grader\n",
1417        );
1418        write(
1419            temp.path().join("skills/good/SKILL.md").as_path(),
1420            "Use the post-cutoff API name and keep the answer scoped.\n",
1421        );
1422        write(
1423            temp.path().join("skills/bloat/SKILL.md").as_path(),
1424            &"repeat this irrelevant guidance for token bloat.\n".repeat(80),
1425        );
1426        let grader_hash = sha256_file(&temp.path().join("grader/check.txt"))
1427            .unwrap()
1428            .sha256;
1429        let report = evaluate_skill_gate_manifest(&fixture_manifest(temp.path(), grader_hash))
1430            .expect("gate evaluates");
1431
1432        assert!(report.pass);
1433        assert_eq!(report.selected_variant_id.as_deref(), Some("known-good"));
1434        assert_eq!(report.included_task_count, 2);
1435        assert_eq!(report.excluded_task_count, 1);
1436        assert_eq!(
1437            report.receipt.excluded_task_ids,
1438            vec!["old-public-benchmark"]
1439        );
1440        let good = report
1441            .variants
1442            .iter()
1443            .find(|variant| variant.id == "known-good")
1444            .unwrap();
1445        assert!(good.accepted);
1446        assert!(good.metrics.mean_gap_recovery > 0.25);
1447        assert_eq!(good.metrics.regression_rate, 0.0);
1448        let bloat = report
1449            .variants
1450            .iter()
1451            .find(|variant| variant.id == "bloated")
1452            .unwrap();
1453        assert!(!bloat.accepted);
1454        assert!(bloat
1455            .failures
1456            .iter()
1457            .any(|failure| failure.contains("context delta")));
1458    }
1459
1460    #[test]
1461    fn gate_fails_when_immutable_grader_checksum_changes() {
1462        let temp = tempfile::tempdir().unwrap();
1463        write(
1464            temp.path().join("grader/check.txt").as_path(),
1465            "stable grader\n",
1466        );
1467        write(
1468            temp.path().join("skills/good/SKILL.md").as_path(),
1469            "Use the post-cutoff API name and keep the answer scoped.\n",
1470        );
1471        write(
1472            temp.path().join("skills/bloat/SKILL.md").as_path(),
1473            &"repeat this irrelevant guidance for token bloat.\n".repeat(80),
1474        );
1475        let mut manifest = fixture_manifest(temp.path(), "not-the-real-hash".to_string());
1476        manifest.variants.truncate(1);
1477        let report = evaluate_skill_gate_manifest(&manifest).expect("gate evaluates");
1478
1479        assert!(!report.pass);
1480        assert!(!report.tamper.pass);
1481        assert!(report
1482            .variants
1483            .first()
1484            .unwrap()
1485            .failures
1486            .iter()
1487            .any(|failure| failure.contains("immutable grader")));
1488    }
1489}