1use std::collections::{BTreeMap, BTreeSet};
9use std::fs;
10use std::path::{Path, PathBuf};
11
12use serde::{Deserialize, Serialize};
13use serde_json::Value as JsonValue;
14use sha2::{Digest, Sha256};
15use walkdir::WalkDir;
16
17use super::estimate_chunk_tokens;
18use crate::value::VmError;
19
20pub const SKILL_GATE_SCHEMA_VERSION: u32 = 1;
21pub const SKILL_GATE_MANIFEST_TYPE: &str = "harn.skill_gate.manifest.v1";
22pub const SKILL_GATE_REPORT_TYPE: &str = "harn.skill_gate.report.v1";
23pub const SKILL_GATE_RECEIPT_TYPE: &str = "harn.skill_gate.receipt.v1";
24
25const EPSILON: f64 = 0.000_000_1;
26
27#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
28#[serde(default)]
29pub struct SkillGateManifest {
30 #[serde(rename = "_type")]
31 pub type_name: String,
32 pub version: u32,
33 pub id: String,
34 pub name: Option<String>,
35 pub description: Option<String>,
36 #[serde(default, alias = "base-dir")]
37 pub base_dir: Option<String>,
38 #[serde(default, alias = "target-model")]
39 pub target_model: SkillGateTargetModel,
40 pub policy: SkillGatePolicy,
41 pub grader: SkillGateGrader,
42 pub tasks: Vec<SkillGateTask>,
43 pub variants: Vec<SkillGateVariant>,
44 pub metadata: BTreeMap<String, JsonValue>,
45}
46
47#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
48#[serde(default)]
49pub struct SkillGateTargetModel {
50 pub id: String,
51 pub provider: Option<String>,
52 #[serde(default, alias = "knowledge-cutoff")]
53 pub knowledge_cutoff: Option<String>,
54 #[serde(default, alias = "context-budget-tokens")]
55 pub context_budget_tokens: Option<usize>,
56 pub metadata: BTreeMap<String, JsonValue>,
57}
58
59#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
60#[serde(default)]
61pub struct SkillGatePolicy {
62 #[serde(default, alias = "min-included-tasks")]
63 pub min_included_tasks: Option<usize>,
64 #[serde(default, alias = "min-score-lift")]
65 pub min_score_lift: Option<f64>,
66 #[serde(default, alias = "min-gap-recovery")]
67 pub min_gap_recovery: Option<f64>,
68 #[serde(default, alias = "min-cluster-gap-recovery")]
69 pub min_cluster_gap_recovery: Option<f64>,
70 #[serde(default, alias = "require-cluster-lift")]
71 pub require_cluster_lift: bool,
72 #[serde(default, alias = "max-regression-rate")]
73 pub max_regression_rate: Option<f64>,
74 #[serde(default, alias = "min-win-rate")]
75 pub min_win_rate: Option<f64>,
76 #[serde(default, alias = "max-context-delta-tokens")]
77 pub max_context_delta_tokens: Option<i64>,
78 #[serde(default, alias = "pass-score-threshold")]
79 pub pass_score_threshold: Option<f64>,
80 #[serde(default, alias = "require-no-tamper")]
81 pub require_no_tamper: Option<bool>,
82 pub metadata: BTreeMap<String, JsonValue>,
83}
84
85impl SkillGatePolicy {
86 fn min_included_tasks(&self) -> usize {
87 self.min_included_tasks.unwrap_or(1)
88 }
89
90 fn min_score_lift(&self) -> f64 {
91 self.min_score_lift.unwrap_or(0.0)
92 }
93
94 fn min_gap_recovery(&self) -> f64 {
95 self.min_gap_recovery.unwrap_or(0.0)
96 }
97
98 fn min_cluster_gap_recovery(&self) -> f64 {
99 self.min_cluster_gap_recovery
100 .unwrap_or_else(|| self.min_gap_recovery())
101 }
102
103 fn max_regression_rate(&self) -> f64 {
104 self.max_regression_rate.unwrap_or(0.0)
105 }
106
107 fn pass_score_threshold(&self) -> f64 {
108 self.pass_score_threshold.unwrap_or(0.5)
109 }
110
111 fn require_no_tamper(&self) -> bool {
112 self.require_no_tamper.unwrap_or(true)
113 }
114}
115
116#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
117#[serde(default)]
118pub struct SkillGateGrader {
119 pub id: String,
120 #[serde(default, alias = "immutable-paths", alias = "protected-paths")]
121 pub immutable_paths: Vec<SkillGateProtectedPath>,
122 pub metadata: BTreeMap<String, JsonValue>,
123}
124
125#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
126#[serde(default)]
127pub struct SkillGateProtectedPath {
128 pub path: String,
129 pub sha256: String,
130 pub label: Option<String>,
131}
132
133#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
134#[serde(default)]
135pub struct SkillGateTask {
136 pub id: String,
137 pub name: Option<String>,
138 pub cluster: String,
139 pub source: Option<String>,
140 pub heldout: SkillGateHeldout,
141 #[serde(default, alias = "baseline-score")]
142 pub baseline_score: f64,
143 #[serde(default, alias = "frontier-score")]
144 pub frontier_score: f64,
145 #[serde(default, alias = "baseline-passed")]
146 pub baseline_passed: Option<bool>,
147 pub metadata: BTreeMap<String, JsonValue>,
148}
149
150#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
151#[serde(default)]
152pub struct SkillGateHeldout {
153 pub kind: String,
154 #[serde(default, alias = "created-at")]
155 pub created_at: Option<String>,
156 pub private: bool,
157 pub suite: Option<String>,
158 pub metadata: BTreeMap<String, JsonValue>,
159}
160
161#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
162#[serde(default)]
163pub struct SkillGateVariant {
164 pub id: String,
165 pub name: Option<String>,
166 pub description: Option<String>,
167 pub baseline: SkillGateArtifact,
168 pub candidate: SkillGateArtifact,
169 #[serde(default, alias = "case-results")]
170 pub case_results: Vec<SkillGateCaseResult>,
171 pub metadata: BTreeMap<String, JsonValue>,
172}
173
174#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
175#[serde(default)]
176pub struct SkillGateArtifact {
177 pub kind: String,
178 pub paths: Vec<String>,
179 #[serde(default, alias = "context-tokens")]
180 pub context_tokens: Option<usize>,
181 pub metadata: BTreeMap<String, JsonValue>,
182}
183
184#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
185#[serde(default)]
186pub struct SkillGateCaseResult {
187 #[serde(default, alias = "task-id")]
188 pub task_id: String,
189 pub score: Option<f64>,
190 pub passed: Option<bool>,
191 pub notes: Option<String>,
192 pub metadata: BTreeMap<String, JsonValue>,
193}
194
195#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
196#[serde(default)]
197pub struct SkillGateReport {
198 #[serde(rename = "_type")]
199 pub type_name: String,
200 pub schema_version: u32,
201 pub manifest_id: String,
202 pub manifest_name: Option<String>,
203 pub target_model: SkillGateTargetModel,
204 pub pass: bool,
205 pub selected_variant_id: Option<String>,
206 pub included_task_count: usize,
207 pub excluded_task_count: usize,
208 pub task_safety: Vec<SkillGateTaskSafetyReport>,
209 pub tamper: SkillGateTamperReport,
210 pub variants: Vec<SkillGateVariantReport>,
211 pub pareto_frontier: Vec<String>,
212 pub receipt: SkillGateReceipt,
213 pub metadata: BTreeMap<String, JsonValue>,
214}
215
216#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
217#[serde(default)]
218pub struct SkillGateTaskSafetyReport {
219 pub task_id: String,
220 pub cluster: String,
221 pub included: bool,
222 pub heldout_kind: String,
223 pub created_at: Option<String>,
224 pub private: bool,
225 pub exclusion_reason: Option<String>,
226}
227
228#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
229#[serde(default)]
230pub struct SkillGateTamperReport {
231 pub pass: bool,
232 pub checks: Vec<SkillGateTamperCheck>,
233 pub failures: Vec<String>,
234}
235
236#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
237#[serde(default)]
238pub struct SkillGateTamperCheck {
239 pub path: String,
240 pub label: Option<String>,
241 pub expected_sha256: String,
242 pub actual_sha256: Option<String>,
243 pub status: String,
244 pub failure: Option<String>,
245}
246
247#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
248#[serde(default)]
249pub struct SkillGateVariantReport {
250 pub id: String,
251 pub name: Option<String>,
252 pub accepted: bool,
253 pub decision: String,
254 pub failures: Vec<String>,
255 pub warnings: Vec<String>,
256 pub metrics: SkillGateVariantMetrics,
257 pub context: SkillGateContextReport,
258 pub clusters: Vec<SkillGateClusterReport>,
259 pub cases: Vec<SkillGateCaseReport>,
260}
261
262#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
263#[serde(default)]
264pub struct SkillGateVariantMetrics {
265 pub included_task_count: usize,
266 pub scored_task_count: usize,
267 pub gap_task_count: usize,
268 pub mean_baseline_score: f64,
269 pub mean_candidate_score: f64,
270 pub mean_frontier_score: f64,
271 pub mean_score_lift: f64,
272 pub mean_gap_recovery: f64,
273 pub candidate_win_count: usize,
274 pub candidate_tie_count: usize,
275 pub candidate_loss_count: usize,
276 pub win_rate: f64,
277 pub regression_count: usize,
278 pub regression_denominator: usize,
279 pub regression_rate: f64,
280}
281
282#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
283#[serde(default)]
284pub struct SkillGateContextReport {
285 pub baseline_tokens: usize,
286 pub candidate_tokens: usize,
287 pub delta_tokens: i64,
288 pub max_delta_tokens: Option<i64>,
289 pub target_context_budget_tokens: Option<usize>,
290 pub within_delta_budget: bool,
291 pub within_target_budget: bool,
292 pub artifact_hashes: Vec<SkillGateArtifactHash>,
293}
294
295#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
296#[serde(default)]
297pub struct SkillGateArtifactHash {
298 pub role: String,
299 pub path: String,
300 pub sha256: String,
301 pub tokens: usize,
302 pub bytes: usize,
303}
304
305#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
306#[serde(default)]
307pub struct SkillGateClusterReport {
308 pub cluster: String,
309 pub task_count: usize,
310 pub gap_task_count: usize,
311 pub mean_baseline_score: f64,
312 pub mean_candidate_score: f64,
313 pub mean_frontier_score: f64,
314 pub mean_score_lift: f64,
315 pub mean_gap_recovery: f64,
316 pub pass: bool,
317}
318
319#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
320#[serde(default)]
321pub struct SkillGateCaseReport {
322 pub task_id: String,
323 pub cluster: String,
324 pub included: bool,
325 pub exclusion_reason: Option<String>,
326 pub baseline_score: f64,
327 pub candidate_score: Option<f64>,
328 pub frontier_score: f64,
329 pub score_lift: Option<f64>,
330 pub gap_recovery: Option<f64>,
331 pub baseline_passed: bool,
332 pub candidate_passed: Option<bool>,
333 pub regression: bool,
334 pub failures: Vec<String>,
335 pub notes: Option<String>,
336}
337
338#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
339#[serde(default)]
340pub struct SkillGateReceipt {
341 #[serde(rename = "_type")]
342 pub type_name: String,
343 pub schema_version: u32,
344 pub manifest_id: String,
345 pub target_model_id: String,
346 pub accepted: bool,
347 pub selected_variant_id: Option<String>,
348 pub decision: String,
349 pub metrics: Option<SkillGateVariantMetrics>,
350 pub context: Option<SkillGateContextReport>,
351 pub tamper: SkillGateTamperReport,
352 pub pareto_frontier: Vec<String>,
353 pub excluded_task_ids: Vec<String>,
354 pub variant_receipts: Vec<SkillGateVariantReceipt>,
355 pub metadata: BTreeMap<String, JsonValue>,
356}
357
358#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
359#[serde(default)]
360pub struct SkillGateVariantReceipt {
361 pub variant_id: String,
362 pub accepted: bool,
363 pub decision: String,
364 pub metrics: SkillGateVariantMetrics,
365 pub context_delta_tokens: i64,
366 pub failures: Vec<String>,
367}
368
369pub fn load_skill_gate_manifest(path: &Path) -> Result<SkillGateManifest, VmError> {
370 let content = fs::read_to_string(path).map_err(|error| {
371 VmError::Runtime(format!("failed to read skill gate manifest: {error}"))
372 })?;
373 let mut manifest: SkillGateManifest =
374 if path.extension().and_then(|ext| ext.to_str()) == Some("toml") {
375 toml::from_str(&content).map_err(|error| {
376 VmError::Runtime(format!("failed to parse skill gate TOML: {error}"))
377 })?
378 } else {
379 serde_json::from_str(&content).map_err(|error| {
380 VmError::Runtime(format!("failed to parse skill gate JSON: {error}"))
381 })?
382 };
383 if manifest.base_dir.is_none() {
384 manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
385 }
386 normalize_skill_gate_manifest(&mut manifest)?;
387 Ok(manifest)
388}
389
390pub fn evaluate_skill_gate_manifest(
391 manifest: &SkillGateManifest,
392) -> Result<SkillGateReport, VmError> {
393 let mut manifest = manifest.clone();
394 normalize_skill_gate_manifest(&mut manifest)?;
395 let base_dir = manifest.base_dir.as_deref().map(Path::new);
396 let task_safety = manifest
397 .tasks
398 .iter()
399 .map(|task| task_safety_report(task, &manifest.target_model))
400 .collect::<Vec<_>>();
401 let included_task_count = task_safety.iter().filter(|task| task.included).count();
402 let excluded_task_count = task_safety.len().saturating_sub(included_task_count);
403 let safety_by_id = task_safety
404 .iter()
405 .map(|task| (task.task_id.as_str(), task))
406 .collect::<BTreeMap<_, _>>();
407 let tamper = verify_immutable_grader(&manifest.grader, base_dir);
408 let variants = manifest
409 .variants
410 .iter()
411 .map(|variant| evaluate_variant(variant, &manifest, &safety_by_id, &tamper, base_dir))
412 .collect::<Vec<_>>();
413 let pareto_frontier = pareto_frontier(&variants);
414 let selected_variant_id = select_variant(&variants, &pareto_frontier);
415 let pass = selected_variant_id.is_some();
416 let receipt = build_receipt(
417 &manifest,
418 pass,
419 selected_variant_id.clone(),
420 &task_safety,
421 tamper.clone(),
422 &variants,
423 pareto_frontier.clone(),
424 );
425 Ok(SkillGateReport {
426 type_name: SKILL_GATE_REPORT_TYPE.to_string(),
427 schema_version: SKILL_GATE_SCHEMA_VERSION,
428 manifest_id: manifest.id,
429 manifest_name: manifest.name,
430 target_model: manifest.target_model,
431 pass,
432 selected_variant_id,
433 included_task_count,
434 excluded_task_count,
435 task_safety,
436 tamper,
437 variants,
438 pareto_frontier,
439 receipt,
440 metadata: manifest.metadata,
441 })
442}
443
444fn normalize_skill_gate_manifest(manifest: &mut SkillGateManifest) -> Result<(), VmError> {
445 if manifest.type_name.is_empty() {
446 manifest.type_name = SKILL_GATE_MANIFEST_TYPE.to_string();
447 }
448 if manifest.type_name != SKILL_GATE_MANIFEST_TYPE {
449 return Err(VmError::Runtime(format!(
450 "skill gate manifest _type must be {SKILL_GATE_MANIFEST_TYPE}"
451 )));
452 }
453 if manifest.version == 0 {
454 manifest.version = SKILL_GATE_SCHEMA_VERSION;
455 }
456 if manifest.version != SKILL_GATE_SCHEMA_VERSION {
457 return Err(VmError::Runtime(format!(
458 "skill gate manifest version must be {SKILL_GATE_SCHEMA_VERSION}"
459 )));
460 }
461 if manifest.id.trim().is_empty() {
462 manifest.id = "skill-gate".to_string();
463 }
464 if manifest.target_model.id.trim().is_empty() {
465 return Err(VmError::Runtime(
466 "skill gate manifest target_model.id is required".to_string(),
467 ));
468 }
469 if manifest.tasks.is_empty() {
470 return Err(VmError::Runtime(
471 "skill gate manifest must declare at least one task".to_string(),
472 ));
473 }
474 if manifest.variants.is_empty() {
475 return Err(VmError::Runtime(
476 "skill gate manifest must declare at least one variant".to_string(),
477 ));
478 }
479 let mut task_ids = BTreeSet::new();
480 for (index, task) in manifest.tasks.iter_mut().enumerate() {
481 if task.id.trim().is_empty() {
482 task.id = format!("task_{}", index + 1);
483 }
484 if !task_ids.insert(task.id.clone()) {
485 return Err(VmError::Runtime(format!(
486 "skill gate manifest has duplicate task id '{}'",
487 task.id
488 )));
489 }
490 if task.cluster.trim().is_empty() {
491 task.cluster = "default".to_string();
492 }
493 validate_score("baseline_score", &task.id, task.baseline_score)?;
494 validate_score("frontier_score", &task.id, task.frontier_score)?;
495 }
496 let mut variant_ids = BTreeSet::new();
497 for (index, variant) in manifest.variants.iter_mut().enumerate() {
498 if variant.id.trim().is_empty() {
499 variant.id = format!("variant_{}", index + 1);
500 }
501 if !variant_ids.insert(variant.id.clone()) {
502 return Err(VmError::Runtime(format!(
503 "skill gate manifest has duplicate variant id '{}'",
504 variant.id
505 )));
506 }
507 let mut result_ids = BTreeSet::new();
508 for result in &variant.case_results {
509 if result.task_id.trim().is_empty() {
510 return Err(VmError::Runtime(format!(
511 "skill gate variant '{}' has a case result with no task_id",
512 variant.id
513 )));
514 }
515 if !task_ids.contains(&result.task_id) {
516 return Err(VmError::Runtime(format!(
517 "skill gate variant '{}' references unknown task '{}'",
518 variant.id, result.task_id
519 )));
520 }
521 if !result_ids.insert(result.task_id.clone()) {
522 return Err(VmError::Runtime(format!(
523 "skill gate variant '{}' has duplicate result for task '{}'",
524 variant.id, result.task_id
525 )));
526 }
527 if let Some(score) = result.score {
528 validate_score("candidate score", &result.task_id, score)?;
529 }
530 }
531 }
532 Ok(())
533}
534
535fn validate_score(label: &str, task_id: &str, score: f64) -> Result<(), VmError> {
536 if !(0.0..=1.0).contains(&score) {
537 return Err(VmError::Runtime(format!(
538 "skill gate task '{task_id}' {label} must be between 0 and 1"
539 )));
540 }
541 Ok(())
542}
543
544fn task_safety_report(
545 task: &SkillGateTask,
546 target_model: &SkillGateTargetModel,
547) -> SkillGateTaskSafetyReport {
548 let kind = normalize_kind(&task.heldout.kind);
549 let (included, exclusion_reason) = if task.heldout.private || kind == "private" {
550 (true, None)
551 } else if matches!(kind.as_str(), "public_static" | "static" | "pre_cutoff") {
552 (
553 false,
554 Some("static public or declared pre-cutoff task is contamination-prone".to_string()),
555 )
556 } else if matches!(
557 kind.as_str(),
558 "post_cutoff" | "rolling" | "livecodebench" | "swe_mera" | "swe_rebench"
559 ) {
560 match (
561 task.heldout.created_at.as_deref(),
562 target_model.knowledge_cutoff.as_deref(),
563 ) {
564 (Some(created_at), Some(cutoff)) if date_after(created_at, cutoff).unwrap_or(false) => {
565 (true, None)
566 }
567 (Some(_), Some(cutoff)) => (
568 false,
569 Some(format!(
570 "task does not post-date target model cutoff {cutoff}"
571 )),
572 ),
573 (Some(_), None) => (
574 false,
575 Some(
576 "target model knowledge_cutoff is required for non-private held-out tasks"
577 .to_string(),
578 ),
579 ),
580 (None, _) => (
581 false,
582 Some("non-private held-out task must declare created_at".to_string()),
583 ),
584 }
585 } else {
586 (
587 false,
588 Some(format!(
589 "held-out kind '{}' is not recognized as contamination-safe",
590 task.heldout.kind
591 )),
592 )
593 };
594
595 SkillGateTaskSafetyReport {
596 task_id: task.id.clone(),
597 cluster: task.cluster.clone(),
598 included,
599 heldout_kind: task.heldout.kind.clone(),
600 created_at: task.heldout.created_at.clone(),
601 private: task.heldout.private,
602 exclusion_reason,
603 }
604}
605
606fn normalize_kind(kind: &str) -> String {
607 kind.trim().to_ascii_lowercase().replace(['-', ' '], "_")
608}
609
610fn date_after(created_at: &str, cutoff: &str) -> Option<bool> {
611 Some(parse_date_prefix(created_at)? > parse_date_prefix(cutoff)?)
612}
613
614fn parse_date_prefix(value: &str) -> Option<(u32, u32, u32)> {
615 let trimmed = value.trim();
616 let prefix = trimmed.get(..10)?;
617 let bytes = prefix.as_bytes();
618 if bytes.get(4) != Some(&b'-') || bytes.get(7) != Some(&b'-') {
619 return None;
620 }
621 for index in [0, 1, 2, 3, 5, 6, 8, 9] {
622 if !bytes[index].is_ascii_digit() {
623 return None;
624 }
625 }
626 let year = prefix[0..4].parse::<u32>().ok()?;
627 let month = prefix[5..7].parse::<u32>().ok()?;
628 let day = prefix[8..10].parse::<u32>().ok()?;
629 let max_day = match month {
630 1 | 3 | 5 | 7 | 8 | 10 | 12 => 31,
631 4 | 6 | 9 | 11 => 30,
632 2 if is_leap_year(year) => 29,
633 2 => 28,
634 _ => return None,
635 };
636 if day == 0 || day > max_day {
637 None
638 } else {
639 Some((year, month, day))
640 }
641}
642
643fn is_leap_year(year: u32) -> bool {
644 (year.is_multiple_of(4) && !year.is_multiple_of(100)) || year.is_multiple_of(400)
645}
646
647fn verify_immutable_grader(
648 grader: &SkillGateGrader,
649 base_dir: Option<&Path>,
650) -> SkillGateTamperReport {
651 let mut checks = Vec::new();
652 let mut failures = Vec::new();
653 for protected in &grader.immutable_paths {
654 let resolved = resolve_manifest_path(base_dir, &protected.path);
655 let mut check = SkillGateTamperCheck {
656 path: protected.path.clone(),
657 label: protected.label.clone(),
658 expected_sha256: protected.sha256.clone(),
659 ..Default::default()
660 };
661 match sha256_path(&resolved) {
662 Ok(hash) => {
663 check.actual_sha256 = Some(hash.sha256.clone());
664 if hash.sha256.eq_ignore_ascii_case(protected.sha256.trim()) {
665 check.status = "pass".to_string();
666 } else {
667 check.status = "fail".to_string();
668 check.failure = Some(format!(
669 "checksum mismatch for immutable grader path {}",
670 protected.path
671 ));
672 }
673 }
674 Err(error) => {
675 check.status = "fail".to_string();
676 check.failure = Some(error);
677 }
678 }
679 if let Some(failure) = &check.failure {
680 failures.push(failure.clone());
681 }
682 checks.push(check);
683 }
684 SkillGateTamperReport {
685 pass: failures.is_empty(),
686 checks,
687 failures,
688 }
689}
690
691fn evaluate_variant(
692 variant: &SkillGateVariant,
693 manifest: &SkillGateManifest,
694 safety_by_id: &BTreeMap<&str, &SkillGateTaskSafetyReport>,
695 tamper: &SkillGateTamperReport,
696 base_dir: Option<&Path>,
697) -> SkillGateVariantReport {
698 let mut failures = Vec::new();
699 let mut warnings = Vec::new();
700 let mut context_valid = true;
701 let context = match measure_context(variant, manifest, base_dir) {
702 Ok(context) => context,
703 Err(error) => {
704 context_valid = false;
705 failures.push(error);
706 SkillGateContextReport::default()
707 }
708 };
709 let results_by_task = variant
710 .case_results
711 .iter()
712 .map(|result| (result.task_id.as_str(), result))
713 .collect::<BTreeMap<_, _>>();
714 let cases = manifest
715 .tasks
716 .iter()
717 .map(|task| {
718 evaluate_case(
719 task,
720 results_by_task.get(task.id.as_str()).copied(),
721 safety_by_id.get(task.id.as_str()).copied(),
722 manifest.policy.pass_score_threshold(),
723 )
724 })
725 .collect::<Vec<_>>();
726 for case in &cases {
727 failures.extend(case.failures.iter().cloned());
728 }
729 let metrics = aggregate_variant_metrics(&cases);
730 let clusters = aggregate_cluster_reports(&cases, manifest.policy.min_cluster_gap_recovery());
731 if !tamper.pass && manifest.policy.require_no_tamper() {
732 failures.push("immutable grader check failed".to_string());
733 }
734 if metrics.included_task_count < manifest.policy.min_included_tasks() {
735 failures.push(format!(
736 "included held-out task count {} is below required {}",
737 metrics.included_task_count,
738 manifest.policy.min_included_tasks()
739 ));
740 }
741 if metrics.scored_task_count == 0 {
742 failures.push("no contamination-safe scored tasks were available".to_string());
743 }
744 if metrics.mean_score_lift + EPSILON < manifest.policy.min_score_lift() {
745 failures.push(format!(
746 "mean score lift {:.4} is below required {:.4}",
747 metrics.mean_score_lift,
748 manifest.policy.min_score_lift()
749 ));
750 }
751 if metrics.mean_gap_recovery + EPSILON < manifest.policy.min_gap_recovery() {
752 failures.push(format!(
753 "mean gap recovery {:.4} is below required {:.4}",
754 metrics.mean_gap_recovery,
755 manifest.policy.min_gap_recovery()
756 ));
757 }
758 if metrics.regression_rate > manifest.policy.max_regression_rate() + EPSILON {
759 failures.push(format!(
760 "regression rate {:.4} exceeds allowed {:.4}",
761 metrics.regression_rate,
762 manifest.policy.max_regression_rate()
763 ));
764 }
765 if let Some(min_win_rate) = manifest.policy.min_win_rate {
766 if metrics.win_rate + EPSILON < min_win_rate {
767 failures.push(format!(
768 "candidate win rate {:.4} is below required {:.4}",
769 metrics.win_rate, min_win_rate
770 ));
771 }
772 }
773 if context_valid && !context.within_delta_budget {
774 failures.push(format!(
775 "context delta {} tokens exceeds allowed {}",
776 context.delta_tokens,
777 context.max_delta_tokens.unwrap_or_default()
778 ));
779 }
780 if context_valid && !context.within_target_budget {
781 failures.push(format!(
782 "candidate context {} tokens exceeds target budget {}",
783 context.candidate_tokens,
784 context.target_context_budget_tokens.unwrap_or_default()
785 ));
786 }
787 if manifest.policy.require_cluster_lift {
788 for cluster in &clusters {
789 if !cluster.pass {
790 failures.push(format!(
791 "cluster '{}' gap recovery {:.4} is below required {:.4}",
792 cluster.cluster,
793 cluster.mean_gap_recovery,
794 manifest.policy.min_cluster_gap_recovery()
795 ));
796 }
797 }
798 }
799 if manifest.grader.immutable_paths.is_empty() {
800 let warning = "no immutable grader paths were declared".to_string();
801 if manifest.policy.require_no_tamper() {
802 failures.push(warning.clone());
803 }
804 warnings.push(warning);
805 }
806 let accepted = failures.is_empty();
807 SkillGateVariantReport {
808 id: variant.id.clone(),
809 name: variant.name.clone(),
810 accepted,
811 decision: if accepted {
812 "accepted".to_string()
813 } else {
814 "rejected".to_string()
815 },
816 failures,
817 warnings,
818 metrics,
819 context,
820 clusters,
821 cases,
822 }
823}
824
825fn evaluate_case(
826 task: &SkillGateTask,
827 result: Option<&SkillGateCaseResult>,
828 safety: Option<&SkillGateTaskSafetyReport>,
829 pass_score_threshold: f64,
830) -> SkillGateCaseReport {
831 let included = safety.is_none_or(|safety| safety.included);
832 let exclusion_reason = safety.and_then(|safety| safety.exclusion_reason.clone());
833 let baseline_passed = task
834 .baseline_passed
835 .unwrap_or(task.baseline_score >= pass_score_threshold);
836 let mut report = SkillGateCaseReport {
837 task_id: task.id.clone(),
838 cluster: task.cluster.clone(),
839 included,
840 exclusion_reason,
841 baseline_score: task.baseline_score,
842 candidate_score: result.and_then(|result| result.score),
843 frontier_score: task.frontier_score,
844 baseline_passed,
845 candidate_passed: result.map(|result| {
846 result
847 .passed
848 .unwrap_or_else(|| result.score.unwrap_or(0.0) >= pass_score_threshold)
849 }),
850 notes: result.and_then(|result| result.notes.clone()),
851 ..Default::default()
852 };
853 if !included {
854 return report;
855 }
856 let Some(candidate_score) = report.candidate_score else {
857 report
858 .failures
859 .push(format!("variant is missing result for task '{}'", task.id));
860 return report;
861 };
862 let score_lift = candidate_score - task.baseline_score;
863 report.score_lift = Some(score_lift);
864 if task.frontier_score > task.baseline_score + EPSILON {
865 report.gap_recovery = Some(score_lift / (task.frontier_score - task.baseline_score));
866 }
867 let candidate_passed = report.candidate_passed.unwrap_or(false);
868 report.regression = baseline_passed && !candidate_passed;
869 if report.regression {
870 report.failures.push(format!(
871 "task '{}' regressed from passing to failing",
872 task.id
873 ));
874 }
875 report
876}
877
878fn aggregate_variant_metrics(cases: &[SkillGateCaseReport]) -> SkillGateVariantMetrics {
879 let included_task_count = cases.iter().filter(|case| case.included).count();
880 let scored = cases
881 .iter()
882 .filter(|case| case.included && case.candidate_score.is_some())
883 .collect::<Vec<_>>();
884 let scored_task_count = scored.len();
885 let gap_cases = scored
886 .iter()
887 .filter(|case| case.gap_recovery.is_some())
888 .collect::<Vec<_>>();
889 let gap_task_count = gap_cases.len();
890 let mut metrics = SkillGateVariantMetrics {
891 included_task_count,
892 scored_task_count,
893 gap_task_count,
894 ..Default::default()
895 };
896 if scored_task_count > 0 {
897 metrics.mean_baseline_score =
898 scored.iter().map(|case| case.baseline_score).sum::<f64>() / scored_task_count as f64;
899 metrics.mean_candidate_score = scored
900 .iter()
901 .map(|case| case.candidate_score.unwrap_or_default())
902 .sum::<f64>()
903 / scored_task_count as f64;
904 metrics.mean_frontier_score =
905 scored.iter().map(|case| case.frontier_score).sum::<f64>() / scored_task_count as f64;
906 metrics.mean_score_lift = scored
907 .iter()
908 .map(|case| case.score_lift.unwrap_or_default())
909 .sum::<f64>()
910 / scored_task_count as f64;
911 metrics.candidate_win_count = scored
912 .iter()
913 .filter(|case| case.score_lift.unwrap_or_default() > EPSILON)
914 .count();
915 metrics.candidate_loss_count = scored
916 .iter()
917 .filter(|case| case.score_lift.unwrap_or_default() < -EPSILON)
918 .count();
919 metrics.candidate_tie_count = scored_task_count
920 .saturating_sub(metrics.candidate_win_count + metrics.candidate_loss_count);
921 metrics.win_rate = metrics.candidate_win_count as f64 / scored_task_count as f64;
922 }
923 if gap_task_count > 0 {
924 metrics.mean_gap_recovery = gap_cases
925 .iter()
926 .map(|case| case.gap_recovery.unwrap_or_default())
927 .sum::<f64>()
928 / gap_task_count as f64;
929 }
930 metrics.regression_denominator = cases
931 .iter()
932 .filter(|case| case.included && case.baseline_passed)
933 .count();
934 metrics.regression_count = cases
935 .iter()
936 .filter(|case| case.included && case.regression)
937 .count();
938 if metrics.regression_denominator > 0 {
939 metrics.regression_rate =
940 metrics.regression_count as f64 / metrics.regression_denominator as f64;
941 }
942 metrics
943}
944
945fn aggregate_cluster_reports(
946 cases: &[SkillGateCaseReport],
947 min_cluster_gap_recovery: f64,
948) -> Vec<SkillGateClusterReport> {
949 let mut grouped: BTreeMap<String, Vec<&SkillGateCaseReport>> = BTreeMap::new();
950 for case in cases
951 .iter()
952 .filter(|case| case.included && case.candidate_score.is_some())
953 {
954 grouped.entry(case.cluster.clone()).or_default().push(case);
955 }
956 grouped
957 .into_iter()
958 .map(|(cluster, cases)| {
959 let task_count = cases.len();
960 let gap_cases = cases
961 .iter()
962 .filter(|case| case.gap_recovery.is_some())
963 .copied()
964 .collect::<Vec<_>>();
965 let gap_task_count = gap_cases.len();
966 let mean_baseline_score =
967 cases.iter().map(|case| case.baseline_score).sum::<f64>() / task_count as f64;
968 let mean_candidate_score = cases
969 .iter()
970 .map(|case| case.candidate_score.unwrap_or_default())
971 .sum::<f64>()
972 / task_count as f64;
973 let mean_frontier_score =
974 cases.iter().map(|case| case.frontier_score).sum::<f64>() / task_count as f64;
975 let mean_score_lift = cases
976 .iter()
977 .map(|case| case.score_lift.unwrap_or_default())
978 .sum::<f64>()
979 / task_count as f64;
980 let mean_gap_recovery = if gap_task_count == 0 {
981 0.0
982 } else {
983 gap_cases
984 .iter()
985 .map(|case| case.gap_recovery.unwrap_or_default())
986 .sum::<f64>()
987 / gap_task_count as f64
988 };
989 SkillGateClusterReport {
990 cluster,
991 task_count,
992 gap_task_count,
993 mean_baseline_score,
994 mean_candidate_score,
995 mean_frontier_score,
996 mean_score_lift,
997 mean_gap_recovery,
998 pass: gap_task_count == 0
999 || mean_gap_recovery + EPSILON >= min_cluster_gap_recovery,
1000 }
1001 })
1002 .collect()
1003}
1004
1005fn measure_context(
1006 variant: &SkillGateVariant,
1007 manifest: &SkillGateManifest,
1008 base_dir: Option<&Path>,
1009) -> Result<SkillGateContextReport, String> {
1010 let baseline = measure_artifact("baseline", &variant.baseline, base_dir)?;
1011 let candidate = measure_artifact("candidate", &variant.candidate, base_dir)?;
1012 let baseline_tokens = baseline.context_tokens;
1013 let candidate_tokens = candidate.context_tokens;
1014 let delta_tokens = candidate_tokens as i64 - baseline_tokens as i64;
1015 let max_delta_tokens = manifest.policy.max_context_delta_tokens;
1016 let target_context_budget_tokens = manifest.target_model.context_budget_tokens;
1017 let within_delta_budget = max_delta_tokens.is_none_or(|max| delta_tokens <= max);
1018 let within_target_budget =
1019 target_context_budget_tokens.is_none_or(|max| candidate_tokens <= max);
1020 let mut artifact_hashes = baseline.hashes;
1021 artifact_hashes.extend(candidate.hashes);
1022 Ok(SkillGateContextReport {
1023 baseline_tokens,
1024 candidate_tokens,
1025 delta_tokens,
1026 max_delta_tokens,
1027 target_context_budget_tokens,
1028 within_delta_budget,
1029 within_target_budget,
1030 artifact_hashes,
1031 })
1032}
1033
1034#[derive(Debug, Default)]
1035struct ArtifactMeasurement {
1036 context_tokens: usize,
1037 hashes: Vec<SkillGateArtifactHash>,
1038}
1039
1040fn measure_artifact(
1041 role: &str,
1042 artifact: &SkillGateArtifact,
1043 base_dir: Option<&Path>,
1044) -> Result<ArtifactMeasurement, String> {
1045 let mut measurement = ArtifactMeasurement::default();
1046 for path in &artifact.paths {
1047 let resolved = resolve_manifest_path(base_dir, path);
1048 let hash = sha256_path(&resolved)?;
1049 measurement.context_tokens += hash.tokens;
1050 measurement.hashes.push(SkillGateArtifactHash {
1051 role: role.to_string(),
1052 path: path.clone(),
1053 sha256: hash.sha256,
1054 tokens: hash.tokens,
1055 bytes: hash.bytes,
1056 });
1057 }
1058 if let Some(tokens) = artifact.context_tokens {
1059 measurement.context_tokens = tokens;
1060 }
1061 Ok(measurement)
1062}
1063
1064#[derive(Debug)]
1065struct PathHash {
1066 sha256: String,
1067 tokens: usize,
1068 bytes: usize,
1069}
1070
1071fn sha256_path(path: &Path) -> Result<PathHash, String> {
1072 let metadata = fs::symlink_metadata(path)
1073 .map_err(|error| format!("failed to stat {}: {error}", path.display()))?;
1074 if metadata.file_type().is_symlink() {
1075 return Err(format!(
1076 "refusing to hash symlink protected path {}",
1077 path.display()
1078 ));
1079 }
1080 if metadata.is_file() {
1081 return sha256_file(path);
1082 }
1083 if metadata.is_dir() {
1084 return sha256_dir(path);
1085 }
1086 Err(format!(
1087 "protected path {} is neither a file nor a directory",
1088 path.display()
1089 ))
1090}
1091
1092fn sha256_file(path: &Path) -> Result<PathHash, String> {
1093 let bytes =
1094 fs::read(path).map_err(|error| format!("failed to read {}: {error}", path.display()))?;
1095 let sha256 = hex_digest(&bytes);
1096 let tokens = estimate_chunk_tokens(&String::from_utf8_lossy(&bytes));
1097 Ok(PathHash {
1098 sha256,
1099 tokens,
1100 bytes: bytes.len(),
1101 })
1102}
1103
1104fn sha256_dir(path: &Path) -> Result<PathHash, String> {
1105 let mut files = Vec::new();
1106 for entry in WalkDir::new(path).follow_links(false) {
1107 let entry = entry.map_err(|error| format!("failed to walk {}: {error}", path.display()))?;
1108 if entry.file_type().is_symlink() {
1109 return Err(format!(
1110 "refusing to hash symlink inside protected directory {}",
1111 entry.path().display()
1112 ));
1113 }
1114 if entry.file_type().is_file() {
1115 files.push(entry.path().to_path_buf());
1116 }
1117 }
1118 files.sort();
1119 let mut hasher = Sha256::new();
1120 let mut tokens = 0;
1121 let mut bytes_total = 0;
1122 for file in files {
1123 let rel = file
1124 .strip_prefix(path)
1125 .map_err(|error| format!("failed to relativize {}: {error}", file.display()))?;
1126 let rel = rel.to_string_lossy().replace('\\', "/");
1127 let bytes = fs::read(&file)
1128 .map_err(|error| format!("failed to read {}: {error}", file.display()))?;
1129 hasher.update(rel.as_bytes());
1130 hasher.update([0]);
1131 hasher.update(&bytes);
1132 hasher.update([0xff]);
1133 tokens += estimate_chunk_tokens(&String::from_utf8_lossy(&bytes));
1134 bytes_total += bytes.len();
1135 }
1136 Ok(PathHash {
1137 sha256: bytes_to_hex(hasher.finalize().as_ref()),
1138 tokens,
1139 bytes: bytes_total,
1140 })
1141}
1142
1143fn hex_digest(bytes: &[u8]) -> String {
1144 let mut hasher = Sha256::new();
1145 hasher.update(bytes);
1146 bytes_to_hex(hasher.finalize().as_ref())
1147}
1148
1149fn bytes_to_hex(bytes: &[u8]) -> String {
1150 bytes.iter().map(|byte| format!("{byte:02x}")).collect()
1151}
1152
1153fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
1154 let path_buf = PathBuf::from(path);
1155 if path_buf.is_absolute() {
1156 path_buf
1157 } else if let Some(base_dir) = base_dir {
1158 base_dir.join(path_buf)
1159 } else {
1160 path_buf
1161 }
1162}
1163
1164fn pareto_frontier(variants: &[SkillGateVariantReport]) -> Vec<String> {
1165 variants
1166 .iter()
1167 .filter(|variant| variant.metrics.scored_task_count > 0)
1168 .filter(|variant| {
1169 !variants
1170 .iter()
1171 .any(|other| other.id != variant.id && dominates(other, variant))
1172 })
1173 .map(|variant| variant.id.clone())
1174 .collect()
1175}
1176
1177fn dominates(left: &SkillGateVariantReport, right: &SkillGateVariantReport) -> bool {
1178 if left.metrics.scored_task_count == 0 {
1179 return false;
1180 }
1181 let at_least_as_good = left.metrics.mean_gap_recovery + EPSILON
1182 >= right.metrics.mean_gap_recovery
1183 && left.metrics.mean_score_lift + EPSILON >= right.metrics.mean_score_lift
1184 && left.metrics.regression_rate <= right.metrics.regression_rate + EPSILON
1185 && left.context.delta_tokens <= right.context.delta_tokens;
1186 let strictly_better = left.metrics.mean_gap_recovery
1187 > right.metrics.mean_gap_recovery + EPSILON
1188 || left.metrics.mean_score_lift > right.metrics.mean_score_lift + EPSILON
1189 || left.metrics.regression_rate + EPSILON < right.metrics.regression_rate
1190 || left.context.delta_tokens < right.context.delta_tokens;
1191 at_least_as_good && strictly_better
1192}
1193
1194fn select_variant(
1195 variants: &[SkillGateVariantReport],
1196 pareto_frontier: &[String],
1197) -> Option<String> {
1198 let frontier = pareto_frontier.iter().collect::<BTreeSet<_>>();
1199 let mut accepted = variants
1200 .iter()
1201 .filter(|variant| variant.accepted && frontier.contains(&variant.id))
1202 .collect::<Vec<_>>();
1203 if accepted.is_empty() {
1204 accepted = variants.iter().filter(|variant| variant.accepted).collect();
1205 }
1206 accepted.sort_by(|left, right| {
1207 right
1208 .metrics
1209 .mean_gap_recovery
1210 .partial_cmp(&left.metrics.mean_gap_recovery)
1211 .unwrap_or(std::cmp::Ordering::Equal)
1212 .then_with(|| {
1213 right
1214 .metrics
1215 .mean_score_lift
1216 .partial_cmp(&left.metrics.mean_score_lift)
1217 .unwrap_or(std::cmp::Ordering::Equal)
1218 })
1219 .then_with(|| left.context.delta_tokens.cmp(&right.context.delta_tokens))
1220 .then_with(|| left.id.cmp(&right.id))
1221 });
1222 accepted.first().map(|variant| variant.id.clone())
1223}
1224
1225fn build_receipt(
1226 manifest: &SkillGateManifest,
1227 accepted: bool,
1228 selected_variant_id: Option<String>,
1229 task_safety: &[SkillGateTaskSafetyReport],
1230 tamper: SkillGateTamperReport,
1231 variants: &[SkillGateVariantReport],
1232 pareto_frontier: Vec<String>,
1233) -> SkillGateReceipt {
1234 let selected = selected_variant_id
1235 .as_ref()
1236 .and_then(|id| variants.iter().find(|variant| &variant.id == id));
1237 SkillGateReceipt {
1238 type_name: SKILL_GATE_RECEIPT_TYPE.to_string(),
1239 schema_version: SKILL_GATE_SCHEMA_VERSION,
1240 manifest_id: manifest.id.clone(),
1241 target_model_id: manifest.target_model.id.clone(),
1242 accepted,
1243 selected_variant_id,
1244 decision: if accepted {
1245 "accepted".to_string()
1246 } else {
1247 "rejected".to_string()
1248 },
1249 metrics: selected.map(|variant| variant.metrics.clone()),
1250 context: selected.map(|variant| variant.context.clone()),
1251 tamper,
1252 pareto_frontier,
1253 excluded_task_ids: task_safety
1254 .iter()
1255 .filter(|task| !task.included)
1256 .map(|task| task.task_id.clone())
1257 .collect(),
1258 variant_receipts: variants
1259 .iter()
1260 .map(|variant| SkillGateVariantReceipt {
1261 variant_id: variant.id.clone(),
1262 accepted: variant.accepted,
1263 decision: variant.decision.clone(),
1264 metrics: variant.metrics.clone(),
1265 context_delta_tokens: variant.context.delta_tokens,
1266 failures: variant.failures.clone(),
1267 })
1268 .collect(),
1269 metadata: manifest.metadata.clone(),
1270 }
1271}
1272
1273#[cfg(test)]
1274mod tests {
1275 use super::*;
1276
1277 fn write(path: &Path, content: &str) {
1278 fs::create_dir_all(path.parent().unwrap()).unwrap();
1279 fs::write(path, content).unwrap();
1280 }
1281
1282 fn fixture_manifest(root: &Path, grader_hash: String) -> SkillGateManifest {
1283 SkillGateManifest {
1284 type_name: SKILL_GATE_MANIFEST_TYPE.to_string(),
1285 version: SKILL_GATE_SCHEMA_VERSION,
1286 id: "skill-gate-test".to_string(),
1287 base_dir: Some(root.display().to_string()),
1288 target_model: SkillGateTargetModel {
1289 id: "mock-cheap".to_string(),
1290 knowledge_cutoff: Some("2026-05-01".to_string()),
1291 context_budget_tokens: Some(220),
1292 ..Default::default()
1293 },
1294 policy: SkillGatePolicy {
1295 min_included_tasks: Some(2),
1296 min_score_lift: Some(0.10),
1297 min_gap_recovery: Some(0.25),
1298 max_regression_rate: Some(0.0),
1299 max_context_delta_tokens: Some(120),
1300 min_win_rate: Some(0.5),
1301 ..Default::default()
1302 },
1303 grader: SkillGateGrader {
1304 id: "immutable".to_string(),
1305 immutable_paths: vec![SkillGateProtectedPath {
1306 path: "grader/check.txt".to_string(),
1307 sha256: grader_hash,
1308 label: Some("grader".to_string()),
1309 }],
1310 ..Default::default()
1311 },
1312 tasks: vec![
1313 SkillGateTask {
1314 id: "post-cutoff-failure".to_string(),
1315 cluster: "api-drift".to_string(),
1316 heldout: SkillGateHeldout {
1317 kind: "post_cutoff".to_string(),
1318 created_at: Some("2026-05-20".to_string()),
1319 ..Default::default()
1320 },
1321 baseline_score: 0.20,
1322 frontier_score: 1.0,
1323 baseline_passed: Some(false),
1324 ..Default::default()
1325 },
1326 SkillGateTask {
1327 id: "private-regression-check".to_string(),
1328 cluster: "regression".to_string(),
1329 heldout: SkillGateHeldout {
1330 kind: "private".to_string(),
1331 private: true,
1332 ..Default::default()
1333 },
1334 baseline_score: 0.90,
1335 frontier_score: 1.0,
1336 baseline_passed: Some(true),
1337 ..Default::default()
1338 },
1339 SkillGateTask {
1340 id: "old-public-benchmark".to_string(),
1341 cluster: "contaminated".to_string(),
1342 heldout: SkillGateHeldout {
1343 kind: "public_static".to_string(),
1344 created_at: Some("2024-01-01".to_string()),
1345 ..Default::default()
1346 },
1347 baseline_score: 0.0,
1348 frontier_score: 1.0,
1349 baseline_passed: Some(false),
1350 ..Default::default()
1351 },
1352 ],
1353 variants: vec![
1354 SkillGateVariant {
1355 id: "known-good".to_string(),
1356 candidate: SkillGateArtifact {
1357 kind: "skill".to_string(),
1358 paths: vec!["skills/good/SKILL.md".to_string()],
1359 ..Default::default()
1360 },
1361 case_results: vec![
1362 SkillGateCaseResult {
1363 task_id: "post-cutoff-failure".to_string(),
1364 score: Some(0.80),
1365 passed: Some(true),
1366 ..Default::default()
1367 },
1368 SkillGateCaseResult {
1369 task_id: "private-regression-check".to_string(),
1370 score: Some(0.92),
1371 passed: Some(true),
1372 ..Default::default()
1373 },
1374 SkillGateCaseResult {
1375 task_id: "old-public-benchmark".to_string(),
1376 score: Some(1.0),
1377 passed: Some(true),
1378 ..Default::default()
1379 },
1380 ],
1381 ..Default::default()
1382 },
1383 SkillGateVariant {
1384 id: "bloated".to_string(),
1385 candidate: SkillGateArtifact {
1386 kind: "skill".to_string(),
1387 paths: vec!["skills/bloat/SKILL.md".to_string()],
1388 ..Default::default()
1389 },
1390 case_results: vec![
1391 SkillGateCaseResult {
1392 task_id: "post-cutoff-failure".to_string(),
1393 score: Some(0.85),
1394 passed: Some(true),
1395 ..Default::default()
1396 },
1397 SkillGateCaseResult {
1398 task_id: "private-regression-check".to_string(),
1399 score: Some(0.91),
1400 passed: Some(true),
1401 ..Default::default()
1402 },
1403 ],
1404 ..Default::default()
1405 },
1406 ],
1407 ..Default::default()
1408 }
1409 }
1410
1411 #[test]
1412 fn gate_accepts_compact_lift_rejects_bloat_and_excludes_contamination() {
1413 let temp = tempfile::tempdir().unwrap();
1414 write(
1415 temp.path().join("grader/check.txt").as_path(),
1416 "stable grader\n",
1417 );
1418 write(
1419 temp.path().join("skills/good/SKILL.md").as_path(),
1420 "Use the post-cutoff API name and keep the answer scoped.\n",
1421 );
1422 write(
1423 temp.path().join("skills/bloat/SKILL.md").as_path(),
1424 &"repeat this irrelevant guidance for token bloat.\n".repeat(80),
1425 );
1426 let grader_hash = sha256_file(&temp.path().join("grader/check.txt"))
1427 .unwrap()
1428 .sha256;
1429 let report = evaluate_skill_gate_manifest(&fixture_manifest(temp.path(), grader_hash))
1430 .expect("gate evaluates");
1431
1432 assert!(report.pass);
1433 assert_eq!(report.selected_variant_id.as_deref(), Some("known-good"));
1434 assert_eq!(report.included_task_count, 2);
1435 assert_eq!(report.excluded_task_count, 1);
1436 assert_eq!(
1437 report.receipt.excluded_task_ids,
1438 vec!["old-public-benchmark"]
1439 );
1440 let good = report
1441 .variants
1442 .iter()
1443 .find(|variant| variant.id == "known-good")
1444 .unwrap();
1445 assert!(good.accepted);
1446 assert!(good.metrics.mean_gap_recovery > 0.25);
1447 assert_eq!(good.metrics.regression_rate, 0.0);
1448 let bloat = report
1449 .variants
1450 .iter()
1451 .find(|variant| variant.id == "bloated")
1452 .unwrap();
1453 assert!(!bloat.accepted);
1454 assert!(bloat
1455 .failures
1456 .iter()
1457 .any(|failure| failure.contains("context delta")));
1458 }
1459
1460 #[test]
1461 fn gate_fails_when_immutable_grader_checksum_changes() {
1462 let temp = tempfile::tempdir().unwrap();
1463 write(
1464 temp.path().join("grader/check.txt").as_path(),
1465 "stable grader\n",
1466 );
1467 write(
1468 temp.path().join("skills/good/SKILL.md").as_path(),
1469 "Use the post-cutoff API name and keep the answer scoped.\n",
1470 );
1471 write(
1472 temp.path().join("skills/bloat/SKILL.md").as_path(),
1473 &"repeat this irrelevant guidance for token bloat.\n".repeat(80),
1474 );
1475 let mut manifest = fixture_manifest(temp.path(), "not-the-real-hash".to_string());
1476 manifest.variants.truncate(1);
1477 let report = evaluate_skill_gate_manifest(&manifest).expect("gate evaluates");
1478
1479 assert!(!report.pass);
1480 assert!(!report.tamper.pass);
1481 assert!(report
1482 .variants
1483 .first()
1484 .unwrap()
1485 .failures
1486 .iter()
1487 .any(|failure| failure.contains("immutable grader")));
1488 }
1489}