1use clap::{Args, Subcommand, ValueEnum};
2use serde::{Deserialize, Serialize};
3use sha2::{Digest, Sha256};
4use std::collections::{BTreeMap, HashMap};
5use std::fs;
6use std::path::{Path, PathBuf};
7
8use crate::core::error;
9use crate::core::store::Store;
10use crate::core::time;
11
12#[derive(Args, Debug)]
13pub struct EvalCli {
14 #[clap(subcommand)]
15 pub command: EvalCommand,
16}
17
18#[derive(Subcommand, Debug)]
19pub enum EvalCommand {
20 Plan {
22 #[clap(long)]
23 task_set_id: String,
24 #[clap(long = "task-ref")]
25 task_refs: Vec<String>,
26 #[clap(long, default_value_t = 5)]
27 runs_per_variant: u32,
28 #[clap(long)]
29 model_id: String,
30 #[clap(long, default_value = "unknown")]
31 agent_version: String,
32 #[clap(long, default_value = "unknown")]
33 agent_id: String,
34 #[clap(long)]
35 prompt_hash: String,
36 #[clap(long, default_value_t = 42)]
37 seed: u64,
38 #[clap(long = "tool-version")]
39 tool_versions: Vec<String>,
40 #[clap(long = "env")]
41 env_fingerprint: Vec<String>,
42 #[clap(long)]
43 judge_model_id: String,
44 #[clap(long)]
45 judge_prompt_hash: String,
46 #[clap(long, default_value_t = 3000)]
47 judge_timeout_ms: u64,
48 },
49
50 IngestRun {
52 #[clap(long)]
53 plan_id: String,
54 #[clap(long)]
55 run_id: String,
56 #[clap(long)]
57 variant: String,
58 #[clap(long)]
59 task_ref: String,
60 #[clap(long, default_value_t = 1)]
61 attempt_index: u32,
62 #[clap(long)]
63 status: String,
64 #[clap(long)]
65 failure_reason: Option<String>,
66 #[clap(long, default_value_t = 0)]
67 duration_ms: u64,
68 #[clap(long)]
69 cost_usd: Option<f64>,
70 #[clap(long)]
71 trace_file: Option<PathBuf>,
72 #[clap(long)]
73 trace_id: Option<String>,
74 },
75
76 Judge {
78 #[clap(long)]
79 plan_id: String,
80 #[clap(long)]
81 run_id: String,
82 #[clap(long)]
83 json_file: Option<PathBuf>,
84 #[clap(long)]
85 json: Option<String>,
86 #[clap(long, default_value_t = 3000)]
87 timeout_ms: u64,
88 #[clap(long)]
89 simulate_delay_ms: Option<u64>,
90 },
91
92 Aggregate {
94 #[clap(long)]
95 plan_id: String,
96 #[clap(long, default_value = "baseline")]
97 baseline_variant: String,
98 #[clap(long, default_value = "candidate")]
99 candidate_variant: String,
100 #[clap(long, default_value_t = 400)]
101 iterations: usize,
102 #[clap(long)]
103 aggregate_id: Option<String>,
104 #[clap(long)]
105 baseline_aggregate_id: Option<String>,
106 #[clap(long)]
107 acknowledge_setting_drift: bool,
108 },
109
110 Gate {
112 #[clap(long)]
113 aggregate_id: String,
114 #[clap(long, default_value_t = 5)]
115 min_runs: u32,
116 #[clap(long, default_value_t = 0.0)]
117 max_regression: f64,
118 #[clap(long)]
119 mark_required: bool,
120 },
121
122 BucketFailures {
124 #[clap(long)]
125 plan_id: String,
126 #[clap(long, default_value = "candidate")]
127 variant: String,
128 #[clap(long, value_enum, default_value_t = BucketMode::Deterministic)]
129 mode: BucketMode,
130 #[clap(long)]
131 model_id: Option<String>,
132 #[clap(long)]
133 prompt_hash: Option<String>,
134 #[clap(long, default_value_t = 0.0)]
135 temperature: f32,
136 },
137}
138
139#[derive(Copy, Clone, Debug, ValueEnum)]
140pub enum BucketMode {
141 Deterministic,
142 AgentAssisted,
143}
144
145#[derive(Debug, Clone, Serialize, Deserialize)]
146pub struct EvalPlan {
147 pub schema_version: String,
148 pub kind: String,
149 pub plan_id: String,
150 pub task_set_id: String,
151 pub task_refs: Vec<String>,
152 pub runs_per_variant: u32,
153 pub created_at: String,
154 pub settings: EvalSettings,
155 pub plan_hash: String,
156}
157
158#[derive(Debug, Clone, Serialize, Deserialize)]
159pub struct EvalSettings {
160 pub agent_id: String,
161 pub model_id: String,
162 pub agent_version: String,
163 pub prompt_hash: String,
164 pub seed: u64,
165 pub tool_versions: BTreeMap<String, String>,
166 pub environment_fingerprint: BTreeMap<String, String>,
167 pub judge_model_id: String,
168 pub judge_prompt_hash: String,
169 pub judge_timeout_ms: u64,
170}
171
172#[derive(Debug, Clone, Serialize, Deserialize)]
173pub struct TraceBundle {
174 pub schema_version: String,
175 pub kind: String,
176 pub trace_id: String,
177 pub run_id: String,
178 pub event_count: usize,
179 pub events: Vec<TraceEvent>,
180 pub attachments: Vec<TraceAttachment>,
181 pub trace_hash: String,
182}
183
184#[derive(Debug, Clone, Serialize, Deserialize)]
185pub struct TraceEvent {
186 pub ts_ms: u64,
187 pub event_type: String,
188 pub tool: Option<String>,
189 pub token_in: Option<u64>,
190 pub token_out: Option<u64>,
191 pub detail: Option<String>,
192}
193
194#[derive(Debug, Clone, Serialize, Deserialize)]
195pub struct TraceAttachment {
196 pub kind: String,
197 pub content_address: String,
198 pub media_type: Option<String>,
199}
200
201#[derive(Debug, Clone, Serialize, Deserialize)]
202pub struct EvalRun {
203 pub schema_version: String,
204 pub kind: String,
205 pub run_id: String,
206 pub plan_id: String,
207 pub variant: String,
208 pub task_ref: String,
209 pub attempt_index: u32,
210 pub status: String,
211 pub failure_reason: Option<String>,
212 pub duration_ms: u64,
213 pub cost_usd: Option<f64>,
214 pub trace_bundle_ref: Option<String>,
215 pub verdict_ref: Option<String>,
216 pub ingested_at: String,
217 pub run_hash: String,
218}
219
220#[derive(Debug, Clone, Serialize, Deserialize)]
221pub struct EvalVerdict {
222 pub schema_version: String,
223 pub kind: String,
224 pub verdict_id: String,
225 pub plan_id: String,
226 pub run_id: String,
227 pub success: bool,
228 pub explanation: String,
229 pub failure_reason: Option<String>,
230 pub reached_captcha: bool,
231 pub impossible_task: bool,
232 pub timed_out: bool,
233 pub judge_model_id: String,
234 pub judge_prompt_hash: String,
235 pub input_digest: String,
236 pub judged_at: String,
237 pub verdict_hash: String,
238}
239
240#[derive(Debug, Clone, Serialize, Deserialize)]
241pub struct EvalAggregate {
242 pub schema_version: String,
243 pub kind: String,
244 pub aggregate_id: String,
245 pub plan_id: String,
246 pub plan_hash: String,
247 pub baseline_variant: String,
248 pub candidate_variant: String,
249 pub baseline_n: u32,
250 pub candidate_n: u32,
251 pub baseline_success_rate: f64,
252 pub candidate_success_rate: f64,
253 pub delta_success_rate: f64,
254 pub ci_low: f64,
255 pub ci_high: f64,
256 pub bootstrap_iterations: usize,
257 pub regression_flag: bool,
258 pub judged_runs: u32,
259 pub judge_timeout_failures: u32,
260 pub computed_at: String,
261 pub aggregate_hash: String,
262}
263
264#[derive(Debug, Clone, Serialize, Deserialize)]
265pub struct FailureBucketArtifact {
266 pub schema_version: String,
267 pub kind: String,
268 pub plan_id: String,
269 pub variant: String,
270 pub mode: String,
271 pub model_id: Option<String>,
272 pub prompt_hash: Option<String>,
273 pub temperature: f32,
274 pub promotion_dependency_allowed: bool,
275 pub total_failures: u32,
276 pub buckets: Vec<FailureBucket>,
277 pub computed_at: String,
278 pub artifact_hash: String,
279}
280
281#[derive(Debug, Clone, Serialize, Deserialize)]
282pub struct FailureBucket {
283 pub bucket_id: String,
284 pub count: u32,
285 pub sample_run_ids: Vec<String>,
286}
287
288#[derive(Debug, Clone, Serialize, Deserialize)]
289struct EvalGateRequirement {
290 schema_version: String,
291 kind: String,
292 aggregate_id: String,
293 min_runs: u32,
294 max_regression: f64,
295 decision_at_mark: bool,
296 marked_at: String,
297}
298
299#[derive(Debug, Clone, Serialize, Deserialize)]
300struct JudgeInput {
301 success: bool,
302 explanation: String,
303 #[serde(default)]
304 failure_reason: Option<String>,
305 #[serde(default)]
306 reached_captcha: bool,
307 #[serde(default)]
308 impossible_task: bool,
309}
310
311pub fn run_eval_cli(store: &Store, cli: EvalCli) -> Result<(), error::DecapodError> {
312 match cli.command {
313 EvalCommand::Plan {
314 task_set_id,
315 task_refs,
316 runs_per_variant,
317 model_id,
318 agent_version,
319 agent_id,
320 prompt_hash,
321 seed,
322 tool_versions,
323 env_fingerprint,
324 judge_model_id,
325 judge_prompt_hash,
326 judge_timeout_ms,
327 } => {
328 let tool_versions = parse_kv_pairs(&tool_versions, "--tool-version")?;
329 let env_fingerprint = parse_kv_pairs(&env_fingerprint, "--env")?;
330 let settings = EvalSettings {
331 agent_id,
332 model_id,
333 agent_version,
334 prompt_hash,
335 seed,
336 tool_versions,
337 environment_fingerprint: env_fingerprint,
338 judge_model_id,
339 judge_prompt_hash,
340 judge_timeout_ms,
341 };
342
343 let mut task_refs_sorted = task_refs;
344 task_refs_sorted.sort();
345 task_refs_sorted.dedup();
346
347 let plan_hash = hash_json(&serde_json::json!({
348 "task_set_id": task_set_id,
349 "task_refs": task_refs_sorted,
350 "runs_per_variant": runs_per_variant,
351 "settings": settings,
352 }))?;
353 let plan_id = format!("P_{}", &plan_hash[..12].to_uppercase());
354
355 let plan = EvalPlan {
356 schema_version: "1.0.0".to_string(),
357 kind: "EVAL_PLAN".to_string(),
358 plan_id: plan_id.clone(),
359 task_set_id,
360 task_refs: task_refs_sorted,
361 runs_per_variant,
362 created_at: time::now_epoch_z(),
363 settings,
364 plan_hash,
365 };
366
367 let path = write_json(eval_plan_path(store, &plan_id), &plan)?;
368 println!(
369 "{}",
370 serde_json::to_string_pretty(&serde_json::json!({
371 "cmd": "eval.plan",
372 "status": "ok",
373 "plan_id": plan.plan_id,
374 "plan_hash": plan.plan_hash,
375 "path": path,
376 }))
377 .unwrap()
378 );
379 }
380 EvalCommand::IngestRun {
381 plan_id,
382 run_id,
383 variant,
384 task_ref,
385 attempt_index,
386 status,
387 failure_reason,
388 duration_ms,
389 cost_usd,
390 trace_file,
391 trace_id,
392 } => {
393 let plan = load_plan(store, &plan_id)?;
394 let status = normalize_status(&status)?;
395 let trace_bundle_ref = if let Some(trace_file) = trace_file {
396 let trace_seed =
397 fs::read_to_string(&trace_file).map_err(error::DecapodError::IoError)?;
398 let trace_payload: serde_json::Value =
399 serde_json::from_str(&trace_seed).map_err(|e| {
400 error::DecapodError::ValidationError(format!(
401 "invalid trace JSON '{}': {}",
402 trace_file.display(),
403 e
404 ))
405 })?;
406 let mut events = Vec::new();
407 if let Some(raw_events) = trace_payload.get("events").and_then(|v| v.as_array()) {
408 for ev in raw_events {
409 events.push(TraceEvent {
410 ts_ms: ev.get("ts_ms").and_then(|v| v.as_u64()).unwrap_or(0),
411 event_type: ev
412 .get("event_type")
413 .and_then(|v| v.as_str())
414 .unwrap_or("unknown")
415 .to_string(),
416 tool: ev
417 .get("tool")
418 .and_then(|v| v.as_str())
419 .map(|s| s.to_string()),
420 token_in: ev.get("token_in").and_then(|v| v.as_u64()),
421 token_out: ev.get("token_out").and_then(|v| v.as_u64()),
422 detail: ev
423 .get("detail")
424 .and_then(|v| v.as_str())
425 .map(|s| s.to_string()),
426 });
427 }
428 }
429 let mut attachments = Vec::new();
430 if let Some(raw_atts) = trace_payload.get("attachments").and_then(|v| v.as_array())
431 {
432 for a in raw_atts {
433 if let Some(addr) = a.get("content_address").and_then(|v| v.as_str()) {
434 attachments.push(TraceAttachment {
435 kind: a
436 .get("kind")
437 .and_then(|v| v.as_str())
438 .unwrap_or("artifact")
439 .to_string(),
440 content_address: addr.to_string(),
441 media_type: a
442 .get("media_type")
443 .and_then(|v| v.as_str())
444 .map(|s| s.to_string()),
445 });
446 }
447 }
448 }
449
450 let resolved_trace_id = trace_id.unwrap_or_else(|| format!("T_{}", run_id));
451 let mut trace = TraceBundle {
452 schema_version: "1.0.0".to_string(),
453 kind: "TRACE_BUNDLE".to_string(),
454 trace_id: resolved_trace_id.clone(),
455 run_id: run_id.clone(),
456 event_count: events.len(),
457 events,
458 attachments,
459 trace_hash: String::new(),
460 };
461 trace.trace_hash = hash_json(&serde_json::to_value(&trace).unwrap())?;
462 let trace_path = write_json(eval_trace_path(store, &resolved_trace_id), &trace)?;
463 Some(trace_path)
464 } else {
465 None
466 };
467
468 let mut run = EvalRun {
469 schema_version: "1.0.0".to_string(),
470 kind: "EVAL_RUN".to_string(),
471 run_id: run_id.clone(),
472 plan_id: plan.plan_id,
473 variant,
474 task_ref,
475 attempt_index,
476 status,
477 failure_reason,
478 duration_ms,
479 cost_usd,
480 trace_bundle_ref,
481 verdict_ref: None,
482 ingested_at: time::now_epoch_z(),
483 run_hash: String::new(),
484 };
485 run.run_hash = hash_json(&serde_json::to_value(&run).unwrap())?;
486
487 let path = write_json(eval_run_path(store, &run_id), &run)?;
488 println!(
489 "{}",
490 serde_json::to_string_pretty(&serde_json::json!({
491 "cmd": "eval.ingest-run",
492 "status": "ok",
493 "path": path,
494 "run_id": run_id,
495 "run_hash": run.run_hash,
496 }))
497 .unwrap()
498 );
499 }
500 EvalCommand::Judge {
501 plan_id,
502 run_id,
503 json_file,
504 json,
505 timeout_ms,
506 simulate_delay_ms,
507 } => {
508 let plan = load_plan(store, &plan_id)?;
509 let mut run = load_run(store, &run_id)?;
510 if run.plan_id != plan.plan_id {
511 return Err(error::DecapodError::ValidationError(format!(
512 "run '{}' belongs to plan '{}', not '{}'",
513 run_id, run.plan_id, plan.plan_id
514 )));
515 }
516
517 if let Some(delay_ms) = simulate_delay_ms {
518 if delay_ms > timeout_ms {
519 return Err(error::DecapodError::ValidationError(format!(
520 "EVAL_JUDGE_TIMEOUT: judge execution exceeded timeout ({}ms > {}ms)",
521 delay_ms, timeout_ms
522 )));
523 }
524 std::thread::sleep(std::time::Duration::from_millis(delay_ms));
525 }
526
527 let payload_text = match (json_file, json) {
528 (Some(path), None) => {
529 fs::read_to_string(&path).map_err(error::DecapodError::IoError)?
530 }
531 (None, Some(raw)) => raw,
532 (Some(_), Some(_)) => {
533 return Err(error::DecapodError::ValidationError(
534 "provide either --json-file or --json, not both".to_string(),
535 ));
536 }
537 (None, None) => {
538 return Err(error::DecapodError::ValidationError(
539 "judge requires --json-file or --json input".to_string(),
540 ));
541 }
542 };
543
544 let input_digest = hash_bytes(payload_text.as_bytes());
545 let input: JudgeInput = serde_json::from_str(&payload_text).map_err(|e| {
546 error::DecapodError::ValidationError(format!(
547 "EVAL_JUDGE_JSON_CONTRACT_ERROR: malformed judge JSON: {}",
548 e
549 ))
550 })?;
551 if input.explanation.trim().is_empty() {
552 return Err(error::DecapodError::ValidationError(
553 "EVAL_JUDGE_JSON_CONTRACT_ERROR: explanation must be non-empty".to_string(),
554 ));
555 }
556
557 let timed_out = false;
558 let mut verdict = EvalVerdict {
559 schema_version: "1.0.0".to_string(),
560 kind: "EVAL_VERDICT".to_string(),
561 verdict_id: format!("V_{}", run_id),
562 plan_id: plan.plan_id,
563 run_id: run_id.clone(),
564 success: input.success,
565 explanation: input.explanation,
566 failure_reason: input.failure_reason,
567 reached_captcha: input.reached_captcha,
568 impossible_task: input.impossible_task,
569 timed_out,
570 judge_model_id: plan.settings.judge_model_id,
571 judge_prompt_hash: plan.settings.judge_prompt_hash,
572 input_digest,
573 judged_at: time::now_epoch_z(),
574 verdict_hash: String::new(),
575 };
576 verdict.verdict_hash = hash_json(&serde_json::to_value(&verdict).unwrap())?;
577
578 let verdict_path = write_json(eval_verdict_path(store, &run_id), &verdict)?;
579 run.verdict_ref = Some(verdict_path.clone());
580 run.run_hash = hash_json(&serde_json::to_value(&run).unwrap())?;
581 write_json(eval_run_path(store, &run_id), &run)?;
582
583 println!(
584 "{}",
585 serde_json::to_string_pretty(&serde_json::json!({
586 "cmd": "eval.judge",
587 "status": "ok",
588 "verdict_id": verdict.verdict_id,
589 "path": verdict_path,
590 "timed_out": false,
591 }))
592 .unwrap()
593 );
594 }
595 EvalCommand::Aggregate {
596 plan_id,
597 baseline_variant,
598 candidate_variant,
599 iterations,
600 aggregate_id,
601 baseline_aggregate_id,
602 acknowledge_setting_drift,
603 } => {
604 let plan = load_plan(store, &plan_id)?;
605 let runs = load_all_runs_for_plan(store, &plan_id)?;
606 let verdicts = load_all_verdicts(store)?;
607
608 let baseline = variant_scores(&runs, &verdicts, &baseline_variant);
609 let candidate = variant_scores(&runs, &verdicts, &candidate_variant);
610
611 if baseline.is_empty() || candidate.is_empty() {
612 return Err(error::DecapodError::ValidationError(format!(
613 "aggregate requires judged runs for both variants (baseline={}, candidate={})",
614 baseline.len(),
615 candidate.len()
616 )));
617 }
618
619 if let Some(base_agg_id) = baseline_aggregate_id {
620 let base = load_aggregate(store, &base_agg_id)?;
621 if base.plan_hash != plan.plan_hash && !acknowledge_setting_drift {
622 return Err(error::DecapodError::ValidationError(format!(
623 "EVAL_SETTINGS_MISMATCH: baseline aggregate '{}' has different plan hash ({} != {}). Use --acknowledge-setting-drift to force comparison.",
624 base_agg_id, base.plan_hash, plan.plan_hash
625 )));
626 }
627 }
628
629 let (ci_low, ci_high) =
630 bootstrap_delta_ci(&baseline, &candidate, iterations, plan.settings.seed);
631 let baseline_rate = mean(&baseline);
632 let candidate_rate = mean(&candidate);
633 let delta = candidate_rate - baseline_rate;
634
635 let judged_runs = (baseline.len() + candidate.len()) as u32;
636 let judge_timeout_failures = runs
637 .iter()
638 .filter(|r| {
639 if let Some(verdict_path) = &r.verdict_ref
640 && let Ok(raw) = fs::read_to_string(verdict_path)
641 && let Ok(v) = serde_json::from_str::<EvalVerdict>(&raw)
642 {
643 return v.timed_out;
644 }
645 false
646 })
647 .count() as u32;
648
649 let regression_flag = ci_high < 0.0;
650 let computed_at = time::now_epoch_z();
651
652 let fallback_id = format!(
653 "A_{}_vs_{}_{}",
654 candidate_variant,
655 baseline_variant,
656 &hash_json(&serde_json::json!({
657 "plan_id": plan.plan_id,
658 "baseline": baseline_variant,
659 "candidate": candidate_variant,
660 "at": computed_at,
661 }))?[..10]
662 );
663
664 let mut agg = EvalAggregate {
665 schema_version: "1.0.0".to_string(),
666 kind: "EVAL_AGGREGATE".to_string(),
667 aggregate_id: aggregate_id.unwrap_or(fallback_id),
668 plan_id: plan.plan_id,
669 plan_hash: plan.plan_hash,
670 baseline_variant,
671 candidate_variant,
672 baseline_n: baseline.len() as u32,
673 candidate_n: candidate.len() as u32,
674 baseline_success_rate: baseline_rate,
675 candidate_success_rate: candidate_rate,
676 delta_success_rate: delta,
677 ci_low,
678 ci_high,
679 bootstrap_iterations: iterations,
680 regression_flag,
681 judged_runs,
682 judge_timeout_failures,
683 computed_at,
684 aggregate_hash: String::new(),
685 };
686 agg.aggregate_hash = hash_json(&serde_json::to_value(&agg).unwrap())?;
687
688 let path = write_json(eval_aggregate_path(store, &agg.aggregate_id), &agg)?;
689 println!(
690 "{}",
691 serde_json::to_string_pretty(&serde_json::json!({
692 "cmd": "eval.aggregate",
693 "status": "ok",
694 "path": path,
695 "aggregate_id": agg.aggregate_id,
696 "delta_success_rate": agg.delta_success_rate,
697 "ci": [agg.ci_low, agg.ci_high],
698 "baseline_n": agg.baseline_n,
699 "candidate_n": agg.candidate_n,
700 }))
701 .unwrap()
702 );
703 }
704 EvalCommand::Gate {
705 aggregate_id,
706 min_runs,
707 max_regression,
708 mark_required,
709 } => {
710 let agg = load_aggregate(store, &aggregate_id)?;
711 let (pass, reasons) = evaluate_gate_decision(&agg, min_runs, max_regression);
712
713 if mark_required {
714 let required = EvalGateRequirement {
715 schema_version: "1.0.0".to_string(),
716 kind: "EVAL_GATE_REQUIREMENT".to_string(),
717 aggregate_id: aggregate_id.clone(),
718 min_runs,
719 max_regression,
720 decision_at_mark: pass,
721 marked_at: time::now_epoch_z(),
722 };
723 write_json(eval_gate_requirement_path(store), &required)?;
724 }
725
726 println!(
727 "{}",
728 serde_json::to_string_pretty(&serde_json::json!({
729 "cmd": "eval.gate",
730 "status": if pass { "ok" } else { "failed" },
731 "aggregate_id": aggregate_id,
732 "pass": pass,
733 "reasons": reasons,
734 "min_runs": min_runs,
735 "max_regression": max_regression,
736 "marked_required": mark_required,
737 }))
738 .unwrap()
739 );
740
741 if !pass {
742 return Err(error::DecapodError::ValidationError(
743 "EVAL_GATE_FAILED: promotion gate rejected aggregate".to_string(),
744 ));
745 }
746 }
747 EvalCommand::BucketFailures {
748 plan_id,
749 variant,
750 mode,
751 model_id,
752 prompt_hash,
753 temperature,
754 } => {
755 let runs = load_all_runs_for_plan(store, &plan_id)?;
756 let verdicts = load_all_verdicts(store)?;
757
758 if matches!(mode, BucketMode::AgentAssisted)
759 && (model_id.is_none() || prompt_hash.is_none())
760 {
761 return Err(error::DecapodError::ValidationError(
762 "agent-assisted bucketing requires --model-id and --prompt-hash".to_string(),
763 ));
764 }
765
766 let mut reasons: Vec<(String, String)> = Vec::new();
767 for run in runs.iter().filter(|r| r.variant == variant) {
768 let verdict = verdicts.get(&run.run_id);
769 let success = verdict
770 .map(|v| v.success)
771 .unwrap_or_else(|| run.status == "pass");
772 if success {
773 continue;
774 }
775 let reason = verdict
776 .and_then(|v| v.failure_reason.clone())
777 .or_else(|| run.failure_reason.clone())
778 .unwrap_or_else(|| "unspecified_failure".to_string());
779 reasons.push((run.run_id.clone(), reason));
780 }
781
782 let mut grouped: HashMap<String, Vec<String>> = HashMap::new();
783 for (run_id, reason) in reasons {
784 let bucket = classify_failure(&reason);
785 grouped.entry(bucket).or_default().push(run_id);
786 }
787
788 let mut buckets: Vec<FailureBucket> = grouped
789 .into_iter()
790 .map(|(bucket_id, mut run_ids)| {
791 run_ids.sort();
792 let count = run_ids.len() as u32;
793 let sample_run_ids = run_ids.into_iter().take(3).collect();
794 FailureBucket {
795 bucket_id,
796 count,
797 sample_run_ids,
798 }
799 })
800 .collect();
801 buckets.sort_by(|a, b| a.bucket_id.cmp(&b.bucket_id));
802
803 let mut artifact = FailureBucketArtifact {
804 schema_version: "1.0.0".to_string(),
805 kind: "FAILURE_BUCKETS".to_string(),
806 plan_id: plan_id.clone(),
807 variant: variant.clone(),
808 mode: match mode {
809 BucketMode::Deterministic => "deterministic".to_string(),
810 BucketMode::AgentAssisted => "agent-assisted".to_string(),
811 },
812 model_id,
813 prompt_hash,
814 temperature,
815 promotion_dependency_allowed: matches!(mode, BucketMode::Deterministic),
816 total_failures: buckets.iter().map(|b| b.count).sum(),
817 buckets,
818 computed_at: time::now_epoch_z(),
819 artifact_hash: String::new(),
820 };
821 artifact.artifact_hash = hash_json(&serde_json::to_value(&artifact).unwrap())?;
822
823 let path = write_json(eval_bucket_path(store, &plan_id, &variant), &artifact)?;
824 println!(
825 "{}",
826 serde_json::to_string_pretty(&serde_json::json!({
827 "cmd": "eval.bucket-failures",
828 "status": "ok",
829 "path": path,
830 "total_failures": artifact.total_failures,
831 "promotion_dependency_allowed": artifact.promotion_dependency_allowed,
832 }))
833 .unwrap()
834 );
835 }
836 }
837 Ok(())
838}
839
840pub fn schema() -> serde_json::Value {
841 serde_json::json!({
842 "name": "eval",
843 "version": "0.1.0",
844 "description": "Variance-aware evaluation artifacts and promotion gates",
845 "commands": [
846 {"name": "plan", "parameters": ["task_set_id", "task_refs", "runs_per_variant", "settings"]},
847 {"name": "ingest-run", "parameters": ["plan_id", "run_id", "variant", "task_ref", "status", "trace"]},
848 {"name": "judge", "parameters": ["plan_id", "run_id", "json", "timeout_ms"]},
849 {"name": "aggregate", "parameters": ["plan_id", "baseline_variant", "candidate_variant", "iterations"]},
850 {"name": "gate", "parameters": ["aggregate_id", "min_runs", "max_regression", "mark_required"]},
851 {"name": "bucket-failures", "parameters": ["plan_id", "variant", "mode"]}
852 ],
853 "artifacts": ["EVAL_PLAN", "EVAL_RUN", "EVAL_VERDICT", "EVAL_AGGREGATE", "TRACE_BUNDLE", "FAILURE_BUCKETS"],
854 "storage": ["eval/plans", "eval/runs", "eval/verdicts", "eval/aggregates", "eval/traces", "eval/failure_buckets"]
855 })
856}
857
858pub fn verify_eval_gate_for_publish(store_root: &Path) -> Result<(), error::DecapodError> {
859 let req_path = eval_gate_requirement_path_from_store_root(store_root);
860 if !req_path.exists() {
861 return Ok(());
862 }
863
864 let raw = fs::read_to_string(&req_path).map_err(error::DecapodError::IoError)?;
865 let req: EvalGateRequirement = serde_json::from_str(&raw).map_err(|e| {
866 error::DecapodError::ValidationError(format!(
867 "Invalid eval gate requirement artifact {}: {}",
868 req_path.display(),
869 e
870 ))
871 })?;
872
873 let agg_path = eval_aggregate_path_from_store_root(store_root, &req.aggregate_id);
874 if !agg_path.exists() {
875 return Err(error::DecapodError::ValidationError(format!(
876 "Cannot publish: required eval aggregate '{}' missing at {}",
877 req.aggregate_id,
878 agg_path.display()
879 )));
880 }
881 let raw = fs::read_to_string(&agg_path).map_err(error::DecapodError::IoError)?;
882 let agg: EvalAggregate = serde_json::from_str(&raw).map_err(|e| {
883 error::DecapodError::ValidationError(format!(
884 "Invalid eval aggregate artifact {}: {}",
885 agg_path.display(),
886 e
887 ))
888 })?;
889
890 let (pass, reasons) = evaluate_gate_decision(&agg, req.min_runs, req.max_regression);
891 if !pass {
892 return Err(error::DecapodError::ValidationError(format!(
893 "Cannot publish: eval gate failed for aggregate '{}': {}",
894 req.aggregate_id,
895 reasons.join(" | ")
896 )));
897 }
898 Ok(())
899}
900
901pub fn validate_eval_gate_if_required(
902 store_root: &Path,
903) -> Result<Vec<String>, error::DecapodError> {
904 let req_path = eval_gate_requirement_path_from_store_root(store_root);
905 if !req_path.exists() {
906 return Ok(vec![]);
907 }
908
909 let raw = fs::read_to_string(&req_path).map_err(error::DecapodError::IoError)?;
910 let req: EvalGateRequirement = serde_json::from_str(&raw).map_err(|e| {
911 error::DecapodError::ValidationError(format!(
912 "Invalid eval gate requirement artifact {}: {}",
913 req_path.display(),
914 e
915 ))
916 })?;
917
918 let agg = load_aggregate_from_store_root(store_root, &req.aggregate_id)?;
919 let (pass, reasons) = evaluate_gate_decision(&agg, req.min_runs, req.max_regression);
920 if pass {
921 Ok(vec![])
922 } else {
923 Ok(vec![format!(
924 "Required eval gate failed (aggregate={}): {}",
925 req.aggregate_id,
926 reasons.join(" | ")
927 )])
928 }
929}
930
931fn evaluate_gate_decision(
932 aggregate: &EvalAggregate,
933 min_runs: u32,
934 max_regression: f64,
935) -> (bool, Vec<String>) {
936 let mut reasons = Vec::new();
937 if aggregate.baseline_n < min_runs {
938 reasons.push(format!(
939 "baseline_n {} is below minimum {}",
940 aggregate.baseline_n, min_runs
941 ));
942 }
943 if aggregate.candidate_n < min_runs {
944 reasons.push(format!(
945 "candidate_n {} is below minimum {}",
946 aggregate.candidate_n, min_runs
947 ));
948 }
949 if aggregate.bootstrap_iterations == 0 {
950 reasons.push("bootstrap_iterations must be > 0".to_string());
951 }
952 if aggregate.judge_timeout_failures > 0 {
953 reasons.push(format!(
954 "judge_timeout_failures must be 0 (got {})",
955 aggregate.judge_timeout_failures
956 ));
957 }
958 if aggregate.ci_high < -max_regression {
959 reasons.push(format!(
960 "regression detected: CI upper {:.4} < -max_regression {:.4}",
961 aggregate.ci_high, max_regression
962 ));
963 }
964 (reasons.is_empty(), reasons)
965}
966
967fn variant_scores(
968 runs: &[EvalRun],
969 verdicts: &HashMap<String, EvalVerdict>,
970 variant: &str,
971) -> Vec<f64> {
972 let mut out = Vec::new();
973 for run in runs.iter().filter(|r| r.variant == variant) {
974 if let Some(verdict) = verdicts.get(&run.run_id) {
975 out.push(if verdict.success { 1.0 } else { 0.0 });
976 }
977 }
978 out
979}
980
981fn mean(values: &[f64]) -> f64 {
982 if values.is_empty() {
983 0.0
984 } else {
985 values.iter().sum::<f64>() / values.len() as f64
986 }
987}
988
989fn bootstrap_delta_ci(
990 baseline: &[f64],
991 candidate: &[f64],
992 iterations: usize,
993 seed: u64,
994) -> (f64, f64) {
995 let n_b = baseline.len();
996 let n_c = candidate.len();
997 if n_b == 0 || n_c == 0 || iterations == 0 {
998 return (0.0, 0.0);
999 }
1000
1001 let mut state = seed.max(1);
1002 let mut samples = Vec::with_capacity(iterations);
1003
1004 for _ in 0..iterations {
1005 let mut b_sum = 0.0;
1006 let mut c_sum = 0.0;
1007 for _ in 0..n_b {
1008 state = xorshift64(state);
1009 let idx = (state as usize) % n_b;
1010 b_sum += baseline[idx];
1011 }
1012 for _ in 0..n_c {
1013 state = xorshift64(state);
1014 let idx = (state as usize) % n_c;
1015 c_sum += candidate[idx];
1016 }
1017 samples.push((c_sum / n_c as f64) - (b_sum / n_b as f64));
1018 }
1019
1020 samples.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1021 let low_idx = ((iterations as f64) * 0.025).floor() as usize;
1022 let high_idx = ((iterations as f64) * 0.975).ceil() as usize;
1023 let hi = high_idx.min(iterations.saturating_sub(1));
1024 (samples[low_idx.min(hi)], samples[hi])
1025}
1026
1027fn xorshift64(mut x: u64) -> u64 {
1028 x ^= x << 13;
1029 x ^= x >> 7;
1030 x ^= x << 17;
1031 x
1032}
1033
1034fn classify_failure(reason: &str) -> String {
1035 let r = reason.to_ascii_lowercase();
1036 if r.contains("captcha") || r.contains("cloudflare") {
1037 return "captcha_or_bot_protection".to_string();
1038 }
1039 if r.contains("timeout") || r.contains("timed out") {
1040 return "timeout_or_latency".to_string();
1041 }
1042 if r.contains("selector") || r.contains("element") || r.contains("dom") {
1043 return "selector_or_dom_drift".to_string();
1044 }
1045 if r.contains("auth") || r.contains("login") || r.contains("permission") {
1046 return "auth_or_permission".to_string();
1047 }
1048 if r.contains("network") || r.contains("dns") || r.contains("connection") {
1049 return "network_or_service".to_string();
1050 }
1051 "other".to_string()
1052}
1053
1054fn normalize_status(status: &str) -> Result<String, error::DecapodError> {
1055 match status {
1056 "pass" | "fail" => Ok(status.to_string()),
1057 _ => Err(error::DecapodError::ValidationError(format!(
1058 "invalid status '{}': expected pass|fail",
1059 status
1060 ))),
1061 }
1062}
1063
1064fn parse_kv_pairs(
1065 raw: &[String],
1066 flag: &str,
1067) -> Result<BTreeMap<String, String>, error::DecapodError> {
1068 let mut out = BTreeMap::new();
1069 for entry in raw {
1070 let mut parts = entry.splitn(2, '=');
1071 let key = parts.next().unwrap_or_default().trim();
1072 let val = parts.next().unwrap_or_default().trim();
1073 if key.is_empty() || val.is_empty() {
1074 return Err(error::DecapodError::ValidationError(format!(
1075 "invalid {} entry '{}': expected key=value",
1076 flag, entry
1077 )));
1078 }
1079 out.insert(key.to_string(), val.to_string());
1080 }
1081 Ok(out)
1082}
1083
1084fn write_json<T: Serialize>(path: PathBuf, value: &T) -> Result<String, error::DecapodError> {
1085 if let Some(parent) = path.parent() {
1086 fs::create_dir_all(parent).map_err(error::DecapodError::IoError)?;
1087 }
1088 let bytes = serde_json::to_vec_pretty(value).map_err(|e| {
1089 error::DecapodError::ValidationError(format!("failed to serialize eval artifact: {}", e))
1090 })?;
1091 fs::write(&path, bytes).map_err(error::DecapodError::IoError)?;
1092 Ok(path.to_string_lossy().to_string())
1093}
1094
1095fn hash_json(value: &serde_json::Value) -> Result<String, error::DecapodError> {
1096 let bytes = serde_json::to_vec(value).map_err(|e| {
1097 error::DecapodError::ValidationError(format!("failed to canonicalize eval JSON: {}", e))
1098 })?;
1099 Ok(hash_bytes(&bytes))
1100}
1101
1102fn hash_bytes(bytes: &[u8]) -> String {
1103 let mut hasher = Sha256::new();
1104 hasher.update(bytes);
1105 format!("{:x}", hasher.finalize())
1106}
1107
1108fn load_plan(store: &Store, plan_id: &str) -> Result<EvalPlan, error::DecapodError> {
1109 load_json(eval_plan_path(store, plan_id), "EVAL_PLAN")
1110}
1111
1112fn load_run(store: &Store, run_id: &str) -> Result<EvalRun, error::DecapodError> {
1113 load_json(eval_run_path(store, run_id), "EVAL_RUN")
1114}
1115
1116fn load_aggregate(store: &Store, aggregate_id: &str) -> Result<EvalAggregate, error::DecapodError> {
1117 load_json(eval_aggregate_path(store, aggregate_id), "EVAL_AGGREGATE")
1118}
1119
1120fn load_aggregate_from_store_root(
1121 store_root: &Path,
1122 aggregate_id: &str,
1123) -> Result<EvalAggregate, error::DecapodError> {
1124 load_json(
1125 eval_aggregate_path_from_store_root(store_root, aggregate_id),
1126 "EVAL_AGGREGATE",
1127 )
1128}
1129
1130fn load_json<T: for<'de> Deserialize<'de>>(
1131 path: PathBuf,
1132 kind: &str,
1133) -> Result<T, error::DecapodError> {
1134 let raw = fs::read_to_string(&path).map_err(error::DecapodError::IoError)?;
1135 serde_json::from_str(&raw).map_err(|e| {
1136 error::DecapodError::ValidationError(format!(
1137 "invalid {} artifact {}: {}",
1138 kind,
1139 path.display(),
1140 e
1141 ))
1142 })
1143}
1144
1145fn load_all_runs_for_plan(
1146 store: &Store,
1147 plan_id: &str,
1148) -> Result<Vec<EvalRun>, error::DecapodError> {
1149 let mut runs = Vec::new();
1150 let dir = eval_runs_dir(store);
1151 if !dir.exists() {
1152 return Ok(runs);
1153 }
1154 for entry in fs::read_dir(dir).map_err(error::DecapodError::IoError)? {
1155 let entry = entry.map_err(error::DecapodError::IoError)?;
1156 let path = entry.path();
1157 if !path.is_file() || path.extension().and_then(|s| s.to_str()) != Some("json") {
1158 continue;
1159 }
1160 let run: EvalRun = load_json(path, "EVAL_RUN")?;
1161 if run.plan_id == plan_id {
1162 runs.push(run);
1163 }
1164 }
1165 runs.sort_by(|a, b| a.run_id.cmp(&b.run_id));
1166 Ok(runs)
1167}
1168
1169fn load_all_verdicts(store: &Store) -> Result<HashMap<String, EvalVerdict>, error::DecapodError> {
1170 let mut verdicts = HashMap::new();
1171 let dir = eval_verdicts_dir(store);
1172 if !dir.exists() {
1173 return Ok(verdicts);
1174 }
1175 for entry in fs::read_dir(dir).map_err(error::DecapodError::IoError)? {
1176 let entry = entry.map_err(error::DecapodError::IoError)?;
1177 let path = entry.path();
1178 if !path.is_file() || path.extension().and_then(|s| s.to_str()) != Some("json") {
1179 continue;
1180 }
1181 let v: EvalVerdict = load_json(path, "EVAL_VERDICT")?;
1182 verdicts.insert(v.run_id.clone(), v);
1183 }
1184 Ok(verdicts)
1185}
1186
1187fn eval_root(store: &Store) -> PathBuf {
1188 store.root.join("eval")
1189}
1190
1191fn eval_root_from_store_root(store_root: &Path) -> PathBuf {
1192 store_root.join("eval")
1193}
1194
1195fn eval_plan_path(store: &Store, plan_id: &str) -> PathBuf {
1196 eval_root(store)
1197 .join("plans")
1198 .join(format!("{}.json", plan_id))
1199}
1200
1201fn eval_runs_dir(store: &Store) -> PathBuf {
1202 eval_root(store).join("runs")
1203}
1204
1205fn eval_run_path(store: &Store, run_id: &str) -> PathBuf {
1206 eval_runs_dir(store).join(format!("{}.json", run_id))
1207}
1208
1209fn eval_trace_path(store: &Store, trace_id: &str) -> PathBuf {
1210 eval_root(store)
1211 .join("traces")
1212 .join(format!("{}.json", trace_id))
1213}
1214
1215fn eval_verdicts_dir(store: &Store) -> PathBuf {
1216 eval_root(store).join("verdicts")
1217}
1218
1219fn eval_verdict_path(store: &Store, run_id: &str) -> PathBuf {
1220 eval_verdicts_dir(store).join(format!("{}.json", run_id))
1221}
1222
1223fn eval_aggregate_path(store: &Store, aggregate_id: &str) -> PathBuf {
1224 eval_root(store)
1225 .join("aggregates")
1226 .join(format!("{}.json", aggregate_id))
1227}
1228
1229fn eval_aggregate_path_from_store_root(store_root: &Path, aggregate_id: &str) -> PathBuf {
1230 eval_root_from_store_root(store_root)
1231 .join("aggregates")
1232 .join(format!("{}.json", aggregate_id))
1233}
1234
1235fn eval_bucket_path(store: &Store, plan_id: &str, variant: &str) -> PathBuf {
1236 eval_root(store)
1237 .join("failure_buckets")
1238 .join(format!("{}_{}.json", plan_id, variant))
1239}
1240
1241fn eval_gate_requirement_path(store: &Store) -> PathBuf {
1242 eval_root(store).join("gate.required.json")
1243}
1244
1245fn eval_gate_requirement_path_from_store_root(store_root: &Path) -> PathBuf {
1246 eval_root_from_store_root(store_root).join("gate.required.json")
1247}