1use std::collections::{BTreeMap, BTreeSet};
10use std::fs;
11use std::path::{Path, PathBuf};
12use std::time::Instant;
13
14use serde::{Deserialize, Serialize};
15
16use crate::value::VmError;
17
18use super::{
19 load_merge_captain_golden, new_id, MergeCaptainDriverBackend, MergeCaptainDriverMode,
20 MergeCaptainDriverOptions, MergeCaptainRunSummary,
21};
22
23const MANIFEST_TYPE: &str = "merge_captain_iteration_manifest";
24const REPORT_TYPE: &str = "merge_captain_iteration_report";
25const DIFF_TYPE: &str = "merge_captain_iteration_diff";
26
27#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
28#[serde(default)]
29pub struct MergeCaptainIterationManifest {
30 #[serde(rename = "_type")]
31 pub type_name: String,
32 pub version: u32,
33 pub id: String,
34 pub name: Option<String>,
35 pub description: Option<String>,
36 pub base_dir: Option<String>,
37 #[serde(alias = "artifact-root")]
38 pub artifact_root: Option<String>,
39 pub scenarios: Vec<MergeCaptainIterationScenario>,
40 pub variants: Vec<MergeCaptainIterationVariant>,
41 pub budget: MergeCaptainIterationBudget,
42 pub metadata: BTreeMap<String, serde_json::Value>,
43}
44
45#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
46#[serde(default)]
47pub struct MergeCaptainIterationScenario {
48 pub id: String,
49 pub description: Option<String>,
50 pub backend: MergeCaptainIterationBackendSpec,
51 pub metadata: BTreeMap<String, serde_json::Value>,
52}
53
54#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
55#[serde(default)]
56pub struct MergeCaptainIterationBackendSpec {
57 pub kind: String,
58 pub path: Option<String>,
59 pub scenario: Option<String>,
60}
61
62#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
63#[serde(default)]
64pub struct MergeCaptainIterationVariant {
65 pub id: String,
66 #[serde(alias = "model-route")]
67 pub model_route: Option<String>,
68 #[serde(alias = "timeout-tier")]
69 pub timeout_tier: Option<String>,
70 #[serde(alias = "package-revision")]
71 pub package_revision: Option<String>,
72 #[serde(alias = "prompt-asset-revision")]
73 pub prompt_asset_revision: Option<String>,
74 #[serde(alias = "max-cost-usd")]
75 pub max_cost_usd: Option<f64>,
76 #[serde(alias = "max-model-calls")]
77 pub max_model_calls: Option<u64>,
78 #[serde(alias = "max-tool-calls")]
79 pub max_tool_calls: Option<u64>,
80 #[serde(alias = "max-latency-ms")]
81 pub max_latency_ms: Option<u64>,
82 #[serde(alias = "timeout-ms")]
83 pub timeout_ms: Option<u64>,
84 #[serde(alias = "max-sweeps")]
85 pub max_sweeps: Option<u32>,
86 #[serde(alias = "watch-backoff-ms")]
87 pub watch_backoff_ms: Option<u64>,
88 pub metadata: BTreeMap<String, serde_json::Value>,
89}
90
91#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
92#[serde(default)]
93pub struct MergeCaptainIterationBudget {
94 #[serde(alias = "max-cost-usd")]
95 pub max_cost_usd: Option<f64>,
96 #[serde(alias = "max-wallclock-ms")]
97 pub max_wallclock_ms: Option<u64>,
98 #[serde(alias = "max-runs")]
99 pub max_runs: Option<usize>,
100}
101
102#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
103#[serde(default)]
104pub struct MergeCaptainIterationReport {
105 #[serde(rename = "_type")]
106 pub type_name: String,
107 pub version: u32,
108 pub id: String,
109 pub name: Option<String>,
110 pub artifact_root: String,
111 pub summary_json_path: String,
112 pub summary_markdown_path: String,
113 pub pass: bool,
114 pub total: usize,
115 pub completed: usize,
116 pub skipped: usize,
117 pub budget_exhausted: bool,
118 pub budget_exhausted_reason: Option<String>,
119 pub total_cost_usd: f64,
120 pub total_latency_ms: u64,
121 pub runs: Vec<MergeCaptainIterationRunReport>,
122 pub rankings: Vec<MergeCaptainIterationRanking>,
123 pub metadata: BTreeMap<String, serde_json::Value>,
124}
125
126#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
127#[serde(default)]
128pub struct MergeCaptainIterationRunReport {
129 pub id: String,
130 pub scenario_id: String,
131 pub variant_id: String,
132 pub backend: String,
133 pub backend_source: Option<String>,
134 pub model_route: Option<String>,
135 pub timeout_tier: Option<String>,
136 pub package_revision: Option<String>,
137 pub prompt_asset_revision: Option<String>,
138 pub pass: bool,
139 pub skipped: bool,
140 pub skip_reason: Option<String>,
141 pub drift_score: u64,
142 pub degradation_reasons: Vec<String>,
143 pub transcript_path: Option<String>,
144 pub receipt_path: Option<String>,
145 pub summary_path: Option<String>,
146 pub oracle_error_findings: usize,
147 pub oracle_warn_findings: usize,
148 pub cost_usd: f64,
149 pub latency_ms: u64,
150 pub tool_calls: u64,
151 pub model_calls: u64,
152 pub event_count: u64,
153}
154
155#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
156#[serde(default)]
157pub struct MergeCaptainIterationRanking {
158 pub variant_id: String,
159 pub package_revision: Option<String>,
160 pub prompt_asset_revision: Option<String>,
161 pub scenarios_completed: usize,
162 pub scenarios_passed: usize,
163 pub skipped: usize,
164 pub drift_score: u64,
165 pub cost_usd: f64,
166 pub latency_ms: u64,
167}
168
169#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
170#[serde(default)]
171pub struct MergeCaptainIterationDiffReport {
172 #[serde(rename = "_type")]
173 pub type_name: String,
174 pub version: u32,
175 pub baseline_id: String,
176 pub candidate_id: String,
177 pub baseline_path: String,
178 pub candidate_path: String,
179 pub improved: usize,
180 pub regressed: usize,
181 pub unchanged: usize,
182 pub missing: usize,
183 pub entries: Vec<MergeCaptainIterationDiffEntry>,
184}
185
186#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
187#[serde(default)]
188pub struct MergeCaptainIterationDiffEntry {
189 pub scenario_id: String,
190 pub variant_id: String,
191 pub baseline_drift_score: Option<u64>,
192 pub candidate_drift_score: Option<u64>,
193 pub delta: Option<i64>,
194 pub status: String,
195 pub baseline_pass: Option<bool>,
196 pub candidate_pass: Option<bool>,
197 pub baseline_prompt_asset_revision: Option<String>,
198 pub candidate_prompt_asset_revision: Option<String>,
199}
200
201pub fn load_merge_captain_iteration_manifest(
202 path: &Path,
203) -> Result<MergeCaptainIterationManifest, VmError> {
204 let content = fs::read_to_string(path).map_err(|error| {
205 VmError::Runtime(format!(
206 "failed to read merge-captain iteration manifest {}: {error}",
207 path.display()
208 ))
209 })?;
210 let mut manifest: MergeCaptainIterationManifest =
211 if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
212 serde_json::from_str(&content).map_err(|error| {
213 VmError::Runtime(format!(
214 "failed to parse merge-captain iteration JSON {}: {error}",
215 path.display()
216 ))
217 })?
218 } else {
219 toml::from_str(&content).map_err(|error| {
220 VmError::Runtime(format!(
221 "failed to parse merge-captain iteration TOML {}: {error}",
222 path.display()
223 ))
224 })?
225 };
226 normalize_merge_captain_iteration_manifest(&mut manifest);
227 if manifest.base_dir.is_none() {
228 manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
229 }
230 Ok(manifest)
231}
232
233pub fn normalize_merge_captain_iteration_manifest(manifest: &mut MergeCaptainIterationManifest) {
234 if manifest.type_name.is_empty() {
235 manifest.type_name = MANIFEST_TYPE.to_string();
236 }
237 if manifest.version == 0 {
238 manifest.version = 1;
239 }
240 if manifest.id.trim().is_empty() {
241 manifest.id = manifest
242 .name
243 .clone()
244 .filter(|name| !name.trim().is_empty())
245 .unwrap_or_else(|| new_id("merge_captain_iteration"));
246 }
247 for (index, scenario) in manifest.scenarios.iter_mut().enumerate() {
248 if scenario.id.trim().is_empty() {
249 scenario.id = scenario
250 .backend
251 .scenario
252 .clone()
253 .or_else(|| {
254 scenario
255 .backend
256 .path
257 .as_deref()
258 .and_then(|path| Path::new(path).file_stem())
259 .and_then(|stem| stem.to_str())
260 .map(str::to_string)
261 })
262 .unwrap_or_else(|| format!("scenario_{}", index + 1));
263 }
264 if scenario.backend.kind.trim().is_empty() {
265 scenario.backend.kind = "replay".to_string();
266 }
267 }
268 if manifest.variants.is_empty() {
269 manifest.variants.push(MergeCaptainIterationVariant {
270 id: "default".to_string(),
271 ..Default::default()
272 });
273 }
274 for (index, variant) in manifest.variants.iter_mut().enumerate() {
275 if variant.id.trim().is_empty() {
276 variant.id = format!("variant_{}", index + 1);
277 }
278 }
279}
280
281pub fn run_merge_captain_iteration(
282 manifest: &MergeCaptainIterationManifest,
283) -> Result<MergeCaptainIterationReport, VmError> {
284 let mut manifest = manifest.clone();
285 normalize_merge_captain_iteration_manifest(&mut manifest);
286 if manifest.scenarios.is_empty() {
287 return Err(VmError::Runtime(format!(
288 "merge-captain iteration '{}' must declare at least one scenario",
289 manifest.id
290 )));
291 }
292
293 let base_dir = manifest.base_dir.as_deref().map(Path::new);
294 let artifact_root = resolve_artifact_root(&manifest, base_dir);
295 fs::create_dir_all(&artifact_root).map_err(|error| {
296 VmError::Runtime(format!(
297 "failed to create merge-captain iteration artifact root {}: {error}",
298 artifact_root.display()
299 ))
300 })?;
301 write_json_file(&artifact_root.join("iteration.json"), &manifest)?;
302
303 let total = manifest.scenarios.len() * manifest.variants.len();
304 let started = Instant::now();
305 let mut total_cost_usd = 0.0;
306 let mut total_latency_ms: u64 = 0;
307 let mut completed = 0;
308 let mut budget_exhausted_reason = None;
309 let mut runs = Vec::new();
310
311 for scenario in &manifest.scenarios {
312 for variant in &manifest.variants {
313 if budget_exhausted_reason.is_none() {
314 budget_exhausted_reason =
315 budget_exhausted(&manifest.budget, completed, total_cost_usd, started);
316 }
317 if let Some(reason) = &budget_exhausted_reason {
318 runs.push(skipped_run_report(
319 scenario,
320 variant,
321 &artifact_root,
322 reason.clone(),
323 ));
324 continue;
325 }
326
327 let run = run_iteration_cell(&artifact_root, base_dir, scenario, variant)?;
328 if !run.skipped {
329 completed += 1;
330 total_cost_usd += run.cost_usd;
331 total_latency_ms = total_latency_ms.saturating_add(run.latency_ms);
332 }
333 runs.push(run);
334 }
335 }
336
337 let rankings = rank_variants(&manifest.variants, &runs);
338 let skipped = runs.iter().filter(|run| run.skipped).count();
339 let best_drift = rankings.first().map(|ranking| ranking.drift_score);
340 let pass = !runs.is_empty()
341 && budget_exhausted_reason.is_none()
342 && best_drift == Some(0)
343 && rankings
344 .first()
345 .is_some_and(|ranking| ranking.scenarios_completed == manifest.scenarios.len());
346 let summary_json_path = artifact_root.join("summary.json");
347 let summary_markdown_path = artifact_root.join("summary.md");
348 let mut report = MergeCaptainIterationReport {
349 type_name: REPORT_TYPE.to_string(),
350 version: 1,
351 id: manifest.id,
352 name: manifest.name,
353 artifact_root: artifact_root.display().to_string(),
354 summary_json_path: summary_json_path.display().to_string(),
355 summary_markdown_path: summary_markdown_path.display().to_string(),
356 pass,
357 total,
358 completed,
359 skipped,
360 budget_exhausted: budget_exhausted_reason.is_some(),
361 budget_exhausted_reason,
362 total_cost_usd,
363 total_latency_ms,
364 runs,
365 rankings,
366 metadata: manifest.metadata,
367 };
368 write_json_file(&summary_json_path, &report)?;
369 let markdown = render_iteration_markdown(&report);
370 write_text_file(&summary_markdown_path, &markdown)?;
371 report.summary_json_path = summary_json_path.display().to_string();
372 report.summary_markdown_path = summary_markdown_path.display().to_string();
373 Ok(report)
374}
375
376fn run_iteration_cell(
377 artifact_root: &Path,
378 base_dir: Option<&Path>,
379 scenario: &MergeCaptainIterationScenario,
380 variant: &MergeCaptainIterationVariant,
381) -> Result<MergeCaptainIterationRunReport, VmError> {
382 let cell_dir = artifact_root
383 .join("runs")
384 .join(safe_path_segment(&scenario.id))
385 .join(safe_path_segment(&variant.id));
386 fs::create_dir_all(&cell_dir).map_err(|error| {
387 VmError::Runtime(format!(
388 "failed to create iteration run dir {}: {error}",
389 cell_dir.display()
390 ))
391 })?;
392 let backend = resolve_iteration_backend(artifact_root, base_dir, scenario, variant)?;
393 let transcript_path = cell_dir.join("event_log.jsonl");
394 let receipt_path = cell_dir.join("receipt.json");
395 let summary_path = cell_dir.join("summary.json");
396 let max_sweeps = variant.max_sweeps.unwrap_or(1).max(1);
397 let output = super::run_merge_captain_driver(MergeCaptainDriverOptions {
398 backend: backend.clone(),
399 mode: if max_sweeps > 1 {
400 MergeCaptainDriverMode::Watch
401 } else {
402 MergeCaptainDriverMode::Once
403 },
404 model_route: variant
405 .model_route
406 .clone()
407 .or_else(|| Some(variant.id.clone())),
408 timeout_tier: variant.timeout_tier.clone(),
409 transcript_out: Some(transcript_path.clone()),
410 receipt_out: Some(receipt_path.clone()),
411 run_root: cell_dir.join("driver-runs"),
412 max_sweeps,
413 watch_backoff_ms: variant.watch_backoff_ms.unwrap_or(0),
414 stream_stdout: false,
415 })?;
416 write_json_file(&summary_path, &output.summary)?;
417
418 let degradation_reasons = degradation_reasons(&output.summary, variant);
419 let drift_score = drift_score(&output.summary, °radation_reasons);
420 let pass = output.summary.pass && degradation_reasons.is_empty();
421 let report = MergeCaptainIterationRunReport {
422 id: format!("{}::{}", scenario.id, variant.id),
423 scenario_id: scenario.id.clone(),
424 variant_id: variant.id.clone(),
425 backend: backend.kind().to_string(),
426 backend_source: output.summary.backend_source.clone(),
427 model_route: output.summary.model_route.clone(),
428 timeout_tier: output.summary.timeout_tier.clone(),
429 package_revision: variant.package_revision.clone(),
430 prompt_asset_revision: variant.prompt_asset_revision.clone(),
431 pass,
432 skipped: false,
433 skip_reason: None,
434 drift_score,
435 degradation_reasons,
436 transcript_path: Some(relative_display(artifact_root, &transcript_path)),
437 receipt_path: Some(relative_display(artifact_root, &receipt_path)),
438 summary_path: Some(relative_display(artifact_root, &summary_path)),
439 oracle_error_findings: output.summary.oracle_error_findings,
440 oracle_warn_findings: output.summary.oracle_warn_findings,
441 cost_usd: output.summary.cost_usd,
442 latency_ms: output.summary.latency_ms,
443 tool_calls: output.summary.tool_calls,
444 model_calls: output.summary.model_calls,
445 event_count: output.summary.event_count,
446 };
447 write_json_file(&cell_dir.join("run-report.json"), &report)?;
448 Ok(report)
449}
450
451fn resolve_iteration_backend(
452 artifact_root: &Path,
453 base_dir: Option<&Path>,
454 scenario: &MergeCaptainIterationScenario,
455 variant: &MergeCaptainIterationVariant,
456) -> Result<MergeCaptainDriverBackend, VmError> {
457 match scenario.backend.kind.trim().to_ascii_lowercase().as_str() {
458 "replay" => {
459 let path = scenario.backend.path.as_deref().ok_or_else(|| {
460 VmError::Runtime(format!(
461 "iteration scenario '{}' replay backend requires path",
462 scenario.id
463 ))
464 })?;
465 let source = resolve_manifest_path(base_dir, path);
466 Ok(MergeCaptainDriverBackend::Replay {
467 fixture: copy_replay_fixture(artifact_root, &scenario.id, &source)?,
468 })
469 }
470 "mock" => {
471 let playground_dir = artifact_root
472 .join("playgrounds")
473 .join(safe_path_segment(&scenario.id))
474 .join(safe_path_segment(&variant.id));
475 let manifest = if let Some(name) = scenario.backend.scenario.as_deref() {
476 Some(super::playground::load_builtin(name)?)
477 } else if let Some(path) = scenario.backend.path.as_deref() {
478 let source = resolve_manifest_path(base_dir, path);
479 if super::playground::playground_marker_path(&source).exists() {
480 return Ok(MergeCaptainDriverBackend::Mock {
481 playground_dir: source,
482 });
483 }
484 super::playground::ScenarioManifest::load(&source).ok()
485 } else {
486 Some(super::playground::load_builtin(&scenario.id)?)
487 };
488 if let Some(manifest) = manifest {
489 let _ = super::playground::cleanup_playground_at(&playground_dir)?;
490 super::playground::init_playground_at(super::playground::InitOptions {
491 dir: &playground_dir,
492 manifest: &manifest,
493 allow_existing: false,
494 })?;
495 Ok(MergeCaptainDriverBackend::Mock { playground_dir })
496 } else {
497 let path = scenario.backend.path.as_deref().ok_or_else(|| {
498 VmError::Runtime(format!(
499 "iteration scenario '{}' mock backend requires path or scenario",
500 scenario.id
501 ))
502 })?;
503 Ok(MergeCaptainDriverBackend::Mock {
504 playground_dir: resolve_manifest_path(base_dir, path),
505 })
506 }
507 }
508 "live" => Ok(MergeCaptainDriverBackend::Live),
509 other => Err(VmError::Runtime(format!(
510 "unsupported merge-captain iteration backend '{other}'"
511 ))),
512 }
513}
514
515fn copy_replay_fixture(
516 artifact_root: &Path,
517 scenario_id: &str,
518 source: &Path,
519) -> Result<PathBuf, VmError> {
520 let stem = source
521 .file_stem()
522 .and_then(|stem| stem.to_str())
523 .unwrap_or("event_log");
524 let dest_dir = artifact_root
525 .join("fixtures")
526 .join(safe_path_segment(scenario_id))
527 .join("transcripts");
528 fs::create_dir_all(&dest_dir).map_err(|error| {
529 VmError::Runtime(format!(
530 "failed to create replay fixture dir {}: {error}",
531 dest_dir.display()
532 ))
533 })?;
534 let dest = dest_dir.join(format!("{stem}.jsonl"));
535 fs::copy(source, &dest).map_err(|error| {
536 VmError::Runtime(format!(
537 "failed to copy replay fixture {} to {}: {error}",
538 source.display(),
539 dest.display()
540 ))
541 })?;
542 if let Some(golden) = find_replay_golden(source)? {
543 let golden_dir = artifact_root
544 .join("fixtures")
545 .join(safe_path_segment(scenario_id))
546 .join("goldens");
547 fs::create_dir_all(&golden_dir).map_err(|error| {
548 VmError::Runtime(format!(
549 "failed to create replay golden dir {}: {error}",
550 golden_dir.display()
551 ))
552 })?;
553 let golden_dest = golden_dir.join(format!("{stem}.json"));
554 fs::copy(&golden, &golden_dest).map_err(|error| {
555 VmError::Runtime(format!(
556 "failed to copy replay golden {} to {}: {error}",
557 golden.display(),
558 golden_dest.display()
559 ))
560 })?;
561 }
562 Ok(dest)
563}
564
565fn find_replay_golden(source: &Path) -> Result<Option<PathBuf>, VmError> {
566 let Some(stem) = source.file_stem().and_then(|stem| stem.to_str()) else {
567 return Ok(None);
568 };
569 let mut candidates = Vec::new();
570 if let Some(parent) = source.parent() {
571 candidates.push(parent.join(format!("{stem}.golden.json")));
572 if parent.file_name().and_then(|name| name.to_str()) == Some("transcripts") {
573 if let Some(root) = parent.parent() {
574 candidates.push(root.join("goldens").join(format!("{stem}.json")));
575 }
576 }
577 }
578 for candidate in candidates {
579 if candidate.exists() {
580 let _ = load_merge_captain_golden(&candidate)?;
581 return Ok(Some(candidate));
582 }
583 }
584 Ok(None)
585}
586
587fn skipped_run_report(
588 scenario: &MergeCaptainIterationScenario,
589 variant: &MergeCaptainIterationVariant,
590 _artifact_root: &Path,
591 reason: String,
592) -> MergeCaptainIterationRunReport {
593 MergeCaptainIterationRunReport {
594 id: format!("{}::{}", scenario.id, variant.id),
595 scenario_id: scenario.id.clone(),
596 variant_id: variant.id.clone(),
597 model_route: variant.model_route.clone(),
598 timeout_tier: variant.timeout_tier.clone(),
599 package_revision: variant.package_revision.clone(),
600 prompt_asset_revision: variant.prompt_asset_revision.clone(),
601 skipped: true,
602 skip_reason: Some(reason),
603 drift_score: 10_000,
604 ..Default::default()
605 }
606}
607
608fn budget_exhausted(
609 budget: &MergeCaptainIterationBudget,
610 completed: usize,
611 total_cost_usd: f64,
612 started: Instant,
613) -> Option<String> {
614 if let Some(max_runs) = budget.max_runs {
615 if completed >= max_runs {
616 return Some(format!("completed run cap {max_runs} reached"));
617 }
618 }
619 if let Some(max_cost_usd) = budget.max_cost_usd {
620 if total_cost_usd > max_cost_usd {
621 return Some(format!(
622 "cost budget ${max_cost_usd:.6} reached (spent ${total_cost_usd:.6})"
623 ));
624 }
625 }
626 if let Some(max_wallclock_ms) = budget.max_wallclock_ms {
627 if started.elapsed().as_millis() >= u128::from(max_wallclock_ms) {
628 return Some(format!("wallclock budget {max_wallclock_ms}ms reached"));
629 }
630 }
631 None
632}
633
634fn degradation_reasons(
635 summary: &MergeCaptainRunSummary,
636 variant: &MergeCaptainIterationVariant,
637) -> Vec<String> {
638 let mut reasons = Vec::new();
639 if !summary.pass {
640 reasons.push(format!(
641 "oracle reported {} error finding(s) and {} warning finding(s)",
642 summary.oracle_error_findings, summary.oracle_warn_findings
643 ));
644 }
645 if let Some(max_tool_calls) = variant.max_tool_calls {
646 if summary.tool_calls > max_tool_calls {
647 reasons.push(format!(
648 "tool calls {} exceeded variant budget {}",
649 summary.tool_calls, max_tool_calls
650 ));
651 }
652 }
653 if let Some(max_model_calls) = variant.max_model_calls {
654 if summary.model_calls > max_model_calls {
655 reasons.push(format!(
656 "model calls {} exceeded variant budget {}",
657 summary.model_calls, max_model_calls
658 ));
659 }
660 }
661 if let Some(max_cost_usd) = variant.max_cost_usd {
662 if summary.cost_usd > max_cost_usd {
663 reasons.push(format!(
664 "cost ${:.6} exceeded variant budget ${:.6}",
665 summary.cost_usd, max_cost_usd
666 ));
667 }
668 }
669 if let Some(max_latency_ms) = variant.max_latency_ms.or(variant.timeout_ms) {
670 if summary.latency_ms > max_latency_ms {
671 reasons.push(format!(
672 "latency {}ms exceeded variant timeout {}ms",
673 summary.latency_ms, max_latency_ms
674 ));
675 }
676 }
677 reasons
678}
679
680fn drift_score(summary: &MergeCaptainRunSummary, degradation_reasons: &[String]) -> u64 {
681 let failed_penalty = if summary.pass { 0 } else { 1_000 };
682 failed_penalty
683 + (summary.oracle_error_findings as u64 * 100)
684 + (summary.oracle_warn_findings as u64 * 10)
685 + (degradation_reasons.len() as u64 * 25)
686}
687
688fn rank_variants(
689 variants: &[MergeCaptainIterationVariant],
690 runs: &[MergeCaptainIterationRunReport],
691) -> Vec<MergeCaptainIterationRanking> {
692 let mut rankings = Vec::new();
693 for variant in variants {
694 let matching: Vec<_> = runs
695 .iter()
696 .filter(|run| run.variant_id == variant.id)
697 .collect();
698 rankings.push(MergeCaptainIterationRanking {
699 variant_id: variant.id.clone(),
700 package_revision: variant.package_revision.clone(),
701 prompt_asset_revision: variant.prompt_asset_revision.clone(),
702 scenarios_completed: matching.iter().filter(|run| !run.skipped).count(),
703 scenarios_passed: matching.iter().filter(|run| run.pass).count(),
704 skipped: matching.iter().filter(|run| run.skipped).count(),
705 drift_score: matching.iter().map(|run| run.drift_score).sum(),
706 cost_usd: matching.iter().map(|run| run.cost_usd).sum(),
707 latency_ms: matching.iter().map(|run| run.latency_ms).sum(),
708 });
709 }
710 rankings.sort_by(|left, right| {
711 left.drift_score
712 .cmp(&right.drift_score)
713 .then_with(|| {
714 left.cost_usd
715 .partial_cmp(&right.cost_usd)
716 .unwrap_or(std::cmp::Ordering::Equal)
717 })
718 .then_with(|| left.variant_id.cmp(&right.variant_id))
719 });
720 rankings
721}
722
723pub fn load_merge_captain_iteration_report(
724 path: &Path,
725) -> Result<MergeCaptainIterationReport, VmError> {
726 let report_path = if path.is_dir() {
727 path.join("summary.json")
728 } else {
729 path.to_path_buf()
730 };
731 let bytes = fs::read(&report_path).map_err(|error| {
732 VmError::Runtime(format!(
733 "failed to read merge-captain iteration report {}: {error}",
734 report_path.display()
735 ))
736 })?;
737 serde_json::from_slice(&bytes).map_err(|error| {
738 VmError::Runtime(format!(
739 "failed to parse merge-captain iteration report {}: {error}",
740 report_path.display()
741 ))
742 })
743}
744
745pub fn diff_merge_captain_iterations(
746 baseline_path: &Path,
747 candidate_path: &Path,
748) -> Result<MergeCaptainIterationDiffReport, VmError> {
749 let baseline = load_merge_captain_iteration_report(baseline_path)?;
750 let candidate = load_merge_captain_iteration_report(candidate_path)?;
751 let mut keys = BTreeSet::new();
752 for run in &baseline.runs {
753 keys.insert((run.scenario_id.clone(), run.variant_id.clone()));
754 }
755 for run in &candidate.runs {
756 keys.insert((run.scenario_id.clone(), run.variant_id.clone()));
757 }
758
759 let mut entries = Vec::new();
760 let mut improved = 0;
761 let mut regressed = 0;
762 let mut unchanged = 0;
763 let mut missing = 0;
764 for (scenario_id, variant_id) in keys {
765 let before = baseline
766 .runs
767 .iter()
768 .find(|run| run.scenario_id == scenario_id && run.variant_id == variant_id);
769 let after = candidate
770 .runs
771 .iter()
772 .find(|run| run.scenario_id == scenario_id && run.variant_id == variant_id);
773 let delta = before
774 .zip(after)
775 .map(|(before, after)| after.drift_score as i64 - before.drift_score as i64);
776 let status = match delta {
777 Some(value) if value < 0 => {
778 improved += 1;
779 "improved"
780 }
781 Some(value) if value > 0 => {
782 regressed += 1;
783 "regressed"
784 }
785 Some(_) => {
786 unchanged += 1;
787 "unchanged"
788 }
789 None => {
790 missing += 1;
791 "missing"
792 }
793 };
794 entries.push(MergeCaptainIterationDiffEntry {
795 scenario_id,
796 variant_id,
797 baseline_drift_score: before.map(|run| run.drift_score),
798 candidate_drift_score: after.map(|run| run.drift_score),
799 delta,
800 status: status.to_string(),
801 baseline_pass: before.map(|run| run.pass),
802 candidate_pass: after.map(|run| run.pass),
803 baseline_prompt_asset_revision: before
804 .and_then(|run| run.prompt_asset_revision.clone()),
805 candidate_prompt_asset_revision: after
806 .and_then(|run| run.prompt_asset_revision.clone()),
807 });
808 }
809
810 Ok(MergeCaptainIterationDiffReport {
811 type_name: DIFF_TYPE.to_string(),
812 version: 1,
813 baseline_id: baseline.id,
814 candidate_id: candidate.id,
815 baseline_path: baseline_path.display().to_string(),
816 candidate_path: candidate_path.display().to_string(),
817 improved,
818 regressed,
819 unchanged,
820 missing,
821 entries,
822 })
823}
824
825pub fn render_iteration_markdown(report: &MergeCaptainIterationReport) -> String {
826 let mut out = String::new();
827 out.push_str(&format!(
828 "# Merge Captain iteration: {}\n\n",
829 report.name.as_deref().unwrap_or(&report.id)
830 ));
831 out.push_str(&format!(
832 "- pass: {}\n- completed: {}/{}\n- skipped: {}\n- budget_exhausted: {}\n\n",
833 report.pass, report.completed, report.total, report.skipped, report.budget_exhausted
834 ));
835 out.push_str("## Variant ranking\n\n");
836 out.push_str(
837 "| rank | variant | package | prompt assets | passed | drift | cost | latency ms |\n",
838 );
839 out.push_str("|---:|---|---|---|---:|---:|---:|---:|\n");
840 for (index, ranking) in report.rankings.iter().enumerate() {
841 out.push_str(&format!(
842 "| {} | {} | {} | {} | {}/{} | {} | {:.6} | {} |\n",
843 index + 1,
844 ranking.variant_id,
845 ranking.package_revision.as_deref().unwrap_or("-"),
846 ranking.prompt_asset_revision.as_deref().unwrap_or("-"),
847 ranking.scenarios_passed,
848 ranking.scenarios_completed,
849 ranking.drift_score,
850 ranking.cost_usd,
851 ranking.latency_ms
852 ));
853 }
854 out.push_str("\n## Scenario runs\n\n");
855 out.push_str(
856 "| scenario | variant | pass | drift | errors | warnings | tools | models | artifact |\n",
857 );
858 out.push_str("|---|---|---:|---:|---:|---:|---:|---:|---|\n");
859 for run in &report.runs {
860 out.push_str(&format!(
861 "| {} | {} | {} | {} | {} | {} | {} | {} | {} |\n",
862 run.scenario_id,
863 run.variant_id,
864 if run.skipped {
865 "skipped".to_string()
866 } else {
867 run.pass.to_string()
868 },
869 run.drift_score,
870 run.oracle_error_findings,
871 run.oracle_warn_findings,
872 run.tool_calls,
873 run.model_calls,
874 run.summary_path.as_deref().unwrap_or("-")
875 ));
876 }
877 out
878}
879
880pub fn render_iteration_diff_markdown(report: &MergeCaptainIterationDiffReport) -> String {
881 let mut out = String::new();
882 out.push_str(&format!(
883 "# Merge Captain iteration diff: {} -> {}\n\n",
884 report.baseline_id, report.candidate_id
885 ));
886 out.push_str(&format!(
887 "- improved: {}\n- regressed: {}\n- unchanged: {}\n- missing: {}\n\n",
888 report.improved, report.regressed, report.unchanged, report.missing
889 ));
890 out.push_str(
891 "| scenario | variant | baseline | candidate | delta | status | prompt assets |\n",
892 );
893 out.push_str("|---|---|---:|---:|---:|---|---|\n");
894 for entry in &report.entries {
895 out.push_str(&format!(
896 "| {} | {} | {} | {} | {} | {} | {} -> {} |\n",
897 entry.scenario_id,
898 entry.variant_id,
899 optional_u64(entry.baseline_drift_score),
900 optional_u64(entry.candidate_drift_score),
901 entry
902 .delta
903 .map(|delta| delta.to_string())
904 .unwrap_or_else(|| "-".to_string()),
905 entry.status,
906 entry
907 .baseline_prompt_asset_revision
908 .as_deref()
909 .unwrap_or("-"),
910 entry
911 .candidate_prompt_asset_revision
912 .as_deref()
913 .unwrap_or("-")
914 ));
915 }
916 out
917}
918
919fn optional_u64(value: Option<u64>) -> String {
920 value
921 .map(|value| value.to_string())
922 .unwrap_or_else(|| "-".to_string())
923}
924
925fn resolve_artifact_root(
926 manifest: &MergeCaptainIterationManifest,
927 base_dir: Option<&Path>,
928) -> PathBuf {
929 let root = manifest
930 .artifact_root
931 .clone()
932 .unwrap_or_else(|| format!(".harn-runs/merge-captain-iterations/{}", manifest.id));
933 let resolved = resolve_manifest_path(base_dir, &root);
934 if resolved.is_absolute() {
935 resolved
936 } else {
937 let relative = resolved.strip_prefix(".").unwrap_or(&resolved);
938 std::env::current_dir()
939 .unwrap_or_else(|_| PathBuf::from("."))
940 .join(relative)
941 }
942}
943
944fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
945 let path_buf = PathBuf::from(path);
946 if path_buf.is_absolute() {
947 path_buf
948 } else if let Some(base_dir) = base_dir {
949 base_dir.join(path_buf)
950 } else {
951 path_buf
952 }
953}
954
955fn relative_display(root: &Path, path: &Path) -> String {
956 path.strip_prefix(root)
957 .map(|path| path.display().to_string())
958 .unwrap_or_else(|_| path.display().to_string())
959}
960
961fn safe_path_segment(value: &str) -> String {
962 let mut out = String::new();
963 for ch in value.chars() {
964 if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
965 out.push(ch);
966 } else {
967 out.push('_');
968 }
969 }
970 if out.is_empty() {
971 "unnamed".to_string()
972 } else {
973 out
974 }
975}
976
977fn write_json_file<T: Serialize>(path: &Path, value: &T) -> Result<(), VmError> {
978 let mut bytes = serde_json::to_vec_pretty(value)
979 .map_err(|error| VmError::Runtime(format!("failed to serialize JSON artifact: {error}")))?;
980 bytes.push(b'\n');
981 write_bytes_file(path, &bytes)
982}
983
984fn write_text_file(path: &Path, value: &str) -> Result<(), VmError> {
985 write_bytes_file(path, value.as_bytes())
986}
987
988fn write_bytes_file(path: &Path, bytes: &[u8]) -> Result<(), VmError> {
989 if let Some(parent) = path.parent() {
990 fs::create_dir_all(parent).map_err(|error| {
991 VmError::Runtime(format!(
992 "failed to create artifact directory {}: {error}",
993 parent.display()
994 ))
995 })?;
996 }
997 fs::write(path, bytes).map_err(|error| {
998 VmError::Runtime(format!(
999 "failed to write artifact {}: {error}",
1000 path.display()
1001 ))
1002 })
1003}
1004
1005#[cfg(test)]
1006mod tests {
1007 use super::*;
1008 use crate::orchestration::load_transcript_jsonl;
1009
1010 fn repo_root() -> PathBuf {
1011 PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1012 .parent()
1013 .unwrap()
1014 .parent()
1015 .unwrap()
1016 .to_path_buf()
1017 }
1018
1019 #[test]
1020 fn iteration_runs_matrix_and_ranks_by_drift() {
1021 let temp = tempfile::tempdir().unwrap();
1022 let manifest = MergeCaptainIterationManifest {
1023 id: "issue-1021-smoke".to_string(),
1024 base_dir: Some(repo_root().display().to_string()),
1025 artifact_root: Some(temp.path().join("iteration").display().to_string()),
1026 scenarios: vec![MergeCaptainIterationScenario {
1027 id: "green-pr".to_string(),
1028 backend: MergeCaptainIterationBackendSpec {
1029 kind: "replay".to_string(),
1030 path: Some(
1031 "examples/personas/merge_captain/transcripts/green_pr.jsonl".to_string(),
1032 ),
1033 ..Default::default()
1034 },
1035 ..Default::default()
1036 }],
1037 variants: vec![
1038 MergeCaptainIterationVariant {
1039 id: "prompt-v1".to_string(),
1040 prompt_asset_revision: Some("prompt/v1".to_string()),
1041 max_tool_calls: Some(1),
1042 ..Default::default()
1043 },
1044 MergeCaptainIterationVariant {
1045 id: "prompt-v2".to_string(),
1046 prompt_asset_revision: Some("prompt/v2".to_string()),
1047 max_tool_calls: Some(4),
1048 ..Default::default()
1049 },
1050 ],
1051 ..Default::default()
1052 };
1053
1054 let report = run_merge_captain_iteration(&manifest).unwrap();
1055
1056 assert!(report.pass);
1057 assert_eq!(report.completed, 2);
1058 assert_eq!(report.rankings[0].variant_id, "prompt-v2");
1059 assert_eq!(report.rankings[0].drift_score, 0);
1060 assert!(Path::new(&report.summary_markdown_path).exists());
1061 assert!(Path::new(&report.artifact_root)
1062 .join("fixtures/green-pr/transcripts/green_pr.jsonl")
1063 .exists());
1064 }
1065
1066 #[test]
1067 fn iteration_budget_cap_skips_remaining_runs() {
1068 let temp = tempfile::tempdir().unwrap();
1069 let manifest = MergeCaptainIterationManifest {
1070 id: "issue-1021-budget".to_string(),
1071 base_dir: Some(repo_root().display().to_string()),
1072 artifact_root: Some(temp.path().join("iteration").display().to_string()),
1073 scenarios: vec![MergeCaptainIterationScenario {
1074 id: "green-pr".to_string(),
1075 backend: MergeCaptainIterationBackendSpec {
1076 kind: "replay".to_string(),
1077 path: Some(
1078 "examples/personas/merge_captain/transcripts/green_pr.jsonl".to_string(),
1079 ),
1080 ..Default::default()
1081 },
1082 ..Default::default()
1083 }],
1084 variants: vec![
1085 MergeCaptainIterationVariant {
1086 id: "one".to_string(),
1087 ..Default::default()
1088 },
1089 MergeCaptainIterationVariant {
1090 id: "two".to_string(),
1091 ..Default::default()
1092 },
1093 ],
1094 budget: MergeCaptainIterationBudget {
1095 max_runs: Some(1),
1096 ..Default::default()
1097 },
1098 ..Default::default()
1099 };
1100
1101 let report = run_merge_captain_iteration(&manifest).unwrap();
1102
1103 assert!(report.budget_exhausted);
1104 assert_eq!(report.completed, 1);
1105 assert_eq!(report.skipped, 1);
1106 assert!(report.runs[1].skipped);
1107 }
1108
1109 #[test]
1110 fn diff_marks_prompt_candidate_improvement() {
1111 let temp = tempfile::tempdir().unwrap();
1112 let baseline_path = temp.path().join("baseline.json");
1113 let candidate_path = temp.path().join("candidate.json");
1114 let mut baseline = MergeCaptainIterationReport {
1115 type_name: REPORT_TYPE.to_string(),
1116 id: "baseline".to_string(),
1117 runs: vec![MergeCaptainIterationRunReport {
1118 scenario_id: "green-pr".to_string(),
1119 variant_id: "value-route".to_string(),
1120 drift_score: 25,
1121 prompt_asset_revision: Some("prompt/v1".to_string()),
1122 ..Default::default()
1123 }],
1124 ..Default::default()
1125 };
1126 baseline.version = 1;
1127 let mut candidate = baseline.clone();
1128 candidate.id = "candidate".to_string();
1129 candidate.runs[0].drift_score = 0;
1130 candidate.runs[0].prompt_asset_revision = Some("prompt/v2".to_string());
1131 write_json_file(&baseline_path, &baseline).unwrap();
1132 write_json_file(&candidate_path, &candidate).unwrap();
1133
1134 let diff = diff_merge_captain_iterations(&baseline_path, &candidate_path).unwrap();
1135
1136 assert_eq!(diff.improved, 1);
1137 assert_eq!(diff.entries[0].delta, Some(-25));
1138 assert_eq!(diff.entries[0].status, "improved");
1139 }
1140
1141 #[test]
1142 fn mock_scenario_manifest_materializes_playground() {
1143 let temp = tempfile::tempdir().unwrap();
1144 let manifest = MergeCaptainIterationManifest {
1145 id: "issue-1021-mock".to_string(),
1146 base_dir: Some(repo_root().display().to_string()),
1147 artifact_root: Some(temp.path().join("iteration").display().to_string()),
1148 scenarios: vec![MergeCaptainIterationScenario {
1149 id: "single-green".to_string(),
1150 backend: MergeCaptainIterationBackendSpec {
1151 kind: "mock".to_string(),
1152 path: Some("examples/merge_captain/scenarios/single_green.json".to_string()),
1153 ..Default::default()
1154 },
1155 ..Default::default()
1156 }],
1157 variants: vec![MergeCaptainIterationVariant {
1158 id: "smoke".to_string(),
1159 ..Default::default()
1160 }],
1161 ..Default::default()
1162 };
1163
1164 let report = run_merge_captain_iteration(&manifest).unwrap();
1165
1166 assert_eq!(report.completed, 1);
1167 assert!(Path::new(&report.artifact_root)
1168 .join("playgrounds/single-green/smoke/playground.json")
1169 .exists());
1170 let loaded = load_transcript_jsonl(
1171 &Path::new(&report.artifact_root)
1172 .join(report.runs[0].transcript_path.as_ref().unwrap()),
1173 )
1174 .unwrap();
1175 assert!(!loaded.events.is_empty());
1176 }
1177}