1use std::collections::BTreeMap;
10use std::fs;
11use std::path::{Path, PathBuf};
12use std::time::Instant;
13
14use serde::{Deserialize, Serialize};
15
16use crate::value::VmError;
17
18use super::{
19 load_merge_captain_golden, new_id, MergeCaptainDriverBackend, MergeCaptainDriverMode,
20 MergeCaptainDriverOptions, MergeCaptainRunSummary,
21};
22
23const MANIFEST_TYPE: &str = "merge_captain_iteration_manifest";
24const REPORT_TYPE: &str = "merge_captain_iteration_report";
25const DIFF_TYPE: &str = "merge_captain_iteration_diff";
26
27#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
28#[serde(default)]
29pub struct MergeCaptainIterationManifest {
30 #[serde(rename = "_type")]
31 pub type_name: String,
32 pub version: u32,
33 pub id: String,
34 pub name: Option<String>,
35 pub description: Option<String>,
36 pub base_dir: Option<String>,
37 #[serde(alias = "artifact-root")]
38 pub artifact_root: Option<String>,
39 pub scenarios: Vec<MergeCaptainIterationScenario>,
40 pub variants: Vec<MergeCaptainIterationVariant>,
41 pub budget: MergeCaptainIterationBudget,
42 pub metadata: BTreeMap<String, serde_json::Value>,
43}
44
45#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
46#[serde(default)]
47pub struct MergeCaptainIterationScenario {
48 pub id: String,
49 pub description: Option<String>,
50 pub backend: MergeCaptainIterationBackendSpec,
51 pub metadata: BTreeMap<String, serde_json::Value>,
52}
53
54#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
55#[serde(default)]
56pub struct MergeCaptainIterationBackendSpec {
57 pub kind: String,
58 pub path: Option<String>,
59 pub scenario: Option<String>,
60}
61
62#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
63#[serde(default)]
64pub struct MergeCaptainIterationVariant {
65 pub id: String,
66 #[serde(alias = "model-route")]
67 pub model_route: Option<String>,
68 #[serde(alias = "timeout-tier")]
69 pub timeout_tier: Option<String>,
70 #[serde(alias = "package-revision")]
71 pub package_revision: Option<String>,
72 #[serde(alias = "prompt-asset-revision")]
73 pub prompt_asset_revision: Option<String>,
74 #[serde(alias = "max-cost-usd")]
75 pub max_cost_usd: Option<f64>,
76 #[serde(alias = "max-model-calls")]
77 pub max_model_calls: Option<u64>,
78 #[serde(alias = "max-tool-calls")]
79 pub max_tool_calls: Option<u64>,
80 #[serde(alias = "max-latency-ms")]
81 pub max_latency_ms: Option<u64>,
82 #[serde(alias = "timeout-ms")]
83 pub timeout_ms: Option<u64>,
84 #[serde(alias = "max-sweeps")]
85 pub max_sweeps: Option<u32>,
86 #[serde(alias = "watch-backoff-ms")]
87 pub watch_backoff_ms: Option<u64>,
88 pub metadata: BTreeMap<String, serde_json::Value>,
89}
90
91#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
92#[serde(default)]
93pub struct MergeCaptainIterationBudget {
94 #[serde(alias = "max-cost-usd")]
95 pub max_cost_usd: Option<f64>,
96 #[serde(alias = "max-wallclock-ms")]
97 pub max_wallclock_ms: Option<u64>,
98 #[serde(alias = "max-runs")]
99 pub max_runs: Option<usize>,
100}
101
102#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
103#[serde(default)]
104pub struct MergeCaptainIterationReport {
105 #[serde(rename = "_type")]
106 pub type_name: String,
107 pub version: u32,
108 pub id: String,
109 pub name: Option<String>,
110 pub artifact_root: String,
111 pub summary_json_path: String,
112 pub summary_markdown_path: String,
113 pub pass: bool,
114 pub total: usize,
115 pub completed: usize,
116 pub skipped: usize,
117 pub budget_exhausted: bool,
118 pub budget_exhausted_reason: Option<String>,
119 pub total_cost_usd: f64,
120 pub total_latency_ms: u64,
121 pub runs: Vec<MergeCaptainIterationRunReport>,
122 pub rankings: Vec<MergeCaptainIterationRanking>,
123 pub metadata: BTreeMap<String, serde_json::Value>,
124}
125
126#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
127#[serde(default)]
128pub struct MergeCaptainIterationRunReport {
129 pub id: String,
130 pub scenario_id: String,
131 pub variant_id: String,
132 pub backend: String,
133 pub backend_source: Option<String>,
134 pub model_route: Option<String>,
135 pub timeout_tier: Option<String>,
136 pub package_revision: Option<String>,
137 pub prompt_asset_revision: Option<String>,
138 pub pass: bool,
139 pub skipped: bool,
140 pub skip_reason: Option<String>,
141 pub drift_score: u64,
142 pub degradation_reasons: Vec<String>,
143 pub transcript_path: Option<String>,
144 pub receipt_path: Option<String>,
145 pub summary_path: Option<String>,
146 pub oracle_error_findings: usize,
147 pub oracle_warn_findings: usize,
148 pub cost_usd: f64,
149 pub latency_ms: u64,
150 pub tool_calls: u64,
151 pub model_calls: u64,
152 pub event_count: u64,
153}
154
155#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
156#[serde(default)]
157pub struct MergeCaptainIterationRanking {
158 pub variant_id: String,
159 pub package_revision: Option<String>,
160 pub prompt_asset_revision: Option<String>,
161 pub scenarios_completed: usize,
162 pub scenarios_passed: usize,
163 pub skipped: usize,
164 pub drift_score: u64,
165 pub cost_usd: f64,
166 pub latency_ms: u64,
167}
168
169#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
170#[serde(default)]
171pub struct MergeCaptainIterationDiffReport {
172 #[serde(rename = "_type")]
173 pub type_name: String,
174 pub version: u32,
175 pub baseline_id: String,
176 pub candidate_id: String,
177 pub baseline_path: String,
178 pub candidate_path: String,
179 pub improved: usize,
180 pub regressed: usize,
181 pub unchanged: usize,
182 pub missing: usize,
183 pub entries: Vec<MergeCaptainIterationDiffEntry>,
184}
185
186#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
187#[serde(default)]
188pub struct MergeCaptainIterationDiffEntry {
189 pub scenario_id: String,
190 pub variant_id: String,
191 pub baseline_drift_score: Option<u64>,
192 pub candidate_drift_score: Option<u64>,
193 pub delta: Option<i64>,
194 pub status: String,
195 pub baseline_pass: Option<bool>,
196 pub candidate_pass: Option<bool>,
197 pub baseline_prompt_asset_revision: Option<String>,
198 pub candidate_prompt_asset_revision: Option<String>,
199}
200
201pub fn load_merge_captain_iteration_manifest(
202 path: &Path,
203) -> Result<MergeCaptainIterationManifest, VmError> {
204 let content = fs::read_to_string(path).map_err(|error| {
205 VmError::Runtime(format!(
206 "failed to read merge-captain iteration manifest {}: {error}",
207 path.display()
208 ))
209 })?;
210 let mut manifest: MergeCaptainIterationManifest =
211 if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
212 serde_json::from_str(&content).map_err(|error| {
213 VmError::Runtime(format!(
214 "failed to parse merge-captain iteration JSON {}: {error}",
215 path.display()
216 ))
217 })?
218 } else {
219 toml::from_str(&content).map_err(|error| {
220 VmError::Runtime(format!(
221 "failed to parse merge-captain iteration TOML {}: {error}",
222 path.display()
223 ))
224 })?
225 };
226 normalize_merge_captain_iteration_manifest(&mut manifest);
227 if manifest.base_dir.is_none() {
228 manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
229 }
230 Ok(manifest)
231}
232
233pub fn normalize_merge_captain_iteration_manifest(manifest: &mut MergeCaptainIterationManifest) {
234 if manifest.type_name.is_empty() {
235 manifest.type_name = MANIFEST_TYPE.to_string();
236 }
237 if manifest.version == 0 {
238 manifest.version = 1;
239 }
240 if manifest.id.trim().is_empty() {
241 manifest.id = manifest
242 .name
243 .clone()
244 .filter(|name| !name.trim().is_empty())
245 .unwrap_or_else(|| new_id("merge_captain_iteration"));
246 }
247 for (index, scenario) in manifest.scenarios.iter_mut().enumerate() {
248 if scenario.id.trim().is_empty() {
249 scenario.id = scenario
250 .backend
251 .scenario
252 .clone()
253 .or_else(|| {
254 scenario
255 .backend
256 .path
257 .as_deref()
258 .and_then(|path| Path::new(path).file_stem())
259 .and_then(|stem| stem.to_str())
260 .map(str::to_string)
261 })
262 .unwrap_or_else(|| format!("scenario_{}", index + 1));
263 }
264 if scenario.backend.kind.trim().is_empty() {
265 scenario.backend.kind = "replay".to_string();
266 }
267 }
268 if manifest.variants.is_empty() {
269 manifest.variants.push(MergeCaptainIterationVariant {
270 id: "default".to_string(),
271 ..Default::default()
272 });
273 }
274 for (index, variant) in manifest.variants.iter_mut().enumerate() {
275 if variant.id.trim().is_empty() {
276 variant.id = format!("variant_{}", index + 1);
277 }
278 }
279}
280
281pub fn run_merge_captain_iteration(
282 manifest: &MergeCaptainIterationManifest,
283) -> Result<MergeCaptainIterationReport, VmError> {
284 let mut manifest = manifest.clone();
285 normalize_merge_captain_iteration_manifest(&mut manifest);
286 if manifest.scenarios.is_empty() {
287 return Err(VmError::Runtime(format!(
288 "merge-captain iteration '{}' must declare at least one scenario",
289 manifest.id
290 )));
291 }
292
293 let base_dir = manifest.base_dir.as_deref().map(Path::new);
294 let artifact_root = resolve_artifact_root(&manifest, base_dir);
295 fs::create_dir_all(&artifact_root).map_err(|error| {
296 VmError::Runtime(format!(
297 "failed to create merge-captain iteration artifact root {}: {error}",
298 artifact_root.display()
299 ))
300 })?;
301 write_json_file(&artifact_root.join("iteration.json"), &manifest)?;
302
303 let total = manifest.scenarios.len() * manifest.variants.len();
304 let started = Instant::now();
305 let mut total_cost_usd = 0.0;
306 let mut total_latency_ms: u64 = 0;
307 let mut completed = 0;
308 let mut budget_exhausted_reason = None;
309 let mut runs = Vec::new();
310
311 for scenario in &manifest.scenarios {
312 for variant in &manifest.variants {
313 if budget_exhausted_reason.is_none() {
314 budget_exhausted_reason =
315 budget_exhausted(&manifest.budget, completed, total_cost_usd, started);
316 }
317 if let Some(reason) = &budget_exhausted_reason {
318 runs.push(skipped_run_report(
319 scenario,
320 variant,
321 &artifact_root,
322 reason.clone(),
323 ));
324 continue;
325 }
326
327 let run = run_iteration_cell(&artifact_root, base_dir, scenario, variant)?;
328 if !run.skipped {
329 completed += 1;
330 total_cost_usd += run.cost_usd;
331 total_latency_ms = total_latency_ms.saturating_add(run.latency_ms);
332 }
333 runs.push(run);
334 }
335 }
336
337 let rankings = rank_variants(&manifest.variants, &runs);
338 let skipped = runs.iter().filter(|run| run.skipped).count();
339 let best_drift = rankings.first().map(|ranking| ranking.drift_score);
340 let pass = !runs.is_empty()
341 && budget_exhausted_reason.is_none()
342 && best_drift == Some(0)
343 && rankings
344 .first()
345 .is_some_and(|ranking| ranking.scenarios_completed == manifest.scenarios.len());
346 let summary_json_path = artifact_root.join("summary.json");
347 let summary_markdown_path = artifact_root.join("summary.md");
348 let mut report = MergeCaptainIterationReport {
349 type_name: REPORT_TYPE.to_string(),
350 version: 1,
351 id: manifest.id,
352 name: manifest.name,
353 artifact_root: artifact_root.display().to_string(),
354 summary_json_path: summary_json_path.display().to_string(),
355 summary_markdown_path: summary_markdown_path.display().to_string(),
356 pass,
357 total,
358 completed,
359 skipped,
360 budget_exhausted: budget_exhausted_reason.is_some(),
361 budget_exhausted_reason,
362 total_cost_usd,
363 total_latency_ms,
364 runs,
365 rankings,
366 metadata: manifest.metadata,
367 };
368 write_json_file(&summary_json_path, &report)?;
369 let markdown = render_iteration_markdown(&report);
370 write_text_file(&summary_markdown_path, &markdown)?;
371 report.summary_json_path = summary_json_path.display().to_string();
372 report.summary_markdown_path = summary_markdown_path.display().to_string();
373 Ok(report)
374}
375
376fn run_iteration_cell(
377 artifact_root: &Path,
378 base_dir: Option<&Path>,
379 scenario: &MergeCaptainIterationScenario,
380 variant: &MergeCaptainIterationVariant,
381) -> Result<MergeCaptainIterationRunReport, VmError> {
382 let cell_dir = artifact_root
383 .join("runs")
384 .join(safe_path_segment(&scenario.id))
385 .join(safe_path_segment(&variant.id));
386 fs::create_dir_all(&cell_dir).map_err(|error| {
387 VmError::Runtime(format!(
388 "failed to create iteration run dir {}: {error}",
389 cell_dir.display()
390 ))
391 })?;
392 let backend = resolve_iteration_backend(artifact_root, base_dir, scenario, variant)?;
393 let transcript_path = cell_dir.join("event_log.jsonl");
394 let receipt_path = cell_dir.join("receipt.json");
395 let summary_path = cell_dir.join("summary.json");
396 let max_sweeps = variant.max_sweeps.unwrap_or(1).max(1);
397 let output = super::run_merge_captain_driver(MergeCaptainDriverOptions {
398 backend: backend.clone(),
399 mode: if max_sweeps > 1 {
400 MergeCaptainDriverMode::Watch
401 } else {
402 MergeCaptainDriverMode::Once
403 },
404 model_route: variant
405 .model_route
406 .clone()
407 .or_else(|| Some(variant.id.clone())),
408 timeout_tier: variant.timeout_tier.clone(),
409 transcript_out: Some(transcript_path.clone()),
410 receipt_out: Some(receipt_path.clone()),
411 run_root: cell_dir.join("driver-runs"),
412 max_sweeps,
413 watch_backoff_ms: variant.watch_backoff_ms.unwrap_or(0),
414 stream_stdout: false,
415 })?;
416 write_json_file(&summary_path, &output.summary)?;
417
418 let degradation_reasons = degradation_reasons(&output.summary, variant);
419 let drift_score = drift_score(&output.summary, °radation_reasons);
420 let pass = output.summary.pass && degradation_reasons.is_empty();
421 let report = MergeCaptainIterationRunReport {
422 id: format!("{}::{}", scenario.id, variant.id),
423 scenario_id: scenario.id.clone(),
424 variant_id: variant.id.clone(),
425 backend: backend.kind().to_string(),
426 backend_source: output.summary.backend_source.clone(),
427 model_route: output.summary.model_route.clone(),
428 timeout_tier: output.summary.timeout_tier.clone(),
429 package_revision: variant.package_revision.clone(),
430 prompt_asset_revision: variant.prompt_asset_revision.clone(),
431 pass,
432 skipped: false,
433 skip_reason: None,
434 drift_score,
435 degradation_reasons,
436 transcript_path: Some(relative_display(artifact_root, &transcript_path)),
437 receipt_path: Some(relative_display(artifact_root, &receipt_path)),
438 summary_path: Some(relative_display(artifact_root, &summary_path)),
439 oracle_error_findings: output.summary.oracle_error_findings,
440 oracle_warn_findings: output.summary.oracle_warn_findings,
441 cost_usd: output.summary.cost_usd,
442 latency_ms: output.summary.latency_ms,
443 tool_calls: output.summary.tool_calls,
444 model_calls: output.summary.model_calls,
445 event_count: output.summary.event_count,
446 };
447 write_json_file(&cell_dir.join("run-report.json"), &report)?;
448 Ok(report)
449}
450
451fn resolve_iteration_backend(
452 artifact_root: &Path,
453 base_dir: Option<&Path>,
454 scenario: &MergeCaptainIterationScenario,
455 variant: &MergeCaptainIterationVariant,
456) -> Result<MergeCaptainDriverBackend, VmError> {
457 match scenario.backend.kind.trim().to_ascii_lowercase().as_str() {
458 "replay" => {
459 let path = scenario.backend.path.as_deref().ok_or_else(|| {
460 VmError::Runtime(format!(
461 "iteration scenario '{}' replay backend requires path",
462 scenario.id
463 ))
464 })?;
465 let source = resolve_manifest_path(base_dir, path);
466 Ok(MergeCaptainDriverBackend::Replay {
467 fixture: copy_replay_fixture(artifact_root, &scenario.id, &source)?,
468 })
469 }
470 "mock" => {
471 let playground_dir = artifact_root
472 .join("playgrounds")
473 .join(safe_path_segment(&scenario.id))
474 .join(safe_path_segment(&variant.id));
475 let manifest = if let Some(name) = scenario.backend.scenario.as_deref() {
476 Some(super::playground::load_builtin(name)?)
477 } else if let Some(path) = scenario.backend.path.as_deref() {
478 let source = resolve_manifest_path(base_dir, path);
479 if super::playground::playground_marker_path(&source).exists() {
480 return Ok(MergeCaptainDriverBackend::Mock {
481 playground_dir: source,
482 });
483 }
484 super::playground::ScenarioManifest::load(&source).ok()
485 } else {
486 Some(super::playground::load_builtin(&scenario.id)?)
487 };
488 if let Some(manifest) = manifest {
489 let _ = super::playground::cleanup_playground_at(&playground_dir)?;
490 super::playground::init_playground_at(super::playground::InitOptions {
491 dir: &playground_dir,
492 manifest: &manifest,
493 allow_existing: false,
494 })?;
495 Ok(MergeCaptainDriverBackend::Mock { playground_dir })
496 } else {
497 let path = scenario.backend.path.as_deref().ok_or_else(|| {
498 VmError::Runtime(format!(
499 "iteration scenario '{}' mock backend requires path or scenario",
500 scenario.id
501 ))
502 })?;
503 Ok(MergeCaptainDriverBackend::Mock {
504 playground_dir: resolve_manifest_path(base_dir, path),
505 })
506 }
507 }
508 "live" => Ok(MergeCaptainDriverBackend::Live),
509 other => Err(VmError::Runtime(format!(
510 "unsupported merge-captain iteration backend '{}'",
511 other
512 ))),
513 }
514}
515
516fn copy_replay_fixture(
517 artifact_root: &Path,
518 scenario_id: &str,
519 source: &Path,
520) -> Result<PathBuf, VmError> {
521 let stem = source
522 .file_stem()
523 .and_then(|stem| stem.to_str())
524 .unwrap_or("event_log");
525 let dest_dir = artifact_root
526 .join("fixtures")
527 .join(safe_path_segment(scenario_id))
528 .join("transcripts");
529 fs::create_dir_all(&dest_dir).map_err(|error| {
530 VmError::Runtime(format!(
531 "failed to create replay fixture dir {}: {error}",
532 dest_dir.display()
533 ))
534 })?;
535 let dest = dest_dir.join(format!("{stem}.jsonl"));
536 fs::copy(source, &dest).map_err(|error| {
537 VmError::Runtime(format!(
538 "failed to copy replay fixture {} to {}: {error}",
539 source.display(),
540 dest.display()
541 ))
542 })?;
543 if let Some(golden) = find_replay_golden(source)? {
544 let golden_dir = artifact_root
545 .join("fixtures")
546 .join(safe_path_segment(scenario_id))
547 .join("goldens");
548 fs::create_dir_all(&golden_dir).map_err(|error| {
549 VmError::Runtime(format!(
550 "failed to create replay golden dir {}: {error}",
551 golden_dir.display()
552 ))
553 })?;
554 let golden_dest = golden_dir.join(format!("{stem}.json"));
555 fs::copy(&golden, &golden_dest).map_err(|error| {
556 VmError::Runtime(format!(
557 "failed to copy replay golden {} to {}: {error}",
558 golden.display(),
559 golden_dest.display()
560 ))
561 })?;
562 }
563 Ok(dest)
564}
565
566fn find_replay_golden(source: &Path) -> Result<Option<PathBuf>, VmError> {
567 let Some(stem) = source.file_stem().and_then(|stem| stem.to_str()) else {
568 return Ok(None);
569 };
570 let mut candidates = Vec::new();
571 if let Some(parent) = source.parent() {
572 candidates.push(parent.join(format!("{stem}.golden.json")));
573 if parent.file_name().and_then(|name| name.to_str()) == Some("transcripts") {
574 if let Some(root) = parent.parent() {
575 candidates.push(root.join("goldens").join(format!("{stem}.json")));
576 }
577 }
578 }
579 for candidate in candidates {
580 if candidate.exists() {
581 let _ = load_merge_captain_golden(&candidate)?;
582 return Ok(Some(candidate));
583 }
584 }
585 Ok(None)
586}
587
588fn skipped_run_report(
589 scenario: &MergeCaptainIterationScenario,
590 variant: &MergeCaptainIterationVariant,
591 _artifact_root: &Path,
592 reason: String,
593) -> MergeCaptainIterationRunReport {
594 MergeCaptainIterationRunReport {
595 id: format!("{}::{}", scenario.id, variant.id),
596 scenario_id: scenario.id.clone(),
597 variant_id: variant.id.clone(),
598 model_route: variant.model_route.clone(),
599 timeout_tier: variant.timeout_tier.clone(),
600 package_revision: variant.package_revision.clone(),
601 prompt_asset_revision: variant.prompt_asset_revision.clone(),
602 skipped: true,
603 skip_reason: Some(reason),
604 drift_score: 10_000,
605 ..Default::default()
606 }
607}
608
609fn budget_exhausted(
610 budget: &MergeCaptainIterationBudget,
611 completed: usize,
612 total_cost_usd: f64,
613 started: Instant,
614) -> Option<String> {
615 if let Some(max_runs) = budget.max_runs {
616 if completed >= max_runs {
617 return Some(format!("completed run cap {max_runs} reached"));
618 }
619 }
620 if let Some(max_cost_usd) = budget.max_cost_usd {
621 if total_cost_usd > max_cost_usd {
622 return Some(format!(
623 "cost budget ${:.6} reached (spent ${:.6})",
624 max_cost_usd, total_cost_usd
625 ));
626 }
627 }
628 if let Some(max_wallclock_ms) = budget.max_wallclock_ms {
629 if started.elapsed().as_millis() >= u128::from(max_wallclock_ms) {
630 return Some(format!("wallclock budget {max_wallclock_ms}ms reached"));
631 }
632 }
633 None
634}
635
636fn degradation_reasons(
637 summary: &MergeCaptainRunSummary,
638 variant: &MergeCaptainIterationVariant,
639) -> Vec<String> {
640 let mut reasons = Vec::new();
641 if !summary.pass {
642 reasons.push(format!(
643 "oracle reported {} error finding(s) and {} warning finding(s)",
644 summary.oracle_error_findings, summary.oracle_warn_findings
645 ));
646 }
647 if let Some(max_tool_calls) = variant.max_tool_calls {
648 if summary.tool_calls > max_tool_calls {
649 reasons.push(format!(
650 "tool calls {} exceeded variant budget {}",
651 summary.tool_calls, max_tool_calls
652 ));
653 }
654 }
655 if let Some(max_model_calls) = variant.max_model_calls {
656 if summary.model_calls > max_model_calls {
657 reasons.push(format!(
658 "model calls {} exceeded variant budget {}",
659 summary.model_calls, max_model_calls
660 ));
661 }
662 }
663 if let Some(max_cost_usd) = variant.max_cost_usd {
664 if summary.cost_usd > max_cost_usd {
665 reasons.push(format!(
666 "cost ${:.6} exceeded variant budget ${:.6}",
667 summary.cost_usd, max_cost_usd
668 ));
669 }
670 }
671 if let Some(max_latency_ms) = variant.max_latency_ms.or(variant.timeout_ms) {
672 if summary.latency_ms > max_latency_ms {
673 reasons.push(format!(
674 "latency {}ms exceeded variant timeout {}ms",
675 summary.latency_ms, max_latency_ms
676 ));
677 }
678 }
679 reasons
680}
681
682fn drift_score(summary: &MergeCaptainRunSummary, degradation_reasons: &[String]) -> u64 {
683 let failed_penalty = if summary.pass { 0 } else { 1_000 };
684 failed_penalty
685 + (summary.oracle_error_findings as u64 * 100)
686 + (summary.oracle_warn_findings as u64 * 10)
687 + (degradation_reasons.len() as u64 * 25)
688}
689
690fn rank_variants(
691 variants: &[MergeCaptainIterationVariant],
692 runs: &[MergeCaptainIterationRunReport],
693) -> Vec<MergeCaptainIterationRanking> {
694 let mut rankings = Vec::new();
695 for variant in variants {
696 let matching: Vec<_> = runs
697 .iter()
698 .filter(|run| run.variant_id == variant.id)
699 .collect();
700 rankings.push(MergeCaptainIterationRanking {
701 variant_id: variant.id.clone(),
702 package_revision: variant.package_revision.clone(),
703 prompt_asset_revision: variant.prompt_asset_revision.clone(),
704 scenarios_completed: matching.iter().filter(|run| !run.skipped).count(),
705 scenarios_passed: matching.iter().filter(|run| run.pass).count(),
706 skipped: matching.iter().filter(|run| run.skipped).count(),
707 drift_score: matching.iter().map(|run| run.drift_score).sum(),
708 cost_usd: matching.iter().map(|run| run.cost_usd).sum(),
709 latency_ms: matching.iter().map(|run| run.latency_ms).sum(),
710 });
711 }
712 rankings.sort_by(|left, right| {
713 left.drift_score
714 .cmp(&right.drift_score)
715 .then_with(|| {
716 left.cost_usd
717 .partial_cmp(&right.cost_usd)
718 .unwrap_or(std::cmp::Ordering::Equal)
719 })
720 .then_with(|| left.variant_id.cmp(&right.variant_id))
721 });
722 rankings
723}
724
725pub fn load_merge_captain_iteration_report(
726 path: &Path,
727) -> Result<MergeCaptainIterationReport, VmError> {
728 let report_path = if path.is_dir() {
729 path.join("summary.json")
730 } else {
731 path.to_path_buf()
732 };
733 let bytes = fs::read(&report_path).map_err(|error| {
734 VmError::Runtime(format!(
735 "failed to read merge-captain iteration report {}: {error}",
736 report_path.display()
737 ))
738 })?;
739 serde_json::from_slice(&bytes).map_err(|error| {
740 VmError::Runtime(format!(
741 "failed to parse merge-captain iteration report {}: {error}",
742 report_path.display()
743 ))
744 })
745}
746
747pub fn diff_merge_captain_iterations(
748 baseline_path: &Path,
749 candidate_path: &Path,
750) -> Result<MergeCaptainIterationDiffReport, VmError> {
751 let baseline = load_merge_captain_iteration_report(baseline_path)?;
752 let candidate = load_merge_captain_iteration_report(candidate_path)?;
753 let mut keys = BTreeMap::new();
754 for run in &baseline.runs {
755 keys.insert((run.scenario_id.clone(), run.variant_id.clone()), ());
756 }
757 for run in &candidate.runs {
758 keys.insert((run.scenario_id.clone(), run.variant_id.clone()), ());
759 }
760
761 let mut entries = Vec::new();
762 let mut improved = 0;
763 let mut regressed = 0;
764 let mut unchanged = 0;
765 let mut missing = 0;
766 for ((scenario_id, variant_id), ()) in keys {
767 let before = baseline
768 .runs
769 .iter()
770 .find(|run| run.scenario_id == scenario_id && run.variant_id == variant_id);
771 let after = candidate
772 .runs
773 .iter()
774 .find(|run| run.scenario_id == scenario_id && run.variant_id == variant_id);
775 let delta = before
776 .zip(after)
777 .map(|(before, after)| after.drift_score as i64 - before.drift_score as i64);
778 let status = match delta {
779 Some(value) if value < 0 => {
780 improved += 1;
781 "improved"
782 }
783 Some(value) if value > 0 => {
784 regressed += 1;
785 "regressed"
786 }
787 Some(_) => {
788 unchanged += 1;
789 "unchanged"
790 }
791 None => {
792 missing += 1;
793 "missing"
794 }
795 };
796 entries.push(MergeCaptainIterationDiffEntry {
797 scenario_id,
798 variant_id,
799 baseline_drift_score: before.map(|run| run.drift_score),
800 candidate_drift_score: after.map(|run| run.drift_score),
801 delta,
802 status: status.to_string(),
803 baseline_pass: before.map(|run| run.pass),
804 candidate_pass: after.map(|run| run.pass),
805 baseline_prompt_asset_revision: before
806 .and_then(|run| run.prompt_asset_revision.clone()),
807 candidate_prompt_asset_revision: after
808 .and_then(|run| run.prompt_asset_revision.clone()),
809 });
810 }
811
812 Ok(MergeCaptainIterationDiffReport {
813 type_name: DIFF_TYPE.to_string(),
814 version: 1,
815 baseline_id: baseline.id,
816 candidate_id: candidate.id,
817 baseline_path: baseline_path.display().to_string(),
818 candidate_path: candidate_path.display().to_string(),
819 improved,
820 regressed,
821 unchanged,
822 missing,
823 entries,
824 })
825}
826
827pub fn render_iteration_markdown(report: &MergeCaptainIterationReport) -> String {
828 let mut out = String::new();
829 out.push_str(&format!(
830 "# Merge Captain iteration: {}\n\n",
831 report.name.as_deref().unwrap_or(&report.id)
832 ));
833 out.push_str(&format!(
834 "- pass: {}\n- completed: {}/{}\n- skipped: {}\n- budget_exhausted: {}\n\n",
835 report.pass, report.completed, report.total, report.skipped, report.budget_exhausted
836 ));
837 out.push_str("## Variant ranking\n\n");
838 out.push_str(
839 "| rank | variant | package | prompt assets | passed | drift | cost | latency ms |\n",
840 );
841 out.push_str("|---:|---|---|---|---:|---:|---:|---:|\n");
842 for (index, ranking) in report.rankings.iter().enumerate() {
843 out.push_str(&format!(
844 "| {} | {} | {} | {} | {}/{} | {} | {:.6} | {} |\n",
845 index + 1,
846 ranking.variant_id,
847 ranking.package_revision.as_deref().unwrap_or("-"),
848 ranking.prompt_asset_revision.as_deref().unwrap_or("-"),
849 ranking.scenarios_passed,
850 ranking.scenarios_completed,
851 ranking.drift_score,
852 ranking.cost_usd,
853 ranking.latency_ms
854 ));
855 }
856 out.push_str("\n## Scenario runs\n\n");
857 out.push_str(
858 "| scenario | variant | pass | drift | errors | warnings | tools | models | artifact |\n",
859 );
860 out.push_str("|---|---|---:|---:|---:|---:|---:|---:|---|\n");
861 for run in &report.runs {
862 out.push_str(&format!(
863 "| {} | {} | {} | {} | {} | {} | {} | {} | {} |\n",
864 run.scenario_id,
865 run.variant_id,
866 if run.skipped {
867 "skipped".to_string()
868 } else {
869 run.pass.to_string()
870 },
871 run.drift_score,
872 run.oracle_error_findings,
873 run.oracle_warn_findings,
874 run.tool_calls,
875 run.model_calls,
876 run.summary_path.as_deref().unwrap_or("-")
877 ));
878 }
879 out
880}
881
882pub fn render_iteration_diff_markdown(report: &MergeCaptainIterationDiffReport) -> String {
883 let mut out = String::new();
884 out.push_str(&format!(
885 "# Merge Captain iteration diff: {} -> {}\n\n",
886 report.baseline_id, report.candidate_id
887 ));
888 out.push_str(&format!(
889 "- improved: {}\n- regressed: {}\n- unchanged: {}\n- missing: {}\n\n",
890 report.improved, report.regressed, report.unchanged, report.missing
891 ));
892 out.push_str(
893 "| scenario | variant | baseline | candidate | delta | status | prompt assets |\n",
894 );
895 out.push_str("|---|---|---:|---:|---:|---|---|\n");
896 for entry in &report.entries {
897 out.push_str(&format!(
898 "| {} | {} | {} | {} | {} | {} | {} -> {} |\n",
899 entry.scenario_id,
900 entry.variant_id,
901 optional_u64(entry.baseline_drift_score),
902 optional_u64(entry.candidate_drift_score),
903 entry
904 .delta
905 .map(|delta| delta.to_string())
906 .unwrap_or_else(|| "-".to_string()),
907 entry.status,
908 entry
909 .baseline_prompt_asset_revision
910 .as_deref()
911 .unwrap_or("-"),
912 entry
913 .candidate_prompt_asset_revision
914 .as_deref()
915 .unwrap_or("-")
916 ));
917 }
918 out
919}
920
921fn optional_u64(value: Option<u64>) -> String {
922 value
923 .map(|value| value.to_string())
924 .unwrap_or_else(|| "-".to_string())
925}
926
927fn resolve_artifact_root(
928 manifest: &MergeCaptainIterationManifest,
929 base_dir: Option<&Path>,
930) -> PathBuf {
931 let root = manifest
932 .artifact_root
933 .clone()
934 .unwrap_or_else(|| format!(".harn-runs/merge-captain-iterations/{}", manifest.id));
935 let resolved = resolve_manifest_path(base_dir, &root);
936 if resolved.is_absolute() {
937 resolved
938 } else {
939 let relative = resolved.strip_prefix(".").unwrap_or(&resolved);
940 std::env::current_dir()
941 .unwrap_or_else(|_| PathBuf::from("."))
942 .join(relative)
943 }
944}
945
946fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
947 let path_buf = PathBuf::from(path);
948 if path_buf.is_absolute() {
949 path_buf
950 } else if let Some(base_dir) = base_dir {
951 base_dir.join(path_buf)
952 } else {
953 path_buf
954 }
955}
956
957fn relative_display(root: &Path, path: &Path) -> String {
958 path.strip_prefix(root)
959 .map(|path| path.display().to_string())
960 .unwrap_or_else(|_| path.display().to_string())
961}
962
963fn safe_path_segment(value: &str) -> String {
964 let mut out = String::new();
965 for ch in value.chars() {
966 if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
967 out.push(ch);
968 } else {
969 out.push('_');
970 }
971 }
972 if out.is_empty() {
973 "unnamed".to_string()
974 } else {
975 out
976 }
977}
978
979fn write_json_file<T: Serialize>(path: &Path, value: &T) -> Result<(), VmError> {
980 let mut bytes = serde_json::to_vec_pretty(value)
981 .map_err(|error| VmError::Runtime(format!("failed to serialize JSON artifact: {error}")))?;
982 bytes.push(b'\n');
983 write_bytes_file(path, &bytes)
984}
985
986fn write_text_file(path: &Path, value: &str) -> Result<(), VmError> {
987 write_bytes_file(path, value.as_bytes())
988}
989
990fn write_bytes_file(path: &Path, bytes: &[u8]) -> Result<(), VmError> {
991 if let Some(parent) = path.parent() {
992 fs::create_dir_all(parent).map_err(|error| {
993 VmError::Runtime(format!(
994 "failed to create artifact directory {}: {error}",
995 parent.display()
996 ))
997 })?;
998 }
999 fs::write(path, bytes).map_err(|error| {
1000 VmError::Runtime(format!(
1001 "failed to write artifact {}: {error}",
1002 path.display()
1003 ))
1004 })
1005}
1006
1007#[cfg(test)]
1008mod tests {
1009 use super::*;
1010 use crate::orchestration::load_transcript_jsonl;
1011
1012 fn repo_root() -> PathBuf {
1013 PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1014 .parent()
1015 .unwrap()
1016 .parent()
1017 .unwrap()
1018 .to_path_buf()
1019 }
1020
1021 #[test]
1022 fn iteration_runs_matrix_and_ranks_by_drift() {
1023 let temp = tempfile::tempdir().unwrap();
1024 let manifest = MergeCaptainIterationManifest {
1025 id: "issue-1021-smoke".to_string(),
1026 base_dir: Some(repo_root().display().to_string()),
1027 artifact_root: Some(temp.path().join("iteration").display().to_string()),
1028 scenarios: vec![MergeCaptainIterationScenario {
1029 id: "green-pr".to_string(),
1030 backend: MergeCaptainIterationBackendSpec {
1031 kind: "replay".to_string(),
1032 path: Some(
1033 "examples/personas/merge_captain/transcripts/green_pr.jsonl".to_string(),
1034 ),
1035 ..Default::default()
1036 },
1037 ..Default::default()
1038 }],
1039 variants: vec![
1040 MergeCaptainIterationVariant {
1041 id: "prompt-v1".to_string(),
1042 prompt_asset_revision: Some("prompt/v1".to_string()),
1043 max_tool_calls: Some(1),
1044 ..Default::default()
1045 },
1046 MergeCaptainIterationVariant {
1047 id: "prompt-v2".to_string(),
1048 prompt_asset_revision: Some("prompt/v2".to_string()),
1049 max_tool_calls: Some(4),
1050 ..Default::default()
1051 },
1052 ],
1053 ..Default::default()
1054 };
1055
1056 let report = run_merge_captain_iteration(&manifest).unwrap();
1057
1058 assert!(report.pass);
1059 assert_eq!(report.completed, 2);
1060 assert_eq!(report.rankings[0].variant_id, "prompt-v2");
1061 assert_eq!(report.rankings[0].drift_score, 0);
1062 assert!(Path::new(&report.summary_markdown_path).exists());
1063 assert!(Path::new(&report.artifact_root)
1064 .join("fixtures/green-pr/transcripts/green_pr.jsonl")
1065 .exists());
1066 }
1067
1068 #[test]
1069 fn iteration_budget_cap_skips_remaining_runs() {
1070 let temp = tempfile::tempdir().unwrap();
1071 let manifest = MergeCaptainIterationManifest {
1072 id: "issue-1021-budget".to_string(),
1073 base_dir: Some(repo_root().display().to_string()),
1074 artifact_root: Some(temp.path().join("iteration").display().to_string()),
1075 scenarios: vec![MergeCaptainIterationScenario {
1076 id: "green-pr".to_string(),
1077 backend: MergeCaptainIterationBackendSpec {
1078 kind: "replay".to_string(),
1079 path: Some(
1080 "examples/personas/merge_captain/transcripts/green_pr.jsonl".to_string(),
1081 ),
1082 ..Default::default()
1083 },
1084 ..Default::default()
1085 }],
1086 variants: vec![
1087 MergeCaptainIterationVariant {
1088 id: "one".to_string(),
1089 ..Default::default()
1090 },
1091 MergeCaptainIterationVariant {
1092 id: "two".to_string(),
1093 ..Default::default()
1094 },
1095 ],
1096 budget: MergeCaptainIterationBudget {
1097 max_runs: Some(1),
1098 ..Default::default()
1099 },
1100 ..Default::default()
1101 };
1102
1103 let report = run_merge_captain_iteration(&manifest).unwrap();
1104
1105 assert!(report.budget_exhausted);
1106 assert_eq!(report.completed, 1);
1107 assert_eq!(report.skipped, 1);
1108 assert!(report.runs[1].skipped);
1109 }
1110
1111 #[test]
1112 fn diff_marks_prompt_candidate_improvement() {
1113 let temp = tempfile::tempdir().unwrap();
1114 let baseline_path = temp.path().join("baseline.json");
1115 let candidate_path = temp.path().join("candidate.json");
1116 let mut baseline = MergeCaptainIterationReport {
1117 type_name: REPORT_TYPE.to_string(),
1118 id: "baseline".to_string(),
1119 runs: vec![MergeCaptainIterationRunReport {
1120 scenario_id: "green-pr".to_string(),
1121 variant_id: "value-route".to_string(),
1122 drift_score: 25,
1123 prompt_asset_revision: Some("prompt/v1".to_string()),
1124 ..Default::default()
1125 }],
1126 ..Default::default()
1127 };
1128 baseline.version = 1;
1129 let mut candidate = baseline.clone();
1130 candidate.id = "candidate".to_string();
1131 candidate.runs[0].drift_score = 0;
1132 candidate.runs[0].prompt_asset_revision = Some("prompt/v2".to_string());
1133 write_json_file(&baseline_path, &baseline).unwrap();
1134 write_json_file(&candidate_path, &candidate).unwrap();
1135
1136 let diff = diff_merge_captain_iterations(&baseline_path, &candidate_path).unwrap();
1137
1138 assert_eq!(diff.improved, 1);
1139 assert_eq!(diff.entries[0].delta, Some(-25));
1140 assert_eq!(diff.entries[0].status, "improved");
1141 }
1142
1143 #[test]
1144 fn mock_scenario_manifest_materializes_playground() {
1145 let temp = tempfile::tempdir().unwrap();
1146 let manifest = MergeCaptainIterationManifest {
1147 id: "issue-1021-mock".to_string(),
1148 base_dir: Some(repo_root().display().to_string()),
1149 artifact_root: Some(temp.path().join("iteration").display().to_string()),
1150 scenarios: vec![MergeCaptainIterationScenario {
1151 id: "single-green".to_string(),
1152 backend: MergeCaptainIterationBackendSpec {
1153 kind: "mock".to_string(),
1154 path: Some("examples/merge_captain/scenarios/single_green.json".to_string()),
1155 ..Default::default()
1156 },
1157 ..Default::default()
1158 }],
1159 variants: vec![MergeCaptainIterationVariant {
1160 id: "smoke".to_string(),
1161 ..Default::default()
1162 }],
1163 ..Default::default()
1164 };
1165
1166 let report = run_merge_captain_iteration(&manifest).unwrap();
1167
1168 assert_eq!(report.completed, 1);
1169 assert!(Path::new(&report.artifact_root)
1170 .join("playgrounds/single-green/smoke/playground.json")
1171 .exists());
1172 let loaded = load_transcript_jsonl(
1173 &Path::new(&report.artifact_root)
1174 .join(report.runs[0].transcript_path.as_ref().unwrap()),
1175 )
1176 .unwrap();
1177 assert!(!loaded.events.is_empty());
1178 }
1179}