Skip to main content

harn_vm/orchestration/
merge_captain_iteration.rs

1//! Agent-iteration harness for Merge Captain eval sweeps (#1021).
2//!
3//! This layer deliberately composes the existing mock/replay driver and oracle:
4//! scenarios define the deterministic backend, variants define model plus Harn
5//! package/prompt-asset revision metadata, and each matrix cell writes the
6//! normal transcript/receipt/summary artifacts under one shareable iteration
7//! directory.
8
9use std::collections::{BTreeMap, BTreeSet};
10use std::fs;
11use std::path::{Path, PathBuf};
12use std::time::Instant;
13
14use serde::{Deserialize, Serialize};
15
16use crate::value::VmError;
17
18use super::{
19    load_merge_captain_golden, new_id, MergeCaptainDriverBackend, MergeCaptainDriverMode,
20    MergeCaptainDriverOptions, MergeCaptainRunSummary,
21};
22
23const MANIFEST_TYPE: &str = "merge_captain_iteration_manifest";
24const REPORT_TYPE: &str = "merge_captain_iteration_report";
25const DIFF_TYPE: &str = "merge_captain_iteration_diff";
26
27#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
28#[serde(default)]
29pub struct MergeCaptainIterationManifest {
30    #[serde(rename = "_type")]
31    pub type_name: String,
32    pub version: u32,
33    pub id: String,
34    pub name: Option<String>,
35    pub description: Option<String>,
36    pub base_dir: Option<String>,
37    #[serde(alias = "artifact-root")]
38    pub artifact_root: Option<String>,
39    pub scenarios: Vec<MergeCaptainIterationScenario>,
40    pub variants: Vec<MergeCaptainIterationVariant>,
41    pub budget: MergeCaptainIterationBudget,
42    pub metadata: BTreeMap<String, serde_json::Value>,
43}
44
45#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
46#[serde(default)]
47pub struct MergeCaptainIterationScenario {
48    pub id: String,
49    pub description: Option<String>,
50    pub backend: MergeCaptainIterationBackendSpec,
51    pub metadata: BTreeMap<String, serde_json::Value>,
52}
53
54#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
55#[serde(default)]
56pub struct MergeCaptainIterationBackendSpec {
57    pub kind: String,
58    pub path: Option<String>,
59    pub scenario: Option<String>,
60}
61
62#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
63#[serde(default)]
64pub struct MergeCaptainIterationVariant {
65    pub id: String,
66    #[serde(alias = "model-route")]
67    pub model_route: Option<String>,
68    #[serde(alias = "timeout-tier")]
69    pub timeout_tier: Option<String>,
70    #[serde(alias = "package-revision")]
71    pub package_revision: Option<String>,
72    #[serde(alias = "prompt-asset-revision")]
73    pub prompt_asset_revision: Option<String>,
74    #[serde(alias = "max-cost-usd")]
75    pub max_cost_usd: Option<f64>,
76    #[serde(alias = "max-model-calls")]
77    pub max_model_calls: Option<u64>,
78    #[serde(alias = "max-tool-calls")]
79    pub max_tool_calls: Option<u64>,
80    #[serde(alias = "max-latency-ms")]
81    pub max_latency_ms: Option<u64>,
82    #[serde(alias = "timeout-ms")]
83    pub timeout_ms: Option<u64>,
84    #[serde(alias = "max-sweeps")]
85    pub max_sweeps: Option<u32>,
86    #[serde(alias = "watch-backoff-ms")]
87    pub watch_backoff_ms: Option<u64>,
88    pub metadata: BTreeMap<String, serde_json::Value>,
89}
90
91#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
92#[serde(default)]
93pub struct MergeCaptainIterationBudget {
94    #[serde(alias = "max-cost-usd")]
95    pub max_cost_usd: Option<f64>,
96    #[serde(alias = "max-wallclock-ms")]
97    pub max_wallclock_ms: Option<u64>,
98    #[serde(alias = "max-runs")]
99    pub max_runs: Option<usize>,
100}
101
102#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
103#[serde(default)]
104pub struct MergeCaptainIterationReport {
105    #[serde(rename = "_type")]
106    pub type_name: String,
107    pub version: u32,
108    pub id: String,
109    pub name: Option<String>,
110    pub artifact_root: String,
111    pub summary_json_path: String,
112    pub summary_markdown_path: String,
113    pub pass: bool,
114    pub total: usize,
115    pub completed: usize,
116    pub skipped: usize,
117    pub budget_exhausted: bool,
118    pub budget_exhausted_reason: Option<String>,
119    pub total_cost_usd: f64,
120    pub total_latency_ms: u64,
121    pub runs: Vec<MergeCaptainIterationRunReport>,
122    pub rankings: Vec<MergeCaptainIterationRanking>,
123    pub metadata: BTreeMap<String, serde_json::Value>,
124}
125
126#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
127#[serde(default)]
128pub struct MergeCaptainIterationRunReport {
129    pub id: String,
130    pub scenario_id: String,
131    pub variant_id: String,
132    pub backend: String,
133    pub backend_source: Option<String>,
134    pub model_route: Option<String>,
135    pub timeout_tier: Option<String>,
136    pub package_revision: Option<String>,
137    pub prompt_asset_revision: Option<String>,
138    pub pass: bool,
139    pub skipped: bool,
140    pub skip_reason: Option<String>,
141    pub drift_score: u64,
142    pub degradation_reasons: Vec<String>,
143    pub transcript_path: Option<String>,
144    pub receipt_path: Option<String>,
145    pub summary_path: Option<String>,
146    pub oracle_error_findings: usize,
147    pub oracle_warn_findings: usize,
148    pub cost_usd: f64,
149    pub latency_ms: u64,
150    pub tool_calls: u64,
151    pub model_calls: u64,
152    pub event_count: u64,
153}
154
155#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
156#[serde(default)]
157pub struct MergeCaptainIterationRanking {
158    pub variant_id: String,
159    pub package_revision: Option<String>,
160    pub prompt_asset_revision: Option<String>,
161    pub scenarios_completed: usize,
162    pub scenarios_passed: usize,
163    pub skipped: usize,
164    pub drift_score: u64,
165    pub cost_usd: f64,
166    pub latency_ms: u64,
167}
168
169#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
170#[serde(default)]
171pub struct MergeCaptainIterationDiffReport {
172    #[serde(rename = "_type")]
173    pub type_name: String,
174    pub version: u32,
175    pub baseline_id: String,
176    pub candidate_id: String,
177    pub baseline_path: String,
178    pub candidate_path: String,
179    pub improved: usize,
180    pub regressed: usize,
181    pub unchanged: usize,
182    pub missing: usize,
183    pub entries: Vec<MergeCaptainIterationDiffEntry>,
184}
185
186#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
187#[serde(default)]
188pub struct MergeCaptainIterationDiffEntry {
189    pub scenario_id: String,
190    pub variant_id: String,
191    pub baseline_drift_score: Option<u64>,
192    pub candidate_drift_score: Option<u64>,
193    pub delta: Option<i64>,
194    pub status: String,
195    pub baseline_pass: Option<bool>,
196    pub candidate_pass: Option<bool>,
197    pub baseline_prompt_asset_revision: Option<String>,
198    pub candidate_prompt_asset_revision: Option<String>,
199}
200
201pub fn load_merge_captain_iteration_manifest(
202    path: &Path,
203) -> Result<MergeCaptainIterationManifest, VmError> {
204    let content = fs::read_to_string(path).map_err(|error| {
205        VmError::Runtime(format!(
206            "failed to read merge-captain iteration manifest {}: {error}",
207            path.display()
208        ))
209    })?;
210    let mut manifest: MergeCaptainIterationManifest =
211        if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
212            serde_json::from_str(&content).map_err(|error| {
213                VmError::Runtime(format!(
214                    "failed to parse merge-captain iteration JSON {}: {error}",
215                    path.display()
216                ))
217            })?
218        } else {
219            toml::from_str(&content).map_err(|error| {
220                VmError::Runtime(format!(
221                    "failed to parse merge-captain iteration TOML {}: {error}",
222                    path.display()
223                ))
224            })?
225        };
226    normalize_merge_captain_iteration_manifest(&mut manifest);
227    if manifest.base_dir.is_none() {
228        manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
229    }
230    Ok(manifest)
231}
232
233pub fn normalize_merge_captain_iteration_manifest(manifest: &mut MergeCaptainIterationManifest) {
234    if manifest.type_name.is_empty() {
235        manifest.type_name = MANIFEST_TYPE.to_string();
236    }
237    if manifest.version == 0 {
238        manifest.version = 1;
239    }
240    if manifest.id.trim().is_empty() {
241        manifest.id = manifest
242            .name
243            .clone()
244            .filter(|name| !name.trim().is_empty())
245            .unwrap_or_else(|| new_id("merge_captain_iteration"));
246    }
247    for (index, scenario) in manifest.scenarios.iter_mut().enumerate() {
248        if scenario.id.trim().is_empty() {
249            scenario.id = scenario
250                .backend
251                .scenario
252                .clone()
253                .or_else(|| {
254                    scenario
255                        .backend
256                        .path
257                        .as_deref()
258                        .and_then(|path| Path::new(path).file_stem())
259                        .and_then(|stem| stem.to_str())
260                        .map(str::to_string)
261                })
262                .unwrap_or_else(|| format!("scenario_{}", index + 1));
263        }
264        if scenario.backend.kind.trim().is_empty() {
265            scenario.backend.kind = "replay".to_string();
266        }
267    }
268    if manifest.variants.is_empty() {
269        manifest.variants.push(MergeCaptainIterationVariant {
270            id: "default".to_string(),
271            ..Default::default()
272        });
273    }
274    for (index, variant) in manifest.variants.iter_mut().enumerate() {
275        if variant.id.trim().is_empty() {
276            variant.id = format!("variant_{}", index + 1);
277        }
278    }
279}
280
281pub fn run_merge_captain_iteration(
282    manifest: &MergeCaptainIterationManifest,
283) -> Result<MergeCaptainIterationReport, VmError> {
284    let mut manifest = manifest.clone();
285    normalize_merge_captain_iteration_manifest(&mut manifest);
286    if manifest.scenarios.is_empty() {
287        return Err(VmError::Runtime(format!(
288            "merge-captain iteration '{}' must declare at least one scenario",
289            manifest.id
290        )));
291    }
292
293    let base_dir = manifest.base_dir.as_deref().map(Path::new);
294    let artifact_root = resolve_artifact_root(&manifest, base_dir);
295    fs::create_dir_all(&artifact_root).map_err(|error| {
296        VmError::Runtime(format!(
297            "failed to create merge-captain iteration artifact root {}: {error}",
298            artifact_root.display()
299        ))
300    })?;
301    write_json_file(&artifact_root.join("iteration.json"), &manifest)?;
302
303    let total = manifest.scenarios.len() * manifest.variants.len();
304    let started = Instant::now();
305    let mut total_cost_usd = 0.0;
306    let mut total_latency_ms: u64 = 0;
307    let mut completed = 0;
308    let mut budget_exhausted_reason = None;
309    let mut runs = Vec::new();
310
311    for scenario in &manifest.scenarios {
312        for variant in &manifest.variants {
313            if budget_exhausted_reason.is_none() {
314                budget_exhausted_reason =
315                    budget_exhausted(&manifest.budget, completed, total_cost_usd, started);
316            }
317            if let Some(reason) = &budget_exhausted_reason {
318                runs.push(skipped_run_report(
319                    scenario,
320                    variant,
321                    &artifact_root,
322                    reason.clone(),
323                ));
324                continue;
325            }
326
327            let run = run_iteration_cell(&artifact_root, base_dir, scenario, variant)?;
328            if !run.skipped {
329                completed += 1;
330                total_cost_usd += run.cost_usd;
331                total_latency_ms = total_latency_ms.saturating_add(run.latency_ms);
332            }
333            runs.push(run);
334        }
335    }
336
337    let rankings = rank_variants(&manifest.variants, &runs);
338    let skipped = runs.iter().filter(|run| run.skipped).count();
339    let best_drift = rankings.first().map(|ranking| ranking.drift_score);
340    let pass = !runs.is_empty()
341        && budget_exhausted_reason.is_none()
342        && best_drift == Some(0)
343        && rankings
344            .first()
345            .is_some_and(|ranking| ranking.scenarios_completed == manifest.scenarios.len());
346    let summary_json_path = artifact_root.join("summary.json");
347    let summary_markdown_path = artifact_root.join("summary.md");
348    let mut report = MergeCaptainIterationReport {
349        type_name: REPORT_TYPE.to_string(),
350        version: 1,
351        id: manifest.id,
352        name: manifest.name,
353        artifact_root: artifact_root.display().to_string(),
354        summary_json_path: summary_json_path.display().to_string(),
355        summary_markdown_path: summary_markdown_path.display().to_string(),
356        pass,
357        total,
358        completed,
359        skipped,
360        budget_exhausted: budget_exhausted_reason.is_some(),
361        budget_exhausted_reason,
362        total_cost_usd,
363        total_latency_ms,
364        runs,
365        rankings,
366        metadata: manifest.metadata,
367    };
368    write_json_file(&summary_json_path, &report)?;
369    let markdown = render_iteration_markdown(&report);
370    write_text_file(&summary_markdown_path, &markdown)?;
371    report.summary_json_path = summary_json_path.display().to_string();
372    report.summary_markdown_path = summary_markdown_path.display().to_string();
373    Ok(report)
374}
375
376fn run_iteration_cell(
377    artifact_root: &Path,
378    base_dir: Option<&Path>,
379    scenario: &MergeCaptainIterationScenario,
380    variant: &MergeCaptainIterationVariant,
381) -> Result<MergeCaptainIterationRunReport, VmError> {
382    let cell_dir = artifact_root
383        .join("runs")
384        .join(safe_path_segment(&scenario.id))
385        .join(safe_path_segment(&variant.id));
386    fs::create_dir_all(&cell_dir).map_err(|error| {
387        VmError::Runtime(format!(
388            "failed to create iteration run dir {}: {error}",
389            cell_dir.display()
390        ))
391    })?;
392    let backend = resolve_iteration_backend(artifact_root, base_dir, scenario, variant)?;
393    let transcript_path = cell_dir.join("event_log.jsonl");
394    let receipt_path = cell_dir.join("receipt.json");
395    let summary_path = cell_dir.join("summary.json");
396    let max_sweeps = variant.max_sweeps.unwrap_or(1).max(1);
397    let output = super::run_merge_captain_driver(MergeCaptainDriverOptions {
398        backend: backend.clone(),
399        mode: if max_sweeps > 1 {
400            MergeCaptainDriverMode::Watch
401        } else {
402            MergeCaptainDriverMode::Once
403        },
404        model_route: variant
405            .model_route
406            .clone()
407            .or_else(|| Some(variant.id.clone())),
408        timeout_tier: variant.timeout_tier.clone(),
409        transcript_out: Some(transcript_path.clone()),
410        receipt_out: Some(receipt_path.clone()),
411        run_root: cell_dir.join("driver-runs"),
412        max_sweeps,
413        watch_backoff_ms: variant.watch_backoff_ms.unwrap_or(0),
414        stream_stdout: false,
415    })?;
416    write_json_file(&summary_path, &output.summary)?;
417
418    let degradation_reasons = degradation_reasons(&output.summary, variant);
419    let drift_score = drift_score(&output.summary, &degradation_reasons);
420    let pass = output.summary.pass && degradation_reasons.is_empty();
421    let report = MergeCaptainIterationRunReport {
422        id: format!("{}::{}", scenario.id, variant.id),
423        scenario_id: scenario.id.clone(),
424        variant_id: variant.id.clone(),
425        backend: backend.kind().to_string(),
426        backend_source: output.summary.backend_source.clone(),
427        model_route: output.summary.model_route.clone(),
428        timeout_tier: output.summary.timeout_tier.clone(),
429        package_revision: variant.package_revision.clone(),
430        prompt_asset_revision: variant.prompt_asset_revision.clone(),
431        pass,
432        skipped: false,
433        skip_reason: None,
434        drift_score,
435        degradation_reasons,
436        transcript_path: Some(relative_display(artifact_root, &transcript_path)),
437        receipt_path: Some(relative_display(artifact_root, &receipt_path)),
438        summary_path: Some(relative_display(artifact_root, &summary_path)),
439        oracle_error_findings: output.summary.oracle_error_findings,
440        oracle_warn_findings: output.summary.oracle_warn_findings,
441        cost_usd: output.summary.cost_usd,
442        latency_ms: output.summary.latency_ms,
443        tool_calls: output.summary.tool_calls,
444        model_calls: output.summary.model_calls,
445        event_count: output.summary.event_count,
446    };
447    write_json_file(&cell_dir.join("run-report.json"), &report)?;
448    Ok(report)
449}
450
451fn resolve_iteration_backend(
452    artifact_root: &Path,
453    base_dir: Option<&Path>,
454    scenario: &MergeCaptainIterationScenario,
455    variant: &MergeCaptainIterationVariant,
456) -> Result<MergeCaptainDriverBackend, VmError> {
457    match scenario.backend.kind.trim().to_ascii_lowercase().as_str() {
458        "replay" => {
459            let path = scenario.backend.path.as_deref().ok_or_else(|| {
460                VmError::Runtime(format!(
461                    "iteration scenario '{}' replay backend requires path",
462                    scenario.id
463                ))
464            })?;
465            let source = resolve_manifest_path(base_dir, path);
466            Ok(MergeCaptainDriverBackend::Replay {
467                fixture: copy_replay_fixture(artifact_root, &scenario.id, &source)?,
468            })
469        }
470        "mock" => {
471            let playground_dir = artifact_root
472                .join("playgrounds")
473                .join(safe_path_segment(&scenario.id))
474                .join(safe_path_segment(&variant.id));
475            let manifest = if let Some(name) = scenario.backend.scenario.as_deref() {
476                Some(super::playground::load_builtin(name)?)
477            } else if let Some(path) = scenario.backend.path.as_deref() {
478                let source = resolve_manifest_path(base_dir, path);
479                if super::playground::playground_marker_path(&source).exists() {
480                    return Ok(MergeCaptainDriverBackend::Mock {
481                        playground_dir: source,
482                    });
483                }
484                super::playground::ScenarioManifest::load(&source).ok()
485            } else {
486                Some(super::playground::load_builtin(&scenario.id)?)
487            };
488            if let Some(manifest) = manifest {
489                let _ = super::playground::cleanup_playground_at(&playground_dir)?;
490                super::playground::init_playground_at(super::playground::InitOptions {
491                    dir: &playground_dir,
492                    manifest: &manifest,
493                    allow_existing: false,
494                })?;
495                Ok(MergeCaptainDriverBackend::Mock { playground_dir })
496            } else {
497                let path = scenario.backend.path.as_deref().ok_or_else(|| {
498                    VmError::Runtime(format!(
499                        "iteration scenario '{}' mock backend requires path or scenario",
500                        scenario.id
501                    ))
502                })?;
503                Ok(MergeCaptainDriverBackend::Mock {
504                    playground_dir: resolve_manifest_path(base_dir, path),
505                })
506            }
507        }
508        "live" => Ok(MergeCaptainDriverBackend::Live),
509        other => Err(VmError::Runtime(format!(
510            "unsupported merge-captain iteration backend '{other}'"
511        ))),
512    }
513}
514
515fn copy_replay_fixture(
516    artifact_root: &Path,
517    scenario_id: &str,
518    source: &Path,
519) -> Result<PathBuf, VmError> {
520    let stem = source
521        .file_stem()
522        .and_then(|stem| stem.to_str())
523        .unwrap_or("event_log");
524    let dest_dir = artifact_root
525        .join("fixtures")
526        .join(safe_path_segment(scenario_id))
527        .join("transcripts");
528    fs::create_dir_all(&dest_dir).map_err(|error| {
529        VmError::Runtime(format!(
530            "failed to create replay fixture dir {}: {error}",
531            dest_dir.display()
532        ))
533    })?;
534    let dest = dest_dir.join(format!("{stem}.jsonl"));
535    fs::copy(source, &dest).map_err(|error| {
536        VmError::Runtime(format!(
537            "failed to copy replay fixture {} to {}: {error}",
538            source.display(),
539            dest.display()
540        ))
541    })?;
542    if let Some(golden) = find_replay_golden(source)? {
543        let golden_dir = artifact_root
544            .join("fixtures")
545            .join(safe_path_segment(scenario_id))
546            .join("goldens");
547        fs::create_dir_all(&golden_dir).map_err(|error| {
548            VmError::Runtime(format!(
549                "failed to create replay golden dir {}: {error}",
550                golden_dir.display()
551            ))
552        })?;
553        let golden_dest = golden_dir.join(format!("{stem}.json"));
554        fs::copy(&golden, &golden_dest).map_err(|error| {
555            VmError::Runtime(format!(
556                "failed to copy replay golden {} to {}: {error}",
557                golden.display(),
558                golden_dest.display()
559            ))
560        })?;
561    }
562    Ok(dest)
563}
564
565fn find_replay_golden(source: &Path) -> Result<Option<PathBuf>, VmError> {
566    let Some(stem) = source.file_stem().and_then(|stem| stem.to_str()) else {
567        return Ok(None);
568    };
569    let mut candidates = Vec::new();
570    if let Some(parent) = source.parent() {
571        candidates.push(parent.join(format!("{stem}.golden.json")));
572        if parent.file_name().and_then(|name| name.to_str()) == Some("transcripts") {
573            if let Some(root) = parent.parent() {
574                candidates.push(root.join("goldens").join(format!("{stem}.json")));
575            }
576        }
577    }
578    for candidate in candidates {
579        if candidate.exists() {
580            let _ = load_merge_captain_golden(&candidate)?;
581            return Ok(Some(candidate));
582        }
583    }
584    Ok(None)
585}
586
587fn skipped_run_report(
588    scenario: &MergeCaptainIterationScenario,
589    variant: &MergeCaptainIterationVariant,
590    _artifact_root: &Path,
591    reason: String,
592) -> MergeCaptainIterationRunReport {
593    MergeCaptainIterationRunReport {
594        id: format!("{}::{}", scenario.id, variant.id),
595        scenario_id: scenario.id.clone(),
596        variant_id: variant.id.clone(),
597        model_route: variant.model_route.clone(),
598        timeout_tier: variant.timeout_tier.clone(),
599        package_revision: variant.package_revision.clone(),
600        prompt_asset_revision: variant.prompt_asset_revision.clone(),
601        skipped: true,
602        skip_reason: Some(reason),
603        drift_score: 10_000,
604        ..Default::default()
605    }
606}
607
608fn budget_exhausted(
609    budget: &MergeCaptainIterationBudget,
610    completed: usize,
611    total_cost_usd: f64,
612    started: Instant,
613) -> Option<String> {
614    if let Some(max_runs) = budget.max_runs {
615        if completed >= max_runs {
616            return Some(format!("completed run cap {max_runs} reached"));
617        }
618    }
619    if let Some(max_cost_usd) = budget.max_cost_usd {
620        if total_cost_usd > max_cost_usd {
621            return Some(format!(
622                "cost budget ${max_cost_usd:.6} reached (spent ${total_cost_usd:.6})"
623            ));
624        }
625    }
626    if let Some(max_wallclock_ms) = budget.max_wallclock_ms {
627        if started.elapsed().as_millis() >= u128::from(max_wallclock_ms) {
628            return Some(format!("wallclock budget {max_wallclock_ms}ms reached"));
629        }
630    }
631    None
632}
633
634fn degradation_reasons(
635    summary: &MergeCaptainRunSummary,
636    variant: &MergeCaptainIterationVariant,
637) -> Vec<String> {
638    let mut reasons = Vec::new();
639    if !summary.pass {
640        reasons.push(format!(
641            "oracle reported {} error finding(s) and {} warning finding(s)",
642            summary.oracle_error_findings, summary.oracle_warn_findings
643        ));
644    }
645    if let Some(max_tool_calls) = variant.max_tool_calls {
646        if summary.tool_calls > max_tool_calls {
647            reasons.push(format!(
648                "tool calls {} exceeded variant budget {}",
649                summary.tool_calls, max_tool_calls
650            ));
651        }
652    }
653    if let Some(max_model_calls) = variant.max_model_calls {
654        if summary.model_calls > max_model_calls {
655            reasons.push(format!(
656                "model calls {} exceeded variant budget {}",
657                summary.model_calls, max_model_calls
658            ));
659        }
660    }
661    if let Some(max_cost_usd) = variant.max_cost_usd {
662        if summary.cost_usd > max_cost_usd {
663            reasons.push(format!(
664                "cost ${:.6} exceeded variant budget ${:.6}",
665                summary.cost_usd, max_cost_usd
666            ));
667        }
668    }
669    if let Some(max_latency_ms) = variant.max_latency_ms.or(variant.timeout_ms) {
670        if summary.latency_ms > max_latency_ms {
671            reasons.push(format!(
672                "latency {}ms exceeded variant timeout {}ms",
673                summary.latency_ms, max_latency_ms
674            ));
675        }
676    }
677    reasons
678}
679
680fn drift_score(summary: &MergeCaptainRunSummary, degradation_reasons: &[String]) -> u64 {
681    let failed_penalty = if summary.pass { 0 } else { 1_000 };
682    failed_penalty
683        + (summary.oracle_error_findings as u64 * 100)
684        + (summary.oracle_warn_findings as u64 * 10)
685        + (degradation_reasons.len() as u64 * 25)
686}
687
688fn rank_variants(
689    variants: &[MergeCaptainIterationVariant],
690    runs: &[MergeCaptainIterationRunReport],
691) -> Vec<MergeCaptainIterationRanking> {
692    let mut rankings = Vec::new();
693    for variant in variants {
694        let matching: Vec<_> = runs
695            .iter()
696            .filter(|run| run.variant_id == variant.id)
697            .collect();
698        rankings.push(MergeCaptainIterationRanking {
699            variant_id: variant.id.clone(),
700            package_revision: variant.package_revision.clone(),
701            prompt_asset_revision: variant.prompt_asset_revision.clone(),
702            scenarios_completed: matching.iter().filter(|run| !run.skipped).count(),
703            scenarios_passed: matching.iter().filter(|run| run.pass).count(),
704            skipped: matching.iter().filter(|run| run.skipped).count(),
705            drift_score: matching.iter().map(|run| run.drift_score).sum(),
706            cost_usd: matching.iter().map(|run| run.cost_usd).sum(),
707            latency_ms: matching.iter().map(|run| run.latency_ms).sum(),
708        });
709    }
710    rankings.sort_by(|left, right| {
711        left.drift_score
712            .cmp(&right.drift_score)
713            .then_with(|| {
714                left.cost_usd
715                    .partial_cmp(&right.cost_usd)
716                    .unwrap_or(std::cmp::Ordering::Equal)
717            })
718            .then_with(|| left.variant_id.cmp(&right.variant_id))
719    });
720    rankings
721}
722
723pub fn load_merge_captain_iteration_report(
724    path: &Path,
725) -> Result<MergeCaptainIterationReport, VmError> {
726    let report_path = if path.is_dir() {
727        path.join("summary.json")
728    } else {
729        path.to_path_buf()
730    };
731    let bytes = fs::read(&report_path).map_err(|error| {
732        VmError::Runtime(format!(
733            "failed to read merge-captain iteration report {}: {error}",
734            report_path.display()
735        ))
736    })?;
737    serde_json::from_slice(&bytes).map_err(|error| {
738        VmError::Runtime(format!(
739            "failed to parse merge-captain iteration report {}: {error}",
740            report_path.display()
741        ))
742    })
743}
744
745pub fn diff_merge_captain_iterations(
746    baseline_path: &Path,
747    candidate_path: &Path,
748) -> Result<MergeCaptainIterationDiffReport, VmError> {
749    let baseline = load_merge_captain_iteration_report(baseline_path)?;
750    let candidate = load_merge_captain_iteration_report(candidate_path)?;
751    let mut keys = BTreeSet::new();
752    for run in &baseline.runs {
753        keys.insert((run.scenario_id.clone(), run.variant_id.clone()));
754    }
755    for run in &candidate.runs {
756        keys.insert((run.scenario_id.clone(), run.variant_id.clone()));
757    }
758
759    let mut entries = Vec::new();
760    let mut improved = 0;
761    let mut regressed = 0;
762    let mut unchanged = 0;
763    let mut missing = 0;
764    for (scenario_id, variant_id) in keys {
765        let before = baseline
766            .runs
767            .iter()
768            .find(|run| run.scenario_id == scenario_id && run.variant_id == variant_id);
769        let after = candidate
770            .runs
771            .iter()
772            .find(|run| run.scenario_id == scenario_id && run.variant_id == variant_id);
773        let delta = before
774            .zip(after)
775            .map(|(before, after)| after.drift_score as i64 - before.drift_score as i64);
776        let status = match delta {
777            Some(value) if value < 0 => {
778                improved += 1;
779                "improved"
780            }
781            Some(value) if value > 0 => {
782                regressed += 1;
783                "regressed"
784            }
785            Some(_) => {
786                unchanged += 1;
787                "unchanged"
788            }
789            None => {
790                missing += 1;
791                "missing"
792            }
793        };
794        entries.push(MergeCaptainIterationDiffEntry {
795            scenario_id,
796            variant_id,
797            baseline_drift_score: before.map(|run| run.drift_score),
798            candidate_drift_score: after.map(|run| run.drift_score),
799            delta,
800            status: status.to_string(),
801            baseline_pass: before.map(|run| run.pass),
802            candidate_pass: after.map(|run| run.pass),
803            baseline_prompt_asset_revision: before
804                .and_then(|run| run.prompt_asset_revision.clone()),
805            candidate_prompt_asset_revision: after
806                .and_then(|run| run.prompt_asset_revision.clone()),
807        });
808    }
809
810    Ok(MergeCaptainIterationDiffReport {
811        type_name: DIFF_TYPE.to_string(),
812        version: 1,
813        baseline_id: baseline.id,
814        candidate_id: candidate.id,
815        baseline_path: baseline_path.display().to_string(),
816        candidate_path: candidate_path.display().to_string(),
817        improved,
818        regressed,
819        unchanged,
820        missing,
821        entries,
822    })
823}
824
825pub fn render_iteration_markdown(report: &MergeCaptainIterationReport) -> String {
826    let mut out = String::new();
827    out.push_str(&format!(
828        "# Merge Captain iteration: {}\n\n",
829        report.name.as_deref().unwrap_or(&report.id)
830    ));
831    out.push_str(&format!(
832        "- pass: {}\n- completed: {}/{}\n- skipped: {}\n- budget_exhausted: {}\n\n",
833        report.pass, report.completed, report.total, report.skipped, report.budget_exhausted
834    ));
835    out.push_str("## Variant ranking\n\n");
836    out.push_str(
837        "| rank | variant | package | prompt assets | passed | drift | cost | latency ms |\n",
838    );
839    out.push_str("|---:|---|---|---|---:|---:|---:|---:|\n");
840    for (index, ranking) in report.rankings.iter().enumerate() {
841        out.push_str(&format!(
842            "| {} | {} | {} | {} | {}/{} | {} | {:.6} | {} |\n",
843            index + 1,
844            ranking.variant_id,
845            ranking.package_revision.as_deref().unwrap_or("-"),
846            ranking.prompt_asset_revision.as_deref().unwrap_or("-"),
847            ranking.scenarios_passed,
848            ranking.scenarios_completed,
849            ranking.drift_score,
850            ranking.cost_usd,
851            ranking.latency_ms
852        ));
853    }
854    out.push_str("\n## Scenario runs\n\n");
855    out.push_str(
856        "| scenario | variant | pass | drift | errors | warnings | tools | models | artifact |\n",
857    );
858    out.push_str("|---|---|---:|---:|---:|---:|---:|---:|---|\n");
859    for run in &report.runs {
860        out.push_str(&format!(
861            "| {} | {} | {} | {} | {} | {} | {} | {} | {} |\n",
862            run.scenario_id,
863            run.variant_id,
864            if run.skipped {
865                "skipped".to_string()
866            } else {
867                run.pass.to_string()
868            },
869            run.drift_score,
870            run.oracle_error_findings,
871            run.oracle_warn_findings,
872            run.tool_calls,
873            run.model_calls,
874            run.summary_path.as_deref().unwrap_or("-")
875        ));
876    }
877    out
878}
879
880pub fn render_iteration_diff_markdown(report: &MergeCaptainIterationDiffReport) -> String {
881    let mut out = String::new();
882    out.push_str(&format!(
883        "# Merge Captain iteration diff: {} -> {}\n\n",
884        report.baseline_id, report.candidate_id
885    ));
886    out.push_str(&format!(
887        "- improved: {}\n- regressed: {}\n- unchanged: {}\n- missing: {}\n\n",
888        report.improved, report.regressed, report.unchanged, report.missing
889    ));
890    out.push_str(
891        "| scenario | variant | baseline | candidate | delta | status | prompt assets |\n",
892    );
893    out.push_str("|---|---|---:|---:|---:|---|---|\n");
894    for entry in &report.entries {
895        out.push_str(&format!(
896            "| {} | {} | {} | {} | {} | {} | {} -> {} |\n",
897            entry.scenario_id,
898            entry.variant_id,
899            optional_u64(entry.baseline_drift_score),
900            optional_u64(entry.candidate_drift_score),
901            entry
902                .delta
903                .map(|delta| delta.to_string())
904                .unwrap_or_else(|| "-".to_string()),
905            entry.status,
906            entry
907                .baseline_prompt_asset_revision
908                .as_deref()
909                .unwrap_or("-"),
910            entry
911                .candidate_prompt_asset_revision
912                .as_deref()
913                .unwrap_or("-")
914        ));
915    }
916    out
917}
918
919fn optional_u64(value: Option<u64>) -> String {
920    value
921        .map(|value| value.to_string())
922        .unwrap_or_else(|| "-".to_string())
923}
924
925fn resolve_artifact_root(
926    manifest: &MergeCaptainIterationManifest,
927    base_dir: Option<&Path>,
928) -> PathBuf {
929    let root = manifest
930        .artifact_root
931        .clone()
932        .unwrap_or_else(|| format!(".harn-runs/merge-captain-iterations/{}", manifest.id));
933    let resolved = resolve_manifest_path(base_dir, &root);
934    if resolved.is_absolute() {
935        resolved
936    } else {
937        let relative = resolved.strip_prefix(".").unwrap_or(&resolved);
938        std::env::current_dir()
939            .unwrap_or_else(|_| PathBuf::from("."))
940            .join(relative)
941    }
942}
943
944fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
945    let path_buf = PathBuf::from(path);
946    if path_buf.is_absolute() {
947        path_buf
948    } else if let Some(base_dir) = base_dir {
949        base_dir.join(path_buf)
950    } else {
951        path_buf
952    }
953}
954
955fn relative_display(root: &Path, path: &Path) -> String {
956    path.strip_prefix(root)
957        .map(|path| path.display().to_string())
958        .unwrap_or_else(|_| path.display().to_string())
959}
960
961fn safe_path_segment(value: &str) -> String {
962    let mut out = String::new();
963    for ch in value.chars() {
964        if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
965            out.push(ch);
966        } else {
967            out.push('_');
968        }
969    }
970    if out.is_empty() {
971        "unnamed".to_string()
972    } else {
973        out
974    }
975}
976
977fn write_json_file<T: Serialize>(path: &Path, value: &T) -> Result<(), VmError> {
978    let mut bytes = serde_json::to_vec_pretty(value)
979        .map_err(|error| VmError::Runtime(format!("failed to serialize JSON artifact: {error}")))?;
980    bytes.push(b'\n');
981    write_bytes_file(path, &bytes)
982}
983
984fn write_text_file(path: &Path, value: &str) -> Result<(), VmError> {
985    write_bytes_file(path, value.as_bytes())
986}
987
988fn write_bytes_file(path: &Path, bytes: &[u8]) -> Result<(), VmError> {
989    if let Some(parent) = path.parent() {
990        fs::create_dir_all(parent).map_err(|error| {
991            VmError::Runtime(format!(
992                "failed to create artifact directory {}: {error}",
993                parent.display()
994            ))
995        })?;
996    }
997    fs::write(path, bytes).map_err(|error| {
998        VmError::Runtime(format!(
999            "failed to write artifact {}: {error}",
1000            path.display()
1001        ))
1002    })
1003}
1004
1005#[cfg(test)]
1006mod tests {
1007    use super::*;
1008    use crate::orchestration::load_transcript_jsonl;
1009
1010    fn repo_root() -> PathBuf {
1011        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1012            .parent()
1013            .unwrap()
1014            .parent()
1015            .unwrap()
1016            .to_path_buf()
1017    }
1018
1019    #[test]
1020    fn iteration_runs_matrix_and_ranks_by_drift() {
1021        let temp = tempfile::tempdir().unwrap();
1022        let manifest = MergeCaptainIterationManifest {
1023            id: "issue-1021-smoke".to_string(),
1024            base_dir: Some(repo_root().display().to_string()),
1025            artifact_root: Some(temp.path().join("iteration").display().to_string()),
1026            scenarios: vec![MergeCaptainIterationScenario {
1027                id: "green-pr".to_string(),
1028                backend: MergeCaptainIterationBackendSpec {
1029                    kind: "replay".to_string(),
1030                    path: Some(
1031                        "examples/personas/merge_captain/transcripts/green_pr.jsonl".to_string(),
1032                    ),
1033                    ..Default::default()
1034                },
1035                ..Default::default()
1036            }],
1037            variants: vec![
1038                MergeCaptainIterationVariant {
1039                    id: "prompt-v1".to_string(),
1040                    prompt_asset_revision: Some("prompt/v1".to_string()),
1041                    max_tool_calls: Some(1),
1042                    ..Default::default()
1043                },
1044                MergeCaptainIterationVariant {
1045                    id: "prompt-v2".to_string(),
1046                    prompt_asset_revision: Some("prompt/v2".to_string()),
1047                    max_tool_calls: Some(4),
1048                    ..Default::default()
1049                },
1050            ],
1051            ..Default::default()
1052        };
1053
1054        let report = run_merge_captain_iteration(&manifest).unwrap();
1055
1056        assert!(report.pass);
1057        assert_eq!(report.completed, 2);
1058        assert_eq!(report.rankings[0].variant_id, "prompt-v2");
1059        assert_eq!(report.rankings[0].drift_score, 0);
1060        assert!(Path::new(&report.summary_markdown_path).exists());
1061        assert!(Path::new(&report.artifact_root)
1062            .join("fixtures/green-pr/transcripts/green_pr.jsonl")
1063            .exists());
1064    }
1065
1066    #[test]
1067    fn iteration_budget_cap_skips_remaining_runs() {
1068        let temp = tempfile::tempdir().unwrap();
1069        let manifest = MergeCaptainIterationManifest {
1070            id: "issue-1021-budget".to_string(),
1071            base_dir: Some(repo_root().display().to_string()),
1072            artifact_root: Some(temp.path().join("iteration").display().to_string()),
1073            scenarios: vec![MergeCaptainIterationScenario {
1074                id: "green-pr".to_string(),
1075                backend: MergeCaptainIterationBackendSpec {
1076                    kind: "replay".to_string(),
1077                    path: Some(
1078                        "examples/personas/merge_captain/transcripts/green_pr.jsonl".to_string(),
1079                    ),
1080                    ..Default::default()
1081                },
1082                ..Default::default()
1083            }],
1084            variants: vec![
1085                MergeCaptainIterationVariant {
1086                    id: "one".to_string(),
1087                    ..Default::default()
1088                },
1089                MergeCaptainIterationVariant {
1090                    id: "two".to_string(),
1091                    ..Default::default()
1092                },
1093            ],
1094            budget: MergeCaptainIterationBudget {
1095                max_runs: Some(1),
1096                ..Default::default()
1097            },
1098            ..Default::default()
1099        };
1100
1101        let report = run_merge_captain_iteration(&manifest).unwrap();
1102
1103        assert!(report.budget_exhausted);
1104        assert_eq!(report.completed, 1);
1105        assert_eq!(report.skipped, 1);
1106        assert!(report.runs[1].skipped);
1107    }
1108
1109    #[test]
1110    fn diff_marks_prompt_candidate_improvement() {
1111        let temp = tempfile::tempdir().unwrap();
1112        let baseline_path = temp.path().join("baseline.json");
1113        let candidate_path = temp.path().join("candidate.json");
1114        let mut baseline = MergeCaptainIterationReport {
1115            type_name: REPORT_TYPE.to_string(),
1116            id: "baseline".to_string(),
1117            runs: vec![MergeCaptainIterationRunReport {
1118                scenario_id: "green-pr".to_string(),
1119                variant_id: "value-route".to_string(),
1120                drift_score: 25,
1121                prompt_asset_revision: Some("prompt/v1".to_string()),
1122                ..Default::default()
1123            }],
1124            ..Default::default()
1125        };
1126        baseline.version = 1;
1127        let mut candidate = baseline.clone();
1128        candidate.id = "candidate".to_string();
1129        candidate.runs[0].drift_score = 0;
1130        candidate.runs[0].prompt_asset_revision = Some("prompt/v2".to_string());
1131        write_json_file(&baseline_path, &baseline).unwrap();
1132        write_json_file(&candidate_path, &candidate).unwrap();
1133
1134        let diff = diff_merge_captain_iterations(&baseline_path, &candidate_path).unwrap();
1135
1136        assert_eq!(diff.improved, 1);
1137        assert_eq!(diff.entries[0].delta, Some(-25));
1138        assert_eq!(diff.entries[0].status, "improved");
1139    }
1140
1141    #[test]
1142    fn mock_scenario_manifest_materializes_playground() {
1143        let temp = tempfile::tempdir().unwrap();
1144        let manifest = MergeCaptainIterationManifest {
1145            id: "issue-1021-mock".to_string(),
1146            base_dir: Some(repo_root().display().to_string()),
1147            artifact_root: Some(temp.path().join("iteration").display().to_string()),
1148            scenarios: vec![MergeCaptainIterationScenario {
1149                id: "single-green".to_string(),
1150                backend: MergeCaptainIterationBackendSpec {
1151                    kind: "mock".to_string(),
1152                    path: Some("examples/merge_captain/scenarios/single_green.json".to_string()),
1153                    ..Default::default()
1154                },
1155                ..Default::default()
1156            }],
1157            variants: vec![MergeCaptainIterationVariant {
1158                id: "smoke".to_string(),
1159                ..Default::default()
1160            }],
1161            ..Default::default()
1162        };
1163
1164        let report = run_merge_captain_iteration(&manifest).unwrap();
1165
1166        assert_eq!(report.completed, 1);
1167        assert!(Path::new(&report.artifact_root)
1168            .join("playgrounds/single-green/smoke/playground.json")
1169            .exists());
1170        let loaded = load_transcript_jsonl(
1171            &Path::new(&report.artifact_root)
1172                .join(report.runs[0].transcript_path.as_ref().unwrap()),
1173        )
1174        .unwrap();
1175        assert!(!loaded.events.is_empty());
1176    }
1177}