Skip to main content

harn_vm/orchestration/
merge_captain_iteration.rs

1//! Agent-iteration harness for Merge Captain eval sweeps (#1021).
2//!
3//! This layer deliberately composes the existing mock/replay driver and oracle:
4//! scenarios define the deterministic backend, variants define model plus Harn
5//! package/prompt-asset revision metadata, and each matrix cell writes the
6//! normal transcript/receipt/summary artifacts under one shareable iteration
7//! directory.
8
9use std::collections::BTreeMap;
10use std::fs;
11use std::path::{Path, PathBuf};
12use std::time::Instant;
13
14use serde::{Deserialize, Serialize};
15
16use crate::value::VmError;
17
18use super::{
19    load_merge_captain_golden, new_id, MergeCaptainDriverBackend, MergeCaptainDriverMode,
20    MergeCaptainDriverOptions, MergeCaptainRunSummary,
21};
22
23const MANIFEST_TYPE: &str = "merge_captain_iteration_manifest";
24const REPORT_TYPE: &str = "merge_captain_iteration_report";
25const DIFF_TYPE: &str = "merge_captain_iteration_diff";
26
27#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
28#[serde(default)]
29pub struct MergeCaptainIterationManifest {
30    #[serde(rename = "_type")]
31    pub type_name: String,
32    pub version: u32,
33    pub id: String,
34    pub name: Option<String>,
35    pub description: Option<String>,
36    pub base_dir: Option<String>,
37    #[serde(alias = "artifact-root")]
38    pub artifact_root: Option<String>,
39    pub scenarios: Vec<MergeCaptainIterationScenario>,
40    pub variants: Vec<MergeCaptainIterationVariant>,
41    pub budget: MergeCaptainIterationBudget,
42    pub metadata: BTreeMap<String, serde_json::Value>,
43}
44
45#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
46#[serde(default)]
47pub struct MergeCaptainIterationScenario {
48    pub id: String,
49    pub description: Option<String>,
50    pub backend: MergeCaptainIterationBackendSpec,
51    pub metadata: BTreeMap<String, serde_json::Value>,
52}
53
54#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
55#[serde(default)]
56pub struct MergeCaptainIterationBackendSpec {
57    pub kind: String,
58    pub path: Option<String>,
59    pub scenario: Option<String>,
60}
61
62#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
63#[serde(default)]
64pub struct MergeCaptainIterationVariant {
65    pub id: String,
66    #[serde(alias = "model-route")]
67    pub model_route: Option<String>,
68    #[serde(alias = "timeout-tier")]
69    pub timeout_tier: Option<String>,
70    #[serde(alias = "package-revision")]
71    pub package_revision: Option<String>,
72    #[serde(alias = "prompt-asset-revision")]
73    pub prompt_asset_revision: Option<String>,
74    #[serde(alias = "max-cost-usd")]
75    pub max_cost_usd: Option<f64>,
76    #[serde(alias = "max-model-calls")]
77    pub max_model_calls: Option<u64>,
78    #[serde(alias = "max-tool-calls")]
79    pub max_tool_calls: Option<u64>,
80    #[serde(alias = "max-latency-ms")]
81    pub max_latency_ms: Option<u64>,
82    #[serde(alias = "timeout-ms")]
83    pub timeout_ms: Option<u64>,
84    #[serde(alias = "max-sweeps")]
85    pub max_sweeps: Option<u32>,
86    #[serde(alias = "watch-backoff-ms")]
87    pub watch_backoff_ms: Option<u64>,
88    pub metadata: BTreeMap<String, serde_json::Value>,
89}
90
91#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
92#[serde(default)]
93pub struct MergeCaptainIterationBudget {
94    #[serde(alias = "max-cost-usd")]
95    pub max_cost_usd: Option<f64>,
96    #[serde(alias = "max-wallclock-ms")]
97    pub max_wallclock_ms: Option<u64>,
98    #[serde(alias = "max-runs")]
99    pub max_runs: Option<usize>,
100}
101
102#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
103#[serde(default)]
104pub struct MergeCaptainIterationReport {
105    #[serde(rename = "_type")]
106    pub type_name: String,
107    pub version: u32,
108    pub id: String,
109    pub name: Option<String>,
110    pub artifact_root: String,
111    pub summary_json_path: String,
112    pub summary_markdown_path: String,
113    pub pass: bool,
114    pub total: usize,
115    pub completed: usize,
116    pub skipped: usize,
117    pub budget_exhausted: bool,
118    pub budget_exhausted_reason: Option<String>,
119    pub total_cost_usd: f64,
120    pub total_latency_ms: u64,
121    pub runs: Vec<MergeCaptainIterationRunReport>,
122    pub rankings: Vec<MergeCaptainIterationRanking>,
123    pub metadata: BTreeMap<String, serde_json::Value>,
124}
125
126#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
127#[serde(default)]
128pub struct MergeCaptainIterationRunReport {
129    pub id: String,
130    pub scenario_id: String,
131    pub variant_id: String,
132    pub backend: String,
133    pub backend_source: Option<String>,
134    pub model_route: Option<String>,
135    pub timeout_tier: Option<String>,
136    pub package_revision: Option<String>,
137    pub prompt_asset_revision: Option<String>,
138    pub pass: bool,
139    pub skipped: bool,
140    pub skip_reason: Option<String>,
141    pub drift_score: u64,
142    pub degradation_reasons: Vec<String>,
143    pub transcript_path: Option<String>,
144    pub receipt_path: Option<String>,
145    pub summary_path: Option<String>,
146    pub oracle_error_findings: usize,
147    pub oracle_warn_findings: usize,
148    pub cost_usd: f64,
149    pub latency_ms: u64,
150    pub tool_calls: u64,
151    pub model_calls: u64,
152    pub event_count: u64,
153}
154
155#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
156#[serde(default)]
157pub struct MergeCaptainIterationRanking {
158    pub variant_id: String,
159    pub package_revision: Option<String>,
160    pub prompt_asset_revision: Option<String>,
161    pub scenarios_completed: usize,
162    pub scenarios_passed: usize,
163    pub skipped: usize,
164    pub drift_score: u64,
165    pub cost_usd: f64,
166    pub latency_ms: u64,
167}
168
169#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
170#[serde(default)]
171pub struct MergeCaptainIterationDiffReport {
172    #[serde(rename = "_type")]
173    pub type_name: String,
174    pub version: u32,
175    pub baseline_id: String,
176    pub candidate_id: String,
177    pub baseline_path: String,
178    pub candidate_path: String,
179    pub improved: usize,
180    pub regressed: usize,
181    pub unchanged: usize,
182    pub missing: usize,
183    pub entries: Vec<MergeCaptainIterationDiffEntry>,
184}
185
186#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
187#[serde(default)]
188pub struct MergeCaptainIterationDiffEntry {
189    pub scenario_id: String,
190    pub variant_id: String,
191    pub baseline_drift_score: Option<u64>,
192    pub candidate_drift_score: Option<u64>,
193    pub delta: Option<i64>,
194    pub status: String,
195    pub baseline_pass: Option<bool>,
196    pub candidate_pass: Option<bool>,
197    pub baseline_prompt_asset_revision: Option<String>,
198    pub candidate_prompt_asset_revision: Option<String>,
199}
200
201pub fn load_merge_captain_iteration_manifest(
202    path: &Path,
203) -> Result<MergeCaptainIterationManifest, VmError> {
204    let content = fs::read_to_string(path).map_err(|error| {
205        VmError::Runtime(format!(
206            "failed to read merge-captain iteration manifest {}: {error}",
207            path.display()
208        ))
209    })?;
210    let mut manifest: MergeCaptainIterationManifest =
211        if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
212            serde_json::from_str(&content).map_err(|error| {
213                VmError::Runtime(format!(
214                    "failed to parse merge-captain iteration JSON {}: {error}",
215                    path.display()
216                ))
217            })?
218        } else {
219            toml::from_str(&content).map_err(|error| {
220                VmError::Runtime(format!(
221                    "failed to parse merge-captain iteration TOML {}: {error}",
222                    path.display()
223                ))
224            })?
225        };
226    normalize_merge_captain_iteration_manifest(&mut manifest);
227    if manifest.base_dir.is_none() {
228        manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
229    }
230    Ok(manifest)
231}
232
233pub fn normalize_merge_captain_iteration_manifest(manifest: &mut MergeCaptainIterationManifest) {
234    if manifest.type_name.is_empty() {
235        manifest.type_name = MANIFEST_TYPE.to_string();
236    }
237    if manifest.version == 0 {
238        manifest.version = 1;
239    }
240    if manifest.id.trim().is_empty() {
241        manifest.id = manifest
242            .name
243            .clone()
244            .filter(|name| !name.trim().is_empty())
245            .unwrap_or_else(|| new_id("merge_captain_iteration"));
246    }
247    for (index, scenario) in manifest.scenarios.iter_mut().enumerate() {
248        if scenario.id.trim().is_empty() {
249            scenario.id = scenario
250                .backend
251                .scenario
252                .clone()
253                .or_else(|| {
254                    scenario
255                        .backend
256                        .path
257                        .as_deref()
258                        .and_then(|path| Path::new(path).file_stem())
259                        .and_then(|stem| stem.to_str())
260                        .map(str::to_string)
261                })
262                .unwrap_or_else(|| format!("scenario_{}", index + 1));
263        }
264        if scenario.backend.kind.trim().is_empty() {
265            scenario.backend.kind = "replay".to_string();
266        }
267    }
268    if manifest.variants.is_empty() {
269        manifest.variants.push(MergeCaptainIterationVariant {
270            id: "default".to_string(),
271            ..Default::default()
272        });
273    }
274    for (index, variant) in manifest.variants.iter_mut().enumerate() {
275        if variant.id.trim().is_empty() {
276            variant.id = format!("variant_{}", index + 1);
277        }
278    }
279}
280
281pub fn run_merge_captain_iteration(
282    manifest: &MergeCaptainIterationManifest,
283) -> Result<MergeCaptainIterationReport, VmError> {
284    let mut manifest = manifest.clone();
285    normalize_merge_captain_iteration_manifest(&mut manifest);
286    if manifest.scenarios.is_empty() {
287        return Err(VmError::Runtime(format!(
288            "merge-captain iteration '{}' must declare at least one scenario",
289            manifest.id
290        )));
291    }
292
293    let base_dir = manifest.base_dir.as_deref().map(Path::new);
294    let artifact_root = resolve_artifact_root(&manifest, base_dir);
295    fs::create_dir_all(&artifact_root).map_err(|error| {
296        VmError::Runtime(format!(
297            "failed to create merge-captain iteration artifact root {}: {error}",
298            artifact_root.display()
299        ))
300    })?;
301    write_json_file(&artifact_root.join("iteration.json"), &manifest)?;
302
303    let total = manifest.scenarios.len() * manifest.variants.len();
304    let started = Instant::now();
305    let mut total_cost_usd = 0.0;
306    let mut total_latency_ms: u64 = 0;
307    let mut completed = 0;
308    let mut budget_exhausted_reason = None;
309    let mut runs = Vec::new();
310
311    for scenario in &manifest.scenarios {
312        for variant in &manifest.variants {
313            if budget_exhausted_reason.is_none() {
314                budget_exhausted_reason =
315                    budget_exhausted(&manifest.budget, completed, total_cost_usd, started);
316            }
317            if let Some(reason) = &budget_exhausted_reason {
318                runs.push(skipped_run_report(
319                    scenario,
320                    variant,
321                    &artifact_root,
322                    reason.clone(),
323                ));
324                continue;
325            }
326
327            let run = run_iteration_cell(&artifact_root, base_dir, scenario, variant)?;
328            if !run.skipped {
329                completed += 1;
330                total_cost_usd += run.cost_usd;
331                total_latency_ms = total_latency_ms.saturating_add(run.latency_ms);
332            }
333            runs.push(run);
334        }
335    }
336
337    let rankings = rank_variants(&manifest.variants, &runs);
338    let skipped = runs.iter().filter(|run| run.skipped).count();
339    let best_drift = rankings.first().map(|ranking| ranking.drift_score);
340    let pass = !runs.is_empty()
341        && budget_exhausted_reason.is_none()
342        && best_drift == Some(0)
343        && rankings
344            .first()
345            .is_some_and(|ranking| ranking.scenarios_completed == manifest.scenarios.len());
346    let summary_json_path = artifact_root.join("summary.json");
347    let summary_markdown_path = artifact_root.join("summary.md");
348    let mut report = MergeCaptainIterationReport {
349        type_name: REPORT_TYPE.to_string(),
350        version: 1,
351        id: manifest.id,
352        name: manifest.name,
353        artifact_root: artifact_root.display().to_string(),
354        summary_json_path: summary_json_path.display().to_string(),
355        summary_markdown_path: summary_markdown_path.display().to_string(),
356        pass,
357        total,
358        completed,
359        skipped,
360        budget_exhausted: budget_exhausted_reason.is_some(),
361        budget_exhausted_reason,
362        total_cost_usd,
363        total_latency_ms,
364        runs,
365        rankings,
366        metadata: manifest.metadata,
367    };
368    write_json_file(&summary_json_path, &report)?;
369    let markdown = render_iteration_markdown(&report);
370    write_text_file(&summary_markdown_path, &markdown)?;
371    report.summary_json_path = summary_json_path.display().to_string();
372    report.summary_markdown_path = summary_markdown_path.display().to_string();
373    Ok(report)
374}
375
376fn run_iteration_cell(
377    artifact_root: &Path,
378    base_dir: Option<&Path>,
379    scenario: &MergeCaptainIterationScenario,
380    variant: &MergeCaptainIterationVariant,
381) -> Result<MergeCaptainIterationRunReport, VmError> {
382    let cell_dir = artifact_root
383        .join("runs")
384        .join(safe_path_segment(&scenario.id))
385        .join(safe_path_segment(&variant.id));
386    fs::create_dir_all(&cell_dir).map_err(|error| {
387        VmError::Runtime(format!(
388            "failed to create iteration run dir {}: {error}",
389            cell_dir.display()
390        ))
391    })?;
392    let backend = resolve_iteration_backend(artifact_root, base_dir, scenario, variant)?;
393    let transcript_path = cell_dir.join("event_log.jsonl");
394    let receipt_path = cell_dir.join("receipt.json");
395    let summary_path = cell_dir.join("summary.json");
396    let max_sweeps = variant.max_sweeps.unwrap_or(1).max(1);
397    let output = super::run_merge_captain_driver(MergeCaptainDriverOptions {
398        backend: backend.clone(),
399        mode: if max_sweeps > 1 {
400            MergeCaptainDriverMode::Watch
401        } else {
402            MergeCaptainDriverMode::Once
403        },
404        model_route: variant
405            .model_route
406            .clone()
407            .or_else(|| Some(variant.id.clone())),
408        timeout_tier: variant.timeout_tier.clone(),
409        transcript_out: Some(transcript_path.clone()),
410        receipt_out: Some(receipt_path.clone()),
411        run_root: cell_dir.join("driver-runs"),
412        max_sweeps,
413        watch_backoff_ms: variant.watch_backoff_ms.unwrap_or(0),
414        stream_stdout: false,
415    })?;
416    write_json_file(&summary_path, &output.summary)?;
417
418    let degradation_reasons = degradation_reasons(&output.summary, variant);
419    let drift_score = drift_score(&output.summary, &degradation_reasons);
420    let pass = output.summary.pass && degradation_reasons.is_empty();
421    let report = MergeCaptainIterationRunReport {
422        id: format!("{}::{}", scenario.id, variant.id),
423        scenario_id: scenario.id.clone(),
424        variant_id: variant.id.clone(),
425        backend: backend.kind().to_string(),
426        backend_source: output.summary.backend_source.clone(),
427        model_route: output.summary.model_route.clone(),
428        timeout_tier: output.summary.timeout_tier.clone(),
429        package_revision: variant.package_revision.clone(),
430        prompt_asset_revision: variant.prompt_asset_revision.clone(),
431        pass,
432        skipped: false,
433        skip_reason: None,
434        drift_score,
435        degradation_reasons,
436        transcript_path: Some(relative_display(artifact_root, &transcript_path)),
437        receipt_path: Some(relative_display(artifact_root, &receipt_path)),
438        summary_path: Some(relative_display(artifact_root, &summary_path)),
439        oracle_error_findings: output.summary.oracle_error_findings,
440        oracle_warn_findings: output.summary.oracle_warn_findings,
441        cost_usd: output.summary.cost_usd,
442        latency_ms: output.summary.latency_ms,
443        tool_calls: output.summary.tool_calls,
444        model_calls: output.summary.model_calls,
445        event_count: output.summary.event_count,
446    };
447    write_json_file(&cell_dir.join("run-report.json"), &report)?;
448    Ok(report)
449}
450
451fn resolve_iteration_backend(
452    artifact_root: &Path,
453    base_dir: Option<&Path>,
454    scenario: &MergeCaptainIterationScenario,
455    variant: &MergeCaptainIterationVariant,
456) -> Result<MergeCaptainDriverBackend, VmError> {
457    match scenario.backend.kind.trim().to_ascii_lowercase().as_str() {
458        "replay" => {
459            let path = scenario.backend.path.as_deref().ok_or_else(|| {
460                VmError::Runtime(format!(
461                    "iteration scenario '{}' replay backend requires path",
462                    scenario.id
463                ))
464            })?;
465            let source = resolve_manifest_path(base_dir, path);
466            Ok(MergeCaptainDriverBackend::Replay {
467                fixture: copy_replay_fixture(artifact_root, &scenario.id, &source)?,
468            })
469        }
470        "mock" => {
471            let playground_dir = artifact_root
472                .join("playgrounds")
473                .join(safe_path_segment(&scenario.id))
474                .join(safe_path_segment(&variant.id));
475            let manifest = if let Some(name) = scenario.backend.scenario.as_deref() {
476                Some(super::playground::load_builtin(name)?)
477            } else if let Some(path) = scenario.backend.path.as_deref() {
478                let source = resolve_manifest_path(base_dir, path);
479                if super::playground::playground_marker_path(&source).exists() {
480                    return Ok(MergeCaptainDriverBackend::Mock {
481                        playground_dir: source,
482                    });
483                }
484                super::playground::ScenarioManifest::load(&source).ok()
485            } else {
486                Some(super::playground::load_builtin(&scenario.id)?)
487            };
488            if let Some(manifest) = manifest {
489                let _ = super::playground::cleanup_playground_at(&playground_dir)?;
490                super::playground::init_playground_at(super::playground::InitOptions {
491                    dir: &playground_dir,
492                    manifest: &manifest,
493                    allow_existing: false,
494                })?;
495                Ok(MergeCaptainDriverBackend::Mock { playground_dir })
496            } else {
497                let path = scenario.backend.path.as_deref().ok_or_else(|| {
498                    VmError::Runtime(format!(
499                        "iteration scenario '{}' mock backend requires path or scenario",
500                        scenario.id
501                    ))
502                })?;
503                Ok(MergeCaptainDriverBackend::Mock {
504                    playground_dir: resolve_manifest_path(base_dir, path),
505                })
506            }
507        }
508        "live" => Ok(MergeCaptainDriverBackend::Live),
509        other => Err(VmError::Runtime(format!(
510            "unsupported merge-captain iteration backend '{}'",
511            other
512        ))),
513    }
514}
515
516fn copy_replay_fixture(
517    artifact_root: &Path,
518    scenario_id: &str,
519    source: &Path,
520) -> Result<PathBuf, VmError> {
521    let stem = source
522        .file_stem()
523        .and_then(|stem| stem.to_str())
524        .unwrap_or("event_log");
525    let dest_dir = artifact_root
526        .join("fixtures")
527        .join(safe_path_segment(scenario_id))
528        .join("transcripts");
529    fs::create_dir_all(&dest_dir).map_err(|error| {
530        VmError::Runtime(format!(
531            "failed to create replay fixture dir {}: {error}",
532            dest_dir.display()
533        ))
534    })?;
535    let dest = dest_dir.join(format!("{stem}.jsonl"));
536    fs::copy(source, &dest).map_err(|error| {
537        VmError::Runtime(format!(
538            "failed to copy replay fixture {} to {}: {error}",
539            source.display(),
540            dest.display()
541        ))
542    })?;
543    if let Some(golden) = find_replay_golden(source)? {
544        let golden_dir = artifact_root
545            .join("fixtures")
546            .join(safe_path_segment(scenario_id))
547            .join("goldens");
548        fs::create_dir_all(&golden_dir).map_err(|error| {
549            VmError::Runtime(format!(
550                "failed to create replay golden dir {}: {error}",
551                golden_dir.display()
552            ))
553        })?;
554        let golden_dest = golden_dir.join(format!("{stem}.json"));
555        fs::copy(&golden, &golden_dest).map_err(|error| {
556            VmError::Runtime(format!(
557                "failed to copy replay golden {} to {}: {error}",
558                golden.display(),
559                golden_dest.display()
560            ))
561        })?;
562    }
563    Ok(dest)
564}
565
566fn find_replay_golden(source: &Path) -> Result<Option<PathBuf>, VmError> {
567    let Some(stem) = source.file_stem().and_then(|stem| stem.to_str()) else {
568        return Ok(None);
569    };
570    let mut candidates = Vec::new();
571    if let Some(parent) = source.parent() {
572        candidates.push(parent.join(format!("{stem}.golden.json")));
573        if parent.file_name().and_then(|name| name.to_str()) == Some("transcripts") {
574            if let Some(root) = parent.parent() {
575                candidates.push(root.join("goldens").join(format!("{stem}.json")));
576            }
577        }
578    }
579    for candidate in candidates {
580        if candidate.exists() {
581            let _ = load_merge_captain_golden(&candidate)?;
582            return Ok(Some(candidate));
583        }
584    }
585    Ok(None)
586}
587
588fn skipped_run_report(
589    scenario: &MergeCaptainIterationScenario,
590    variant: &MergeCaptainIterationVariant,
591    _artifact_root: &Path,
592    reason: String,
593) -> MergeCaptainIterationRunReport {
594    MergeCaptainIterationRunReport {
595        id: format!("{}::{}", scenario.id, variant.id),
596        scenario_id: scenario.id.clone(),
597        variant_id: variant.id.clone(),
598        model_route: variant.model_route.clone(),
599        timeout_tier: variant.timeout_tier.clone(),
600        package_revision: variant.package_revision.clone(),
601        prompt_asset_revision: variant.prompt_asset_revision.clone(),
602        skipped: true,
603        skip_reason: Some(reason),
604        drift_score: 10_000,
605        ..Default::default()
606    }
607}
608
609fn budget_exhausted(
610    budget: &MergeCaptainIterationBudget,
611    completed: usize,
612    total_cost_usd: f64,
613    started: Instant,
614) -> Option<String> {
615    if let Some(max_runs) = budget.max_runs {
616        if completed >= max_runs {
617            return Some(format!("completed run cap {max_runs} reached"));
618        }
619    }
620    if let Some(max_cost_usd) = budget.max_cost_usd {
621        if total_cost_usd > max_cost_usd {
622            return Some(format!(
623                "cost budget ${:.6} reached (spent ${:.6})",
624                max_cost_usd, total_cost_usd
625            ));
626        }
627    }
628    if let Some(max_wallclock_ms) = budget.max_wallclock_ms {
629        if started.elapsed().as_millis() >= u128::from(max_wallclock_ms) {
630            return Some(format!("wallclock budget {max_wallclock_ms}ms reached"));
631        }
632    }
633    None
634}
635
636fn degradation_reasons(
637    summary: &MergeCaptainRunSummary,
638    variant: &MergeCaptainIterationVariant,
639) -> Vec<String> {
640    let mut reasons = Vec::new();
641    if !summary.pass {
642        reasons.push(format!(
643            "oracle reported {} error finding(s) and {} warning finding(s)",
644            summary.oracle_error_findings, summary.oracle_warn_findings
645        ));
646    }
647    if let Some(max_tool_calls) = variant.max_tool_calls {
648        if summary.tool_calls > max_tool_calls {
649            reasons.push(format!(
650                "tool calls {} exceeded variant budget {}",
651                summary.tool_calls, max_tool_calls
652            ));
653        }
654    }
655    if let Some(max_model_calls) = variant.max_model_calls {
656        if summary.model_calls > max_model_calls {
657            reasons.push(format!(
658                "model calls {} exceeded variant budget {}",
659                summary.model_calls, max_model_calls
660            ));
661        }
662    }
663    if let Some(max_cost_usd) = variant.max_cost_usd {
664        if summary.cost_usd > max_cost_usd {
665            reasons.push(format!(
666                "cost ${:.6} exceeded variant budget ${:.6}",
667                summary.cost_usd, max_cost_usd
668            ));
669        }
670    }
671    if let Some(max_latency_ms) = variant.max_latency_ms.or(variant.timeout_ms) {
672        if summary.latency_ms > max_latency_ms {
673            reasons.push(format!(
674                "latency {}ms exceeded variant timeout {}ms",
675                summary.latency_ms, max_latency_ms
676            ));
677        }
678    }
679    reasons
680}
681
682fn drift_score(summary: &MergeCaptainRunSummary, degradation_reasons: &[String]) -> u64 {
683    let failed_penalty = if summary.pass { 0 } else { 1_000 };
684    failed_penalty
685        + (summary.oracle_error_findings as u64 * 100)
686        + (summary.oracle_warn_findings as u64 * 10)
687        + (degradation_reasons.len() as u64 * 25)
688}
689
690fn rank_variants(
691    variants: &[MergeCaptainIterationVariant],
692    runs: &[MergeCaptainIterationRunReport],
693) -> Vec<MergeCaptainIterationRanking> {
694    let mut rankings = Vec::new();
695    for variant in variants {
696        let matching: Vec<_> = runs
697            .iter()
698            .filter(|run| run.variant_id == variant.id)
699            .collect();
700        rankings.push(MergeCaptainIterationRanking {
701            variant_id: variant.id.clone(),
702            package_revision: variant.package_revision.clone(),
703            prompt_asset_revision: variant.prompt_asset_revision.clone(),
704            scenarios_completed: matching.iter().filter(|run| !run.skipped).count(),
705            scenarios_passed: matching.iter().filter(|run| run.pass).count(),
706            skipped: matching.iter().filter(|run| run.skipped).count(),
707            drift_score: matching.iter().map(|run| run.drift_score).sum(),
708            cost_usd: matching.iter().map(|run| run.cost_usd).sum(),
709            latency_ms: matching.iter().map(|run| run.latency_ms).sum(),
710        });
711    }
712    rankings.sort_by(|left, right| {
713        left.drift_score
714            .cmp(&right.drift_score)
715            .then_with(|| {
716                left.cost_usd
717                    .partial_cmp(&right.cost_usd)
718                    .unwrap_or(std::cmp::Ordering::Equal)
719            })
720            .then_with(|| left.variant_id.cmp(&right.variant_id))
721    });
722    rankings
723}
724
725pub fn load_merge_captain_iteration_report(
726    path: &Path,
727) -> Result<MergeCaptainIterationReport, VmError> {
728    let report_path = if path.is_dir() {
729        path.join("summary.json")
730    } else {
731        path.to_path_buf()
732    };
733    let bytes = fs::read(&report_path).map_err(|error| {
734        VmError::Runtime(format!(
735            "failed to read merge-captain iteration report {}: {error}",
736            report_path.display()
737        ))
738    })?;
739    serde_json::from_slice(&bytes).map_err(|error| {
740        VmError::Runtime(format!(
741            "failed to parse merge-captain iteration report {}: {error}",
742            report_path.display()
743        ))
744    })
745}
746
747pub fn diff_merge_captain_iterations(
748    baseline_path: &Path,
749    candidate_path: &Path,
750) -> Result<MergeCaptainIterationDiffReport, VmError> {
751    let baseline = load_merge_captain_iteration_report(baseline_path)?;
752    let candidate = load_merge_captain_iteration_report(candidate_path)?;
753    let mut keys = BTreeMap::new();
754    for run in &baseline.runs {
755        keys.insert((run.scenario_id.clone(), run.variant_id.clone()), ());
756    }
757    for run in &candidate.runs {
758        keys.insert((run.scenario_id.clone(), run.variant_id.clone()), ());
759    }
760
761    let mut entries = Vec::new();
762    let mut improved = 0;
763    let mut regressed = 0;
764    let mut unchanged = 0;
765    let mut missing = 0;
766    for ((scenario_id, variant_id), ()) in keys {
767        let before = baseline
768            .runs
769            .iter()
770            .find(|run| run.scenario_id == scenario_id && run.variant_id == variant_id);
771        let after = candidate
772            .runs
773            .iter()
774            .find(|run| run.scenario_id == scenario_id && run.variant_id == variant_id);
775        let delta = before
776            .zip(after)
777            .map(|(before, after)| after.drift_score as i64 - before.drift_score as i64);
778        let status = match delta {
779            Some(value) if value < 0 => {
780                improved += 1;
781                "improved"
782            }
783            Some(value) if value > 0 => {
784                regressed += 1;
785                "regressed"
786            }
787            Some(_) => {
788                unchanged += 1;
789                "unchanged"
790            }
791            None => {
792                missing += 1;
793                "missing"
794            }
795        };
796        entries.push(MergeCaptainIterationDiffEntry {
797            scenario_id,
798            variant_id,
799            baseline_drift_score: before.map(|run| run.drift_score),
800            candidate_drift_score: after.map(|run| run.drift_score),
801            delta,
802            status: status.to_string(),
803            baseline_pass: before.map(|run| run.pass),
804            candidate_pass: after.map(|run| run.pass),
805            baseline_prompt_asset_revision: before
806                .and_then(|run| run.prompt_asset_revision.clone()),
807            candidate_prompt_asset_revision: after
808                .and_then(|run| run.prompt_asset_revision.clone()),
809        });
810    }
811
812    Ok(MergeCaptainIterationDiffReport {
813        type_name: DIFF_TYPE.to_string(),
814        version: 1,
815        baseline_id: baseline.id,
816        candidate_id: candidate.id,
817        baseline_path: baseline_path.display().to_string(),
818        candidate_path: candidate_path.display().to_string(),
819        improved,
820        regressed,
821        unchanged,
822        missing,
823        entries,
824    })
825}
826
827pub fn render_iteration_markdown(report: &MergeCaptainIterationReport) -> String {
828    let mut out = String::new();
829    out.push_str(&format!(
830        "# Merge Captain iteration: {}\n\n",
831        report.name.as_deref().unwrap_or(&report.id)
832    ));
833    out.push_str(&format!(
834        "- pass: {}\n- completed: {}/{}\n- skipped: {}\n- budget_exhausted: {}\n\n",
835        report.pass, report.completed, report.total, report.skipped, report.budget_exhausted
836    ));
837    out.push_str("## Variant ranking\n\n");
838    out.push_str(
839        "| rank | variant | package | prompt assets | passed | drift | cost | latency ms |\n",
840    );
841    out.push_str("|---:|---|---|---|---:|---:|---:|---:|\n");
842    for (index, ranking) in report.rankings.iter().enumerate() {
843        out.push_str(&format!(
844            "| {} | {} | {} | {} | {}/{} | {} | {:.6} | {} |\n",
845            index + 1,
846            ranking.variant_id,
847            ranking.package_revision.as_deref().unwrap_or("-"),
848            ranking.prompt_asset_revision.as_deref().unwrap_or("-"),
849            ranking.scenarios_passed,
850            ranking.scenarios_completed,
851            ranking.drift_score,
852            ranking.cost_usd,
853            ranking.latency_ms
854        ));
855    }
856    out.push_str("\n## Scenario runs\n\n");
857    out.push_str(
858        "| scenario | variant | pass | drift | errors | warnings | tools | models | artifact |\n",
859    );
860    out.push_str("|---|---|---:|---:|---:|---:|---:|---:|---|\n");
861    for run in &report.runs {
862        out.push_str(&format!(
863            "| {} | {} | {} | {} | {} | {} | {} | {} | {} |\n",
864            run.scenario_id,
865            run.variant_id,
866            if run.skipped {
867                "skipped".to_string()
868            } else {
869                run.pass.to_string()
870            },
871            run.drift_score,
872            run.oracle_error_findings,
873            run.oracle_warn_findings,
874            run.tool_calls,
875            run.model_calls,
876            run.summary_path.as_deref().unwrap_or("-")
877        ));
878    }
879    out
880}
881
882pub fn render_iteration_diff_markdown(report: &MergeCaptainIterationDiffReport) -> String {
883    let mut out = String::new();
884    out.push_str(&format!(
885        "# Merge Captain iteration diff: {} -> {}\n\n",
886        report.baseline_id, report.candidate_id
887    ));
888    out.push_str(&format!(
889        "- improved: {}\n- regressed: {}\n- unchanged: {}\n- missing: {}\n\n",
890        report.improved, report.regressed, report.unchanged, report.missing
891    ));
892    out.push_str(
893        "| scenario | variant | baseline | candidate | delta | status | prompt assets |\n",
894    );
895    out.push_str("|---|---|---:|---:|---:|---|---|\n");
896    for entry in &report.entries {
897        out.push_str(&format!(
898            "| {} | {} | {} | {} | {} | {} | {} -> {} |\n",
899            entry.scenario_id,
900            entry.variant_id,
901            optional_u64(entry.baseline_drift_score),
902            optional_u64(entry.candidate_drift_score),
903            entry
904                .delta
905                .map(|delta| delta.to_string())
906                .unwrap_or_else(|| "-".to_string()),
907            entry.status,
908            entry
909                .baseline_prompt_asset_revision
910                .as_deref()
911                .unwrap_or("-"),
912            entry
913                .candidate_prompt_asset_revision
914                .as_deref()
915                .unwrap_or("-")
916        ));
917    }
918    out
919}
920
921fn optional_u64(value: Option<u64>) -> String {
922    value
923        .map(|value| value.to_string())
924        .unwrap_or_else(|| "-".to_string())
925}
926
927fn resolve_artifact_root(
928    manifest: &MergeCaptainIterationManifest,
929    base_dir: Option<&Path>,
930) -> PathBuf {
931    let root = manifest
932        .artifact_root
933        .clone()
934        .unwrap_or_else(|| format!(".harn-runs/merge-captain-iterations/{}", manifest.id));
935    let resolved = resolve_manifest_path(base_dir, &root);
936    if resolved.is_absolute() {
937        resolved
938    } else {
939        let relative = resolved.strip_prefix(".").unwrap_or(&resolved);
940        std::env::current_dir()
941            .unwrap_or_else(|_| PathBuf::from("."))
942            .join(relative)
943    }
944}
945
946fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
947    let path_buf = PathBuf::from(path);
948    if path_buf.is_absolute() {
949        path_buf
950    } else if let Some(base_dir) = base_dir {
951        base_dir.join(path_buf)
952    } else {
953        path_buf
954    }
955}
956
957fn relative_display(root: &Path, path: &Path) -> String {
958    path.strip_prefix(root)
959        .map(|path| path.display().to_string())
960        .unwrap_or_else(|_| path.display().to_string())
961}
962
963fn safe_path_segment(value: &str) -> String {
964    let mut out = String::new();
965    for ch in value.chars() {
966        if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
967            out.push(ch);
968        } else {
969            out.push('_');
970        }
971    }
972    if out.is_empty() {
973        "unnamed".to_string()
974    } else {
975        out
976    }
977}
978
979fn write_json_file<T: Serialize>(path: &Path, value: &T) -> Result<(), VmError> {
980    let mut bytes = serde_json::to_vec_pretty(value)
981        .map_err(|error| VmError::Runtime(format!("failed to serialize JSON artifact: {error}")))?;
982    bytes.push(b'\n');
983    write_bytes_file(path, &bytes)
984}
985
986fn write_text_file(path: &Path, value: &str) -> Result<(), VmError> {
987    write_bytes_file(path, value.as_bytes())
988}
989
990fn write_bytes_file(path: &Path, bytes: &[u8]) -> Result<(), VmError> {
991    if let Some(parent) = path.parent() {
992        fs::create_dir_all(parent).map_err(|error| {
993            VmError::Runtime(format!(
994                "failed to create artifact directory {}: {error}",
995                parent.display()
996            ))
997        })?;
998    }
999    fs::write(path, bytes).map_err(|error| {
1000        VmError::Runtime(format!(
1001            "failed to write artifact {}: {error}",
1002            path.display()
1003        ))
1004    })
1005}
1006
1007#[cfg(test)]
1008mod tests {
1009    use super::*;
1010    use crate::orchestration::load_transcript_jsonl;
1011
1012    fn repo_root() -> PathBuf {
1013        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1014            .parent()
1015            .unwrap()
1016            .parent()
1017            .unwrap()
1018            .to_path_buf()
1019    }
1020
1021    #[test]
1022    fn iteration_runs_matrix_and_ranks_by_drift() {
1023        let temp = tempfile::tempdir().unwrap();
1024        let manifest = MergeCaptainIterationManifest {
1025            id: "issue-1021-smoke".to_string(),
1026            base_dir: Some(repo_root().display().to_string()),
1027            artifact_root: Some(temp.path().join("iteration").display().to_string()),
1028            scenarios: vec![MergeCaptainIterationScenario {
1029                id: "green-pr".to_string(),
1030                backend: MergeCaptainIterationBackendSpec {
1031                    kind: "replay".to_string(),
1032                    path: Some(
1033                        "examples/personas/merge_captain/transcripts/green_pr.jsonl".to_string(),
1034                    ),
1035                    ..Default::default()
1036                },
1037                ..Default::default()
1038            }],
1039            variants: vec![
1040                MergeCaptainIterationVariant {
1041                    id: "prompt-v1".to_string(),
1042                    prompt_asset_revision: Some("prompt/v1".to_string()),
1043                    max_tool_calls: Some(1),
1044                    ..Default::default()
1045                },
1046                MergeCaptainIterationVariant {
1047                    id: "prompt-v2".to_string(),
1048                    prompt_asset_revision: Some("prompt/v2".to_string()),
1049                    max_tool_calls: Some(4),
1050                    ..Default::default()
1051                },
1052            ],
1053            ..Default::default()
1054        };
1055
1056        let report = run_merge_captain_iteration(&manifest).unwrap();
1057
1058        assert!(report.pass);
1059        assert_eq!(report.completed, 2);
1060        assert_eq!(report.rankings[0].variant_id, "prompt-v2");
1061        assert_eq!(report.rankings[0].drift_score, 0);
1062        assert!(Path::new(&report.summary_markdown_path).exists());
1063        assert!(Path::new(&report.artifact_root)
1064            .join("fixtures/green-pr/transcripts/green_pr.jsonl")
1065            .exists());
1066    }
1067
1068    #[test]
1069    fn iteration_budget_cap_skips_remaining_runs() {
1070        let temp = tempfile::tempdir().unwrap();
1071        let manifest = MergeCaptainIterationManifest {
1072            id: "issue-1021-budget".to_string(),
1073            base_dir: Some(repo_root().display().to_string()),
1074            artifact_root: Some(temp.path().join("iteration").display().to_string()),
1075            scenarios: vec![MergeCaptainIterationScenario {
1076                id: "green-pr".to_string(),
1077                backend: MergeCaptainIterationBackendSpec {
1078                    kind: "replay".to_string(),
1079                    path: Some(
1080                        "examples/personas/merge_captain/transcripts/green_pr.jsonl".to_string(),
1081                    ),
1082                    ..Default::default()
1083                },
1084                ..Default::default()
1085            }],
1086            variants: vec![
1087                MergeCaptainIterationVariant {
1088                    id: "one".to_string(),
1089                    ..Default::default()
1090                },
1091                MergeCaptainIterationVariant {
1092                    id: "two".to_string(),
1093                    ..Default::default()
1094                },
1095            ],
1096            budget: MergeCaptainIterationBudget {
1097                max_runs: Some(1),
1098                ..Default::default()
1099            },
1100            ..Default::default()
1101        };
1102
1103        let report = run_merge_captain_iteration(&manifest).unwrap();
1104
1105        assert!(report.budget_exhausted);
1106        assert_eq!(report.completed, 1);
1107        assert_eq!(report.skipped, 1);
1108        assert!(report.runs[1].skipped);
1109    }
1110
1111    #[test]
1112    fn diff_marks_prompt_candidate_improvement() {
1113        let temp = tempfile::tempdir().unwrap();
1114        let baseline_path = temp.path().join("baseline.json");
1115        let candidate_path = temp.path().join("candidate.json");
1116        let mut baseline = MergeCaptainIterationReport {
1117            type_name: REPORT_TYPE.to_string(),
1118            id: "baseline".to_string(),
1119            runs: vec![MergeCaptainIterationRunReport {
1120                scenario_id: "green-pr".to_string(),
1121                variant_id: "value-route".to_string(),
1122                drift_score: 25,
1123                prompt_asset_revision: Some("prompt/v1".to_string()),
1124                ..Default::default()
1125            }],
1126            ..Default::default()
1127        };
1128        baseline.version = 1;
1129        let mut candidate = baseline.clone();
1130        candidate.id = "candidate".to_string();
1131        candidate.runs[0].drift_score = 0;
1132        candidate.runs[0].prompt_asset_revision = Some("prompt/v2".to_string());
1133        write_json_file(&baseline_path, &baseline).unwrap();
1134        write_json_file(&candidate_path, &candidate).unwrap();
1135
1136        let diff = diff_merge_captain_iterations(&baseline_path, &candidate_path).unwrap();
1137
1138        assert_eq!(diff.improved, 1);
1139        assert_eq!(diff.entries[0].delta, Some(-25));
1140        assert_eq!(diff.entries[0].status, "improved");
1141    }
1142
1143    #[test]
1144    fn mock_scenario_manifest_materializes_playground() {
1145        let temp = tempfile::tempdir().unwrap();
1146        let manifest = MergeCaptainIterationManifest {
1147            id: "issue-1021-mock".to_string(),
1148            base_dir: Some(repo_root().display().to_string()),
1149            artifact_root: Some(temp.path().join("iteration").display().to_string()),
1150            scenarios: vec![MergeCaptainIterationScenario {
1151                id: "single-green".to_string(),
1152                backend: MergeCaptainIterationBackendSpec {
1153                    kind: "mock".to_string(),
1154                    path: Some("examples/merge_captain/scenarios/single_green.json".to_string()),
1155                    ..Default::default()
1156                },
1157                ..Default::default()
1158            }],
1159            variants: vec![MergeCaptainIterationVariant {
1160                id: "smoke".to_string(),
1161                ..Default::default()
1162            }],
1163            ..Default::default()
1164        };
1165
1166        let report = run_merge_captain_iteration(&manifest).unwrap();
1167
1168        assert_eq!(report.completed, 1);
1169        assert!(Path::new(&report.artifact_root)
1170            .join("playgrounds/single-green/smoke/playground.json")
1171            .exists());
1172        let loaded = load_transcript_jsonl(
1173            &Path::new(&report.artifact_root)
1174                .join(report.runs[0].transcript_path.as_ref().unwrap()),
1175        )
1176        .unwrap();
1177        assert!(!loaded.events.is_empty());
1178    }
1179}