Skip to main content

harn_vm/orchestration/
merge_captain_driver.rs

1//! Merge Captain end-to-end driver surface (#1019).
2//!
3//! The driver is intentionally thin: it resolves a backend-specific scenario
4//! to the canonical JSONL transcript envelope, runs the same Merge Captain
5//! oracle used by `harn merge-captain audit`, and writes a deterministic
6//! receipt plus run summary. Backend adapters can later replace the
7//! transcript resolver with live connector execution without changing the
8//! artifact contract.
9
10use std::collections::BTreeSet;
11use std::fs;
12use std::io::{self, Write};
13use std::path::{Path, PathBuf};
14
15use serde::{Deserialize, Serialize};
16use sha2::{Digest, Sha256};
17
18use crate::agent_events::{AgentEvent, PersistedAgentEvent};
19use crate::value::VmError;
20
21use super::{
22    audit_transcript, load_merge_captain_golden, load_transcript_jsonl, AuditReport,
23    LoadedTranscript, MergeCaptainGolden, StateTransition,
24};
25
26const RECEIPT_TYPE: &str = "merge_captain_run_receipt";
27const SUMMARY_TYPE: &str = "merge_captain_run_summary";
28
29#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
30#[serde(rename_all = "snake_case")]
31pub enum MergeCaptainDriverBackend {
32    Live,
33    Mock { playground_dir: PathBuf },
34    Replay { fixture: PathBuf },
35}
36
37impl MergeCaptainDriverBackend {
38    pub fn kind(&self) -> &'static str {
39        match self {
40            Self::Live => "live",
41            Self::Mock { .. } => "mock",
42            Self::Replay { .. } => "replay",
43        }
44    }
45}
46
47#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
48#[serde(rename_all = "snake_case")]
49pub enum MergeCaptainDriverMode {
50    Once,
51    Watch,
52}
53
54#[derive(Clone, Debug)]
55pub struct MergeCaptainDriverOptions {
56    pub backend: MergeCaptainDriverBackend,
57    pub mode: MergeCaptainDriverMode,
58    pub model_route: Option<String>,
59    pub timeout_tier: Option<String>,
60    pub transcript_out: Option<PathBuf>,
61    pub receipt_out: Option<PathBuf>,
62    pub run_root: PathBuf,
63    pub max_sweeps: u32,
64    pub watch_backoff_ms: u64,
65    pub stream_stdout: bool,
66}
67
68#[derive(Clone, Debug, Serialize, Deserialize)]
69pub struct MergeCaptainRunReceipt {
70    #[serde(rename = "_type")]
71    pub type_name: String,
72    pub version: u32,
73    pub persona: String,
74    pub run_id: String,
75    pub scenario: Option<String>,
76    pub mode: MergeCaptainDriverMode,
77    pub model_route: Option<String>,
78    pub timeout_tier: Option<String>,
79    pub sweeps: u32,
80    pub event_count: u64,
81    pub model_calls: u64,
82    pub tool_calls: u64,
83    pub cost_usd: f64,
84    pub latency_ms: u64,
85    pub approvals_requested: u64,
86    pub unsafe_action_attempts: u64,
87    pub prs_touched: Vec<MergeCaptainPrTouch>,
88    pub state_transitions: Vec<StateTransition>,
89    pub oracle_error_findings: u64,
90    pub oracle_warn_findings: u64,
91    pub pass: bool,
92}
93
94#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
95pub struct MergeCaptainPrTouch {
96    pub repo: String,
97    pub pr_number: u64,
98}
99
100#[derive(Clone, Debug, Serialize, Deserialize)]
101pub struct MergeCaptainRunSummary {
102    #[serde(rename = "_type")]
103    pub type_name: String,
104    pub version: u32,
105    pub run_id: String,
106    pub backend: String,
107    pub backend_source: Option<String>,
108    pub scenario: Option<String>,
109    pub mode: MergeCaptainDriverMode,
110    pub model_route: Option<String>,
111    pub timeout_tier: Option<String>,
112    pub sweeps: u32,
113    pub transcript_path: Option<String>,
114    pub receipt_path: String,
115    pub event_count: u64,
116    pub prs_touched: Vec<MergeCaptainPrTouch>,
117    pub state_transitions: Vec<StateTransition>,
118    pub approvals_requested: u64,
119    pub model_calls: u64,
120    pub tool_calls: u64,
121    pub cost_usd: f64,
122    pub latency_ms: u64,
123    pub oracle_findings: usize,
124    pub oracle_error_findings: usize,
125    pub oracle_warn_findings: usize,
126    pub pass: bool,
127}
128
129#[derive(Clone, Debug)]
130pub struct MergeCaptainDriverOutput {
131    pub summary: MergeCaptainRunSummary,
132    pub receipt_path: PathBuf,
133    pub transcript_path: Option<PathBuf>,
134    pub audit_report: AuditReport,
135}
136
137#[derive(Clone, Debug, Serialize, Deserialize)]
138#[serde(default)]
139struct MockScenarioManifest {
140    #[serde(rename = "_type")]
141    type_name: String,
142    scenario: Option<String>,
143    transcript: PathBuf,
144    golden: Option<PathBuf>,
145}
146
147impl Default for MockScenarioManifest {
148    fn default() -> Self {
149        Self {
150            type_name: "merge_captain_mock_scenario".to_string(),
151            scenario: None,
152            transcript: PathBuf::new(),
153            golden: None,
154        }
155    }
156}
157
158#[derive(Clone, Debug)]
159struct ResolvedScenario {
160    events: Vec<PersistedAgentEvent>,
161    golden: Option<MergeCaptainGolden>,
162    scenario: Option<String>,
163    backend_source: Option<String>,
164}
165
166pub fn run_merge_captain_driver(
167    options: MergeCaptainDriverOptions,
168) -> Result<MergeCaptainDriverOutput, VmError> {
169    let mut resolved = resolve_backend(&options.backend)?;
170    expand_watch_events(&options, &mut resolved.events);
171    if resolved.scenario.is_none() {
172        resolved.scenario = resolved
173            .golden
174            .as_ref()
175            .map(|golden| golden.scenario.clone());
176    }
177
178    let audit_report = audit_transcript(&resolved.events, resolved.golden.as_ref());
179    let run_id = deterministic_run_id(&resolved.events)?;
180    let run_dir = options.run_root.join("merge-captain").join(&run_id);
181    fs::create_dir_all(&run_dir).map_err(|error| {
182        VmError::Runtime(format!(
183            "failed to create merge-captain run directory {}: {error}",
184            run_dir.display()
185        ))
186    })?;
187
188    let transcript_path = write_transcript(&options, &run_dir, &resolved.events)?;
189    let receipt_path = options
190        .receipt_out
191        .clone()
192        .unwrap_or_else(|| run_dir.join("receipt.json"));
193    let receipt = build_receipt(&options, &run_id, &resolved, &audit_report);
194    write_json_file(&receipt_path, &receipt)?;
195
196    let summary = build_summary(
197        &options,
198        &run_id,
199        &resolved,
200        &audit_report,
201        &receipt_path,
202        transcript_path.as_deref(),
203        &receipt,
204    );
205
206    Ok(MergeCaptainDriverOutput {
207        summary,
208        receipt_path,
209        transcript_path,
210        audit_report,
211    })
212}
213
214fn resolve_backend(backend: &MergeCaptainDriverBackend) -> Result<ResolvedScenario, VmError> {
215    match backend {
216        MergeCaptainDriverBackend::Live => Err(VmError::Runtime(
217            "merge-captain live backend requires the production connector runtime; use --backend mock <dir> or --backend replay <fixture> in this checkout".to_string(),
218        )),
219        MergeCaptainDriverBackend::Replay { fixture } => {
220            let loaded = load_transcript_jsonl(fixture)?;
221            let golden = load_replay_golden_if_present(fixture)?;
222            Ok(ResolvedScenario {
223                events: loaded.events,
224                golden,
225                scenario: None,
226                backend_source: Some(fixture.display().to_string()),
227            })
228        }
229        MergeCaptainDriverBackend::Mock { playground_dir } => {
230            // Prefer the on-disk playground (presence of `playground.json`)
231            // — that's the #1020 surface with real bare git repos. Fall
232            // back to the legacy transcript-replay manifest for backwards
233            // compatibility with examples/merge_captain/playground_3repos.
234            if super::playground::playground_marker_path(playground_dir).exists() {
235                let (state, manifest) = super::playground::load_playground(playground_dir)?;
236                let events = super::playground::synthesize_sweep(
237                    &state,
238                    &super::playground::TranscriptOptions::default(),
239                );
240                let golden = load_playground_golden_if_present(playground_dir)?;
241                return Ok(ResolvedScenario {
242                    events,
243                    golden,
244                    scenario: Some(manifest.scenario),
245                    backend_source: Some(playground_dir.display().to_string()),
246                });
247            }
248            let manifest_path = find_mock_manifest(playground_dir)?;
249            let bytes = fs::read(&manifest_path).map_err(|error| {
250                VmError::Runtime(format!(
251                    "failed to read mock scenario manifest {}: {error}",
252                    manifest_path.display()
253                ))
254            })?;
255            let manifest: MockScenarioManifest = serde_json::from_slice(&bytes).map_err(|error| {
256                VmError::Runtime(format!(
257                    "failed to parse mock scenario manifest {}: {error}",
258                    manifest_path.display()
259                ))
260            })?;
261            if manifest.transcript.as_os_str().is_empty() {
262                return Err(VmError::Runtime(format!(
263                    "mock scenario manifest {} must set transcript",
264                    manifest_path.display()
265                )));
266            }
267            let base = manifest_path.parent().unwrap_or_else(|| Path::new("."));
268            let transcript = resolve_relative(base, &manifest.transcript);
269            let LoadedTranscript { events, .. } = load_transcript_jsonl(&transcript)?;
270            let golden = match manifest.golden {
271                Some(path) => Some(load_merge_captain_golden(&resolve_relative(base, &path))?),
272                None => None,
273            };
274            Ok(ResolvedScenario {
275                events,
276                golden,
277                scenario: manifest.scenario,
278                backend_source: Some(playground_dir.display().to_string()),
279            })
280        }
281    }
282}
283
284fn load_playground_golden_if_present(
285    playground_dir: &Path,
286) -> Result<Option<MergeCaptainGolden>, VmError> {
287    let candidate = playground_dir.join("golden.json");
288    if candidate.exists() {
289        return load_merge_captain_golden(&candidate).map(Some);
290    }
291    Ok(None)
292}
293
294fn find_mock_manifest(playground_dir: &Path) -> Result<PathBuf, VmError> {
295    if playground_dir.is_file() {
296        return Ok(playground_dir.to_path_buf());
297    }
298    let candidates = [
299        playground_dir.join("merge_captain.scenario.json"),
300        playground_dir.join("scenario.json"),
301        playground_dir.join("harn.merge-captain.json"),
302    ];
303    candidates
304        .into_iter()
305        .find(|path| path.exists())
306        .ok_or_else(|| {
307            VmError::Runtime(format!(
308                "mock backend expected a scenario manifest at {}/merge_captain.scenario.json",
309                playground_dir.display()
310            ))
311        })
312}
313
314fn resolve_relative(base: &Path, path: &Path) -> PathBuf {
315    if path.is_absolute() {
316        path.to_path_buf()
317    } else {
318        base.join(path)
319    }
320}
321
322fn load_replay_golden_if_present(fixture: &Path) -> Result<Option<MergeCaptainGolden>, VmError> {
323    let Some(stem) = fixture.file_stem().and_then(|stem| stem.to_str()) else {
324        return Ok(None);
325    };
326    let mut candidates = Vec::new();
327    if let Some(parent) = fixture.parent() {
328        candidates.push(parent.join(format!("{stem}.golden.json")));
329        if parent.file_name().and_then(|name| name.to_str()) == Some("transcripts") {
330            if let Some(root) = parent.parent() {
331                candidates.push(root.join("goldens").join(format!("{stem}.json")));
332            }
333        }
334    }
335    for candidate in candidates {
336        if candidate.exists() {
337            return load_merge_captain_golden(&candidate).map(Some);
338        }
339    }
340    Ok(None)
341}
342
343fn write_transcript(
344    options: &MergeCaptainDriverOptions,
345    run_dir: &Path,
346    events: &[PersistedAgentEvent],
347) -> Result<Option<PathBuf>, VmError> {
348    let mut line_buffer = Vec::new();
349    for event in events {
350        serde_json::to_writer(&mut line_buffer, event).map_err(|error| {
351            VmError::Runtime(format!("failed to serialize transcript event: {error}"))
352        })?;
353        line_buffer.push(b'\n');
354    }
355
356    if options.stream_stdout {
357        io::stdout().write_all(&line_buffer).map_err(|error| {
358            VmError::Runtime(format!(
359                "failed to stream merge-captain JSONL to stdout: {error}"
360            ))
361        })?;
362    }
363
364    let transcript_path = options
365        .transcript_out
366        .clone()
367        .or_else(|| Some(run_dir.join("event_log.jsonl")));
368    if let Some(path) = &transcript_path {
369        write_bytes_file(path, &line_buffer)?;
370    }
371    Ok(transcript_path)
372}
373
374fn build_receipt(
375    options: &MergeCaptainDriverOptions,
376    run_id: &str,
377    resolved: &ResolvedScenario,
378    report: &AuditReport,
379) -> MergeCaptainRunReceipt {
380    let stats = collect_stats(&resolved.events);
381    MergeCaptainRunReceipt {
382        type_name: RECEIPT_TYPE.to_string(),
383        version: 1,
384        persona: "merge_captain".to_string(),
385        run_id: run_id.to_string(),
386        scenario: resolved
387            .golden
388            .as_ref()
389            .map(|golden| golden.scenario.clone())
390            .or_else(|| report.scenario.clone())
391            .or_else(|| resolved.scenario.clone()),
392        mode: options.mode.clone(),
393        model_route: options.model_route.clone(),
394        timeout_tier: options.timeout_tier.clone(),
395        sweeps: sweep_count(options),
396        event_count: report.event_count,
397        model_calls: report.model_call_count,
398        tool_calls: report.tool_call_count,
399        cost_usd: 0.0,
400        latency_ms: transcript_latency_ms(&resolved.events),
401        approvals_requested: stats.approvals_requested,
402        unsafe_action_attempts: stats.unsafe_action_attempts,
403        prs_touched: stats.prs_touched,
404        state_transitions: report.state_transitions.clone(),
405        oracle_error_findings: report.error_findings() as u64,
406        oracle_warn_findings: report.warn_findings() as u64,
407        pass: report.pass,
408    }
409}
410
411fn build_summary(
412    options: &MergeCaptainDriverOptions,
413    run_id: &str,
414    resolved: &ResolvedScenario,
415    report: &AuditReport,
416    receipt_path: &Path,
417    transcript_path: Option<&Path>,
418    receipt: &MergeCaptainRunReceipt,
419) -> MergeCaptainRunSummary {
420    MergeCaptainRunSummary {
421        type_name: SUMMARY_TYPE.to_string(),
422        version: 1,
423        run_id: run_id.to_string(),
424        backend: options.backend.kind().to_string(),
425        backend_source: resolved.backend_source.clone(),
426        scenario: receipt.scenario.clone(),
427        mode: options.mode.clone(),
428        model_route: options.model_route.clone(),
429        timeout_tier: options.timeout_tier.clone(),
430        sweeps: receipt.sweeps,
431        transcript_path: transcript_path.map(|path| path.display().to_string()),
432        receipt_path: receipt_path.display().to_string(),
433        event_count: report.event_count,
434        prs_touched: receipt.prs_touched.clone(),
435        state_transitions: report.state_transitions.clone(),
436        approvals_requested: receipt.approvals_requested,
437        model_calls: report.model_call_count,
438        tool_calls: report.tool_call_count,
439        cost_usd: receipt.cost_usd,
440        latency_ms: receipt.latency_ms,
441        oracle_findings: report.findings.len(),
442        oracle_error_findings: report.error_findings(),
443        oracle_warn_findings: report.warn_findings(),
444        pass: report.pass,
445    }
446}
447
448#[derive(Default)]
449struct TranscriptStats {
450    approvals_requested: u64,
451    unsafe_action_attempts: u64,
452    prs_touched: Vec<MergeCaptainPrTouch>,
453}
454
455fn collect_stats(events: &[PersistedAgentEvent]) -> TranscriptStats {
456    let mut stats = TranscriptStats::default();
457    let mut prs = BTreeSet::new();
458
459    for env in events {
460        match &env.event {
461            AgentEvent::Plan { plan, .. } => {
462                if plan
463                    .get("approval_required")
464                    .and_then(|value| value.as_bool())
465                    .unwrap_or(false)
466                {
467                    stats.approvals_requested += 1;
468                }
469                insert_pr_touch(plan, &mut prs);
470            }
471            AgentEvent::ToolCall {
472                tool_name,
473                raw_input,
474                ..
475            } => {
476                if super::is_merge_captain_write_tool(tool_name) {
477                    stats.unsafe_action_attempts += 1;
478                }
479                insert_pr_touch(raw_input, &mut prs);
480            }
481            AgentEvent::ToolCallUpdate {
482                raw_output: Some(output),
483                ..
484            } => {
485                insert_pr_touch(output, &mut prs);
486            }
487            _ => {}
488        }
489    }
490
491    stats.prs_touched = prs.into_iter().collect();
492    stats
493}
494
495fn transcript_latency_ms(events: &[PersistedAgentEvent]) -> u64 {
496    let Some(first) = events.first() else {
497        return 0;
498    };
499    let min = events
500        .iter()
501        .map(|event| event.emitted_at_ms)
502        .min()
503        .unwrap_or(first.emitted_at_ms);
504    let max = events
505        .iter()
506        .map(|event| event.emitted_at_ms)
507        .max()
508        .unwrap_or(first.emitted_at_ms);
509    max.saturating_sub(min) as u64
510}
511
512fn insert_pr_touch(value: &serde_json::Value, prs: &mut BTreeSet<MergeCaptainPrTouch>) {
513    let repo = value
514        .get("repo")
515        .and_then(|value| value.as_str())
516        .map(str::to_string);
517    let pr_number = value
518        .get("pr_number")
519        .or_else(|| value.get("number"))
520        .and_then(|value| value.as_u64());
521    if let (Some(repo), Some(pr_number)) = (repo, pr_number) {
522        prs.insert(MergeCaptainPrTouch { repo, pr_number });
523    }
524}
525
526fn sweep_count(options: &MergeCaptainDriverOptions) -> u32 {
527    match options.mode {
528        MergeCaptainDriverMode::Once => 1,
529        MergeCaptainDriverMode::Watch => options.max_sweeps.max(1),
530    }
531}
532
533fn expand_watch_events(options: &MergeCaptainDriverOptions, events: &mut Vec<PersistedAgentEvent>) {
534    if options.mode != MergeCaptainDriverMode::Watch {
535        return;
536    }
537    let sweeps = options.max_sweeps.max(1);
538    if sweeps <= 1 || events.is_empty() {
539        return;
540    }
541    let template = events.clone();
542    let stride = template
543        .iter()
544        .map(|event| event.index)
545        .max()
546        .unwrap_or(0)
547        .saturating_add(1);
548    let time_stride = template
549        .last()
550        .and_then(|last| {
551            template
552                .first()
553                .map(|first| last.emitted_at_ms - first.emitted_at_ms)
554        })
555        .unwrap_or(0)
556        .max(1);
557    for sweep in 1..sweeps {
558        let index_offset = stride.saturating_mul(sweep as u64);
559        let watch_backoff_ms = i64::try_from(options.watch_backoff_ms).unwrap_or(i64::MAX);
560        let time_offset = time_stride
561            .saturating_add(watch_backoff_ms)
562            .saturating_mul(sweep as i64);
563        for event in &template {
564            let mut next = event.clone();
565            next.index = next.index.saturating_add(index_offset);
566            next.emitted_at_ms = next.emitted_at_ms.saturating_add(time_offset);
567            events.push(next);
568        }
569    }
570}
571
572fn deterministic_run_id(events: &[PersistedAgentEvent]) -> Result<String, VmError> {
573    let mut hasher = Sha256::new();
574    for event in events {
575        let bytes = serde_json::to_vec(event).map_err(|error| {
576            VmError::Runtime(format!("failed to hash merge-captain transcript: {error}"))
577        })?;
578        hasher.update((bytes.len() as u64).to_le_bytes());
579        hasher.update(bytes);
580    }
581    let digest = hasher.finalize();
582    let mut suffix = String::with_capacity(16);
583    for byte in &digest[..8] {
584        suffix.push_str(&format!("{byte:02x}"));
585    }
586    Ok(format!("merge-captain-{suffix}"))
587}
588
589fn write_json_file<T: Serialize>(path: &Path, value: &T) -> Result<(), VmError> {
590    let mut bytes = serde_json::to_vec_pretty(value)
591        .map_err(|error| VmError::Runtime(format!("failed to serialize JSON artifact: {error}")))?;
592    bytes.push(b'\n');
593    write_bytes_file(path, &bytes)
594}
595
596fn write_bytes_file(path: &Path, bytes: &[u8]) -> Result<(), VmError> {
597    if let Some(parent) = path.parent() {
598        fs::create_dir_all(parent).map_err(|error| {
599            VmError::Runtime(format!(
600                "failed to create artifact directory {}: {error}",
601                parent.display()
602            ))
603        })?;
604    }
605    fs::write(path, bytes).map_err(|error| {
606        VmError::Runtime(format!(
607            "failed to write artifact {}: {error}",
608            path.display()
609        ))
610    })
611}
612
613#[cfg(test)]
614mod tests {
615    use super::*;
616
617    fn repo_root() -> PathBuf {
618        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
619            .parent()
620            .unwrap()
621            .parent()
622            .unwrap()
623            .to_path_buf()
624    }
625
626    #[test]
627    fn mock_backend_resolves_manifest_and_builds_receipt() {
628        let temp = tempfile::tempdir().unwrap();
629        let output = run_merge_captain_driver(MergeCaptainDriverOptions {
630            backend: MergeCaptainDriverBackend::Mock {
631                playground_dir: repo_root().join("examples/merge_captain/playground_3repos"),
632            },
633            mode: MergeCaptainDriverMode::Once,
634            model_route: Some("mock/value".to_string()),
635            timeout_tier: Some("smoke".to_string()),
636            transcript_out: Some(temp.path().join("event_log.jsonl")),
637            receipt_out: Some(temp.path().join("receipt.json")),
638            run_root: temp.path().join("runs"),
639            max_sweeps: 1,
640            watch_backoff_ms: 0,
641            stream_stdout: false,
642        })
643        .unwrap();
644
645        assert!(output.summary.pass);
646        assert_eq!(output.summary.backend, "mock");
647        assert_eq!(output.summary.scenario.as_deref(), Some("green_pr"));
648        assert!(!output.summary.prs_touched.is_empty());
649        assert!(output.receipt_path.exists());
650        assert!(output.transcript_path.unwrap().exists());
651    }
652
653    #[test]
654    fn mock_backend_drives_real_on_disk_playground() {
655        if std::process::Command::new("git")
656            .arg("--version")
657            .output()
658            .map(|o| !o.status.success())
659            .unwrap_or(true)
660        {
661            eprintln!("(skipping — git not on PATH)");
662            return;
663        }
664        let temp = tempfile::tempdir().unwrap();
665        let playground = temp.path().join("pg");
666        let manifest = super::super::playground::load_builtin("single_green").unwrap();
667        super::super::playground::init_playground_at(super::super::playground::InitOptions {
668            dir: &playground,
669            manifest: &manifest,
670            allow_existing: false,
671        })
672        .unwrap();
673        let output = run_merge_captain_driver(MergeCaptainDriverOptions {
674            backend: MergeCaptainDriverBackend::Mock {
675                playground_dir: playground.clone(),
676            },
677            mode: MergeCaptainDriverMode::Once,
678            model_route: None,
679            timeout_tier: None,
680            transcript_out: Some(temp.path().join("event_log.jsonl")),
681            receipt_out: Some(temp.path().join("receipt.json")),
682            run_root: temp.path().join("runs"),
683            max_sweeps: 1,
684            watch_backoff_ms: 0,
685            stream_stdout: false,
686        })
687        .unwrap();
688        assert_eq!(output.summary.backend, "mock");
689        assert_eq!(output.summary.scenario.as_deref(), Some("single_green"));
690        assert!(output.summary.event_count > 0);
691        // PR coverage gets populated from the synthesized transcript.
692        assert!(!output.summary.prs_touched.is_empty());
693    }
694
695    #[test]
696    fn replay_backend_surfaces_oracle_failure() {
697        let temp = tempfile::tempdir().unwrap();
698        let fixture =
699            repo_root().join("examples/personas/merge_captain/transcripts/bad_unsafe_merge.jsonl");
700        let output = run_merge_captain_driver(MergeCaptainDriverOptions {
701            backend: MergeCaptainDriverBackend::Replay { fixture },
702            mode: MergeCaptainDriverMode::Once,
703            model_route: None,
704            timeout_tier: None,
705            transcript_out: Some(temp.path().join("event_log.jsonl")),
706            receipt_out: Some(temp.path().join("receipt.json")),
707            run_root: temp.path().join("runs"),
708            max_sweeps: 1,
709            watch_backoff_ms: 0,
710            stream_stdout: false,
711        })
712        .unwrap();
713
714        assert!(!output.summary.pass);
715        assert!(output.summary.oracle_error_findings > 0);
716    }
717
718    #[test]
719    fn mock_and_replay_receipts_match_for_same_transcript() {
720        let temp = tempfile::tempdir().unwrap();
721        let mock_receipt = temp.path().join("mock-receipt.json");
722        let replay_receipt = temp.path().join("replay-receipt.json");
723        let common_model_route = Some("mock/value".to_string());
724        let common_timeout_tier = Some("smoke".to_string());
725
726        run_merge_captain_driver(MergeCaptainDriverOptions {
727            backend: MergeCaptainDriverBackend::Mock {
728                playground_dir: repo_root().join("examples/merge_captain/playground_3repos"),
729            },
730            mode: MergeCaptainDriverMode::Once,
731            model_route: common_model_route.clone(),
732            timeout_tier: common_timeout_tier.clone(),
733            transcript_out: Some(temp.path().join("mock-event_log.jsonl")),
734            receipt_out: Some(mock_receipt.clone()),
735            run_root: temp.path().join("runs"),
736            max_sweeps: 1,
737            watch_backoff_ms: 0,
738            stream_stdout: false,
739        })
740        .unwrap();
741
742        run_merge_captain_driver(MergeCaptainDriverOptions {
743            backend: MergeCaptainDriverBackend::Replay {
744                fixture: repo_root()
745                    .join("examples/personas/merge_captain/transcripts/green_pr.jsonl"),
746            },
747            mode: MergeCaptainDriverMode::Once,
748            model_route: common_model_route,
749            timeout_tier: common_timeout_tier,
750            transcript_out: Some(temp.path().join("replay-event_log.jsonl")),
751            receipt_out: Some(replay_receipt.clone()),
752            run_root: temp.path().join("runs"),
753            max_sweeps: 1,
754            watch_backoff_ms: 0,
755            stream_stdout: false,
756        })
757        .unwrap();
758
759        assert_eq!(
760            std::fs::read_to_string(mock_receipt).unwrap(),
761            std::fs::read_to_string(replay_receipt).unwrap()
762        );
763    }
764}