Skip to main content

harn_vm/orchestration/
merge_captain_ladder.rs

1//! Persona eval timeout/budget ladders, starting with Merge Captain.
2//!
3//! The ladder is intentionally an eval artifact runner rather than host
4//! orchestration logic: every route/tier combination produces the same
5//! transcript, receipt, and summary contracts as `harn merge-captain run`,
6//! then an aggregate report marks the first configuration that completed
7//! correctly and the configurations that degraded or looped.
8
9use std::collections::{BTreeMap, BTreeSet};
10use std::fs;
11use std::path::{Path, PathBuf};
12
13use serde::{Deserialize, Serialize};
14
15use crate::value::{VmError, VmValue};
16
17use super::{
18    new_id, parse_json_value, MergeCaptainDriverBackend, MergeCaptainDriverMode,
19    MergeCaptainDriverOptions, MergeCaptainRunSummary, StateTransition,
20};
21
22const MANIFEST_TYPE: &str = "persona_eval_ladder_manifest";
23const REPORT_TYPE: &str = "persona_eval_ladder_report";
24const DEFAULT_PERSONA: &str = "merge_captain";
25
26#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
27#[serde(default)]
28pub struct PersonaEvalLadderManifest {
29    #[serde(rename = "_type")]
30    pub type_name: String,
31    pub version: u32,
32    pub id: String,
33    pub name: Option<String>,
34    pub description: Option<String>,
35    pub persona: String,
36    pub base_dir: Option<String>,
37    #[serde(alias = "artifact-root")]
38    pub artifact_root: Option<String>,
39    pub severity: Option<String>,
40    pub backend: PersonaEvalLadderBackendSpec,
41    #[serde(alias = "model-routes")]
42    pub model_routes: Vec<PersonaEvalModelRoute>,
43    #[serde(alias = "timeout-tiers")]
44    pub timeout_tiers: Vec<PersonaEvalTimeoutTier>,
45    pub metadata: BTreeMap<String, serde_json::Value>,
46}
47
48#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
49#[serde(default)]
50pub struct PersonaEvalLadderBackendSpec {
51    pub kind: String,
52    pub path: Option<String>,
53}
54
55#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
56#[serde(default)]
57pub struct PersonaEvalModelRoute {
58    pub id: String,
59    pub route: Option<String>,
60    pub provider: Option<String>,
61    pub model: Option<String>,
62    pub profile: Option<String>,
63    #[serde(alias = "max-cost-usd")]
64    pub max_cost_usd: Option<f64>,
65    #[serde(alias = "max-model-calls")]
66    pub max_model_calls: Option<u64>,
67    pub metadata: BTreeMap<String, serde_json::Value>,
68}
69
70#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
71#[serde(default)]
72pub struct PersonaEvalTimeoutTier {
73    pub id: String,
74    #[serde(alias = "timeout-ms")]
75    pub timeout_ms: Option<u64>,
76    #[serde(alias = "max-latency-ms")]
77    pub max_latency_ms: Option<u64>,
78    #[serde(alias = "max-cost-usd")]
79    pub max_cost_usd: Option<f64>,
80    #[serde(alias = "max-tool-calls")]
81    pub max_tool_calls: Option<u64>,
82    #[serde(alias = "max-model-calls")]
83    pub max_model_calls: Option<u64>,
84    #[serde(alias = "max-sweeps")]
85    pub max_sweeps: Option<u32>,
86    #[serde(alias = "watch-backoff-ms")]
87    pub watch_backoff_ms: Option<u64>,
88    pub metadata: BTreeMap<String, serde_json::Value>,
89}
90
91#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
92#[serde(rename_all = "snake_case")]
93pub enum PersonaEvalTierOutcome {
94    Correct,
95    Degraded,
96    Loop,
97}
98
99impl PersonaEvalTierOutcome {
100    pub fn as_str(self) -> &'static str {
101        match self {
102            Self::Correct => "correct",
103            Self::Degraded => "degraded",
104            Self::Loop => "loop",
105        }
106    }
107}
108
109#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
110#[serde(default)]
111pub struct PersonaEvalLadderReport {
112    #[serde(rename = "_type")]
113    pub type_name: String,
114    pub version: u32,
115    pub id: String,
116    pub persona: String,
117    pub severity: String,
118    pub blocking: bool,
119    pub pass: bool,
120    pub total: usize,
121    pub passed: usize,
122    pub failed: usize,
123    pub first_correct_tier: Option<String>,
124    pub first_correct_route: Option<String>,
125    pub first_correct_index: Option<usize>,
126    pub artifact_root: String,
127    pub tiers: Vec<PersonaEvalTierReport>,
128    pub metadata: BTreeMap<String, serde_json::Value>,
129}
130
131#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
132#[serde(default)]
133pub struct PersonaEvalTierReport {
134    pub id: String,
135    pub route_id: String,
136    pub model_route: Option<String>,
137    pub timeout_tier: String,
138    pub timeout_ms: Option<u64>,
139    pub max_cost_usd: Option<f64>,
140    pub max_latency_ms: Option<u64>,
141    pub pass: bool,
142    pub outcome: String,
143    pub degradation_reasons: Vec<String>,
144    pub transcript_path: Option<String>,
145    pub receipt_path: String,
146    pub summary_path: String,
147    pub event_count: u64,
148    pub cost_usd: f64,
149    pub latency_ms: u64,
150    pub tool_calls: u64,
151    pub model_calls: u64,
152    pub oracle_error_findings: usize,
153    pub oracle_warn_findings: usize,
154    pub state_machine_coverage: StateMachineCoverage,
155}
156
157#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
158#[serde(default)]
159pub struct StateMachineCoverage {
160    pub observed: usize,
161    pub observed_steps: Vec<String>,
162    pub transitions: Vec<StateTransition>,
163}
164
165pub fn load_persona_eval_ladder_manifest(
166    path: &Path,
167) -> Result<PersonaEvalLadderManifest, VmError> {
168    let content = fs::read_to_string(path).map_err(|error| {
169        VmError::Runtime(format!(
170            "failed to read persona eval ladder manifest {}: {error}",
171            path.display()
172        ))
173    })?;
174    let mut manifest: PersonaEvalLadderManifest =
175        if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
176            serde_json::from_str(&content).map_err(|error| {
177                VmError::Runtime(format!(
178                    "failed to parse persona eval ladder JSON {}: {error}",
179                    path.display()
180                ))
181            })?
182        } else {
183            toml::from_str(&content).map_err(|error| {
184                VmError::Runtime(format!(
185                    "failed to parse persona eval ladder TOML {}: {error}",
186                    path.display()
187                ))
188            })?
189        };
190    normalize_persona_eval_ladder_manifest(&mut manifest);
191    if manifest.base_dir.is_none() {
192        manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
193    }
194    Ok(manifest)
195}
196
197pub fn normalize_persona_eval_ladder_manifest_value(
198    value: &VmValue,
199) -> Result<PersonaEvalLadderManifest, VmError> {
200    let mut manifest: PersonaEvalLadderManifest = parse_json_value(value)?;
201    normalize_persona_eval_ladder_manifest(&mut manifest);
202    Ok(manifest)
203}
204
205pub fn normalize_persona_eval_ladder_manifest(manifest: &mut PersonaEvalLadderManifest) {
206    if manifest.type_name.is_empty() {
207        manifest.type_name = MANIFEST_TYPE.to_string();
208    }
209    if manifest.version == 0 {
210        manifest.version = 1;
211    }
212    if manifest.id.trim().is_empty() {
213        manifest.id = manifest
214            .name
215            .clone()
216            .filter(|name| !name.trim().is_empty())
217            .unwrap_or_else(|| new_id("persona_eval_ladder"));
218    }
219    if manifest.persona.trim().is_empty() {
220        manifest.persona = DEFAULT_PERSONA.to_string();
221    }
222    if manifest.backend.kind.trim().is_empty() {
223        manifest.backend.kind = "replay".to_string();
224    }
225    if manifest.model_routes.is_empty() {
226        manifest.model_routes.push(PersonaEvalModelRoute {
227            id: "default".to_string(),
228            ..Default::default()
229        });
230    }
231    for (index, route) in manifest.model_routes.iter_mut().enumerate() {
232        if route.id.trim().is_empty() {
233            route.id = format!("route_{}", index + 1);
234        }
235    }
236    for (index, tier) in manifest.timeout_tiers.iter_mut().enumerate() {
237        if tier.id.trim().is_empty() {
238            tier.id = format!("tier_{}", index + 1);
239        }
240    }
241}
242
243pub fn run_persona_eval_ladder(
244    manifest: &PersonaEvalLadderManifest,
245) -> Result<PersonaEvalLadderReport, VmError> {
246    let mut manifest = manifest.clone();
247    normalize_persona_eval_ladder_manifest(&mut manifest);
248    if manifest.persona != DEFAULT_PERSONA {
249        return Err(VmError::Runtime(format!(
250            "persona eval ladder only supports persona '{}', got '{}'",
251            DEFAULT_PERSONA, manifest.persona
252        )));
253    }
254    if manifest.timeout_tiers.is_empty() {
255        return Err(VmError::Runtime(format!(
256            "persona eval ladder '{}' must declare at least one timeout tier",
257            manifest.id
258        )));
259    }
260
261    let base_dir = manifest.base_dir.as_deref().map(Path::new);
262    let backend = resolve_ladder_backend(&manifest.backend, base_dir)?;
263    let artifact_root = resolve_artifact_root(&manifest, base_dir);
264    fs::create_dir_all(&artifact_root).map_err(|error| {
265        VmError::Runtime(format!(
266            "failed to create persona eval ladder artifact root {}: {error}",
267            artifact_root.display()
268        ))
269    })?;
270
271    let mut tiers = Vec::new();
272    for route in &manifest.model_routes {
273        for tier in &manifest.timeout_tiers {
274            let index = tiers.len();
275            tiers.push(run_ladder_tier(
276                &backend,
277                &artifact_root,
278                route,
279                tier,
280                index,
281            )?);
282        }
283    }
284
285    let first_correct_index = tiers.iter().position(|tier| tier.pass);
286    let (first_correct_tier, first_correct_route) = first_correct_index
287        .and_then(|index| tiers.get(index))
288        .map(|tier| (Some(tier.timeout_tier.clone()), Some(tier.route_id.clone())))
289        .unwrap_or((None, None));
290    let passed = tiers.iter().filter(|tier| tier.pass).count();
291    let total = tiers.len();
292    let severity = normalize_ladder_severity(manifest.severity.as_deref());
293    Ok(PersonaEvalLadderReport {
294        type_name: REPORT_TYPE.to_string(),
295        version: 1,
296        id: manifest.id,
297        persona: manifest.persona,
298        blocking: severity == "blocking",
299        severity,
300        pass: first_correct_index.is_some(),
301        total,
302        passed,
303        failed: total.saturating_sub(passed),
304        first_correct_tier,
305        first_correct_route,
306        first_correct_index,
307        artifact_root: artifact_root.display().to_string(),
308        tiers,
309        metadata: manifest.metadata,
310    })
311}
312
313fn resolve_ladder_backend(
314    spec: &PersonaEvalLadderBackendSpec,
315    base_dir: Option<&Path>,
316) -> Result<MergeCaptainDriverBackend, VmError> {
317    match spec.kind.trim().to_ascii_lowercase().as_str() {
318        "live" => Ok(MergeCaptainDriverBackend::Live),
319        "mock" => {
320            let path = spec.path.as_deref().ok_or_else(|| {
321                VmError::Runtime("mock ladder backend requires backend.path".to_string())
322            })?;
323            Ok(MergeCaptainDriverBackend::Mock {
324                playground_dir: resolve_manifest_path(base_dir, path),
325            })
326        }
327        "replay" => {
328            let path = spec.path.as_deref().ok_or_else(|| {
329                VmError::Runtime("replay ladder backend requires backend.path".to_string())
330            })?;
331            Ok(MergeCaptainDriverBackend::Replay {
332                fixture: resolve_manifest_path(base_dir, path),
333            })
334        }
335        other => Err(VmError::Runtime(format!(
336            "unsupported persona eval ladder backend '{}'",
337            other
338        ))),
339    }
340}
341
342fn resolve_artifact_root(manifest: &PersonaEvalLadderManifest, base_dir: Option<&Path>) -> PathBuf {
343    let root = manifest
344        .artifact_root
345        .clone()
346        .unwrap_or_else(|| format!(".harn-runs/persona-eval-ladders/{}", manifest.id));
347    resolve_manifest_path(base_dir, &root)
348}
349
350fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
351    let path_buf = PathBuf::from(path);
352    if path_buf.is_absolute() {
353        path_buf
354    } else if let Some(base_dir) = base_dir {
355        base_dir.join(path_buf)
356    } else {
357        path_buf
358    }
359}
360
361fn run_ladder_tier(
362    backend: &MergeCaptainDriverBackend,
363    artifact_root: &Path,
364    route: &PersonaEvalModelRoute,
365    tier: &PersonaEvalTimeoutTier,
366    index: usize,
367) -> Result<PersonaEvalTierReport, VmError> {
368    let tier_dir = artifact_root
369        .join(format!("{:02}-{}", index + 1, safe_path_segment(&route.id)))
370        .join(safe_path_segment(&tier.id));
371    fs::create_dir_all(&tier_dir).map_err(|error| {
372        VmError::Runtime(format!(
373            "failed to create persona eval ladder tier dir {}: {error}",
374            tier_dir.display()
375        ))
376    })?;
377
378    let transcript_path = tier_dir.join("event_log.jsonl");
379    let receipt_path = tier_dir.join("receipt.json");
380    let summary_path = tier_dir.join("summary.json");
381    let max_sweeps = tier.max_sweeps.unwrap_or(1).max(1);
382    let options = MergeCaptainDriverOptions {
383        backend: backend.clone(),
384        mode: if max_sweeps > 1 {
385            MergeCaptainDriverMode::Watch
386        } else {
387            MergeCaptainDriverMode::Once
388        },
389        model_route: Some(route.route.clone().unwrap_or_else(|| route.id.clone())),
390        timeout_tier: Some(tier.id.clone()),
391        transcript_out: Some(transcript_path.clone()),
392        receipt_out: Some(receipt_path.clone()),
393        run_root: tier_dir.join("runs"),
394        max_sweeps,
395        watch_backoff_ms: tier.watch_backoff_ms.unwrap_or(0),
396        stream_stdout: false,
397    };
398
399    let output = super::run_merge_captain_driver(options)?;
400    write_json_file(&summary_path, &output.summary)?;
401
402    let mut reasons = degradation_reasons(&output.summary, route, tier);
403    if !output.summary.pass {
404        reasons.push(format!(
405            "oracle reported {} error finding(s) and {} warning finding(s)",
406            output.summary.oracle_error_findings, output.summary.oracle_warn_findings
407        ));
408    }
409    let looped = output.audit_report.findings.iter().any(|finding| {
410        let message = finding.message.to_ascii_lowercase();
411        message.contains("loop") || message.contains("stuck")
412    });
413    let pass = output.summary.pass && reasons.is_empty();
414    let outcome = if looped {
415        PersonaEvalTierOutcome::Loop
416    } else if pass {
417        PersonaEvalTierOutcome::Correct
418    } else {
419        PersonaEvalTierOutcome::Degraded
420    };
421
422    Ok(PersonaEvalTierReport {
423        id: format!("{}::{}", route.id, tier.id),
424        route_id: route.id.clone(),
425        model_route: output.summary.model_route.clone(),
426        timeout_tier: tier.id.clone(),
427        timeout_ms: tier.timeout_ms,
428        max_cost_usd: tier.max_cost_usd.or(route.max_cost_usd),
429        max_latency_ms: tier.max_latency_ms.or(tier.timeout_ms),
430        pass,
431        outcome: outcome.as_str().to_string(),
432        degradation_reasons: reasons,
433        transcript_path: output
434            .transcript_path
435            .as_deref()
436            .map(|path| path.display().to_string()),
437        receipt_path: output.receipt_path.display().to_string(),
438        summary_path: summary_path.display().to_string(),
439        event_count: output.summary.event_count,
440        cost_usd: output.summary.cost_usd,
441        latency_ms: output.summary.latency_ms,
442        tool_calls: output.summary.tool_calls,
443        model_calls: output.summary.model_calls,
444        oracle_error_findings: output.summary.oracle_error_findings,
445        oracle_warn_findings: output.summary.oracle_warn_findings,
446        state_machine_coverage: state_machine_coverage(&output.summary.state_transitions),
447    })
448}
449
450fn degradation_reasons(
451    summary: &MergeCaptainRunSummary,
452    route: &PersonaEvalModelRoute,
453    tier: &PersonaEvalTimeoutTier,
454) -> Vec<String> {
455    let mut reasons = Vec::new();
456    if let Some(max_tool_calls) = tier.max_tool_calls {
457        if summary.tool_calls > max_tool_calls {
458            reasons.push(format!(
459                "tool calls {} exceeded tier budget {}",
460                summary.tool_calls, max_tool_calls
461            ));
462        }
463    }
464    if let Some(max_model_calls) = tier.max_model_calls.or(route.max_model_calls) {
465        if summary.model_calls > max_model_calls {
466            reasons.push(format!(
467                "model calls {} exceeded budget {}",
468                summary.model_calls, max_model_calls
469            ));
470        }
471    }
472    if let Some(max_cost_usd) = tier.max_cost_usd.or(route.max_cost_usd) {
473        if summary.cost_usd > max_cost_usd {
474            reasons.push(format!(
475                "cost ${:.6} exceeded budget ${:.6}",
476                summary.cost_usd, max_cost_usd
477            ));
478        }
479    }
480    if let Some(max_latency_ms) = tier.max_latency_ms.or(tier.timeout_ms) {
481        if summary.latency_ms > max_latency_ms {
482            reasons.push(format!(
483                "latency {}ms exceeded tier timeout {}ms",
484                summary.latency_ms, max_latency_ms
485            ));
486        }
487    }
488    reasons
489}
490
491fn state_machine_coverage(transitions: &[StateTransition]) -> StateMachineCoverage {
492    let observed_steps: Vec<String> = transitions
493        .iter()
494        .map(|transition| transition.step.clone())
495        .collect::<BTreeSet<_>>()
496        .into_iter()
497        .collect();
498    StateMachineCoverage {
499        observed: observed_steps.len(),
500        observed_steps,
501        transitions: transitions.to_vec(),
502    }
503}
504
505fn normalize_ladder_severity(value: Option<&str>) -> String {
506    match value
507        .unwrap_or("blocking")
508        .trim()
509        .to_ascii_lowercase()
510        .as_str()
511    {
512        "warn" | "warning" => "warning".to_string(),
513        "info" | "informational" => "informational".to_string(),
514        _ => "blocking".to_string(),
515    }
516}
517
518fn safe_path_segment(value: &str) -> String {
519    let mut out = String::new();
520    for ch in value.chars() {
521        if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
522            out.push(ch);
523        } else {
524            out.push('_');
525        }
526    }
527    if out.is_empty() {
528        "unnamed".to_string()
529    } else {
530        out
531    }
532}
533
534fn write_json_file<T: Serialize>(path: &Path, value: &T) -> Result<(), VmError> {
535    let mut bytes = serde_json::to_vec_pretty(value)
536        .map_err(|error| VmError::Runtime(format!("failed to serialize JSON artifact: {error}")))?;
537    bytes.push(b'\n');
538    fs::write(path, bytes).map_err(|error| {
539        VmError::Runtime(format!(
540            "failed to write artifact {}: {error}",
541            path.display()
542        ))
543    })
544}
545
546#[cfg(test)]
547mod tests {
548    use super::*;
549
550    fn repo_root() -> PathBuf {
551        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
552            .parent()
553            .unwrap()
554            .parent()
555            .unwrap()
556            .to_path_buf()
557    }
558
559    #[test]
560    fn ladder_marks_first_correct_tier_and_writes_artifacts() {
561        let temp = tempfile::tempdir().unwrap();
562        let manifest = PersonaEvalLadderManifest {
563            id: "merge-captain-ladder-test".to_string(),
564            base_dir: Some(repo_root().display().to_string()),
565            artifact_root: Some(temp.path().join("ladder").display().to_string()),
566            backend: PersonaEvalLadderBackendSpec {
567                kind: "replay".to_string(),
568                path: Some(
569                    "examples/personas/merge_captain/transcripts/green_pr.jsonl".to_string(),
570                ),
571            },
572            model_routes: vec![PersonaEvalModelRoute {
573                id: "gemma-value".to_string(),
574                route: Some("local/gemma-value".to_string()),
575                provider: Some("llama.cpp".to_string()),
576                model: Some("gemma".to_string()),
577                profile: Some("value".to_string()),
578                ..Default::default()
579            }],
580            timeout_tiers: vec![
581                PersonaEvalTimeoutTier {
582                    id: "too-tight".to_string(),
583                    max_tool_calls: Some(1),
584                    ..Default::default()
585                },
586                PersonaEvalTimeoutTier {
587                    id: "balanced".to_string(),
588                    max_tool_calls: Some(4),
589                    max_model_calls: Some(1),
590                    ..Default::default()
591                },
592            ],
593            ..Default::default()
594        };
595
596        let report = run_persona_eval_ladder(&manifest).unwrap();
597
598        assert!(report.pass);
599        assert_eq!(report.total, 2);
600        assert_eq!(report.first_correct_tier.as_deref(), Some("balanced"));
601        assert_eq!(report.first_correct_route.as_deref(), Some("gemma-value"));
602        assert_eq!(report.tiers[0].outcome, "degraded");
603        assert_eq!(report.tiers[1].outcome, "correct");
604        assert!(Path::new(&report.tiers[0].transcript_path.as_ref().unwrap()).exists());
605        assert!(Path::new(&report.tiers[1].receipt_path).exists());
606        assert!(report.tiers[1].state_machine_coverage.observed > 0);
607    }
608
609    #[test]
610    fn unsupported_persona_is_rejected() {
611        let manifest = PersonaEvalLadderManifest {
612            id: "other-persona".to_string(),
613            persona: "ship_captain".to_string(),
614            timeout_tiers: vec![PersonaEvalTimeoutTier {
615                id: "smoke".to_string(),
616                ..Default::default()
617            }],
618            ..Default::default()
619        };
620
621        let error = run_persona_eval_ladder(&manifest).unwrap_err();
622
623        assert!(format!("{error}").contains("only supports persona"));
624    }
625}