Skip to main content

harn_vm/orchestration/
merge_captain_ladder.rs

1//! Persona eval timeout/budget ladders, starting with Merge Captain.
2//!
3//! The ladder is intentionally an eval artifact runner rather than host
4//! orchestration logic: every route/tier combination produces the same
5//! transcript, receipt, and summary contracts as `harn merge-captain run`,
6//! then an aggregate report marks the first configuration that completed
7//! correctly and the configurations that degraded or looped.
8
9use std::collections::{BTreeMap, BTreeSet};
10use std::fs;
11use std::path::{Path, PathBuf};
12
13use serde::{Deserialize, Serialize};
14
15use crate::value::{VmError, VmValue};
16
17use super::{
18    new_id, parse_json_value, MergeCaptainDriverBackend, MergeCaptainDriverMode,
19    MergeCaptainDriverOptions, MergeCaptainRunSummary, StateTransition,
20};
21
22const MANIFEST_TYPE: &str = "persona_eval_ladder_manifest";
23const REPORT_TYPE: &str = "persona_eval_ladder_report";
24const DEFAULT_PERSONA: &str = "merge_captain";
25
26#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
27#[serde(default)]
28pub struct PersonaEvalLadderManifest {
29    #[serde(rename = "_type")]
30    pub type_name: String,
31    pub version: u32,
32    pub id: String,
33    pub name: Option<String>,
34    pub description: Option<String>,
35    pub persona: String,
36    pub base_dir: Option<String>,
37    #[serde(alias = "artifact-root")]
38    pub artifact_root: Option<String>,
39    pub severity: Option<String>,
40    pub backend: PersonaEvalLadderBackendSpec,
41    #[serde(alias = "model-routes")]
42    pub model_routes: Vec<PersonaEvalModelRoute>,
43    #[serde(alias = "timeout-tiers")]
44    pub timeout_tiers: Vec<PersonaEvalTimeoutTier>,
45    pub metadata: BTreeMap<String, serde_json::Value>,
46}
47
48#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
49#[serde(default)]
50pub struct PersonaEvalLadderBackendSpec {
51    pub kind: String,
52    pub path: Option<String>,
53}
54
55#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
56#[serde(default)]
57pub struct PersonaEvalModelRoute {
58    pub id: String,
59    pub route: Option<String>,
60    pub provider: Option<String>,
61    pub model: Option<String>,
62    pub profile: Option<String>,
63    #[serde(alias = "max-cost-usd")]
64    pub max_cost_usd: Option<f64>,
65    #[serde(alias = "max-model-calls")]
66    pub max_model_calls: Option<u64>,
67    pub metadata: BTreeMap<String, serde_json::Value>,
68}
69
70#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
71#[serde(default)]
72pub struct PersonaEvalTimeoutTier {
73    pub id: String,
74    #[serde(alias = "timeout-ms")]
75    pub timeout_ms: Option<u64>,
76    #[serde(alias = "max-latency-ms")]
77    pub max_latency_ms: Option<u64>,
78    #[serde(alias = "max-cost-usd")]
79    pub max_cost_usd: Option<f64>,
80    #[serde(alias = "max-tool-calls")]
81    pub max_tool_calls: Option<u64>,
82    #[serde(alias = "max-model-calls")]
83    pub max_model_calls: Option<u64>,
84    #[serde(alias = "max-sweeps")]
85    pub max_sweeps: Option<u32>,
86    #[serde(alias = "watch-backoff-ms")]
87    pub watch_backoff_ms: Option<u64>,
88    pub metadata: BTreeMap<String, serde_json::Value>,
89}
90
91#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
92#[serde(rename_all = "snake_case")]
93pub enum PersonaEvalTierOutcome {
94    Correct,
95    Degraded,
96    Loop,
97}
98
99impl PersonaEvalTierOutcome {
100    pub fn as_str(self) -> &'static str {
101        match self {
102            Self::Correct => "correct",
103            Self::Degraded => "degraded",
104            Self::Loop => "loop",
105        }
106    }
107}
108
109#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
110#[serde(default)]
111pub struct PersonaEvalLadderReport {
112    #[serde(rename = "_type")]
113    pub type_name: String,
114    pub version: u32,
115    pub id: String,
116    pub persona: String,
117    pub severity: String,
118    pub blocking: bool,
119    pub pass: bool,
120    pub total: usize,
121    pub passed: usize,
122    pub failed: usize,
123    pub first_correct_tier: Option<String>,
124    pub first_correct_route: Option<String>,
125    pub first_correct_index: Option<usize>,
126    pub artifact_root: String,
127    pub tiers: Vec<PersonaEvalTierReport>,
128    pub metadata: BTreeMap<String, serde_json::Value>,
129}
130
131#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
132#[serde(default)]
133pub struct PersonaEvalTierReport {
134    pub id: String,
135    pub route_id: String,
136    pub model_route: Option<String>,
137    pub timeout_tier: String,
138    pub timeout_ms: Option<u64>,
139    pub max_cost_usd: Option<f64>,
140    pub max_latency_ms: Option<u64>,
141    pub pass: bool,
142    pub outcome: String,
143    pub degradation_reasons: Vec<String>,
144    pub transcript_path: Option<String>,
145    pub receipt_path: String,
146    pub summary_path: String,
147    pub event_count: u64,
148    pub cost_usd: f64,
149    pub latency_ms: u64,
150    pub tool_calls: u64,
151    pub model_calls: u64,
152    pub oracle_error_findings: usize,
153    pub oracle_warn_findings: usize,
154    pub state_machine_coverage: StateMachineCoverage,
155}
156
157#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
158#[serde(default)]
159pub struct StateMachineCoverage {
160    pub observed: usize,
161    pub observed_steps: Vec<String>,
162    pub transitions: Vec<StateTransition>,
163}
164
165pub fn load_persona_eval_ladder_manifest(
166    path: &Path,
167) -> Result<PersonaEvalLadderManifest, VmError> {
168    let content = fs::read_to_string(path).map_err(|error| {
169        VmError::Runtime(format!(
170            "failed to read persona eval ladder manifest {}: {error}",
171            path.display()
172        ))
173    })?;
174    let mut manifest: PersonaEvalLadderManifest =
175        if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
176            serde_json::from_str(&content).map_err(|error| {
177                VmError::Runtime(format!(
178                    "failed to parse persona eval ladder JSON {}: {error}",
179                    path.display()
180                ))
181            })?
182        } else {
183            toml::from_str(&content).map_err(|error| {
184                VmError::Runtime(format!(
185                    "failed to parse persona eval ladder TOML {}: {error}",
186                    path.display()
187                ))
188            })?
189        };
190    normalize_persona_eval_ladder_manifest(&mut manifest);
191    if manifest.base_dir.is_none() {
192        manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
193    }
194    Ok(manifest)
195}
196
197pub fn normalize_persona_eval_ladder_manifest_value(
198    value: &VmValue,
199) -> Result<PersonaEvalLadderManifest, VmError> {
200    let mut manifest: PersonaEvalLadderManifest = parse_json_value(value)?;
201    normalize_persona_eval_ladder_manifest(&mut manifest);
202    Ok(manifest)
203}
204
205pub fn normalize_persona_eval_ladder_manifest(manifest: &mut PersonaEvalLadderManifest) {
206    if manifest.type_name.is_empty() {
207        manifest.type_name = MANIFEST_TYPE.to_string();
208    }
209    if manifest.version == 0 {
210        manifest.version = 1;
211    }
212    if manifest.id.trim().is_empty() {
213        manifest.id = manifest
214            .name
215            .clone()
216            .filter(|name| !name.trim().is_empty())
217            .unwrap_or_else(|| new_id("persona_eval_ladder"));
218    }
219    if manifest.persona.trim().is_empty() {
220        manifest.persona = DEFAULT_PERSONA.to_string();
221    }
222    if manifest.backend.kind.trim().is_empty() {
223        manifest.backend.kind = "replay".to_string();
224    }
225    if manifest.model_routes.is_empty() {
226        manifest.model_routes.push(PersonaEvalModelRoute {
227            id: "default".to_string(),
228            ..Default::default()
229        });
230    }
231    for (index, route) in manifest.model_routes.iter_mut().enumerate() {
232        if route.id.trim().is_empty() {
233            route.id = format!("route_{}", index + 1);
234        }
235    }
236    for (index, tier) in manifest.timeout_tiers.iter_mut().enumerate() {
237        if tier.id.trim().is_empty() {
238            tier.id = format!("tier_{}", index + 1);
239        }
240    }
241}
242
243pub fn run_persona_eval_ladder(
244    manifest: &PersonaEvalLadderManifest,
245) -> Result<PersonaEvalLadderReport, VmError> {
246    let mut manifest = manifest.clone();
247    normalize_persona_eval_ladder_manifest(&mut manifest);
248    if manifest.persona != DEFAULT_PERSONA {
249        return Err(VmError::Runtime(format!(
250            "persona eval ladder only supports persona '{}', got '{}'",
251            DEFAULT_PERSONA, manifest.persona
252        )));
253    }
254    if manifest.timeout_tiers.is_empty() {
255        return Err(VmError::Runtime(format!(
256            "persona eval ladder '{}' must declare at least one timeout tier",
257            manifest.id
258        )));
259    }
260
261    let base_dir = manifest.base_dir.as_deref().map(Path::new);
262    let backend = resolve_ladder_backend(&manifest.backend, base_dir)?;
263    let artifact_root = resolve_artifact_root(&manifest, base_dir);
264    fs::create_dir_all(&artifact_root).map_err(|error| {
265        VmError::Runtime(format!(
266            "failed to create persona eval ladder artifact root {}: {error}",
267            artifact_root.display()
268        ))
269    })?;
270
271    let mut tiers = Vec::new();
272    for route in &manifest.model_routes {
273        for tier in &manifest.timeout_tiers {
274            let index = tiers.len();
275            tiers.push(run_ladder_tier(
276                &backend,
277                &artifact_root,
278                route,
279                tier,
280                index,
281            )?);
282        }
283    }
284
285    let first_correct_index = tiers.iter().position(|tier| tier.pass);
286    let (first_correct_tier, first_correct_route) = first_correct_index
287        .and_then(|index| tiers.get(index))
288        .map(|tier| (Some(tier.timeout_tier.clone()), Some(tier.route_id.clone())))
289        .unwrap_or((None, None));
290    let passed = tiers.iter().filter(|tier| tier.pass).count();
291    let total = tiers.len();
292    let severity = normalize_ladder_severity(manifest.severity.as_deref());
293    Ok(PersonaEvalLadderReport {
294        type_name: REPORT_TYPE.to_string(),
295        version: 1,
296        id: manifest.id,
297        persona: manifest.persona,
298        blocking: severity == "blocking",
299        severity,
300        pass: first_correct_index.is_some(),
301        total,
302        passed,
303        failed: total.saturating_sub(passed),
304        first_correct_tier,
305        first_correct_route,
306        first_correct_index,
307        artifact_root: artifact_root.display().to_string(),
308        tiers,
309        metadata: manifest.metadata,
310    })
311}
312
313fn resolve_ladder_backend(
314    spec: &PersonaEvalLadderBackendSpec,
315    base_dir: Option<&Path>,
316) -> Result<MergeCaptainDriverBackend, VmError> {
317    match spec.kind.trim().to_ascii_lowercase().as_str() {
318        "live" => Ok(MergeCaptainDriverBackend::Live),
319        "mock" => {
320            let path = spec.path.as_deref().ok_or_else(|| {
321                VmError::Runtime("mock ladder backend requires backend.path".to_string())
322            })?;
323            Ok(MergeCaptainDriverBackend::Mock {
324                playground_dir: resolve_manifest_path(base_dir, path),
325            })
326        }
327        "replay" => {
328            let path = spec.path.as_deref().ok_or_else(|| {
329                VmError::Runtime("replay ladder backend requires backend.path".to_string())
330            })?;
331            Ok(MergeCaptainDriverBackend::Replay {
332                fixture: resolve_manifest_path(base_dir, path),
333            })
334        }
335        other => Err(VmError::Runtime(format!(
336            "unsupported persona eval ladder backend '{other}'"
337        ))),
338    }
339}
340
341fn resolve_artifact_root(manifest: &PersonaEvalLadderManifest, base_dir: Option<&Path>) -> PathBuf {
342    let root = manifest
343        .artifact_root
344        .clone()
345        .unwrap_or_else(|| format!(".harn-runs/persona-eval-ladders/{}", manifest.id));
346    resolve_manifest_path(base_dir, &root)
347}
348
349fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
350    let path_buf = PathBuf::from(path);
351    if path_buf.is_absolute() {
352        path_buf
353    } else if let Some(base_dir) = base_dir {
354        base_dir.join(path_buf)
355    } else {
356        path_buf
357    }
358}
359
360fn run_ladder_tier(
361    backend: &MergeCaptainDriverBackend,
362    artifact_root: &Path,
363    route: &PersonaEvalModelRoute,
364    tier: &PersonaEvalTimeoutTier,
365    index: usize,
366) -> Result<PersonaEvalTierReport, VmError> {
367    let tier_dir = artifact_root
368        .join(format!("{:02}-{}", index + 1, safe_path_segment(&route.id)))
369        .join(safe_path_segment(&tier.id));
370    fs::create_dir_all(&tier_dir).map_err(|error| {
371        VmError::Runtime(format!(
372            "failed to create persona eval ladder tier dir {}: {error}",
373            tier_dir.display()
374        ))
375    })?;
376
377    let transcript_path = tier_dir.join("event_log.jsonl");
378    let receipt_path = tier_dir.join("receipt.json");
379    let summary_path = tier_dir.join("summary.json");
380    let max_sweeps = tier.max_sweeps.unwrap_or(1).max(1);
381    let options = MergeCaptainDriverOptions {
382        backend: backend.clone(),
383        mode: if max_sweeps > 1 {
384            MergeCaptainDriverMode::Watch
385        } else {
386            MergeCaptainDriverMode::Once
387        },
388        model_route: Some(route.route.clone().unwrap_or_else(|| route.id.clone())),
389        timeout_tier: Some(tier.id.clone()),
390        transcript_out: Some(transcript_path),
391        receipt_out: Some(receipt_path),
392        run_root: tier_dir.join("runs"),
393        max_sweeps,
394        watch_backoff_ms: tier.watch_backoff_ms.unwrap_or(0),
395        stream_stdout: false,
396    };
397
398    let output = super::run_merge_captain_driver(options)?;
399    write_json_file(&summary_path, &output.summary)?;
400
401    let mut reasons = degradation_reasons(&output.summary, route, tier);
402    if !output.summary.pass {
403        reasons.push(format!(
404            "oracle reported {} error finding(s) and {} warning finding(s)",
405            output.summary.oracle_error_findings, output.summary.oracle_warn_findings
406        ));
407    }
408    let looped = output.audit_report.findings.iter().any(|finding| {
409        let message = finding.message.to_ascii_lowercase();
410        message.contains("loop") || message.contains("stuck")
411    });
412    let pass = output.summary.pass && reasons.is_empty();
413    let outcome = if looped {
414        PersonaEvalTierOutcome::Loop
415    } else if pass {
416        PersonaEvalTierOutcome::Correct
417    } else {
418        PersonaEvalTierOutcome::Degraded
419    };
420
421    Ok(PersonaEvalTierReport {
422        id: format!("{}::{}", route.id, tier.id),
423        route_id: route.id.clone(),
424        model_route: output.summary.model_route.clone(),
425        timeout_tier: tier.id.clone(),
426        timeout_ms: tier.timeout_ms,
427        max_cost_usd: tier.max_cost_usd.or(route.max_cost_usd),
428        max_latency_ms: tier.max_latency_ms.or(tier.timeout_ms),
429        pass,
430        outcome: outcome.as_str().to_string(),
431        degradation_reasons: reasons,
432        transcript_path: output
433            .transcript_path
434            .as_deref()
435            .map(|path| path.display().to_string()),
436        receipt_path: output.receipt_path.display().to_string(),
437        summary_path: summary_path.display().to_string(),
438        event_count: output.summary.event_count,
439        cost_usd: output.summary.cost_usd,
440        latency_ms: output.summary.latency_ms,
441        tool_calls: output.summary.tool_calls,
442        model_calls: output.summary.model_calls,
443        oracle_error_findings: output.summary.oracle_error_findings,
444        oracle_warn_findings: output.summary.oracle_warn_findings,
445        state_machine_coverage: state_machine_coverage(&output.summary.state_transitions),
446    })
447}
448
449fn degradation_reasons(
450    summary: &MergeCaptainRunSummary,
451    route: &PersonaEvalModelRoute,
452    tier: &PersonaEvalTimeoutTier,
453) -> Vec<String> {
454    let mut reasons = Vec::new();
455    if let Some(max_tool_calls) = tier.max_tool_calls {
456        if summary.tool_calls > max_tool_calls {
457            reasons.push(format!(
458                "tool calls {} exceeded tier budget {}",
459                summary.tool_calls, max_tool_calls
460            ));
461        }
462    }
463    if let Some(max_model_calls) = tier.max_model_calls.or(route.max_model_calls) {
464        if summary.model_calls > max_model_calls {
465            reasons.push(format!(
466                "model calls {} exceeded budget {}",
467                summary.model_calls, max_model_calls
468            ));
469        }
470    }
471    if let Some(max_cost_usd) = tier.max_cost_usd.or(route.max_cost_usd) {
472        if summary.cost_usd > max_cost_usd {
473            reasons.push(format!(
474                "cost ${:.6} exceeded budget ${:.6}",
475                summary.cost_usd, max_cost_usd
476            ));
477        }
478    }
479    if let Some(max_latency_ms) = tier.max_latency_ms.or(tier.timeout_ms) {
480        if summary.latency_ms > max_latency_ms {
481            reasons.push(format!(
482                "latency {}ms exceeded tier timeout {}ms",
483                summary.latency_ms, max_latency_ms
484            ));
485        }
486    }
487    reasons
488}
489
490fn state_machine_coverage(transitions: &[StateTransition]) -> StateMachineCoverage {
491    let observed_steps: Vec<String> = transitions
492        .iter()
493        .map(|transition| transition.step.clone())
494        .collect::<BTreeSet<_>>()
495        .into_iter()
496        .collect();
497    StateMachineCoverage {
498        observed: observed_steps.len(),
499        observed_steps,
500        transitions: transitions.to_vec(),
501    }
502}
503
504fn normalize_ladder_severity(value: Option<&str>) -> String {
505    match value
506        .unwrap_or("blocking")
507        .trim()
508        .to_ascii_lowercase()
509        .as_str()
510    {
511        "warn" | "warning" => "warning".to_string(),
512        "info" | "informational" => "informational".to_string(),
513        _ => "blocking".to_string(),
514    }
515}
516
517fn safe_path_segment(value: &str) -> String {
518    let mut out = String::new();
519    for ch in value.chars() {
520        if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
521            out.push(ch);
522        } else {
523            out.push('_');
524        }
525    }
526    if out.is_empty() {
527        "unnamed".to_string()
528    } else {
529        out
530    }
531}
532
533fn write_json_file<T: Serialize>(path: &Path, value: &T) -> Result<(), VmError> {
534    let mut bytes = serde_json::to_vec_pretty(value)
535        .map_err(|error| VmError::Runtime(format!("failed to serialize JSON artifact: {error}")))?;
536    bytes.push(b'\n');
537    fs::write(path, bytes).map_err(|error| {
538        VmError::Runtime(format!(
539            "failed to write artifact {}: {error}",
540            path.display()
541        ))
542    })
543}
544
545#[cfg(test)]
546mod tests {
547    use super::*;
548
549    fn repo_root() -> PathBuf {
550        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
551            .parent()
552            .unwrap()
553            .parent()
554            .unwrap()
555            .to_path_buf()
556    }
557
558    #[test]
559    fn ladder_marks_first_correct_tier_and_writes_artifacts() {
560        let temp = tempfile::tempdir().unwrap();
561        let manifest = PersonaEvalLadderManifest {
562            id: "merge-captain-ladder-test".to_string(),
563            base_dir: Some(repo_root().display().to_string()),
564            artifact_root: Some(temp.path().join("ladder").display().to_string()),
565            backend: PersonaEvalLadderBackendSpec {
566                kind: "replay".to_string(),
567                path: Some(
568                    "examples/personas/merge_captain/transcripts/green_pr.jsonl".to_string(),
569                ),
570            },
571            model_routes: vec![PersonaEvalModelRoute {
572                id: "gemma-value".to_string(),
573                route: Some("local/gemma-value".to_string()),
574                provider: Some("llama.cpp".to_string()),
575                model: Some("gemma".to_string()),
576                profile: Some("value".to_string()),
577                ..Default::default()
578            }],
579            timeout_tiers: vec![
580                PersonaEvalTimeoutTier {
581                    id: "too-tight".to_string(),
582                    max_tool_calls: Some(1),
583                    ..Default::default()
584                },
585                PersonaEvalTimeoutTier {
586                    id: "balanced".to_string(),
587                    max_tool_calls: Some(4),
588                    max_model_calls: Some(1),
589                    ..Default::default()
590                },
591            ],
592            ..Default::default()
593        };
594
595        let report = run_persona_eval_ladder(&manifest).unwrap();
596
597        assert!(report.pass);
598        assert_eq!(report.total, 2);
599        assert_eq!(report.first_correct_tier.as_deref(), Some("balanced"));
600        assert_eq!(report.first_correct_route.as_deref(), Some("gemma-value"));
601        assert_eq!(report.tiers[0].outcome, "degraded");
602        assert_eq!(report.tiers[1].outcome, "correct");
603        assert!(Path::new(&report.tiers[0].transcript_path.as_ref().unwrap()).exists());
604        assert!(Path::new(&report.tiers[1].receipt_path).exists());
605        assert!(report.tiers[1].state_machine_coverage.observed > 0);
606    }
607
608    #[test]
609    fn unsupported_persona_is_rejected() {
610        let manifest = PersonaEvalLadderManifest {
611            id: "other-persona".to_string(),
612            persona: "ship_captain".to_string(),
613            timeout_tiers: vec![PersonaEvalTimeoutTier {
614                id: "smoke".to_string(),
615                ..Default::default()
616            }],
617            ..Default::default()
618        };
619
620        let error = run_persona_eval_ladder(&manifest).unwrap_err();
621
622        assert!(format!("{error}").contains("only supports persona"));
623    }
624}