1use std::collections::{BTreeMap, BTreeSet};
10use std::fs;
11use std::path::{Path, PathBuf};
12
13use serde::{Deserialize, Serialize};
14
15use crate::value::{VmError, VmValue};
16
17use super::{
18 new_id, parse_json_value, MergeCaptainDriverBackend, MergeCaptainDriverMode,
19 MergeCaptainDriverOptions, MergeCaptainRunSummary, StateTransition,
20};
21
22const MANIFEST_TYPE: &str = "persona_eval_ladder_manifest";
23const REPORT_TYPE: &str = "persona_eval_ladder_report";
24const DEFAULT_PERSONA: &str = "merge_captain";
25
26#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
27#[serde(default)]
28pub struct PersonaEvalLadderManifest {
29 #[serde(rename = "_type")]
30 pub type_name: String,
31 pub version: u32,
32 pub id: String,
33 pub name: Option<String>,
34 pub description: Option<String>,
35 pub persona: String,
36 pub base_dir: Option<String>,
37 #[serde(alias = "artifact-root")]
38 pub artifact_root: Option<String>,
39 pub severity: Option<String>,
40 pub backend: PersonaEvalLadderBackendSpec,
41 #[serde(alias = "model-routes")]
42 pub model_routes: Vec<PersonaEvalModelRoute>,
43 #[serde(alias = "timeout-tiers")]
44 pub timeout_tiers: Vec<PersonaEvalTimeoutTier>,
45 pub metadata: BTreeMap<String, serde_json::Value>,
46}
47
48#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
49#[serde(default)]
50pub struct PersonaEvalLadderBackendSpec {
51 pub kind: String,
52 pub path: Option<String>,
53}
54
55#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
56#[serde(default)]
57pub struct PersonaEvalModelRoute {
58 pub id: String,
59 pub route: Option<String>,
60 pub provider: Option<String>,
61 pub model: Option<String>,
62 pub profile: Option<String>,
63 #[serde(alias = "max-cost-usd")]
64 pub max_cost_usd: Option<f64>,
65 #[serde(alias = "max-model-calls")]
66 pub max_model_calls: Option<u64>,
67 pub metadata: BTreeMap<String, serde_json::Value>,
68}
69
70#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
71#[serde(default)]
72pub struct PersonaEvalTimeoutTier {
73 pub id: String,
74 #[serde(alias = "timeout-ms")]
75 pub timeout_ms: Option<u64>,
76 #[serde(alias = "max-latency-ms")]
77 pub max_latency_ms: Option<u64>,
78 #[serde(alias = "max-cost-usd")]
79 pub max_cost_usd: Option<f64>,
80 #[serde(alias = "max-tool-calls")]
81 pub max_tool_calls: Option<u64>,
82 #[serde(alias = "max-model-calls")]
83 pub max_model_calls: Option<u64>,
84 #[serde(alias = "max-sweeps")]
85 pub max_sweeps: Option<u32>,
86 #[serde(alias = "watch-backoff-ms")]
87 pub watch_backoff_ms: Option<u64>,
88 pub metadata: BTreeMap<String, serde_json::Value>,
89}
90
91#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
92#[serde(rename_all = "snake_case")]
93pub enum PersonaEvalTierOutcome {
94 Correct,
95 Degraded,
96 Loop,
97}
98
99impl PersonaEvalTierOutcome {
100 pub fn as_str(self) -> &'static str {
101 match self {
102 Self::Correct => "correct",
103 Self::Degraded => "degraded",
104 Self::Loop => "loop",
105 }
106 }
107}
108
109#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
110#[serde(default)]
111pub struct PersonaEvalLadderReport {
112 #[serde(rename = "_type")]
113 pub type_name: String,
114 pub version: u32,
115 pub id: String,
116 pub persona: String,
117 pub severity: String,
118 pub blocking: bool,
119 pub pass: bool,
120 pub total: usize,
121 pub passed: usize,
122 pub failed: usize,
123 pub first_correct_tier: Option<String>,
124 pub first_correct_route: Option<String>,
125 pub first_correct_index: Option<usize>,
126 pub artifact_root: String,
127 pub tiers: Vec<PersonaEvalTierReport>,
128 pub metadata: BTreeMap<String, serde_json::Value>,
129}
130
131#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
132#[serde(default)]
133pub struct PersonaEvalTierReport {
134 pub id: String,
135 pub route_id: String,
136 pub model_route: Option<String>,
137 pub timeout_tier: String,
138 pub timeout_ms: Option<u64>,
139 pub max_cost_usd: Option<f64>,
140 pub max_latency_ms: Option<u64>,
141 pub pass: bool,
142 pub outcome: String,
143 pub degradation_reasons: Vec<String>,
144 pub transcript_path: Option<String>,
145 pub receipt_path: String,
146 pub summary_path: String,
147 pub event_count: u64,
148 pub cost_usd: f64,
149 pub latency_ms: u64,
150 pub tool_calls: u64,
151 pub model_calls: u64,
152 pub oracle_error_findings: usize,
153 pub oracle_warn_findings: usize,
154 pub state_machine_coverage: StateMachineCoverage,
155}
156
157#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
158#[serde(default)]
159pub struct StateMachineCoverage {
160 pub observed: usize,
161 pub observed_steps: Vec<String>,
162 pub transitions: Vec<StateTransition>,
163}
164
165pub fn load_persona_eval_ladder_manifest(
166 path: &Path,
167) -> Result<PersonaEvalLadderManifest, VmError> {
168 let content = fs::read_to_string(path).map_err(|error| {
169 VmError::Runtime(format!(
170 "failed to read persona eval ladder manifest {}: {error}",
171 path.display()
172 ))
173 })?;
174 let mut manifest: PersonaEvalLadderManifest =
175 if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
176 serde_json::from_str(&content).map_err(|error| {
177 VmError::Runtime(format!(
178 "failed to parse persona eval ladder JSON {}: {error}",
179 path.display()
180 ))
181 })?
182 } else {
183 toml::from_str(&content).map_err(|error| {
184 VmError::Runtime(format!(
185 "failed to parse persona eval ladder TOML {}: {error}",
186 path.display()
187 ))
188 })?
189 };
190 normalize_persona_eval_ladder_manifest(&mut manifest);
191 if manifest.base_dir.is_none() {
192 manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
193 }
194 Ok(manifest)
195}
196
197pub fn normalize_persona_eval_ladder_manifest_value(
198 value: &VmValue,
199) -> Result<PersonaEvalLadderManifest, VmError> {
200 let mut manifest: PersonaEvalLadderManifest = parse_json_value(value)?;
201 normalize_persona_eval_ladder_manifest(&mut manifest);
202 Ok(manifest)
203}
204
205pub fn normalize_persona_eval_ladder_manifest(manifest: &mut PersonaEvalLadderManifest) {
206 if manifest.type_name.is_empty() {
207 manifest.type_name = MANIFEST_TYPE.to_string();
208 }
209 if manifest.version == 0 {
210 manifest.version = 1;
211 }
212 if manifest.id.trim().is_empty() {
213 manifest.id = manifest
214 .name
215 .clone()
216 .filter(|name| !name.trim().is_empty())
217 .unwrap_or_else(|| new_id("persona_eval_ladder"));
218 }
219 if manifest.persona.trim().is_empty() {
220 manifest.persona = DEFAULT_PERSONA.to_string();
221 }
222 if manifest.backend.kind.trim().is_empty() {
223 manifest.backend.kind = "replay".to_string();
224 }
225 if manifest.model_routes.is_empty() {
226 manifest.model_routes.push(PersonaEvalModelRoute {
227 id: "default".to_string(),
228 ..Default::default()
229 });
230 }
231 for (index, route) in manifest.model_routes.iter_mut().enumerate() {
232 if route.id.trim().is_empty() {
233 route.id = format!("route_{}", index + 1);
234 }
235 }
236 for (index, tier) in manifest.timeout_tiers.iter_mut().enumerate() {
237 if tier.id.trim().is_empty() {
238 tier.id = format!("tier_{}", index + 1);
239 }
240 }
241}
242
243pub fn run_persona_eval_ladder(
244 manifest: &PersonaEvalLadderManifest,
245) -> Result<PersonaEvalLadderReport, VmError> {
246 let mut manifest = manifest.clone();
247 normalize_persona_eval_ladder_manifest(&mut manifest);
248 if manifest.persona != DEFAULT_PERSONA {
249 return Err(VmError::Runtime(format!(
250 "persona eval ladder only supports persona '{}', got '{}'",
251 DEFAULT_PERSONA, manifest.persona
252 )));
253 }
254 if manifest.timeout_tiers.is_empty() {
255 return Err(VmError::Runtime(format!(
256 "persona eval ladder '{}' must declare at least one timeout tier",
257 manifest.id
258 )));
259 }
260
261 let base_dir = manifest.base_dir.as_deref().map(Path::new);
262 let backend = resolve_ladder_backend(&manifest.backend, base_dir)?;
263 let artifact_root = resolve_artifact_root(&manifest, base_dir);
264 fs::create_dir_all(&artifact_root).map_err(|error| {
265 VmError::Runtime(format!(
266 "failed to create persona eval ladder artifact root {}: {error}",
267 artifact_root.display()
268 ))
269 })?;
270
271 let mut tiers = Vec::new();
272 for route in &manifest.model_routes {
273 for tier in &manifest.timeout_tiers {
274 let index = tiers.len();
275 tiers.push(run_ladder_tier(
276 &backend,
277 &artifact_root,
278 route,
279 tier,
280 index,
281 )?);
282 }
283 }
284
285 let first_correct_index = tiers.iter().position(|tier| tier.pass);
286 let (first_correct_tier, first_correct_route) = first_correct_index
287 .and_then(|index| tiers.get(index))
288 .map(|tier| (Some(tier.timeout_tier.clone()), Some(tier.route_id.clone())))
289 .unwrap_or((None, None));
290 let passed = tiers.iter().filter(|tier| tier.pass).count();
291 let total = tiers.len();
292 let severity = normalize_ladder_severity(manifest.severity.as_deref());
293 Ok(PersonaEvalLadderReport {
294 type_name: REPORT_TYPE.to_string(),
295 version: 1,
296 id: manifest.id,
297 persona: manifest.persona,
298 blocking: severity == "blocking",
299 severity,
300 pass: first_correct_index.is_some(),
301 total,
302 passed,
303 failed: total.saturating_sub(passed),
304 first_correct_tier,
305 first_correct_route,
306 first_correct_index,
307 artifact_root: artifact_root.display().to_string(),
308 tiers,
309 metadata: manifest.metadata,
310 })
311}
312
313fn resolve_ladder_backend(
314 spec: &PersonaEvalLadderBackendSpec,
315 base_dir: Option<&Path>,
316) -> Result<MergeCaptainDriverBackend, VmError> {
317 match spec.kind.trim().to_ascii_lowercase().as_str() {
318 "live" => Ok(MergeCaptainDriverBackend::Live),
319 "mock" => {
320 let path = spec.path.as_deref().ok_or_else(|| {
321 VmError::Runtime("mock ladder backend requires backend.path".to_string())
322 })?;
323 Ok(MergeCaptainDriverBackend::Mock {
324 playground_dir: resolve_manifest_path(base_dir, path),
325 })
326 }
327 "replay" => {
328 let path = spec.path.as_deref().ok_or_else(|| {
329 VmError::Runtime("replay ladder backend requires backend.path".to_string())
330 })?;
331 Ok(MergeCaptainDriverBackend::Replay {
332 fixture: resolve_manifest_path(base_dir, path),
333 })
334 }
335 other => Err(VmError::Runtime(format!(
336 "unsupported persona eval ladder backend '{}'",
337 other
338 ))),
339 }
340}
341
342fn resolve_artifact_root(manifest: &PersonaEvalLadderManifest, base_dir: Option<&Path>) -> PathBuf {
343 let root = manifest
344 .artifact_root
345 .clone()
346 .unwrap_or_else(|| format!(".harn-runs/persona-eval-ladders/{}", manifest.id));
347 resolve_manifest_path(base_dir, &root)
348}
349
350fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
351 let path_buf = PathBuf::from(path);
352 if path_buf.is_absolute() {
353 path_buf
354 } else if let Some(base_dir) = base_dir {
355 base_dir.join(path_buf)
356 } else {
357 path_buf
358 }
359}
360
361fn run_ladder_tier(
362 backend: &MergeCaptainDriverBackend,
363 artifact_root: &Path,
364 route: &PersonaEvalModelRoute,
365 tier: &PersonaEvalTimeoutTier,
366 index: usize,
367) -> Result<PersonaEvalTierReport, VmError> {
368 let tier_dir = artifact_root
369 .join(format!("{:02}-{}", index + 1, safe_path_segment(&route.id)))
370 .join(safe_path_segment(&tier.id));
371 fs::create_dir_all(&tier_dir).map_err(|error| {
372 VmError::Runtime(format!(
373 "failed to create persona eval ladder tier dir {}: {error}",
374 tier_dir.display()
375 ))
376 })?;
377
378 let transcript_path = tier_dir.join("event_log.jsonl");
379 let receipt_path = tier_dir.join("receipt.json");
380 let summary_path = tier_dir.join("summary.json");
381 let max_sweeps = tier.max_sweeps.unwrap_or(1).max(1);
382 let options = MergeCaptainDriverOptions {
383 backend: backend.clone(),
384 mode: if max_sweeps > 1 {
385 MergeCaptainDriverMode::Watch
386 } else {
387 MergeCaptainDriverMode::Once
388 },
389 model_route: Some(route.route.clone().unwrap_or_else(|| route.id.clone())),
390 timeout_tier: Some(tier.id.clone()),
391 transcript_out: Some(transcript_path.clone()),
392 receipt_out: Some(receipt_path.clone()),
393 run_root: tier_dir.join("runs"),
394 max_sweeps,
395 watch_backoff_ms: tier.watch_backoff_ms.unwrap_or(0),
396 stream_stdout: false,
397 };
398
399 let output = super::run_merge_captain_driver(options)?;
400 write_json_file(&summary_path, &output.summary)?;
401
402 let mut reasons = degradation_reasons(&output.summary, route, tier);
403 if !output.summary.pass {
404 reasons.push(format!(
405 "oracle reported {} error finding(s) and {} warning finding(s)",
406 output.summary.oracle_error_findings, output.summary.oracle_warn_findings
407 ));
408 }
409 let looped = output.audit_report.findings.iter().any(|finding| {
410 let message = finding.message.to_ascii_lowercase();
411 message.contains("loop") || message.contains("stuck")
412 });
413 let pass = output.summary.pass && reasons.is_empty();
414 let outcome = if looped {
415 PersonaEvalTierOutcome::Loop
416 } else if pass {
417 PersonaEvalTierOutcome::Correct
418 } else {
419 PersonaEvalTierOutcome::Degraded
420 };
421
422 Ok(PersonaEvalTierReport {
423 id: format!("{}::{}", route.id, tier.id),
424 route_id: route.id.clone(),
425 model_route: output.summary.model_route.clone(),
426 timeout_tier: tier.id.clone(),
427 timeout_ms: tier.timeout_ms,
428 max_cost_usd: tier.max_cost_usd.or(route.max_cost_usd),
429 max_latency_ms: tier.max_latency_ms.or(tier.timeout_ms),
430 pass,
431 outcome: outcome.as_str().to_string(),
432 degradation_reasons: reasons,
433 transcript_path: output
434 .transcript_path
435 .as_deref()
436 .map(|path| path.display().to_string()),
437 receipt_path: output.receipt_path.display().to_string(),
438 summary_path: summary_path.display().to_string(),
439 event_count: output.summary.event_count,
440 cost_usd: output.summary.cost_usd,
441 latency_ms: output.summary.latency_ms,
442 tool_calls: output.summary.tool_calls,
443 model_calls: output.summary.model_calls,
444 oracle_error_findings: output.summary.oracle_error_findings,
445 oracle_warn_findings: output.summary.oracle_warn_findings,
446 state_machine_coverage: state_machine_coverage(&output.summary.state_transitions),
447 })
448}
449
450fn degradation_reasons(
451 summary: &MergeCaptainRunSummary,
452 route: &PersonaEvalModelRoute,
453 tier: &PersonaEvalTimeoutTier,
454) -> Vec<String> {
455 let mut reasons = Vec::new();
456 if let Some(max_tool_calls) = tier.max_tool_calls {
457 if summary.tool_calls > max_tool_calls {
458 reasons.push(format!(
459 "tool calls {} exceeded tier budget {}",
460 summary.tool_calls, max_tool_calls
461 ));
462 }
463 }
464 if let Some(max_model_calls) = tier.max_model_calls.or(route.max_model_calls) {
465 if summary.model_calls > max_model_calls {
466 reasons.push(format!(
467 "model calls {} exceeded budget {}",
468 summary.model_calls, max_model_calls
469 ));
470 }
471 }
472 if let Some(max_cost_usd) = tier.max_cost_usd.or(route.max_cost_usd) {
473 if summary.cost_usd > max_cost_usd {
474 reasons.push(format!(
475 "cost ${:.6} exceeded budget ${:.6}",
476 summary.cost_usd, max_cost_usd
477 ));
478 }
479 }
480 if let Some(max_latency_ms) = tier.max_latency_ms.or(tier.timeout_ms) {
481 if summary.latency_ms > max_latency_ms {
482 reasons.push(format!(
483 "latency {}ms exceeded tier timeout {}ms",
484 summary.latency_ms, max_latency_ms
485 ));
486 }
487 }
488 reasons
489}
490
491fn state_machine_coverage(transitions: &[StateTransition]) -> StateMachineCoverage {
492 let observed_steps: Vec<String> = transitions
493 .iter()
494 .map(|transition| transition.step.clone())
495 .collect::<BTreeSet<_>>()
496 .into_iter()
497 .collect();
498 StateMachineCoverage {
499 observed: observed_steps.len(),
500 observed_steps,
501 transitions: transitions.to_vec(),
502 }
503}
504
505fn normalize_ladder_severity(value: Option<&str>) -> String {
506 match value
507 .unwrap_or("blocking")
508 .trim()
509 .to_ascii_lowercase()
510 .as_str()
511 {
512 "warn" | "warning" => "warning".to_string(),
513 "info" | "informational" => "informational".to_string(),
514 _ => "blocking".to_string(),
515 }
516}
517
518fn safe_path_segment(value: &str) -> String {
519 let mut out = String::new();
520 for ch in value.chars() {
521 if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
522 out.push(ch);
523 } else {
524 out.push('_');
525 }
526 }
527 if out.is_empty() {
528 "unnamed".to_string()
529 } else {
530 out
531 }
532}
533
534fn write_json_file<T: Serialize>(path: &Path, value: &T) -> Result<(), VmError> {
535 let mut bytes = serde_json::to_vec_pretty(value)
536 .map_err(|error| VmError::Runtime(format!("failed to serialize JSON artifact: {error}")))?;
537 bytes.push(b'\n');
538 fs::write(path, bytes).map_err(|error| {
539 VmError::Runtime(format!(
540 "failed to write artifact {}: {error}",
541 path.display()
542 ))
543 })
544}
545
546#[cfg(test)]
547mod tests {
548 use super::*;
549
550 fn repo_root() -> PathBuf {
551 PathBuf::from(env!("CARGO_MANIFEST_DIR"))
552 .parent()
553 .unwrap()
554 .parent()
555 .unwrap()
556 .to_path_buf()
557 }
558
559 #[test]
560 fn ladder_marks_first_correct_tier_and_writes_artifacts() {
561 let temp = tempfile::tempdir().unwrap();
562 let manifest = PersonaEvalLadderManifest {
563 id: "merge-captain-ladder-test".to_string(),
564 base_dir: Some(repo_root().display().to_string()),
565 artifact_root: Some(temp.path().join("ladder").display().to_string()),
566 backend: PersonaEvalLadderBackendSpec {
567 kind: "replay".to_string(),
568 path: Some(
569 "examples/personas/merge_captain/transcripts/green_pr.jsonl".to_string(),
570 ),
571 },
572 model_routes: vec![PersonaEvalModelRoute {
573 id: "gemma-value".to_string(),
574 route: Some("local/gemma-value".to_string()),
575 provider: Some("llama.cpp".to_string()),
576 model: Some("gemma".to_string()),
577 profile: Some("value".to_string()),
578 ..Default::default()
579 }],
580 timeout_tiers: vec![
581 PersonaEvalTimeoutTier {
582 id: "too-tight".to_string(),
583 max_tool_calls: Some(1),
584 ..Default::default()
585 },
586 PersonaEvalTimeoutTier {
587 id: "balanced".to_string(),
588 max_tool_calls: Some(4),
589 max_model_calls: Some(1),
590 ..Default::default()
591 },
592 ],
593 ..Default::default()
594 };
595
596 let report = run_persona_eval_ladder(&manifest).unwrap();
597
598 assert!(report.pass);
599 assert_eq!(report.total, 2);
600 assert_eq!(report.first_correct_tier.as_deref(), Some("balanced"));
601 assert_eq!(report.first_correct_route.as_deref(), Some("gemma-value"));
602 assert_eq!(report.tiers[0].outcome, "degraded");
603 assert_eq!(report.tiers[1].outcome, "correct");
604 assert!(Path::new(&report.tiers[0].transcript_path.as_ref().unwrap()).exists());
605 assert!(Path::new(&report.tiers[1].receipt_path).exists());
606 assert!(report.tiers[1].state_machine_coverage.observed > 0);
607 }
608
609 #[test]
610 fn unsupported_persona_is_rejected() {
611 let manifest = PersonaEvalLadderManifest {
612 id: "other-persona".to_string(),
613 persona: "ship_captain".to_string(),
614 timeout_tiers: vec![PersonaEvalTimeoutTier {
615 id: "smoke".to_string(),
616 ..Default::default()
617 }],
618 ..Default::default()
619 };
620
621 let error = run_persona_eval_ladder(&manifest).unwrap_err();
622
623 assert!(format!("{error}").contains("only supports persona"));
624 }
625}