1use std::collections::{BTreeMap, BTreeSet};
10use std::fs;
11use std::path::{Path, PathBuf};
12
13use serde::{Deserialize, Serialize};
14
15use crate::value::{VmError, VmValue};
16
17use super::{
18 new_id, parse_json_value, MergeCaptainDriverBackend, MergeCaptainDriverMode,
19 MergeCaptainDriverOptions, MergeCaptainRunSummary, StateTransition,
20};
21
22const MANIFEST_TYPE: &str = "persona_eval_ladder_manifest";
23const REPORT_TYPE: &str = "persona_eval_ladder_report";
24const DEFAULT_PERSONA: &str = "merge_captain";
25
26#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
27#[serde(default)]
28pub struct PersonaEvalLadderManifest {
29 #[serde(rename = "_type")]
30 pub type_name: String,
31 pub version: u32,
32 pub id: String,
33 pub name: Option<String>,
34 pub description: Option<String>,
35 pub persona: String,
36 pub base_dir: Option<String>,
37 #[serde(alias = "artifact-root")]
38 pub artifact_root: Option<String>,
39 pub severity: Option<String>,
40 pub backend: PersonaEvalLadderBackendSpec,
41 #[serde(alias = "model-routes")]
42 pub model_routes: Vec<PersonaEvalModelRoute>,
43 #[serde(alias = "timeout-tiers")]
44 pub timeout_tiers: Vec<PersonaEvalTimeoutTier>,
45 pub metadata: BTreeMap<String, serde_json::Value>,
46}
47
48#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
49#[serde(default)]
50pub struct PersonaEvalLadderBackendSpec {
51 pub kind: String,
52 pub path: Option<String>,
53}
54
55#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
56#[serde(default)]
57pub struct PersonaEvalModelRoute {
58 pub id: String,
59 pub route: Option<String>,
60 pub provider: Option<String>,
61 pub model: Option<String>,
62 pub profile: Option<String>,
63 #[serde(alias = "max-cost-usd")]
64 pub max_cost_usd: Option<f64>,
65 #[serde(alias = "max-model-calls")]
66 pub max_model_calls: Option<u64>,
67 pub metadata: BTreeMap<String, serde_json::Value>,
68}
69
70#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
71#[serde(default)]
72pub struct PersonaEvalTimeoutTier {
73 pub id: String,
74 #[serde(alias = "timeout-ms")]
75 pub timeout_ms: Option<u64>,
76 #[serde(alias = "max-latency-ms")]
77 pub max_latency_ms: Option<u64>,
78 #[serde(alias = "max-cost-usd")]
79 pub max_cost_usd: Option<f64>,
80 #[serde(alias = "max-tool-calls")]
81 pub max_tool_calls: Option<u64>,
82 #[serde(alias = "max-model-calls")]
83 pub max_model_calls: Option<u64>,
84 #[serde(alias = "max-sweeps")]
85 pub max_sweeps: Option<u32>,
86 #[serde(alias = "watch-backoff-ms")]
87 pub watch_backoff_ms: Option<u64>,
88 pub metadata: BTreeMap<String, serde_json::Value>,
89}
90
91#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
92#[serde(rename_all = "snake_case")]
93pub enum PersonaEvalTierOutcome {
94 Correct,
95 Degraded,
96 Loop,
97}
98
99impl PersonaEvalTierOutcome {
100 pub fn as_str(self) -> &'static str {
101 match self {
102 Self::Correct => "correct",
103 Self::Degraded => "degraded",
104 Self::Loop => "loop",
105 }
106 }
107}
108
109#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
110#[serde(default)]
111pub struct PersonaEvalLadderReport {
112 #[serde(rename = "_type")]
113 pub type_name: String,
114 pub version: u32,
115 pub id: String,
116 pub persona: String,
117 pub severity: String,
118 pub blocking: bool,
119 pub pass: bool,
120 pub total: usize,
121 pub passed: usize,
122 pub failed: usize,
123 pub first_correct_tier: Option<String>,
124 pub first_correct_route: Option<String>,
125 pub first_correct_index: Option<usize>,
126 pub artifact_root: String,
127 pub tiers: Vec<PersonaEvalTierReport>,
128 pub metadata: BTreeMap<String, serde_json::Value>,
129}
130
131#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
132#[serde(default)]
133pub struct PersonaEvalTierReport {
134 pub id: String,
135 pub route_id: String,
136 pub model_route: Option<String>,
137 pub timeout_tier: String,
138 pub timeout_ms: Option<u64>,
139 pub max_cost_usd: Option<f64>,
140 pub max_latency_ms: Option<u64>,
141 pub pass: bool,
142 pub outcome: String,
143 pub degradation_reasons: Vec<String>,
144 pub transcript_path: Option<String>,
145 pub receipt_path: String,
146 pub summary_path: String,
147 pub event_count: u64,
148 pub cost_usd: f64,
149 pub latency_ms: u64,
150 pub tool_calls: u64,
151 pub model_calls: u64,
152 pub oracle_error_findings: usize,
153 pub oracle_warn_findings: usize,
154 pub state_machine_coverage: StateMachineCoverage,
155}
156
157#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
158#[serde(default)]
159pub struct StateMachineCoverage {
160 pub observed: usize,
161 pub observed_steps: Vec<String>,
162 pub transitions: Vec<StateTransition>,
163}
164
165pub fn load_persona_eval_ladder_manifest(
166 path: &Path,
167) -> Result<PersonaEvalLadderManifest, VmError> {
168 let content = fs::read_to_string(path).map_err(|error| {
169 VmError::Runtime(format!(
170 "failed to read persona eval ladder manifest {}: {error}",
171 path.display()
172 ))
173 })?;
174 let mut manifest: PersonaEvalLadderManifest =
175 if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
176 serde_json::from_str(&content).map_err(|error| {
177 VmError::Runtime(format!(
178 "failed to parse persona eval ladder JSON {}: {error}",
179 path.display()
180 ))
181 })?
182 } else {
183 toml::from_str(&content).map_err(|error| {
184 VmError::Runtime(format!(
185 "failed to parse persona eval ladder TOML {}: {error}",
186 path.display()
187 ))
188 })?
189 };
190 normalize_persona_eval_ladder_manifest(&mut manifest);
191 if manifest.base_dir.is_none() {
192 manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
193 }
194 Ok(manifest)
195}
196
197pub fn normalize_persona_eval_ladder_manifest_value(
198 value: &VmValue,
199) -> Result<PersonaEvalLadderManifest, VmError> {
200 let mut manifest: PersonaEvalLadderManifest = parse_json_value(value)?;
201 normalize_persona_eval_ladder_manifest(&mut manifest);
202 Ok(manifest)
203}
204
205pub fn normalize_persona_eval_ladder_manifest(manifest: &mut PersonaEvalLadderManifest) {
206 if manifest.type_name.is_empty() {
207 manifest.type_name = MANIFEST_TYPE.to_string();
208 }
209 if manifest.version == 0 {
210 manifest.version = 1;
211 }
212 if manifest.id.trim().is_empty() {
213 manifest.id = manifest
214 .name
215 .clone()
216 .filter(|name| !name.trim().is_empty())
217 .unwrap_or_else(|| new_id("persona_eval_ladder"));
218 }
219 if manifest.persona.trim().is_empty() {
220 manifest.persona = DEFAULT_PERSONA.to_string();
221 }
222 if manifest.backend.kind.trim().is_empty() {
223 manifest.backend.kind = "replay".to_string();
224 }
225 if manifest.model_routes.is_empty() {
226 manifest.model_routes.push(PersonaEvalModelRoute {
227 id: "default".to_string(),
228 ..Default::default()
229 });
230 }
231 for (index, route) in manifest.model_routes.iter_mut().enumerate() {
232 if route.id.trim().is_empty() {
233 route.id = format!("route_{}", index + 1);
234 }
235 }
236 for (index, tier) in manifest.timeout_tiers.iter_mut().enumerate() {
237 if tier.id.trim().is_empty() {
238 tier.id = format!("tier_{}", index + 1);
239 }
240 }
241}
242
243pub fn run_persona_eval_ladder(
244 manifest: &PersonaEvalLadderManifest,
245) -> Result<PersonaEvalLadderReport, VmError> {
246 let mut manifest = manifest.clone();
247 normalize_persona_eval_ladder_manifest(&mut manifest);
248 if manifest.persona != DEFAULT_PERSONA {
249 return Err(VmError::Runtime(format!(
250 "persona eval ladder only supports persona '{}', got '{}'",
251 DEFAULT_PERSONA, manifest.persona
252 )));
253 }
254 if manifest.timeout_tiers.is_empty() {
255 return Err(VmError::Runtime(format!(
256 "persona eval ladder '{}' must declare at least one timeout tier",
257 manifest.id
258 )));
259 }
260
261 let base_dir = manifest.base_dir.as_deref().map(Path::new);
262 let backend = resolve_ladder_backend(&manifest.backend, base_dir)?;
263 let artifact_root = resolve_artifact_root(&manifest, base_dir);
264 fs::create_dir_all(&artifact_root).map_err(|error| {
265 VmError::Runtime(format!(
266 "failed to create persona eval ladder artifact root {}: {error}",
267 artifact_root.display()
268 ))
269 })?;
270
271 let mut tiers = Vec::new();
272 for route in &manifest.model_routes {
273 for tier in &manifest.timeout_tiers {
274 let index = tiers.len();
275 tiers.push(run_ladder_tier(
276 &backend,
277 &artifact_root,
278 route,
279 tier,
280 index,
281 )?);
282 }
283 }
284
285 let first_correct_index = tiers.iter().position(|tier| tier.pass);
286 let (first_correct_tier, first_correct_route) = first_correct_index
287 .and_then(|index| tiers.get(index))
288 .map(|tier| (Some(tier.timeout_tier.clone()), Some(tier.route_id.clone())))
289 .unwrap_or((None, None));
290 let passed = tiers.iter().filter(|tier| tier.pass).count();
291 let total = tiers.len();
292 let severity = normalize_ladder_severity(manifest.severity.as_deref());
293 Ok(PersonaEvalLadderReport {
294 type_name: REPORT_TYPE.to_string(),
295 version: 1,
296 id: manifest.id,
297 persona: manifest.persona,
298 blocking: severity == "blocking",
299 severity,
300 pass: first_correct_index.is_some(),
301 total,
302 passed,
303 failed: total.saturating_sub(passed),
304 first_correct_tier,
305 first_correct_route,
306 first_correct_index,
307 artifact_root: artifact_root.display().to_string(),
308 tiers,
309 metadata: manifest.metadata,
310 })
311}
312
313fn resolve_ladder_backend(
314 spec: &PersonaEvalLadderBackendSpec,
315 base_dir: Option<&Path>,
316) -> Result<MergeCaptainDriverBackend, VmError> {
317 match spec.kind.trim().to_ascii_lowercase().as_str() {
318 "live" => Ok(MergeCaptainDriverBackend::Live),
319 "mock" => {
320 let path = spec.path.as_deref().ok_or_else(|| {
321 VmError::Runtime("mock ladder backend requires backend.path".to_string())
322 })?;
323 Ok(MergeCaptainDriverBackend::Mock {
324 playground_dir: resolve_manifest_path(base_dir, path),
325 })
326 }
327 "replay" => {
328 let path = spec.path.as_deref().ok_or_else(|| {
329 VmError::Runtime("replay ladder backend requires backend.path".to_string())
330 })?;
331 Ok(MergeCaptainDriverBackend::Replay {
332 fixture: resolve_manifest_path(base_dir, path),
333 })
334 }
335 other => Err(VmError::Runtime(format!(
336 "unsupported persona eval ladder backend '{other}'"
337 ))),
338 }
339}
340
341fn resolve_artifact_root(manifest: &PersonaEvalLadderManifest, base_dir: Option<&Path>) -> PathBuf {
342 let root = manifest
343 .artifact_root
344 .clone()
345 .unwrap_or_else(|| format!(".harn-runs/persona-eval-ladders/{}", manifest.id));
346 resolve_manifest_path(base_dir, &root)
347}
348
349fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
350 let path_buf = PathBuf::from(path);
351 if path_buf.is_absolute() {
352 path_buf
353 } else if let Some(base_dir) = base_dir {
354 base_dir.join(path_buf)
355 } else {
356 path_buf
357 }
358}
359
360fn run_ladder_tier(
361 backend: &MergeCaptainDriverBackend,
362 artifact_root: &Path,
363 route: &PersonaEvalModelRoute,
364 tier: &PersonaEvalTimeoutTier,
365 index: usize,
366) -> Result<PersonaEvalTierReport, VmError> {
367 let tier_dir = artifact_root
368 .join(format!("{:02}-{}", index + 1, safe_path_segment(&route.id)))
369 .join(safe_path_segment(&tier.id));
370 fs::create_dir_all(&tier_dir).map_err(|error| {
371 VmError::Runtime(format!(
372 "failed to create persona eval ladder tier dir {}: {error}",
373 tier_dir.display()
374 ))
375 })?;
376
377 let transcript_path = tier_dir.join("event_log.jsonl");
378 let receipt_path = tier_dir.join("receipt.json");
379 let summary_path = tier_dir.join("summary.json");
380 let max_sweeps = tier.max_sweeps.unwrap_or(1).max(1);
381 let options = MergeCaptainDriverOptions {
382 backend: backend.clone(),
383 mode: if max_sweeps > 1 {
384 MergeCaptainDriverMode::Watch
385 } else {
386 MergeCaptainDriverMode::Once
387 },
388 model_route: Some(route.route.clone().unwrap_or_else(|| route.id.clone())),
389 timeout_tier: Some(tier.id.clone()),
390 transcript_out: Some(transcript_path),
391 receipt_out: Some(receipt_path),
392 run_root: tier_dir.join("runs"),
393 max_sweeps,
394 watch_backoff_ms: tier.watch_backoff_ms.unwrap_or(0),
395 stream_stdout: false,
396 };
397
398 let output = super::run_merge_captain_driver(options)?;
399 write_json_file(&summary_path, &output.summary)?;
400
401 let mut reasons = degradation_reasons(&output.summary, route, tier);
402 if !output.summary.pass {
403 reasons.push(format!(
404 "oracle reported {} error finding(s) and {} warning finding(s)",
405 output.summary.oracle_error_findings, output.summary.oracle_warn_findings
406 ));
407 }
408 let looped = output.audit_report.findings.iter().any(|finding| {
409 let message = finding.message.to_ascii_lowercase();
410 message.contains("loop") || message.contains("stuck")
411 });
412 let pass = output.summary.pass && reasons.is_empty();
413 let outcome = if looped {
414 PersonaEvalTierOutcome::Loop
415 } else if pass {
416 PersonaEvalTierOutcome::Correct
417 } else {
418 PersonaEvalTierOutcome::Degraded
419 };
420
421 Ok(PersonaEvalTierReport {
422 id: format!("{}::{}", route.id, tier.id),
423 route_id: route.id.clone(),
424 model_route: output.summary.model_route.clone(),
425 timeout_tier: tier.id.clone(),
426 timeout_ms: tier.timeout_ms,
427 max_cost_usd: tier.max_cost_usd.or(route.max_cost_usd),
428 max_latency_ms: tier.max_latency_ms.or(tier.timeout_ms),
429 pass,
430 outcome: outcome.as_str().to_string(),
431 degradation_reasons: reasons,
432 transcript_path: output
433 .transcript_path
434 .as_deref()
435 .map(|path| path.display().to_string()),
436 receipt_path: output.receipt_path.display().to_string(),
437 summary_path: summary_path.display().to_string(),
438 event_count: output.summary.event_count,
439 cost_usd: output.summary.cost_usd,
440 latency_ms: output.summary.latency_ms,
441 tool_calls: output.summary.tool_calls,
442 model_calls: output.summary.model_calls,
443 oracle_error_findings: output.summary.oracle_error_findings,
444 oracle_warn_findings: output.summary.oracle_warn_findings,
445 state_machine_coverage: state_machine_coverage(&output.summary.state_transitions),
446 })
447}
448
449fn degradation_reasons(
450 summary: &MergeCaptainRunSummary,
451 route: &PersonaEvalModelRoute,
452 tier: &PersonaEvalTimeoutTier,
453) -> Vec<String> {
454 let mut reasons = Vec::new();
455 if let Some(max_tool_calls) = tier.max_tool_calls {
456 if summary.tool_calls > max_tool_calls {
457 reasons.push(format!(
458 "tool calls {} exceeded tier budget {}",
459 summary.tool_calls, max_tool_calls
460 ));
461 }
462 }
463 if let Some(max_model_calls) = tier.max_model_calls.or(route.max_model_calls) {
464 if summary.model_calls > max_model_calls {
465 reasons.push(format!(
466 "model calls {} exceeded budget {}",
467 summary.model_calls, max_model_calls
468 ));
469 }
470 }
471 if let Some(max_cost_usd) = tier.max_cost_usd.or(route.max_cost_usd) {
472 if summary.cost_usd > max_cost_usd {
473 reasons.push(format!(
474 "cost ${:.6} exceeded budget ${:.6}",
475 summary.cost_usd, max_cost_usd
476 ));
477 }
478 }
479 if let Some(max_latency_ms) = tier.max_latency_ms.or(tier.timeout_ms) {
480 if summary.latency_ms > max_latency_ms {
481 reasons.push(format!(
482 "latency {}ms exceeded tier timeout {}ms",
483 summary.latency_ms, max_latency_ms
484 ));
485 }
486 }
487 reasons
488}
489
490fn state_machine_coverage(transitions: &[StateTransition]) -> StateMachineCoverage {
491 let observed_steps: Vec<String> = transitions
492 .iter()
493 .map(|transition| transition.step.clone())
494 .collect::<BTreeSet<_>>()
495 .into_iter()
496 .collect();
497 StateMachineCoverage {
498 observed: observed_steps.len(),
499 observed_steps,
500 transitions: transitions.to_vec(),
501 }
502}
503
504fn normalize_ladder_severity(value: Option<&str>) -> String {
505 match value
506 .unwrap_or("blocking")
507 .trim()
508 .to_ascii_lowercase()
509 .as_str()
510 {
511 "warn" | "warning" => "warning".to_string(),
512 "info" | "informational" => "informational".to_string(),
513 _ => "blocking".to_string(),
514 }
515}
516
517fn safe_path_segment(value: &str) -> String {
518 let mut out = String::new();
519 for ch in value.chars() {
520 if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
521 out.push(ch);
522 } else {
523 out.push('_');
524 }
525 }
526 if out.is_empty() {
527 "unnamed".to_string()
528 } else {
529 out
530 }
531}
532
533fn write_json_file<T: Serialize>(path: &Path, value: &T) -> Result<(), VmError> {
534 let mut bytes = serde_json::to_vec_pretty(value)
535 .map_err(|error| VmError::Runtime(format!("failed to serialize JSON artifact: {error}")))?;
536 bytes.push(b'\n');
537 fs::write(path, bytes).map_err(|error| {
538 VmError::Runtime(format!(
539 "failed to write artifact {}: {error}",
540 path.display()
541 ))
542 })
543}
544
545#[cfg(test)]
546mod tests {
547 use super::*;
548
549 fn repo_root() -> PathBuf {
550 PathBuf::from(env!("CARGO_MANIFEST_DIR"))
551 .parent()
552 .unwrap()
553 .parent()
554 .unwrap()
555 .to_path_buf()
556 }
557
558 #[test]
559 fn ladder_marks_first_correct_tier_and_writes_artifacts() {
560 let temp = tempfile::tempdir().unwrap();
561 let manifest = PersonaEvalLadderManifest {
562 id: "merge-captain-ladder-test".to_string(),
563 base_dir: Some(repo_root().display().to_string()),
564 artifact_root: Some(temp.path().join("ladder").display().to_string()),
565 backend: PersonaEvalLadderBackendSpec {
566 kind: "replay".to_string(),
567 path: Some(
568 "examples/personas/merge_captain/transcripts/green_pr.jsonl".to_string(),
569 ),
570 },
571 model_routes: vec![PersonaEvalModelRoute {
572 id: "gemma-value".to_string(),
573 route: Some("local/gemma-value".to_string()),
574 provider: Some("llama.cpp".to_string()),
575 model: Some("gemma".to_string()),
576 profile: Some("value".to_string()),
577 ..Default::default()
578 }],
579 timeout_tiers: vec![
580 PersonaEvalTimeoutTier {
581 id: "too-tight".to_string(),
582 max_tool_calls: Some(1),
583 ..Default::default()
584 },
585 PersonaEvalTimeoutTier {
586 id: "balanced".to_string(),
587 max_tool_calls: Some(4),
588 max_model_calls: Some(1),
589 ..Default::default()
590 },
591 ],
592 ..Default::default()
593 };
594
595 let report = run_persona_eval_ladder(&manifest).unwrap();
596
597 assert!(report.pass);
598 assert_eq!(report.total, 2);
599 assert_eq!(report.first_correct_tier.as_deref(), Some("balanced"));
600 assert_eq!(report.first_correct_route.as_deref(), Some("gemma-value"));
601 assert_eq!(report.tiers[0].outcome, "degraded");
602 assert_eq!(report.tiers[1].outcome, "correct");
603 assert!(Path::new(&report.tiers[0].transcript_path.as_ref().unwrap()).exists());
604 assert!(Path::new(&report.tiers[1].receipt_path).exists());
605 assert!(report.tiers[1].state_machine_coverage.observed > 0);
606 }
607
608 #[test]
609 fn unsupported_persona_is_rejected() {
610 let manifest = PersonaEvalLadderManifest {
611 id: "other-persona".to_string(),
612 persona: "ship_captain".to_string(),
613 timeout_tiers: vec![PersonaEvalTimeoutTier {
614 id: "smoke".to_string(),
615 ..Default::default()
616 }],
617 ..Default::default()
618 };
619
620 let error = run_persona_eval_ladder(&manifest).unwrap_err();
621
622 assert!(format!("{error}").contains("only supports persona"));
623 }
624}