1use std::collections::HashSet;
24use std::fs;
25use std::path::{Path, PathBuf};
26use std::process;
27use std::thread;
28
29use harn_vm::testbench::annotations::{
30 annotations_for_record, validate_against_tape, AnnotationKind, AnnotationTape,
31};
32use harn_vm::testbench::fidelity::{compare, FidelityMode, FidelityReport};
33use harn_vm::testbench::overlay_fs::{render_unified_diff, DiffEntry, DiffKind};
34use harn_vm::testbench::tape::EventTape;
35use harn_vm::testbench::{
36 ClockConfig, FilesystemConfig, LlmConfig, NetworkConfig, SubprocessConfig, TapeConfig,
37 Testbench,
38};
39
40use crate::cli::{
41 TestBenchCommand, TestBenchExportAnnotationsArgs, TestBenchFidelityArgs, TestBenchReplayArgs,
42 TestBenchRunArgs, TestBenchValidateAnnotationsArgs,
43};
44use crate::commands::run::{execute_run, CliLlmMockMode, RunOutcome, RunProfileOptions};
45use crate::CLI_RUNTIME_STACK_SIZE;
46
47const DEFAULT_TESTBENCH_START_MS: i64 = 1_767_225_600_000;
52
53enum ReplaySource {
55 ReRun,
57 Tape(String),
59}
60
61pub(crate) async fn run(command: TestBenchCommand) {
62 let outcome = match command {
63 TestBenchCommand::Run(args) => run_args(args).await,
64 TestBenchCommand::Replay(args) => replay_args(args).await,
65 TestBenchCommand::Fidelity(args) => fidelity_args(args).await,
66 TestBenchCommand::ValidateAnnotations(args) => validate_annotations_args(args),
67 TestBenchCommand::ExportAnnotations(args) => export_annotations_args(args),
68 };
69 flush_outcome(outcome);
70}
71
72async fn run_args(args: TestBenchRunArgs) -> RunOutcome {
73 let bench = match build_testbench(&args) {
74 Ok(bench) => bench,
75 Err(message) => return error_outcome(message),
76 };
77 let llm_mode = match build_llm_mode(&args) {
78 Ok(mode) => mode,
79 Err(message) => return error_outcome(message),
80 };
81 match args.runtime.as_str() {
82 "paused-tokio" | "" => run_with_bench(args, bench, llm_mode).await,
83 "des" => run_with_des_runtime(args, bench, llm_mode).await,
84 other => error_outcome(format!(
85 "--runtime must be `paused-tokio` or `des`, got `{other}`"
86 )),
87 }
88}
89
90async fn run_with_bench(
93 args: TestBenchRunArgs,
94 bench: Testbench,
95 llm_mode: CliLlmMockMode,
96) -> RunOutcome {
97 let session = match bench.activate() {
98 Ok(session) => session,
99 Err(error) => return error_outcome(format!("activate testbench: {error}")),
100 };
101 let outcome = execute_run(
102 &args.file,
103 false,
104 HashSet::new(),
105 args.argv.clone(),
106 Vec::new(),
107 llm_mode,
108 None,
109 RunProfileOptions::default(),
110 )
111 .await;
112 finalize_session(outcome, session, &args)
113}
114
115async fn run_with_des_runtime(
130 args: TestBenchRunArgs,
131 bench: Testbench,
132 llm_mode: CliLlmMockMode,
133) -> RunOutcome {
134 let (tx, rx) = std::sync::mpsc::channel();
135 thread::Builder::new()
136 .name("harn-des".to_string())
137 .stack_size(CLI_RUNTIME_STACK_SIZE)
138 .spawn(move || {
139 let rt = tokio::runtime::Builder::new_current_thread()
140 .enable_all()
141 .build()
142 .unwrap_or_else(|e| panic!("failed to build DES runtime: {e}"));
143 let outcome = rt.block_on(async move {
144 harn_vm::reset_thread_local_state();
145 let session = match bench.activate() {
146 Ok(s) => s,
147 Err(e) => return error_outcome(format!("activate testbench: {e}")),
148 };
149 let outcome = execute_run(
150 &args.file,
151 false,
152 HashSet::new(),
153 args.argv.clone(),
154 Vec::new(),
155 llm_mode,
156 None,
157 RunProfileOptions::default(),
158 )
159 .await;
160 finalize_session(outcome, session, &args)
161 });
162 let _ = tx.send(outcome);
163 })
164 .expect("spawn DES thread");
165 tokio::task::spawn_blocking(move || {
166 rx.recv()
167 .unwrap_or_else(|_| error_outcome("DES runtime thread panicked".to_string()))
168 })
169 .await
170 .unwrap_or_else(|e| error_outcome(format!("DES runtime blocking task failed: {e:?}")))
171}
172
173fn build_llm_mode(args: &TestBenchRunArgs) -> Result<CliLlmMockMode, String> {
174 match (&args.llm_fixture, &args.llm_record) {
175 (Some(_), Some(_)) => Err("--llm-fixture and --llm-record are mutually exclusive".into()),
176 (Some(path), None) => Ok(CliLlmMockMode::Replay {
177 fixture_path: PathBuf::from(path),
178 }),
179 (None, Some(path)) => Ok(CliLlmMockMode::Record {
180 fixture_path: PathBuf::from(path),
181 }),
182 (None, None) => Ok(CliLlmMockMode::Off),
183 }
184}
185
186fn finalize_session(
187 outcome: RunOutcome,
188 session: harn_vm::testbench::TestbenchSession,
189 args: &TestBenchRunArgs,
190) -> RunOutcome {
191 let finalize = match session.finalize() {
192 Ok(f) => f,
193 Err(error) => return append_error(outcome, format!("finalize testbench: {error}")),
194 };
195 let mut outcome = outcome;
196 if matches!(args.network.as_str(), "deny") {
197 outcome
198 .stderr
199 .push_str("[testbench] network=deny applied for the duration of the run.\n");
200 }
201 if let Some(diff_path) = args.emit_diff.as_ref() {
202 if let Err(error) = persist_overlay_diff(&finalize.fs_diff, &PathBuf::from(diff_path)) {
203 outcome.stderr.push_str(&format!(
204 "warning: failed to write fs diff to {diff_path}: {error}\n"
205 ));
206 }
207 } else if !finalize.fs_diff.is_empty() {
208 outcome
209 .stderr
210 .push_str(&render_diff_summary(&finalize.fs_diff));
211 }
212 if let Some(record_path) = args.process_record.as_ref() {
213 outcome.stderr.push_str(&format!(
214 "[testbench] recorded {} subprocess invocation(s) to {record_path}.\n",
215 finalize.recorded_subprocesses.len()
216 ));
217 }
218 if let Some(toolchain_dir) = args.process_wasi.as_ref() {
219 outcome.stderr.push_str(&format!(
220 "[testbench] subprocess invocations resolved against WASI toolchain at {toolchain_dir}.\n"
221 ));
222 }
223 if let Some(tape) = finalize.tape.as_ref() {
224 outcome.stderr.push_str(&format!(
225 "[testbench] emitted unified tape with {} record(s) to {}.\n",
226 tape.records,
227 tape.path.display(),
228 ));
229 }
230 for leak in &finalize.clock_leaks {
231 outcome.stderr.push_str(&format!(
232 "[testbench] clock leak: {} (count={})\n",
233 leak.capability_id, leak.count,
234 ));
235 }
236 outcome
237}
238
239async fn replay_args(args: TestBenchReplayArgs) -> RunOutcome {
240 let annotations_loaded = match args.annotations.as_deref() {
243 None => None,
244 Some(path) => match AnnotationTape::load(Path::new(path)) {
245 Ok(tape) => Some((path.to_string(), tape)),
246 Err(error) => {
247 return error_outcome(format!("load annotations {path}: {error}"));
248 }
249 },
250 };
251
252 let tape_temp = if annotations_loaded.is_some() && args.emit_tape.is_none() {
257 match tempfile::tempdir() {
258 Ok(dir) => Some(dir),
259 Err(error) => return error_outcome(format!("tempdir for replay tape: {error}")),
260 }
261 } else {
262 None
263 };
264 let emit_tape_path = match (&args.emit_tape, tape_temp.as_ref()) {
265 (Some(path), _) => Some(path.clone()),
266 (None, Some(dir)) => Some(dir.path().join("run.tape").to_string_lossy().into_owned()),
267 (None, None) => None,
268 };
269
270 let derived = TestBenchRunArgs {
271 file: args.file.clone(),
272 start_at_ms: args.start_at_ms,
273 clock: "paused".to_string(),
274 llm_fixture: args.llm_fixture.clone(),
275 llm_record: None,
276 fs_overlay: args.fs_overlay.clone(),
277 process_replay: Some(args.process_tape.clone()),
278 process_record: None,
279 process_wasi: None,
280 network: "deny".to_string(),
281 allow_host: Vec::new(),
282 emit_diff: None,
283 emit_tape: emit_tape_path.clone(),
284 runtime: "paused-tokio".to_string(),
285 argv: args.argv.clone(),
286 };
287 let mut outcome = run_args(derived).await;
288
289 if let (Some((annotations_path, annotations)), Some(tape_path)) =
290 (annotations_loaded, emit_tape_path)
291 {
292 match EventTape::load(Path::new(&tape_path)) {
293 Ok(tape) => {
294 let report = validate_against_tape(&annotations, &tape);
295 outcome.stderr.push_str(&render_annotations_block(
296 &annotations_path,
297 &annotations,
298 &tape,
299 ));
300 if !report.is_ok() {
301 outcome.stderr.push_str(&format!(
302 "[testbench] annotations validation failed with {} problem(s); see `harn test-bench validate-annotations` for the structured report.\n",
303 report.problems.len()
304 ));
305 outcome.exit_code = outcome.exit_code.max(2);
306 }
307 }
308 Err(error) => {
309 outcome.stderr.push_str(&format!(
310 "warning: failed to load tape for annotation surfacing: {error}\n"
311 ));
312 }
313 }
314 }
315 outcome
316}
317
318fn render_annotations_block(
322 annotations_path: &str,
323 annotations: &AnnotationTape,
324 tape: &EventTape,
325) -> String {
326 let mut out = String::new();
327 out.push_str(&format!(
328 "[annotations] loaded {} annotation(s) from {annotations_path}\n",
329 annotations.annotations.len()
330 ));
331 let mut sorted_records: Vec<_> = tape.records.iter().collect();
332 sorted_records.sort_by_key(|record| record.seq);
333 for record in sorted_records {
334 let matches = annotations_for_record(annotations, record);
335 if matches.is_empty() {
336 continue;
337 }
338 out.push_str(&format!(
339 " event seq={} virtual_time_ms={} kind={}\n",
340 record.seq,
341 record.virtual_time_ms,
342 record.kind.label(),
343 ));
344 for annotation in matches {
345 let label = annotation.kind.as_str();
346 let evidence = annotation
347 .evidence
348 .as_deref()
349 .unwrap_or("(no evidence)")
350 .lines()
351 .next()
352 .unwrap_or("(no evidence)");
353 let id = if annotation.id.is_empty() {
354 "(no id)".to_string()
355 } else {
356 annotation.id.clone()
357 };
358 out.push_str(&format!(" [{label}] {id}: {evidence}\n"));
359 }
360 }
361 out
362}
363
364fn validate_annotations_args(args: TestBenchValidateAnnotationsArgs) -> RunOutcome {
365 let tape = match EventTape::load(Path::new(&args.tape)) {
366 Ok(tape) => tape,
367 Err(error) => return error_outcome(format!("load tape {}: {error}", args.tape)),
368 };
369 let annotations = match AnnotationTape::load(Path::new(&args.annotations)) {
370 Ok(tape) => tape,
371 Err(error) => {
372 return error_outcome(format!("load annotations {}: {error}", args.annotations));
373 }
374 };
375 let report = validate_against_tape(&annotations, &tape);
376 let json = match serde_json::to_string_pretty(&report) {
377 Ok(json) => json,
378 Err(error) => return error_outcome(format!("serialize validation report: {error}")),
379 };
380 let mut outcome = RunOutcome::default();
381 if let Some(path) = args.report.as_deref() {
382 if let Err(error) = persist_text(&json, Path::new(path)) {
383 return error_outcome(format!("write validation report: {error}"));
384 }
385 outcome.stderr.push_str(&format!(
386 "[testbench] annotations validation: checked={} problems={} ({})\n",
387 report.annotations_checked,
388 report.problems.len(),
389 path,
390 ));
391 } else {
392 outcome.stdout.push_str(&json);
393 outcome.stdout.push('\n');
394 }
395 if !report.is_ok() {
396 outcome.exit_code = 2;
397 }
398 outcome
399}
400
401fn export_annotations_args(args: TestBenchExportAnnotationsArgs) -> RunOutcome {
402 let annotations = match AnnotationTape::load(Path::new(&args.annotations)) {
403 Ok(tape) => tape,
404 Err(error) => {
405 return error_outcome(format!("load annotations {}: {error}", args.annotations));
406 }
407 };
408
409 let kinds: Vec<AnnotationKind> = if args.kind.is_empty() {
410 Vec::new()
411 } else {
412 let mut parsed = Vec::with_capacity(args.kind.len());
413 for raw in &args.kind {
414 match AnnotationKind::parse_cli(raw) {
415 Ok(kind) => parsed.push(kind),
416 Err(error) => return error_outcome(error),
417 }
418 }
419 parsed
420 };
421
422 let selected: Vec<_> = annotations
423 .annotations
424 .iter()
425 .filter(|annotation| kinds.is_empty() || kinds.contains(&annotation.kind))
426 .collect();
427
428 let body = match args.format.as_str() {
429 "jsonl" | "" => {
430 let mut out = String::new();
431 for annotation in &selected {
432 match serde_json::to_string(annotation) {
433 Ok(line) => {
434 out.push_str(&line);
435 out.push('\n');
436 }
437 Err(error) => {
438 return error_outcome(format!("serialize annotation: {error}"));
439 }
440 }
441 }
442 out
443 }
444 "friction" => {
445 let mut out = String::new();
446 for annotation in &selected {
447 if let Some(event) = harn_vm::testbench::annotations::annotation_to_friction_event(
448 annotation,
449 &annotations.header,
450 ) {
451 match serde_json::to_string(&event) {
452 Ok(line) => {
453 out.push_str(&line);
454 out.push('\n');
455 }
456 Err(error) => {
457 return error_outcome(format!("serialize friction event: {error}"));
458 }
459 }
460 }
461 }
462 out
463 }
464 other => {
465 return error_outcome(format!(
466 "--format must be `jsonl` or `friction`, got `{other}`"
467 ));
468 }
469 };
470
471 let mut outcome = RunOutcome::default();
472 if let Some(path) = args.output.as_deref() {
473 if let Err(error) = persist_text(&body, Path::new(path)) {
474 return error_outcome(format!("write export: {error}"));
475 }
476 outcome.stderr.push_str(&format!(
477 "[testbench] exported {} annotation(s) to {} (format={})\n",
478 selected.len(),
479 path,
480 args.format,
481 ));
482 } else {
483 outcome.stdout.push_str(&body);
484 }
485 outcome
486}
487
488async fn fidelity_args(args: TestBenchFidelityArgs) -> RunOutcome {
489 let mode = match FidelityMode::parse(&args.mode) {
490 Ok(mode) => mode,
491 Err(error) => return error_outcome(error),
492 };
493
494 let (recorded_path, replay_source) = match (&args.against, &args.replay) {
495 (Some(recorded), _) => (recorded.clone(), ReplaySource::ReRun),
496 (None, Some(replay)) => (args.primary.clone(), ReplaySource::Tape(replay.clone())),
497 (None, None) => {
498 return error_outcome(
499 "expected either two tape paths or `--against <tape> <script>`".to_string(),
500 )
501 }
502 };
503
504 let recorded = match EventTape::load(Path::new(&recorded_path)) {
505 Ok(tape) => tape,
506 Err(error) => return error_outcome(format!("load recorded tape: {error}")),
507 };
508
509 let (replay, mut prelude) = match replay_source {
510 ReplaySource::ReRun => {
511 let temp = match tempfile::tempdir() {
512 Ok(dir) => dir,
513 Err(error) => return error_outcome(format!("create temp tape dir: {error}")),
514 };
515 let replay_tape_path = temp.path().join("replay.tape");
516 let start_at = args
517 .start_at_ms
518 .or(recorded.header.started_at_unix_ms)
519 .unwrap_or(DEFAULT_TESTBENCH_START_MS);
520 let derived = TestBenchRunArgs {
521 file: args.primary.clone(),
522 start_at_ms: Some(start_at),
523 clock: "paused".to_string(),
524 llm_fixture: None,
525 llm_record: None,
526 fs_overlay: args.fs_overlay.clone(),
527 process_replay: None,
528 process_record: None,
529 process_wasi: None,
530 network: "deny".to_string(),
531 allow_host: Vec::new(),
532 emit_diff: None,
533 emit_tape: Some(replay_tape_path.to_string_lossy().into_owned()),
534 runtime: "paused-tokio".to_string(),
535 argv: args.argv.clone(),
536 };
537 let inner = run_args(derived).await;
538 match EventTape::load(&replay_tape_path) {
539 Ok(tape) => (tape, inner),
540 Err(error) => return append_error(inner, format!("load replay tape: {error}")),
541 }
542 }
543 ReplaySource::Tape(path) => match EventTape::load(Path::new(&path)) {
544 Ok(tape) => (tape, RunOutcome::default()),
545 Err(error) => return error_outcome(format!("load replay tape: {error}")),
546 },
547 };
548
549 let report = compare(&recorded, &replay, mode);
550 let json = match serde_json::to_string_pretty(&report) {
551 Ok(json) => json,
552 Err(error) => return append_error(prelude, format!("serialize fidelity report: {error}")),
553 };
554 if let Some(path) = args.report.as_ref() {
555 if let Err(error) = persist_fidelity_report(&json, Path::new(path)) {
556 return append_error(prelude, format!("write fidelity report: {error}"));
557 }
558 prelude.stderr.push_str(&format!(
559 "[testbench] fidelity report written to {path} (mode={:?}, score={:.4}, divergences={})\n",
560 report.mode,
561 report.score,
562 report.divergences.len(),
563 ));
564 } else {
565 prelude.stdout.push_str(&json);
566 prelude.stdout.push('\n');
567 }
568 if !report.divergences.is_empty() {
569 prelude.exit_code = prelude.exit_code.max(report_exit_code(&report));
570 }
571 prelude
572}
573
574fn report_exit_code(report: &FidelityReport) -> i32 {
575 if report.divergences.is_empty() {
578 0
579 } else {
580 2
581 }
582}
583
584fn persist_fidelity_report(json: &str, path: &Path) -> Result<(), String> {
585 persist_text(json, path)
586}
587
588fn persist_text(body: &str, path: &Path) -> Result<(), String> {
589 if let Some(parent) = path.parent() {
590 if !parent.as_os_str().is_empty() {
591 fs::create_dir_all(parent)
592 .map_err(|error| format!("mkdir {}: {error}", parent.display()))?;
593 }
594 }
595 fs::write(path, body).map_err(|error| format!("write {}: {error}", path.display()))
596}
597
598fn build_testbench(args: &TestBenchRunArgs) -> Result<Testbench, String> {
599 let clock = match args.clock.as_str() {
600 "paused" => ClockConfig::Paused {
601 starting_at_ms: args.start_at_ms.unwrap_or(DEFAULT_TESTBENCH_START_MS),
602 },
603 "real" => ClockConfig::Real,
604 other => return Err(format!("--clock must be `paused` or `real`, got `{other}`")),
605 };
606
607 let llm = if let Some(fixture) = &args.llm_fixture {
608 LlmConfig::Replay {
609 fixture: PathBuf::from(fixture),
610 }
611 } else if let Some(record) = &args.llm_record {
612 LlmConfig::Record {
613 fixture: PathBuf::from(record),
614 }
615 } else {
616 LlmConfig::Real
617 };
618
619 let filesystem = match &args.fs_overlay {
620 None => FilesystemConfig::Real,
621 Some(root) => FilesystemConfig::Overlay {
622 worktree: PathBuf::from(root),
623 },
624 };
625
626 let subprocess = if let Some(record) = &args.process_record {
627 SubprocessConfig::Record {
628 tape: PathBuf::from(record),
629 }
630 } else if let Some(replay) = &args.process_replay {
631 SubprocessConfig::Replay {
632 tape: PathBuf::from(replay),
633 }
634 } else if let Some(toolchain) = &args.process_wasi {
635 SubprocessConfig::WasiToolchain {
636 dir: PathBuf::from(toolchain),
637 }
638 } else {
639 SubprocessConfig::Real
640 };
641
642 let network = match args.network.as_str() {
643 "deny" => NetworkConfig::DenyByDefault {
644 allow: args.allow_host.clone(),
645 },
646 "real" => NetworkConfig::Real,
647 other => return Err(format!("--network must be `deny` or `real`, got `{other}`")),
648 };
649
650 let tape = match &args.emit_tape {
651 None => TapeConfig::Off,
652 Some(path) => TapeConfig::Emit {
653 path: PathBuf::from(path),
654 argv: args.argv.clone(),
655 script_path: Some(args.file.clone()),
656 },
657 };
658
659 Ok(Testbench {
660 clock,
661 llm,
662 filesystem,
663 subprocess,
664 network,
665 tape,
666 })
667}
668
669fn persist_overlay_diff(diff: &[DiffEntry], path: &PathBuf) -> Result<(), String> {
670 if let Some(parent) = path.parent() {
671 if !parent.as_os_str().is_empty() {
672 fs::create_dir_all(parent)
673 .map_err(|err| format!("mkdir {}: {err}", parent.display()))?;
674 }
675 }
676 let body = render_unified_diff(diff);
677 fs::write(path, body).map_err(|err| format!("write {}: {err}", path.display()))
678}
679
680fn render_diff_summary(diff: &[DiffEntry]) -> String {
681 let mut out = String::new();
682 out.push_str(&format!(
683 "[testbench] overlay fs diff: {} change(s)\n",
684 diff.len()
685 ));
686 for entry in diff {
687 let label = match &entry.kind {
688 DiffKind::Added { .. } => "added",
689 DiffKind::Modified { .. } => "modified",
690 DiffKind::Deleted => "deleted",
691 };
692 out.push_str(&format!(" {label} {}\n", entry.path.display()));
693 }
694 out
695}
696
697fn error_outcome(message: String) -> RunOutcome {
698 RunOutcome {
699 stdout: String::new(),
700 stderr: format!("error: {message}\n"),
701 exit_code: 1,
702 }
703}
704
705fn append_error(mut outcome: RunOutcome, message: String) -> RunOutcome {
706 outcome.stderr.push_str(&format!("error: {message}\n"));
707 outcome.exit_code = outcome.exit_code.max(1);
708 outcome
709}
710
711fn flush_outcome(outcome: RunOutcome) {
712 use std::io::Write;
713 let _ = std::io::stderr().write_all(outcome.stderr.as_bytes());
714 let _ = std::io::stdout().write_all(outcome.stdout.as_bytes());
715 if outcome.exit_code != 0 {
716 process::exit(outcome.exit_code);
717 }
718}