harn_vm/testbench/
fidelity.rs

1//! Replay fidelity oracle.
2//!
3//! Compares two [`EventTape`]s and emits a structured [`FidelityReport`]
4//! listing every diverging record. The oracle drives the
5//! `harn test-bench fidelity` CLI subcommand and is the artifact behind
6//! [harn-cloud#19][cloud19]'s "byte-for-byte vs. semantic" leaderboard
7//! metric.
8//!
9//! [cloud19]: https://github.com/burin-labs/harn-cloud/issues/19
10//!
11//! # Modes
12//!
13//! - **`ByteIdentical`** (strictest). Every record must match position,
14//!   kind, and content hash. The mode CI uses to gate "this PR did not
15//!   regress replay determinism".
16//!
17//! - **`Semantic`**. Ignores diffs that are non-meaningful by
18//!   construction: monotonic-only sequence numbers, virtual-time stamps
19//!   that drift only because of paused-clock rounding, and recorded
20//!   `monotonic_ms` deltas. Content hashes still gate every payload, so
21//!   anything user-visible must still match.
22//!
23//! - **`Outcome`** (loosest). Compares only the script's externally
24//!   observable result: the final FS write set (overlay diff), the exit
25//!   status of the last subprocess, and the count of LLM calls. Useful
26//!   for stochastic LLM runs where intermediate token streams legitimately
27//!   diverge between record and replay.
28
29use std::collections::BTreeMap;
30
31use serde::{Deserialize, Serialize};
32
33use super::tape::{EventTape, TapeRecord, TapeRecordKind};
34
35/// Strictness of the comparison. The CLI surface picks one.
36#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
37#[serde(rename_all = "snake_case")]
38pub enum FidelityMode {
39    ByteIdentical,
40    Semantic,
41    Outcome,
42}
43
44impl FidelityMode {
45    pub fn parse(label: &str) -> Result<Self, String> {
46        match label {
47            "byte" | "byte-identical" | "byte_identical" => Ok(Self::ByteIdentical),
48            "semantic" => Ok(Self::Semantic),
49            "outcome" => Ok(Self::Outcome),
50            other => Err(format!(
51                "unknown fidelity mode `{other}` — expected `byte-identical`, `semantic`, or `outcome`"
52            )),
53        }
54    }
55}
56
57/// Structured comparison result. Emitted as JSON by the CLI; consumed by
58/// CI gates and by the public leaderboard.
59#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
60pub struct FidelityReport {
61    pub mode: FidelityMode,
62    /// Number of records compared on the recorded side. Lets a consumer
63    /// distinguish "100% match over 0 records" from "100% over 1000".
64    pub recorded_records: usize,
65    /// Number of records compared on the replay side.
66    pub replay_records: usize,
67    /// Records where one side had an event the other did not, or where
68    /// the same-position record had different content. Stable order so
69    /// CI snapshots are deterministic.
70    pub divergences: Vec<Divergence>,
71    /// 0.0–1.0. Defined as `1 - (divergences / max(records, 1))` for
72    /// strict modes; in outcome mode it's `1.0` iff zero divergences.
73    pub score: f32,
74}
75
76impl FidelityReport {
77    pub fn is_byte_identical(&self) -> bool {
78        self.divergences.is_empty()
79    }
80}
81
82/// Single divergence between two tapes. Records are paired by sequence
83/// number for byte-identical / semantic modes; outcome mode emits one
84/// of these per outcome facet that disagreed.
85#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
86pub struct Divergence {
87    /// Logical sequence number of the diverging record (`None` when the
88    /// divergence is an outcome facet, e.g. exit code).
89    pub seq: Option<u64>,
90    /// Short tag used by tooling to group divergences (`clock_sleep`,
91    /// `process_stdout_hash`, `outcome_exit_code`, …).
92    pub category: String,
93    /// Human-readable explanation. Stable phrasing — CI snapshot tests
94    /// match on substrings.
95    pub message: String,
96}
97
98/// Compare two tapes under `mode`.
99pub fn compare(recorded: &EventTape, replay: &EventTape, mode: FidelityMode) -> FidelityReport {
100    let divergences = match mode {
101        FidelityMode::ByteIdentical => compare_record_by_record(recorded, replay, true),
102        FidelityMode::Semantic => compare_record_by_record(recorded, replay, false),
103        FidelityMode::Outcome => compare_outcome(recorded, replay),
104    };
105    let baseline = recorded.records.len().max(replay.records.len()).max(1);
106    let score = match mode {
107        FidelityMode::ByteIdentical | FidelityMode::Semantic => {
108            1.0 - (divergences.len() as f32 / baseline as f32).min(1.0)
109        }
110        FidelityMode::Outcome => {
111            if divergences.is_empty() {
112                1.0
113            } else {
114                0.0
115            }
116        }
117    };
118    FidelityReport {
119        mode,
120        recorded_records: recorded.records.len(),
121        replay_records: replay.records.len(),
122        divergences,
123        score,
124    }
125}
126
127fn compare_record_by_record(
128    recorded: &EventTape,
129    replay: &EventTape,
130    byte_strict: bool,
131) -> Vec<Divergence> {
132    let mut out = Vec::new();
133    let max = recorded.records.len().max(replay.records.len());
134    for idx in 0..max {
135        match (recorded.records.get(idx), replay.records.get(idx)) {
136            (Some(rec), Some(rep)) => compare_pair(rec, rep, byte_strict, &mut out),
137            (Some(rec), None) => out.push(Divergence {
138                seq: Some(rec.seq),
139                category: "missing_in_replay".to_string(),
140                message: format!(
141                    "replay tape ended at #{idx}; recorded had {} more record(s)",
142                    recorded.records.len() - idx
143                ),
144            }),
145            (None, Some(rep)) => out.push(Divergence {
146                seq: Some(rep.seq),
147                category: "missing_in_recorded".to_string(),
148                message: format!(
149                    "replay produced an extra record at #{idx} (kind={})",
150                    record_kind_tag(&rep.kind)
151                ),
152            }),
153            (None, None) => break,
154        }
155    }
156    out
157}
158
159fn compare_pair(
160    recorded: &TapeRecord,
161    replay: &TapeRecord,
162    byte_strict: bool,
163    out: &mut Vec<Divergence>,
164) {
165    if record_kind_tag(&recorded.kind) != record_kind_tag(&replay.kind) {
166        out.push(Divergence {
167            seq: Some(recorded.seq),
168            category: "kind_mismatch".to_string(),
169            message: format!(
170                "record kind diverged: recorded={} replay={}",
171                record_kind_tag(&recorded.kind),
172                record_kind_tag(&replay.kind),
173            ),
174        });
175        return;
176    }
177    if byte_strict && recorded.virtual_time_ms != replay.virtual_time_ms {
178        out.push(Divergence {
179            seq: Some(recorded.seq),
180            category: "virtual_time_drift".to_string(),
181            message: format!(
182                "virtual_time_ms diverged: recorded={} replay={}",
183                recorded.virtual_time_ms, replay.virtual_time_ms,
184            ),
185        });
186    }
187    if byte_strict && recorded.monotonic_ms != replay.monotonic_ms {
188        out.push(Divergence {
189            seq: Some(recorded.seq),
190            category: "monotonic_drift".to_string(),
191            message: format!(
192                "monotonic_ms diverged: recorded={} replay={}",
193                recorded.monotonic_ms, replay.monotonic_ms,
194            ),
195        });
196    }
197    compare_kind(&recorded.kind, &replay.kind, recorded.seq, byte_strict, out);
198}
199
200fn compare_kind(
201    recorded: &TapeRecordKind,
202    replay: &TapeRecordKind,
203    seq: u64,
204    byte_strict: bool,
205    out: &mut Vec<Divergence>,
206) {
207    use TapeRecordKind::*;
208    match (recorded, replay) {
209        (
210            ClockRead {
211                source: r_source,
212                value_ms: r_val,
213            },
214            ClockRead {
215                source: p_source,
216                value_ms: p_val,
217            },
218        ) => {
219            if r_source != p_source {
220                out.push(Divergence {
221                    seq: Some(seq),
222                    category: "clock_read_source".to_string(),
223                    message: format!(
224                        "clock_read source diverged: recorded={r_source:?} replay={p_source:?}"
225                    ),
226                });
227            }
228            if r_val != p_val {
229                out.push(Divergence {
230                    seq: Some(seq),
231                    category: "clock_read_value".to_string(),
232                    message: format!(
233                        "clock_read value_ms diverged: recorded={r_val} replay={p_val}"
234                    ),
235                });
236            }
237        }
238        (
239            ClockSleep {
240                duration_ms: recorded_dur,
241            },
242            ClockSleep {
243                duration_ms: replay_dur,
244            },
245        ) => {
246            if recorded_dur != replay_dur {
247                out.push(Divergence {
248                    seq: Some(seq),
249                    category: "clock_sleep_duration".to_string(),
250                    message: format!(
251                        "sleep duration diverged: recorded={recorded_dur}ms replay={replay_dur}ms"
252                    ),
253                });
254            }
255        }
256        (
257            LlmCall {
258                request_digest: recorded_req,
259                response: recorded_res,
260            },
261            LlmCall {
262                request_digest: replay_req,
263                response: replay_res,
264            },
265        ) => {
266            if recorded_req != replay_req {
267                out.push(Divergence {
268                    seq: Some(seq),
269                    category: "llm_request_digest".to_string(),
270                    message: format!(
271                        "LLM request digest diverged: recorded={recorded_req} replay={replay_req}"
272                    ),
273                });
274            }
275            if recorded_res.content_hash() != replay_res.content_hash() {
276                out.push(Divergence {
277                    seq: Some(seq),
278                    category: "llm_response_hash".to_string(),
279                    message: format!(
280                        "LLM response hash diverged: recorded={} replay={}",
281                        recorded_res.content_hash(),
282                        replay_res.content_hash(),
283                    ),
284                });
285            }
286        }
287        (
288            FileRead {
289                path: rp,
290                content_hash: rh,
291                len_bytes: rl,
292            },
293            FileRead {
294                path: pp,
295                content_hash: ph,
296                len_bytes: pl,
297            },
298        ) => compare_file(seq, "file_read", rp, rh, *rl, pp, ph, *pl, byte_strict, out),
299        (
300            FileWrite {
301                path: rp,
302                content_hash: rh,
303                len_bytes: rl,
304            },
305            FileWrite {
306                path: pp,
307                content_hash: ph,
308                len_bytes: pl,
309            },
310        ) => compare_file(
311            seq,
312            "file_write",
313            rp,
314            rh,
315            *rl,
316            pp,
317            ph,
318            *pl,
319            byte_strict,
320            out,
321        ),
322        (FileDelete { path: rp }, FileDelete { path: pp }) => {
323            if rp != pp {
324                out.push(Divergence {
325                    seq: Some(seq),
326                    category: "file_delete_path".to_string(),
327                    message: format!("file_delete path diverged: recorded={rp} replay={pp}"),
328                });
329            }
330        }
331        (
332            ProcessSpawn {
333                program: r_program,
334                args: r_args,
335                cwd: r_cwd,
336                exit_code: r_exit,
337                duration_ms: r_dur,
338                stdout_payload: r_stdout,
339                stderr_payload: r_stderr,
340            },
341            ProcessSpawn {
342                program: p_program,
343                args: p_args,
344                cwd: p_cwd,
345                exit_code: p_exit,
346                duration_ms: p_dur,
347                stdout_payload: p_stdout,
348                stderr_payload: p_stderr,
349            },
350        ) => {
351            if r_program != p_program {
352                out.push(Divergence {
353                    seq: Some(seq),
354                    category: "process_program".to_string(),
355                    message: format!(
356                        "subprocess program diverged: recorded={r_program} replay={p_program}"
357                    ),
358                });
359            }
360            if r_args != p_args {
361                out.push(Divergence {
362                    seq: Some(seq),
363                    category: "process_args".to_string(),
364                    message: format!(
365                        "subprocess args diverged: recorded={r_args:?} replay={p_args:?}"
366                    ),
367                });
368            }
369            if r_cwd != p_cwd {
370                out.push(Divergence {
371                    seq: Some(seq),
372                    category: "process_cwd".to_string(),
373                    message: format!(
374                        "subprocess cwd diverged: recorded={r_cwd:?} replay={p_cwd:?}"
375                    ),
376                });
377            }
378            if r_exit != p_exit {
379                out.push(Divergence {
380                    seq: Some(seq),
381                    category: "process_exit_code".to_string(),
382                    message: format!(
383                        "subprocess exit code diverged: recorded={r_exit} replay={p_exit}"
384                    ),
385                });
386            }
387            if byte_strict && r_dur != p_dur {
388                out.push(Divergence {
389                    seq: Some(seq),
390                    category: "process_duration".to_string(),
391                    message: format!(
392                        "subprocess duration diverged: recorded={r_dur}ms replay={p_dur}ms"
393                    ),
394                });
395            }
396            if r_stdout.content_hash() != p_stdout.content_hash() {
397                out.push(Divergence {
398                    seq: Some(seq),
399                    category: "process_stdout_hash".to_string(),
400                    message: format!(
401                        "subprocess stdout hash diverged: recorded={} replay={}",
402                        r_stdout.content_hash(),
403                        p_stdout.content_hash(),
404                    ),
405                });
406            }
407            if r_stderr.content_hash() != p_stderr.content_hash() {
408                out.push(Divergence {
409                    seq: Some(seq),
410                    category: "process_stderr_hash".to_string(),
411                    message: format!(
412                        "subprocess stderr hash diverged: recorded={} replay={}",
413                        r_stderr.content_hash(),
414                        p_stderr.content_hash(),
415                    ),
416                });
417            }
418        }
419        (Unknown, _) | (_, Unknown) => out.push(Divergence {
420            seq: Some(seq),
421            category: "unknown_kind".to_string(),
422            message: "encountered an unknown record kind — produced by a newer harn-vm version"
423                .to_string(),
424        }),
425        // Kind tag matched at the top of `compare_pair`; reaching here
426        // means a new variant was added without extending this match.
427        // Surface a structured divergence rather than panicking so older
428        // CI runners stay functional after a tape-format upgrade.
429        _ => out.push(Divergence {
430            seq: Some(seq),
431            category: "comparator_gap".to_string(),
432            message: format!(
433                "no comparator wired for record kind `{}`",
434                record_kind_tag(recorded)
435            ),
436        }),
437    }
438}
439
440#[allow(clippy::too_many_arguments)]
441fn compare_file(
442    seq: u64,
443    category: &str,
444    recorded_path: &str,
445    recorded_hash: &str,
446    recorded_len: u64,
447    replay_path: &str,
448    replay_hash: &str,
449    replay_len: u64,
450    byte_strict: bool,
451    out: &mut Vec<Divergence>,
452) {
453    if recorded_path != replay_path {
454        out.push(Divergence {
455            seq: Some(seq),
456            category: format!("{category}_path"),
457            message: format!(
458                "{category} path diverged: recorded={recorded_path} replay={replay_path}"
459            ),
460        });
461    }
462    if recorded_hash != replay_hash {
463        out.push(Divergence {
464            seq: Some(seq),
465            category: format!("{category}_hash"),
466            message: format!(
467                "{category} content hash diverged: recorded={recorded_hash} replay={replay_hash}"
468            ),
469        });
470    }
471    if byte_strict && recorded_len != replay_len {
472        out.push(Divergence {
473            seq: Some(seq),
474            category: format!("{category}_len"),
475            message: format!(
476                "{category} length diverged: recorded={recorded_len} replay={replay_len}"
477            ),
478        });
479    }
480}
481
482fn compare_outcome(recorded: &EventTape, replay: &EventTape) -> Vec<Divergence> {
483    let mut out = Vec::new();
484
485    let recorded_writes = collect_final_writes(recorded);
486    let replay_writes = collect_final_writes(replay);
487    if recorded_writes != replay_writes {
488        let recorded_paths: Vec<&String> = recorded_writes.keys().collect();
489        let replay_paths: Vec<&String> = replay_writes.keys().collect();
490        out.push(Divergence {
491            seq: None,
492            category: "outcome_fs_diff".to_string(),
493            message: format!(
494                "final FS write set diverged: recorded={recorded_paths:?} replay={replay_paths:?}"
495            ),
496        });
497    }
498
499    let recorded_exit = last_process_exit(recorded);
500    let replay_exit = last_process_exit(replay);
501    if recorded_exit != replay_exit {
502        out.push(Divergence {
503            seq: None,
504            category: "outcome_exit_code".to_string(),
505            message: format!(
506                "last subprocess exit code diverged: recorded={recorded_exit:?} replay={replay_exit:?}"
507            ),
508        });
509    }
510
511    let recorded_llm = count_llm_calls(recorded);
512    let replay_llm = count_llm_calls(replay);
513    if recorded_llm != replay_llm {
514        out.push(Divergence {
515            seq: None,
516            category: "outcome_llm_call_count".to_string(),
517            message: format!(
518                "LLM call count diverged: recorded={recorded_llm} replay={replay_llm}"
519            ),
520        });
521    }
522    out
523}
524
525fn collect_final_writes(tape: &EventTape) -> BTreeMap<String, Option<String>> {
526    let mut state: BTreeMap<String, Option<String>> = BTreeMap::new();
527    for record in &tape.records {
528        match &record.kind {
529            TapeRecordKind::FileWrite {
530                path, content_hash, ..
531            } => {
532                state.insert(path.clone(), Some(content_hash.clone()));
533            }
534            TapeRecordKind::FileDelete { path } => {
535                state.insert(path.clone(), None);
536            }
537            _ => {}
538        }
539    }
540    state
541}
542
543fn last_process_exit(tape: &EventTape) -> Option<i32> {
544    tape.records
545        .iter()
546        .rev()
547        .find_map(|record| match &record.kind {
548            TapeRecordKind::ProcessSpawn { exit_code, .. } => Some(*exit_code),
549            _ => None,
550        })
551}
552
553fn count_llm_calls(tape: &EventTape) -> usize {
554    tape.records
555        .iter()
556        .filter(|record| matches!(record.kind, TapeRecordKind::LlmCall { .. }))
557        .count()
558}
559
560fn record_kind_tag(kind: &TapeRecordKind) -> &'static str {
561    match kind {
562        TapeRecordKind::ClockRead { .. } => "clock_read",
563        TapeRecordKind::ClockSleep { .. } => "clock_sleep",
564        TapeRecordKind::LlmCall { .. } => "llm_call",
565        TapeRecordKind::FileRead { .. } => "file_read",
566        TapeRecordKind::FileWrite { .. } => "file_write",
567        TapeRecordKind::FileDelete { .. } => "file_delete",
568        TapeRecordKind::ProcessSpawn { .. } => "process_spawn",
569        TapeRecordKind::Unknown => "unknown",
570    }
571}
572
573#[cfg(test)]
574mod tests {
575    use super::*;
576    use crate::testbench::tape::{TapeHeader, TapePayload, TapeRecord};
577
578    fn empty_tape() -> EventTape {
579        EventTape::new(TapeHeader::current(None, None, Vec::new()))
580    }
581
582    fn record(seq: u64, kind: TapeRecordKind) -> TapeRecord {
583        TapeRecord {
584            seq,
585            virtual_time_ms: 0,
586            monotonic_ms: 0,
587            kind,
588        }
589    }
590
591    #[test]
592    fn byte_identical_matches_when_records_align() {
593        let mut a = empty_tape();
594        let mut b = empty_tape();
595        a.records
596            .push(record(0, TapeRecordKind::ClockSleep { duration_ms: 5 }));
597        b.records
598            .push(record(0, TapeRecordKind::ClockSleep { duration_ms: 5 }));
599        let report = compare(&a, &b, FidelityMode::ByteIdentical);
600        assert!(report.is_byte_identical(), "{report:?}");
601        assert_eq!(report.score, 1.0);
602    }
603
604    #[test]
605    fn byte_identical_flags_a_drifted_clock_read() {
606        // Acceptance criterion: deliberately introduce a wall-clock
607        // divergence and confirm the oracle flags it.
608        let mut a = empty_tape();
609        let mut b = empty_tape();
610        a.records
611            .push(record(0, TapeRecordKind::ClockSleep { duration_ms: 5 }));
612        b.records
613            .push(record(0, TapeRecordKind::ClockSleep { duration_ms: 7 }));
614        let report = compare(&a, &b, FidelityMode::ByteIdentical);
615        assert_eq!(report.divergences.len(), 1);
616        assert_eq!(report.divergences[0].category, "clock_sleep_duration");
617    }
618
619    #[test]
620    fn semantic_mode_ignores_pure_timing_drift() {
621        let mut a = empty_tape();
622        let mut b = empty_tape();
623        let make = |seq: u64, vt: i64| TapeRecord {
624            seq,
625            virtual_time_ms: vt,
626            monotonic_ms: vt,
627            kind: TapeRecordKind::FileWrite {
628                path: "/tmp/out.txt".to_string(),
629                content_hash: "abc".to_string(),
630                len_bytes: 3,
631            },
632        };
633        a.records.push(make(0, 0));
634        b.records.push(make(0, 1)); // virtual time drifted by 1ms
635        let strict = compare(&a, &b, FidelityMode::ByteIdentical);
636        assert!(!strict.is_byte_identical());
637        let semantic = compare(&a, &b, FidelityMode::Semantic);
638        assert!(
639            semantic.is_byte_identical(),
640            "semantic should not flag pure timing drift, got {semantic:?}"
641        );
642    }
643
644    #[test]
645    fn outcome_mode_only_compares_final_writes_and_exit() {
646        let mut a = empty_tape();
647        let mut b = empty_tape();
648        // Both end with the same final FS state but take different paths.
649        a.records.push(record(
650            0,
651            TapeRecordKind::FileWrite {
652                path: "/tmp/a".to_string(),
653                content_hash: "h1".to_string(),
654                len_bytes: 1,
655            },
656        ));
657        a.records
658            .push(record(1, TapeRecordKind::ClockSleep { duration_ms: 1000 }));
659        b.records
660            .push(record(0, TapeRecordKind::ClockSleep { duration_ms: 50 }));
661        b.records.push(record(
662            1,
663            TapeRecordKind::FileWrite {
664                path: "/tmp/a".to_string(),
665                content_hash: "h1".to_string(),
666                len_bytes: 1,
667            },
668        ));
669        let report = compare(&a, &b, FidelityMode::Outcome);
670        assert!(
671            report.divergences.is_empty(),
672            "outcome mode should ignore intermediate diffs, got {report:?}"
673        );
674        assert_eq!(report.score, 1.0);
675    }
676
677    #[test]
678    fn outcome_mode_flags_exit_code_drift() {
679        let mut a = empty_tape();
680        let mut b = empty_tape();
681        let payload = TapePayload::Inline {
682            content_hash: "ehash".to_string(),
683            text: String::new(),
684        };
685        a.records.push(record(
686            0,
687            TapeRecordKind::ProcessSpawn {
688                program: "git".to_string(),
689                args: Vec::new(),
690                cwd: None,
691                exit_code: 0,
692                duration_ms: 1,
693                stdout_payload: payload.clone(),
694                stderr_payload: payload.clone(),
695            },
696        ));
697        b.records.push(record(
698            0,
699            TapeRecordKind::ProcessSpawn {
700                program: "git".to_string(),
701                args: Vec::new(),
702                cwd: None,
703                exit_code: 1,
704                duration_ms: 1,
705                stdout_payload: payload.clone(),
706                stderr_payload: payload,
707            },
708        ));
709        let report = compare(&a, &b, FidelityMode::Outcome);
710        assert_eq!(report.divergences.len(), 1);
711        assert_eq!(report.divergences[0].category, "outcome_exit_code");
712    }
713
714    #[test]
715    fn parse_mode_accepts_aliases() {
716        assert_eq!(
717            FidelityMode::parse("byte").unwrap(),
718            FidelityMode::ByteIdentical
719        );
720        assert_eq!(
721            FidelityMode::parse("byte-identical").unwrap(),
722            FidelityMode::ByteIdentical
723        );
724        assert_eq!(
725            FidelityMode::parse("semantic").unwrap(),
726            FidelityMode::Semantic
727        );
728        assert_eq!(
729            FidelityMode::parse("outcome").unwrap(),
730            FidelityMode::Outcome
731        );
732        assert!(FidelityMode::parse("nope").is_err());
733    }
734}
harn_vm/testbench/fidelity.rs

harn_vm/testbench/
fidelity.rs