harn_vm/orchestration/
replay_bench.rs

1use std::collections::{BTreeMap, BTreeSet};
2use std::fmt;
3
4use serde::{Deserialize, Serialize};
5use serde_json::{json, Value as JsonValue};
6use sha2::{Digest, Sha256};
7
8use super::{
9    canonicalize_run, run_replay_oracle_trace, ReplayAllowlistRule, ReplayDivergence,
10    ReplayExpectation, ReplayOracleError, ReplayOracleReport, ReplayOracleTrace, ReplayTraceRun,
11    ReplayTraceRunCounts, REPLAY_TRACE_SCHEMA_VERSION,
12};
13
14pub const REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION: &str = "harn.replay_benchmark.report.v1";
15pub const REPLAY_BENCHMARK_CLOUD_INGEST_KIND: &str = "harn_cloud.replay_determinism.leaderboard.v1";
16pub const OPENCODE_JSONL_ADAPTER_ID: &str = "opencode-jsonl";
17pub const OPENCODE_JSONL_ADAPTER_SCHEMA_VERSION: &str =
18    "harn.replay_benchmark.adapter.opencode_jsonl.v1";
19
20const REPLAY_TRACE_SECTIONS: [&str; 11] = [
21    "event_log_entries",
22    "trigger_firings",
23    "llm_interactions",
24    "protocol_interactions",
25    "approval_interactions",
26    "effect_receipts",
27    "persona_runtime_states",
28    "agent_transcript_deltas",
29    "final_artifacts",
30    "policy_decisions",
31    // CH-07 (#1878): channel emit/match audit receipts.
32    "channel_receipts",
33];
34
35const TOOL_DRIFT_SECTIONS: [&str; 3] = [
36    "llm_interactions",
37    "protocol_interactions",
38    "effect_receipts",
39];
40
41const PERMISSION_SECTIONS: [&str; 2] = ["approval_interactions", "policy_decisions"];
42
43#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
44pub struct ReplayBenchmarkReport {
45    pub schema_version: String,
46    pub cloud_ingest: ReplayBenchmarkCloudIngest,
47    pub suite: ReplayBenchmarkSuiteIdentity,
48    pub summary: ReplayBenchmarkSummary,
49    pub fixtures: Vec<ReplayBenchmarkFixtureReport>,
50}
51
52#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
53pub struct ReplayBenchmarkCloudIngest {
54    pub kind: String,
55    pub leaderboard_key: String,
56    pub report_schema_version: String,
57    pub replay_trace_schema_version: String,
58    pub artifact_contract: String,
59}
60
61#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
62pub struct ReplayBenchmarkSuiteIdentity {
63    pub name: String,
64    pub fixture_count: usize,
65    pub source_paths: Vec<String>,
66}
67
68#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
69pub struct ReplayBenchmarkSummary {
70    pub passed: usize,
71    pub failed: usize,
72    pub deterministic_fixtures: usize,
73    pub drifted_fixtures: usize,
74    pub mean_replay_fidelity_score: f64,
75    pub mean_permission_decision_preservation_score: f64,
76    pub tool_call_drift_count: usize,
77    pub transcript_drift_count: usize,
78    pub observed_interactions: usize,
79    pub llm_input_tokens: u64,
80    pub llm_output_tokens: u64,
81}
82
83#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
84pub struct ReplayBenchmarkFixtureReport {
85    pub path: String,
86    pub name: String,
87    pub description: Option<String>,
88    pub expectation: ReplayExpectation,
89    pub passed: bool,
90    pub deterministic: bool,
91    pub first_run_counts: ReplayTraceRunCounts,
92    pub second_run_counts: ReplayTraceRunCounts,
93    pub metrics: ReplayBenchmarkMetrics,
94    pub first_divergence: Option<ReplayDivergence>,
95    pub receipt: ReplayBenchmarkFixtureReceipt,
96}
97
98#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
99pub struct ReplayBenchmarkMetrics {
100    pub determinism_score: f64,
101    pub replay_fidelity_score: f64,
102    pub permission_decision_preservation_score: f64,
103    pub tool_call_drift_count: usize,
104    pub transcript_drift_count: usize,
105    pub runtime_cost: ReplayRuntimeCostMetrics,
106    pub debugging_time_to_root_cause_proxy: ReplayDebuggingProxyMetrics,
107    pub category_scores: BTreeMap<String, ReplayCategoryMetric>,
108}
109
110#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
111pub struct ReplayCategoryMetric {
112    pub compared: bool,
113    pub matched: bool,
114    pub drift_count: usize,
115    pub first_run_count: usize,
116    pub second_run_count: usize,
117}
118
119#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
120pub struct ReplayRuntimeCostMetrics {
121    pub observed_interactions: usize,
122    pub event_log_entries: usize,
123    pub trigger_firings: usize,
124    pub llm_interactions: usize,
125    pub protocol_interactions: usize,
126    pub approval_interactions: usize,
127    pub effect_receipts: usize,
128    pub persona_runtime_states: usize,
129    pub agent_transcript_deltas: usize,
130    pub final_artifacts: usize,
131    pub policy_decisions: usize,
132    /// CH-07 (#1878): channel emit/match audit receipts.
133    #[serde(default)]
134    pub channel_receipts: usize,
135    #[serde(default)]
136    pub lifecycle_receipts: usize,
137    pub llm_input_tokens: u64,
138    pub llm_output_tokens: u64,
139    #[serde(skip_serializing_if = "Option::is_none")]
140    pub observed_cost_usd: Option<f64>,
141}
142
143#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
144pub struct ReplayDebuggingProxyMetrics {
145    pub proxy_kind: String,
146    pub first_divergence_path: Option<String>,
147    pub first_divergence_depth: usize,
148    pub drift_surface_count: usize,
149    pub estimated_triage_steps: usize,
150}
151
152#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
153pub struct ReplayBenchmarkFixtureReceipt {
154    pub ingest_kind: String,
155    pub report_schema_version: String,
156    pub replay_trace_schema_version: String,
157    pub canonical_first_sha256: String,
158    pub canonical_second_sha256: String,
159    pub benchmark_receipt_sha256: String,
160}
161
162#[derive(Debug, Clone, PartialEq, Eq)]
163pub enum ReplayBenchmarkError {
164    Oracle(ReplayOracleError),
165    Adapter(String),
166    Serialization(String),
167}
168
169impl fmt::Display for ReplayBenchmarkError {
170    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
171        match self {
172            Self::Oracle(error) => error.fmt(f),
173            Self::Adapter(message) | Self::Serialization(message) => message.fmt(f),
174        }
175    }
176}
177
178impl std::error::Error for ReplayBenchmarkError {}
179
180impl From<ReplayOracleError> for ReplayBenchmarkError {
181    fn from(error: ReplayOracleError) -> Self {
182        Self::Oracle(error)
183    }
184}
185
186pub trait ReplayTraceAdapter {
187    fn adapter_id(&self) -> &'static str;
188    fn input_schema_version(&self) -> &'static str;
189    fn adapt_run(&self, input: &str, run_id: &str) -> Result<ReplayTraceRun, ReplayBenchmarkError>;
190}
191
192#[derive(Clone, Copy, Debug, Default)]
193pub struct OpenCodeJsonlAdapter;
194
195impl ReplayTraceAdapter for OpenCodeJsonlAdapter {
196    fn adapter_id(&self) -> &'static str {
197        OPENCODE_JSONL_ADAPTER_ID
198    }
199
200    fn input_schema_version(&self) -> &'static str {
201        OPENCODE_JSONL_ADAPTER_SCHEMA_VERSION
202    }
203
204    fn adapt_run(&self, input: &str, run_id: &str) -> Result<ReplayTraceRun, ReplayBenchmarkError> {
205        adapt_opencode_jsonl(input, run_id)
206    }
207}
208
209pub fn benchmark_replay_trace(
210    path: impl Into<String>,
211    trace: &ReplayOracleTrace,
212) -> Result<ReplayBenchmarkFixtureReport, ReplayBenchmarkError> {
213    let path = path.into();
214    let oracle = run_replay_oracle_trace(trace)?;
215    benchmark_replay_trace_from_oracle(path, trace, oracle)
216}
217
218pub fn benchmark_adapted_replay_pair(
219    adapter: &dyn ReplayTraceAdapter,
220    name: impl Into<String>,
221    first_input: &str,
222    second_input: &str,
223) -> Result<ReplayBenchmarkFixtureReport, ReplayBenchmarkError> {
224    let name = name.into();
225    let trace = ReplayOracleTrace {
226        schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
227        name: name.clone(),
228        description: Some(format!(
229            "External replay trace pair adapted with {} ({})",
230            adapter.adapter_id(),
231            adapter.input_schema_version()
232        )),
233        expect: ReplayExpectation::Match,
234        allowlist: vec![ReplayAllowlistRule {
235            path: "/run_id".to_string(),
236            reason: "external trace runs are imported as separate executions".to_string(),
237            replacement: None,
238        }],
239        first_run: adapter.adapt_run(first_input, "adapted_first_run")?,
240        second_run: adapter.adapt_run(second_input, "adapted_second_run")?,
241        protocol_fixture_refs: Vec::new(),
242    };
243    benchmark_replay_trace(format!("adapter:{}:{name}", adapter.adapter_id()), &trace)
244}
245
246pub fn build_replay_benchmark_report(
247    suite_name: impl Into<String>,
248    source_paths: Vec<String>,
249    fixtures: Vec<ReplayBenchmarkFixtureReport>,
250) -> ReplayBenchmarkReport {
251    let suite_name = suite_name.into();
252    let summary = summarize_replay_benchmark(&fixtures);
253    ReplayBenchmarkReport {
254        schema_version: REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION.to_string(),
255        cloud_ingest: ReplayBenchmarkCloudIngest {
256            kind: REPLAY_BENCHMARK_CLOUD_INGEST_KIND.to_string(),
257            leaderboard_key: "replay-determinism".to_string(),
258            report_schema_version: REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION.to_string(),
259            replay_trace_schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
260            artifact_contract:
261                "fixtures[].receipt + fixtures[].metrics are stable Cloud leaderboard inputs"
262                    .to_string(),
263        },
264        suite: ReplayBenchmarkSuiteIdentity {
265            name: suite_name,
266            fixture_count: fixtures.len(),
267            source_paths,
268        },
269        summary,
270        fixtures,
271    }
272}
273
274fn benchmark_replay_trace_from_oracle(
275    path: String,
276    trace: &ReplayOracleTrace,
277    oracle: ReplayOracleReport,
278) -> Result<ReplayBenchmarkFixtureReport, ReplayBenchmarkError> {
279    let first = canonicalize_run(&trace.first_run, &trace.allowlist)?;
280    let second = canonicalize_run(&trace.second_run, &trace.allowlist)?;
281    let category_scores = category_scores(&first, &second, &oracle);
282    let metrics = replay_metrics(trace, &oracle, category_scores)?;
283    let canonical_first_sha256 = sha256_json(&first)?;
284    let canonical_second_sha256 = sha256_json(&second)?;
285    let receipt = fixture_receipt(
286        &trace.name,
287        &path,
288        &metrics,
289        &canonical_first_sha256,
290        &canonical_second_sha256,
291    )?;
292
293    Ok(ReplayBenchmarkFixtureReport {
294        path,
295        name: oracle.name,
296        description: trace.description.clone(),
297        expectation: oracle.expectation,
298        passed: oracle.passed,
299        deterministic: oracle.divergence.is_none(),
300        first_run_counts: oracle.first_run_counts,
301        second_run_counts: oracle.second_run_counts,
302        metrics,
303        first_divergence: oracle.divergence,
304        receipt,
305    })
306}
307
308fn summarize_replay_benchmark(fixtures: &[ReplayBenchmarkFixtureReport]) -> ReplayBenchmarkSummary {
309    let fixture_count = fixtures.len();
310    let passed = fixtures.iter().filter(|fixture| fixture.passed).count();
311    let deterministic_fixtures = fixtures
312        .iter()
313        .filter(|fixture| fixture.deterministic)
314        .count();
315    let runtime = fixtures
316        .iter()
317        .fold(ReplayRuntimeCostMetrics::default(), |mut acc, fixture| {
318            let runtime = &fixture.metrics.runtime_cost;
319            acc.observed_interactions += runtime.observed_interactions;
320            acc.event_log_entries += runtime.event_log_entries;
321            acc.trigger_firings += runtime.trigger_firings;
322            acc.llm_interactions += runtime.llm_interactions;
323            acc.protocol_interactions += runtime.protocol_interactions;
324            acc.approval_interactions += runtime.approval_interactions;
325            acc.effect_receipts += runtime.effect_receipts;
326            acc.persona_runtime_states += runtime.persona_runtime_states;
327            acc.agent_transcript_deltas += runtime.agent_transcript_deltas;
328            acc.final_artifacts += runtime.final_artifacts;
329            acc.policy_decisions += runtime.policy_decisions;
330            acc.channel_receipts += runtime.channel_receipts;
331            acc.llm_input_tokens += runtime.llm_input_tokens;
332            acc.llm_output_tokens += runtime.llm_output_tokens;
333            acc.observed_cost_usd =
334                sum_optional_cost(acc.observed_cost_usd, runtime.observed_cost_usd);
335            acc
336        });
337    ReplayBenchmarkSummary {
338        passed,
339        failed: fixture_count.saturating_sub(passed),
340        deterministic_fixtures,
341        drifted_fixtures: fixture_count.saturating_sub(deterministic_fixtures),
342        mean_replay_fidelity_score: average_metric(fixtures, |fixture| {
343            fixture.metrics.replay_fidelity_score
344        }),
345        mean_permission_decision_preservation_score: average_metric(fixtures, |fixture| {
346            fixture.metrics.permission_decision_preservation_score
347        }),
348        tool_call_drift_count: fixtures
349            .iter()
350            .map(|fixture| fixture.metrics.tool_call_drift_count)
351            .sum(),
352        transcript_drift_count: fixtures
353            .iter()
354            .map(|fixture| fixture.metrics.transcript_drift_count)
355            .sum(),
356        observed_interactions: runtime.observed_interactions,
357        llm_input_tokens: runtime.llm_input_tokens,
358        llm_output_tokens: runtime.llm_output_tokens,
359    }
360}
361
362fn replay_metrics(
363    trace: &ReplayOracleTrace,
364    oracle: &ReplayOracleReport,
365    category_scores: BTreeMap<String, ReplayCategoryMetric>,
366) -> Result<ReplayBenchmarkMetrics, ReplayBenchmarkError> {
367    let compared_categories = category_scores
368        .values()
369        .filter(|metric| metric.compared)
370        .count();
371    let matched_categories = category_scores
372        .values()
373        .filter(|metric| metric.compared && metric.matched)
374        .count();
375    let replay_fidelity_score = if compared_categories == 0 {
376        0.0
377    } else {
378        matched_categories as f64 / compared_categories as f64
379    };
380    let permission_decision_preservation_score =
381        section_score(&category_scores, &PERMISSION_SECTIONS);
382    let tool_call_drift_count = section_drift_count(&category_scores, &TOOL_DRIFT_SECTIONS);
383    let transcript_drift_count =
384        section_drift_count(&category_scores, &["agent_transcript_deltas"]);
385    let runtime_cost = runtime_cost_metrics(&trace.first_run, &trace.second_run);
386    let debugging_time_to_root_cause_proxy =
387        debugging_proxy_metrics(oracle.divergence.as_ref(), &category_scores);
388
389    Ok(ReplayBenchmarkMetrics {
390        determinism_score: if oracle.divergence.is_none() {
391            1.0
392        } else {
393            0.0
394        },
395        replay_fidelity_score,
396        permission_decision_preservation_score,
397        tool_call_drift_count,
398        transcript_drift_count,
399        runtime_cost,
400        debugging_time_to_root_cause_proxy,
401        category_scores,
402    })
403}
404
405fn category_scores(
406    first: &JsonValue,
407    second: &JsonValue,
408    oracle: &ReplayOracleReport,
409) -> BTreeMap<String, ReplayCategoryMetric> {
410    let first_counts = counts_by_section(&oracle.first_run_counts);
411    let second_counts = counts_by_section(&oracle.second_run_counts);
412    REPLAY_TRACE_SECTIONS
413        .iter()
414        .map(|section| {
415            let first_value = first.get(*section).unwrap_or(&JsonValue::Null);
416            let second_value = second.get(*section).unwrap_or(&JsonValue::Null);
417            let first_run_count = first_counts.get(*section).copied().unwrap_or_default();
418            let second_run_count = second_counts.get(*section).copied().unwrap_or_default();
419            let compared = first_run_count > 0 || second_run_count > 0;
420            let drift_count = if compared {
421                drift_count(first_value, second_value)
422            } else {
423                0
424            };
425            (
426                (*section).to_string(),
427                ReplayCategoryMetric {
428                    compared,
429                    matched: drift_count == 0,
430                    drift_count,
431                    first_run_count,
432                    second_run_count,
433                },
434            )
435        })
436        .collect()
437}
438
439fn counts_by_section(counts: &ReplayTraceRunCounts) -> BTreeMap<&'static str, usize> {
440    BTreeMap::from([
441        ("event_log_entries", counts.event_log_entries),
442        ("trigger_firings", counts.trigger_firings),
443        ("llm_interactions", counts.llm_interactions),
444        ("protocol_interactions", counts.protocol_interactions),
445        ("approval_interactions", counts.approval_interactions),
446        ("effect_receipts", counts.effect_receipts),
447        ("persona_runtime_states", counts.persona_runtime_states),
448        ("agent_transcript_deltas", counts.agent_transcript_deltas),
449        ("final_artifacts", counts.final_artifacts),
450        ("policy_decisions", counts.policy_decisions),
451        // CH-07 (#1878).
452        ("channel_receipts", counts.channel_receipts),
453        ("lifecycle_receipts", counts.lifecycle_receipts),
454    ])
455}
456
457fn drift_count(first: &JsonValue, second: &JsonValue) -> usize {
458    if first == second {
459        return 0;
460    }
461    match (first, second) {
462        (JsonValue::Array(first_items), JsonValue::Array(second_items)) => {
463            let shared = first_items.len().min(second_items.len());
464            let item_drifts = (0..shared)
465                .filter(|index| first_items[*index] != second_items[*index])
466                .count();
467            item_drifts + first_items.len().abs_diff(second_items.len())
468        }
469        (JsonValue::Object(first_map), JsonValue::Object(second_map)) => {
470            let keys = first_map
471                .keys()
472                .chain(second_map.keys())
473                .collect::<BTreeSet<_>>();
474            keys.into_iter()
475                .filter(|key| first_map.get(*key) != second_map.get(*key))
476                .count()
477        }
478        _ => 1,
479    }
480}
481
482fn section_score(
483    category_scores: &BTreeMap<String, ReplayCategoryMetric>,
484    sections: &[&str],
485) -> f64 {
486    let compared = sections
487        .iter()
488        .filter_map(|section| category_scores.get(*section))
489        .filter(|metric| metric.compared)
490        .collect::<Vec<_>>();
491    if compared.is_empty() {
492        return 1.0;
493    }
494    compared.iter().filter(|metric| metric.matched).count() as f64 / compared.len() as f64
495}
496
497fn section_drift_count(
498    category_scores: &BTreeMap<String, ReplayCategoryMetric>,
499    sections: &[&str],
500) -> usize {
501    sections
502        .iter()
503        .filter_map(|section| category_scores.get(*section))
504        .map(|metric| metric.drift_count)
505        .sum()
506}
507
508fn runtime_cost_metrics(
509    first_run: &ReplayTraceRun,
510    second_run: &ReplayTraceRun,
511) -> ReplayRuntimeCostMetrics {
512    let first = first_run.counts();
513    let second = second_run.counts();
514    let observed_cost_usd =
515        sum_optional_cost(cost_usd_for_run(first_run), cost_usd_for_run(second_run));
516    ReplayRuntimeCostMetrics {
517        observed_interactions: trace_material_count(&first) + trace_material_count(&second),
518        event_log_entries: first.event_log_entries + second.event_log_entries,
519        trigger_firings: first.trigger_firings + second.trigger_firings,
520        llm_interactions: first.llm_interactions + second.llm_interactions,
521        protocol_interactions: first.protocol_interactions + second.protocol_interactions,
522        approval_interactions: first.approval_interactions + second.approval_interactions,
523        effect_receipts: first.effect_receipts + second.effect_receipts,
524        persona_runtime_states: first.persona_runtime_states + second.persona_runtime_states,
525        agent_transcript_deltas: first.agent_transcript_deltas + second.agent_transcript_deltas,
526        final_artifacts: first.final_artifacts + second.final_artifacts,
527        policy_decisions: first.policy_decisions + second.policy_decisions,
528        channel_receipts: first.channel_receipts + second.channel_receipts,
529        lifecycle_receipts: first.lifecycle_receipts + second.lifecycle_receipts,
530        llm_input_tokens: token_total(first_run, "input_tokens")
531            + token_total(second_run, "input_tokens"),
532        llm_output_tokens: token_total(first_run, "output_tokens")
533            + token_total(second_run, "output_tokens"),
534        observed_cost_usd,
535    }
536}
537
538fn trace_material_count(counts: &ReplayTraceRunCounts) -> usize {
539    counts.event_log_entries
540        + counts.trigger_firings
541        + counts.llm_interactions
542        + counts.protocol_interactions
543        + counts.approval_interactions
544        + counts.effect_receipts
545        + counts.persona_runtime_states
546        + counts.agent_transcript_deltas
547        + counts.final_artifacts
548        + counts.policy_decisions
549        + counts.channel_receipts
550        + counts.lifecycle_receipts
551}
552
553fn token_total(run: &ReplayTraceRun, token_key: &str) -> u64 {
554    run.llm_interactions
555        .iter()
556        .filter_map(|interaction| {
557            interaction
558                .get(token_key)
559                .and_then(JsonValue::as_u64)
560                .or_else(|| {
561                    interaction
562                        .get("usage")
563                        .and_then(|usage| usage.get(token_key))
564                        .and_then(JsonValue::as_u64)
565                })
566        })
567        .sum()
568}
569
570fn cost_usd_for_run(run: &ReplayTraceRun) -> Option<f64> {
571    let mut seen = false;
572    let mut total = 0.0;
573    for interaction in &run.llm_interactions {
574        if let Some(cost) = interaction
575            .get("cost_usd")
576            .and_then(JsonValue::as_f64)
577            .or_else(|| {
578                interaction
579                    .get("usage")
580                    .and_then(|usage| usage.get("cost_usd"))
581                    .and_then(JsonValue::as_f64)
582            })
583        {
584            seen = true;
585            total += cost;
586        }
587    }
588    seen.then_some(total)
589}
590
591fn sum_optional_cost(left: Option<f64>, right: Option<f64>) -> Option<f64> {
592    match (left, right) {
593        (Some(left), Some(right)) => Some(left + right),
594        (Some(value), None) | (None, Some(value)) => Some(value),
595        (None, None) => None,
596    }
597}
598
599fn debugging_proxy_metrics(
600    divergence: Option<&ReplayDivergence>,
601    category_scores: &BTreeMap<String, ReplayCategoryMetric>,
602) -> ReplayDebuggingProxyMetrics {
603    let first_divergence_path = divergence.map(|divergence| divergence.path.clone());
604    let first_divergence_depth = first_divergence_path
605        .as_deref()
606        .map(json_path_depth)
607        .unwrap_or_default();
608    let drift_surface_count = category_scores
609        .values()
610        .filter(|metric| metric.compared && !metric.matched)
611        .count();
612    ReplayDebuggingProxyMetrics {
613        proxy_kind: "first_divergence_depth_plus_drift_surfaces".to_string(),
614        first_divergence_path,
615        first_divergence_depth,
616        drift_surface_count,
617        estimated_triage_steps: if drift_surface_count == 0 {
618            0
619        } else {
620            1 + first_divergence_depth + drift_surface_count
621        },
622    }
623}
624
625fn json_path_depth(path: &str) -> usize {
626    let path = path.trim();
627    if path == "$" {
628        return 0;
629    }
630    if let Some(pointer_path) = path.strip_prefix('/') {
631        return pointer_path
632            .split('/')
633            .filter(|segment| !segment.is_empty())
634            .count();
635    }
636    path.split('.')
637        .filter(|segment| !segment.is_empty() && *segment != "$")
638        .count()
639}
640
641fn fixture_receipt(
642    name: &str,
643    path: &str,
644    metrics: &ReplayBenchmarkMetrics,
645    canonical_first_sha256: &str,
646    canonical_second_sha256: &str,
647) -> Result<ReplayBenchmarkFixtureReceipt, ReplayBenchmarkError> {
648    let receipt_material = json!({
649        "ingest_kind": REPLAY_BENCHMARK_CLOUD_INGEST_KIND,
650        "report_schema_version": REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION,
651        "replay_trace_schema_version": REPLAY_TRACE_SCHEMA_VERSION,
652        "name": name,
653        "path": path,
654        "canonical_first_sha256": canonical_first_sha256,
655        "canonical_second_sha256": canonical_second_sha256,
656        "metrics": metrics,
657    });
658    Ok(ReplayBenchmarkFixtureReceipt {
659        ingest_kind: REPLAY_BENCHMARK_CLOUD_INGEST_KIND.to_string(),
660        report_schema_version: REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION.to_string(),
661        replay_trace_schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
662        canonical_first_sha256: canonical_first_sha256.to_string(),
663        canonical_second_sha256: canonical_second_sha256.to_string(),
664        benchmark_receipt_sha256: sha256_json(&receipt_material)?,
665    })
666}
667
668fn sha256_json(value: &JsonValue) -> Result<String, ReplayBenchmarkError> {
669    let bytes = serde_json::to_vec(value)
670        .map_err(|error| ReplayBenchmarkError::Serialization(error.to_string()))?;
671    Ok(format!("sha256:{}", hex::encode(Sha256::digest(bytes))))
672}
673
674fn sha256_value(value: &JsonValue) -> Result<String, ReplayBenchmarkError> {
675    sha256_json(value)
676}
677
678fn sha256_text(text: &str) -> String {
679    format!("sha256:{}", hex::encode(Sha256::digest(text.as_bytes())))
680}
681
682fn average_metric(
683    fixtures: &[ReplayBenchmarkFixtureReport],
684    metric: impl Fn(&ReplayBenchmarkFixtureReport) -> f64,
685) -> f64 {
686    if fixtures.is_empty() {
687        0.0
688    } else {
689        fixtures.iter().map(metric).sum::<f64>() / fixtures.len() as f64
690    }
691}
692
693fn adapt_opencode_jsonl(input: &str, run_id: &str) -> Result<ReplayTraceRun, ReplayBenchmarkError> {
694    let mut run = ReplayTraceRun {
695        run_id: run_id.to_string(),
696        ..ReplayTraceRun::default()
697    };
698    for (index, raw_line) in input.lines().enumerate() {
699        let line_no = index + 1;
700        let line = raw_line.trim();
701        if line.is_empty() {
702            continue;
703        }
704        let value: JsonValue = serde_json::from_str(line).map_err(|error| {
705            ReplayBenchmarkError::Adapter(format!(
706                "invalid {OPENCODE_JSONL_ADAPTER_ID} JSONL line {line_no}: {error}"
707            ))
708        })?;
709        let object = value.as_object().ok_or_else(|| {
710            ReplayBenchmarkError::Adapter(format!(
711                "{OPENCODE_JSONL_ADAPTER_ID} JSONL line {line_no} must be an object"
712            ))
713        })?;
714        let event_type = object
715            .get("type")
716            .or_else(|| object.get("event"))
717            .and_then(JsonValue::as_str)
718            .unwrap_or("event");
719        match event_type {
720            "message" | "session.message" => {
721                run.agent_transcript_deltas
722                    .push(adapt_opencode_message(object, line_no));
723            }
724            "tool_call" | "tool" | "session.tool_call" => {
725                let (protocol, receipt) = adapt_opencode_tool_call(object, line_no)?;
726                run.protocol_interactions.push(protocol);
727                run.effect_receipts.push(receipt);
728            }
729            "permission" | "permission_decision" | "session.permission" => {
730                let (approval, policy) = adapt_opencode_permission(object, line_no);
731                run.approval_interactions.push(approval);
732                run.policy_decisions.push(policy);
733            }
734            "llm" | "model" | "session.llm" => {
735                run.llm_interactions
736                    .push(adapt_opencode_llm(object, line_no));
737            }
738            _ => run
739                .event_log_entries
740                .push(adapt_opencode_event(object, event_type, line_no)),
741        }
742    }
743    if trace_material_count(&run.counts()) == 0 {
744        return Err(ReplayBenchmarkError::Adapter(format!(
745            "{OPENCODE_JSONL_ADAPTER_ID} input contained no adaptable events"
746        )));
747    }
748    Ok(run)
749}
750
751fn adapt_opencode_message(
752    object: &serde_json::Map<String, JsonValue>,
753    line_no: usize,
754) -> JsonValue {
755    let content = object.get("content").cloned().unwrap_or(JsonValue::Null);
756    json!({
757        "delta_id": object_string(object, "id").unwrap_or_else(|| format!("message-{line_no}")),
758        "agent": object_string(object, "agent").unwrap_or_else(|| "opencode".to_string()),
759        "role": object_string(object, "role").unwrap_or_else(|| "assistant".to_string()),
760        "content_sha256": sha256_text(&content.to_string()),
761    })
762}
763
764fn adapt_opencode_tool_call(
765    object: &serde_json::Map<String, JsonValue>,
766    line_no: usize,
767) -> Result<(JsonValue, JsonValue), ReplayBenchmarkError> {
768    let tool = object_string(object, "tool")
769        .or_else(|| object_string(object, "name"))
770        .unwrap_or_else(|| "unknown_tool".to_string());
771    let arguments = object
772        .get("arguments")
773        .or_else(|| object.get("args"))
774        .cloned()
775        .unwrap_or_else(|| json!({}));
776    let result = object
777        .get("result")
778        .or_else(|| object.get("output"))
779        .cloned()
780        .unwrap_or(JsonValue::Null);
781    let status = object_string(object, "status").unwrap_or_else(|| "completed".to_string());
782    let arguments_sha256 = sha256_value(&arguments)?;
783    let result_sha256 = sha256_value(&result)?;
784    Ok((
785        json!({
786            "protocol": "opencode",
787            "boundary": "tool_call",
788            "tool": tool,
789            "call_id": object_string(object, "id").unwrap_or_else(|| format!("tool-{line_no}")),
790            "arguments_sha256": arguments_sha256,
791            "status": status,
792            "result_sha256": result_sha256,
793        }),
794        json!({
795            "receipt_id": object_string(object, "receipt_id").unwrap_or_else(|| format!("tool-receipt-{line_no}")),
796            "kind": "tool_call",
797            "tool": tool,
798            "status": status,
799            "arguments_sha256": arguments_sha256,
800            "result_sha256": result_sha256,
801        }),
802    ))
803}
804
805fn adapt_opencode_permission(
806    object: &serde_json::Map<String, JsonValue>,
807    line_no: usize,
808) -> (JsonValue, JsonValue) {
809    let action = object_string(object, "action").unwrap_or_else(|| "unknown".to_string());
810    let decision = object_string(object, "decision")
811        .or_else(|| object_string(object, "response"))
812        .unwrap_or_else(|| "unknown".to_string());
813    (
814        json!({
815            "request_id": object_string(object, "id").unwrap_or_else(|| format!("permission-{line_no}")),
816            "principal": object_string(object, "principal").unwrap_or_else(|| "agent".to_string()),
817            "action": action,
818            "response": decision,
819            "reviewer": object.get("reviewer").cloned().unwrap_or(JsonValue::Null),
820        }),
821        json!({
822            "decision_id": object_string(object, "decision_id").unwrap_or_else(|| format!("policy-{line_no}")),
823            "capability": object_string(object, "capability").unwrap_or(action),
824            "decision": decision,
825            "approval_required": true,
826        }),
827    )
828}
829
830fn adapt_opencode_llm(object: &serde_json::Map<String, JsonValue>, line_no: usize) -> JsonValue {
831    let input_tokens = object
832        .get("input_tokens")
833        .and_then(JsonValue::as_u64)
834        .or_else(|| {
835            object
836                .get("usage")
837                .and_then(|usage| usage.get("input_tokens"))
838                .and_then(JsonValue::as_u64)
839        })
840        .unwrap_or_default();
841    let output_tokens = object
842        .get("output_tokens")
843        .and_then(JsonValue::as_u64)
844        .or_else(|| {
845            object
846                .get("usage")
847                .and_then(|usage| usage.get("output_tokens"))
848                .and_then(JsonValue::as_u64)
849        })
850        .unwrap_or_default();
851    let messages_sha256 = object
852        .get("messages")
853        .map(|value| sha256_text(&value.to_string()))
854        .unwrap_or_else(|| sha256_text(""));
855    let response_sha256 = object
856        .get("response")
857        .map(|value| sha256_text(&value.to_string()))
858        .unwrap_or_else(|| sha256_text(""));
859    json!({
860        "request_id": object_string(object, "id").unwrap_or_else(|| format!("llm-{line_no}")),
861        "provider": object_string(object, "provider").unwrap_or_else(|| "opencode".to_string()),
862        "model": object_string(object, "model").unwrap_or_else(|| "unknown".to_string()),
863        "messages_sha256": messages_sha256,
864        "response_sha256": response_sha256,
865        "usage": {
866            "input_tokens": input_tokens,
867            "output_tokens": output_tokens,
868        },
869    })
870}
871
872fn adapt_opencode_event(
873    object: &serde_json::Map<String, JsonValue>,
874    event_type: &str,
875    line_no: usize,
876) -> JsonValue {
877    json!({
878        "event_id": line_no,
879        "topic": object_string(object, "topic").unwrap_or_else(|| "opencode.session".to_string()),
880        "kind": event_type,
881        "payload": object.get("payload").cloned().unwrap_or_else(|| JsonValue::Object(object.clone())),
882    })
883}
884
885fn object_string(object: &serde_json::Map<String, JsonValue>, key: &str) -> Option<String> {
886    object
887        .get(key)
888        .and_then(JsonValue::as_str)
889        .map(str::to_string)
890}
891
892#[cfg(test)]
893mod tests {
894    use super::*;
895
896    fn trace_pair(status: (&str, &str)) -> ReplayOracleTrace {
897        ReplayOracleTrace {
898            schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
899            name: "simple_tool_run".to_string(),
900            description: Some("golden replay benchmark fixture".to_string()),
901            expect: ReplayExpectation::Match,
902            allowlist: vec![ReplayAllowlistRule {
903                path: "/run_id".to_string(),
904                reason: "run ids are allocated per execution".to_string(),
905                replacement: None,
906            }],
907            first_run: ReplayTraceRun {
908                run_id: "first".to_string(),
909                protocol_interactions: vec![json!({
910                    "protocol": "mcp",
911                    "boundary": "tools/call",
912                    "tool": "read_file",
913                    "status": status.0,
914                })],
915                policy_decisions: vec![json!({
916                    "capability": "fs.read",
917                    "decision": "allow",
918                })],
919                ..ReplayTraceRun::default()
920            },
921            second_run: ReplayTraceRun {
922                run_id: "second".to_string(),
923                protocol_interactions: vec![json!({
924                    "protocol": "mcp",
925                    "boundary": "tools/call",
926                    "tool": "read_file",
927                    "status": status.1,
928                })],
929                policy_decisions: vec![json!({
930                    "capability": "fs.read",
931                    "decision": "allow",
932                })],
933                ..ReplayTraceRun::default()
934            },
935            protocol_fixture_refs: Vec::new(),
936        }
937    }
938
939    #[test]
940    fn replay_benchmark_reports_stable_golden_metrics_for_matching_trace() {
941        let fixture =
942            benchmark_replay_trace("benchmarks/replay/simple.json", &trace_pair(("ok", "ok")))
943                .expect("benchmark fixture");
944
945        assert!(fixture.passed);
946        assert!(fixture.deterministic);
947        assert_eq!(fixture.metrics.determinism_score, 1.0);
948        assert_eq!(fixture.metrics.replay_fidelity_score, 1.0);
949        assert_eq!(fixture.metrics.permission_decision_preservation_score, 1.0);
950        assert_eq!(fixture.metrics.tool_call_drift_count, 0);
951        assert!(fixture
952            .receipt
953            .benchmark_receipt_sha256
954            .starts_with("sha256:"));
955    }
956
957    #[test]
958    fn replay_benchmark_reports_reduced_fidelity_for_meaningful_drift() {
959        let fixture =
960            benchmark_replay_trace("benchmarks/replay/drift.json", &trace_pair(("ok", "error")))
961                .expect("benchmark fixture");
962
963        assert!(!fixture.passed);
964        assert!(!fixture.deterministic);
965        assert_eq!(fixture.metrics.determinism_score, 0.0);
966        assert_eq!(fixture.metrics.replay_fidelity_score, 0.5);
967        assert_eq!(fixture.metrics.tool_call_drift_count, 1);
968        assert_eq!(
969            fixture
970                .metrics
971                .debugging_time_to_root_cause_proxy
972                .first_divergence_path
973                .as_deref(),
974            Some("/protocol_interactions/0/status")
975        );
976        assert_eq!(
977            fixture
978                .metrics
979                .debugging_time_to_root_cause_proxy
980                .first_divergence_depth,
981            3
982        );
983        assert_eq!(
984            fixture
985                .metrics
986                .debugging_time_to_root_cause_proxy
987                .estimated_triage_steps,
988            5
989        );
990    }
991
992    #[test]
993    fn replay_benchmark_summary_is_stable_across_repeated_runs() {
994        let first = benchmark_replay_trace("fixture.json", &trace_pair(("ok", "ok")))
995            .expect("first benchmark");
996        let second = benchmark_replay_trace("fixture.json", &trace_pair(("ok", "ok")))
997            .expect("second benchmark");
998
999        let first_json = serde_json::to_string(&first).expect("serialize first");
1000        let second_json = serde_json::to_string(&second).expect("serialize second");
1001        assert_eq!(first_json, second_json);
1002    }
1003
1004    #[test]
1005    fn opencode_jsonl_adapter_maps_messages_tools_permissions_and_llm_usage() {
1006        let input = concat!(
1007            "{\"type\":\"message\",\"id\":\"m1\",\"role\":\"assistant\",\"content\":\"done\"}\n",
1008            "{\"type\":\"tool_call\",\"id\":\"t1\",\"tool\":\"write_file\",\"arguments\":{\"path\":\"notes.md\"},\"result\":{\"ok\":true}}\n",
1009            "{\"type\":\"permission\",\"id\":\"p1\",\"action\":\"write_file\",\"decision\":\"approved\"}\n",
1010            "{\"type\":\"llm\",\"id\":\"l1\",\"model\":\"qwen\",\"usage\":{\"input_tokens\":7,\"output_tokens\":3}}\n"
1011        );
1012
1013        let run = OpenCodeJsonlAdapter
1014            .adapt_run(input, "opencode-run")
1015            .expect("adapt opencode jsonl");
1016
1017        assert_eq!(run.run_id, "opencode-run");
1018        assert_eq!(run.agent_transcript_deltas.len(), 1);
1019        assert_eq!(run.protocol_interactions.len(), 1);
1020        assert_eq!(run.effect_receipts.len(), 1);
1021        assert_eq!(run.approval_interactions.len(), 1);
1022        assert_eq!(run.policy_decisions.len(), 1);
1023        assert_eq!(run.llm_interactions.len(), 1);
1024        assert_eq!(token_total(&run, "input_tokens"), 7);
1025        assert_eq!(token_total(&run, "output_tokens"), 3);
1026    }
1027
1028    #[test]
1029    fn adapted_trace_pair_can_be_benchmarked() {
1030        let first = "{\"type\":\"tool_call\",\"tool\":\"read_file\",\"result\":{\"ok\":true}}\n";
1031        let second = "{\"type\":\"tool_call\",\"tool\":\"read_file\",\"result\":{\"ok\":true}}\n";
1032
1033        let fixture = benchmark_adapted_replay_pair(
1034            &OpenCodeJsonlAdapter,
1035            "external-tool-run",
1036            first,
1037            second,
1038        )
1039        .expect("benchmark adapted pair");
1040
1041        assert!(fixture.passed);
1042        assert_eq!(fixture.name, "external-tool-run");
1043        assert_eq!(fixture.metrics.tool_call_drift_count, 0);
1044    }
1045}
harn_vm/orchestration/replay_bench.rs

harn_vm/orchestration/
replay_bench.rs