harn_vm/orchestration/
replay_bench.rs

1use std::collections::{BTreeMap, BTreeSet};
2use std::fmt;
3
4use serde::{Deserialize, Serialize};
5use serde_json::{json, Value as JsonValue};
6use sha2::{Digest, Sha256};
7
8use super::{
9    canonicalize_run, run_replay_oracle_trace, ReplayAllowlistRule, ReplayDivergence,
10    ReplayExpectation, ReplayOracleError, ReplayOracleReport, ReplayOracleTrace, ReplayTraceRun,
11    ReplayTraceRunCounts, REPLAY_TRACE_SCHEMA_VERSION,
12};
13
14pub const REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION: &str = "harn.replay_benchmark.report.v1";
15pub const REPLAY_BENCHMARK_CLOUD_INGEST_KIND: &str = "harn_cloud.replay_determinism.leaderboard.v1";
16pub const OPENCODE_JSONL_ADAPTER_ID: &str = "opencode-jsonl";
17pub const OPENCODE_JSONL_ADAPTER_SCHEMA_VERSION: &str =
18    "harn.replay_benchmark.adapter.opencode_jsonl.v1";
19
20const REPLAY_TRACE_SECTIONS: [&str; 11] = [
21    "event_log_entries",
22    "trigger_firings",
23    "llm_interactions",
24    "protocol_interactions",
25    "approval_interactions",
26    "effect_receipts",
27    "persona_runtime_states",
28    "agent_transcript_deltas",
29    "final_artifacts",
30    "policy_decisions",
31    // CH-07 (#1878): channel emit/match audit receipts.
32    "channel_receipts",
33];
34
35const TOOL_DRIFT_SECTIONS: [&str; 3] = [
36    "llm_interactions",
37    "protocol_interactions",
38    "effect_receipts",
39];
40
41const PERMISSION_SECTIONS: [&str; 2] = ["approval_interactions", "policy_decisions"];
42
43#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
44pub struct ReplayBenchmarkReport {
45    pub schema_version: String,
46    pub cloud_ingest: ReplayBenchmarkCloudIngest,
47    pub suite: ReplayBenchmarkSuiteIdentity,
48    pub summary: ReplayBenchmarkSummary,
49    pub fixtures: Vec<ReplayBenchmarkFixtureReport>,
50}
51
52#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
53pub struct ReplayBenchmarkCloudIngest {
54    pub kind: String,
55    pub leaderboard_key: String,
56    pub report_schema_version: String,
57    pub replay_trace_schema_version: String,
58    pub artifact_contract: String,
59}
60
61#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
62pub struct ReplayBenchmarkSuiteIdentity {
63    pub name: String,
64    pub fixture_count: usize,
65    pub source_paths: Vec<String>,
66}
67
68#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
69pub struct ReplayBenchmarkSummary {
70    pub passed: usize,
71    pub failed: usize,
72    pub deterministic_fixtures: usize,
73    pub drifted_fixtures: usize,
74    pub mean_replay_fidelity_score: f64,
75    pub mean_permission_decision_preservation_score: f64,
76    pub tool_call_drift_count: usize,
77    pub transcript_drift_count: usize,
78    pub observed_interactions: usize,
79    pub llm_input_tokens: u64,
80    pub llm_output_tokens: u64,
81}
82
83#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
84pub struct ReplayBenchmarkFixtureReport {
85    pub path: String,
86    pub name: String,
87    pub description: Option<String>,
88    pub expectation: ReplayExpectation,
89    pub passed: bool,
90    pub deterministic: bool,
91    pub first_run_counts: ReplayTraceRunCounts,
92    pub second_run_counts: ReplayTraceRunCounts,
93    pub metrics: ReplayBenchmarkMetrics,
94    pub first_divergence: Option<ReplayDivergence>,
95    pub receipt: ReplayBenchmarkFixtureReceipt,
96}
97
98#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
99pub struct ReplayBenchmarkMetrics {
100    pub determinism_score: f64,
101    pub replay_fidelity_score: f64,
102    pub permission_decision_preservation_score: f64,
103    pub tool_call_drift_count: usize,
104    pub transcript_drift_count: usize,
105    pub runtime_cost: ReplayRuntimeCostMetrics,
106    pub debugging_time_to_root_cause_proxy: ReplayDebuggingProxyMetrics,
107    pub category_scores: BTreeMap<String, ReplayCategoryMetric>,
108}
109
110#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
111pub struct ReplayCategoryMetric {
112    pub compared: bool,
113    pub matched: bool,
114    pub drift_count: usize,
115    pub first_run_count: usize,
116    pub second_run_count: usize,
117}
118
119#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
120pub struct ReplayRuntimeCostMetrics {
121    pub observed_interactions: usize,
122    pub event_log_entries: usize,
123    pub trigger_firings: usize,
124    pub llm_interactions: usize,
125    pub protocol_interactions: usize,
126    pub approval_interactions: usize,
127    pub effect_receipts: usize,
128    pub persona_runtime_states: usize,
129    pub agent_transcript_deltas: usize,
130    pub final_artifacts: usize,
131    pub policy_decisions: usize,
132    /// CH-07 (#1878): channel emit/match audit receipts.
133    #[serde(default)]
134    pub channel_receipts: usize,
135    #[serde(default)]
136    pub lifecycle_receipts: usize,
137    pub llm_input_tokens: u64,
138    pub llm_output_tokens: u64,
139    #[serde(skip_serializing_if = "Option::is_none")]
140    pub observed_cost_usd: Option<f64>,
141}
142
143#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
144pub struct ReplayDebuggingProxyMetrics {
145    pub proxy_kind: String,
146    pub first_divergence_path: Option<String>,
147    pub first_divergence_depth: usize,
148    pub drift_surface_count: usize,
149    pub estimated_triage_steps: usize,
150}
151
152#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
153pub struct ReplayBenchmarkFixtureReceipt {
154    pub ingest_kind: String,
155    pub report_schema_version: String,
156    pub replay_trace_schema_version: String,
157    pub canonical_first_sha256: String,
158    pub canonical_second_sha256: String,
159    pub benchmark_receipt_sha256: String,
160}
161
162#[derive(Debug, Clone, PartialEq, Eq)]
163pub enum ReplayBenchmarkError {
164    Oracle(ReplayOracleError),
165    Adapter(String),
166    Serialization(String),
167}
168
169impl fmt::Display for ReplayBenchmarkError {
170    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
171        match self {
172            Self::Oracle(error) => error.fmt(f),
173            Self::Adapter(message) | Self::Serialization(message) => message.fmt(f),
174        }
175    }
176}
177
178impl std::error::Error for ReplayBenchmarkError {}
179
180impl From<ReplayOracleError> for ReplayBenchmarkError {
181    fn from(error: ReplayOracleError) -> Self {
182        Self::Oracle(error)
183    }
184}
185
186pub trait ReplayTraceAdapter {
187    fn adapter_id(&self) -> &'static str;
188    fn input_schema_version(&self) -> &'static str;
189    fn adapt_run(&self, input: &str, run_id: &str) -> Result<ReplayTraceRun, ReplayBenchmarkError>;
190}
191
192#[derive(Clone, Copy, Debug, Default)]
193pub struct OpenCodeJsonlAdapter;
194
195impl ReplayTraceAdapter for OpenCodeJsonlAdapter {
196    fn adapter_id(&self) -> &'static str {
197        OPENCODE_JSONL_ADAPTER_ID
198    }
199
200    fn input_schema_version(&self) -> &'static str {
201        OPENCODE_JSONL_ADAPTER_SCHEMA_VERSION
202    }
203
204    fn adapt_run(&self, input: &str, run_id: &str) -> Result<ReplayTraceRun, ReplayBenchmarkError> {
205        adapt_opencode_jsonl(input, run_id)
206    }
207}
208
209pub fn benchmark_replay_trace(
210    path: impl Into<String>,
211    trace: &ReplayOracleTrace,
212) -> Result<ReplayBenchmarkFixtureReport, ReplayBenchmarkError> {
213    let path = path.into();
214    let oracle = run_replay_oracle_trace(trace)?;
215    benchmark_replay_trace_from_oracle(path, trace, oracle)
216}
217
218pub fn benchmark_adapted_replay_pair(
219    adapter: &dyn ReplayTraceAdapter,
220    name: impl Into<String>,
221    first_input: &str,
222    second_input: &str,
223) -> Result<ReplayBenchmarkFixtureReport, ReplayBenchmarkError> {
224    let name = name.into();
225    let trace = ReplayOracleTrace {
226        schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
227        name: name.clone(),
228        description: Some(format!(
229            "External replay trace pair adapted with {} ({})",
230            adapter.adapter_id(),
231            adapter.input_schema_version()
232        )),
233        expect: ReplayExpectation::Match,
234        allowlist: vec![ReplayAllowlistRule {
235            path: "/run_id".to_string(),
236            reason: "external trace runs are imported as separate executions".to_string(),
237            replacement: None,
238        }],
239        first_run: adapter.adapt_run(first_input, "adapted_first_run")?,
240        second_run: adapter.adapt_run(second_input, "adapted_second_run")?,
241        protocol_fixture_refs: Vec::new(),
242    };
243    benchmark_replay_trace(format!("adapter:{}:{name}", adapter.adapter_id()), &trace)
244}
245
246pub fn build_replay_benchmark_report(
247    suite_name: impl Into<String>,
248    source_paths: Vec<String>,
249    fixtures: Vec<ReplayBenchmarkFixtureReport>,
250) -> ReplayBenchmarkReport {
251    let suite_name = suite_name.into();
252    let summary = summarize_replay_benchmark(&fixtures);
253    ReplayBenchmarkReport {
254        schema_version: REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION.to_string(),
255        cloud_ingest: ReplayBenchmarkCloudIngest {
256            kind: REPLAY_BENCHMARK_CLOUD_INGEST_KIND.to_string(),
257            leaderboard_key: "replay-determinism".to_string(),
258            report_schema_version: REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION.to_string(),
259            replay_trace_schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
260            artifact_contract:
261                "fixtures[].receipt + fixtures[].metrics are stable Cloud leaderboard inputs"
262                    .to_string(),
263        },
264        suite: ReplayBenchmarkSuiteIdentity {
265            name: suite_name,
266            fixture_count: fixtures.len(),
267            source_paths,
268        },
269        summary,
270        fixtures,
271    }
272}
273
274fn benchmark_replay_trace_from_oracle(
275    path: String,
276    trace: &ReplayOracleTrace,
277    oracle: ReplayOracleReport,
278) -> Result<ReplayBenchmarkFixtureReport, ReplayBenchmarkError> {
279    let first = canonicalize_run(&trace.first_run, &trace.allowlist)?;
280    let second = canonicalize_run(&trace.second_run, &trace.allowlist)?;
281    let category_scores = category_scores(&first, &second, &oracle);
282    let metrics = replay_metrics(trace, &oracle, category_scores)?;
283    let canonical_first_sha256 = sha256_json(&first)?;
284    let canonical_second_sha256 = sha256_json(&second)?;
285    let receipt = fixture_receipt(
286        &trace.name,
287        &path,
288        &metrics,
289        &canonical_first_sha256,
290        &canonical_second_sha256,
291    )?;
292
293    Ok(ReplayBenchmarkFixtureReport {
294        path,
295        name: oracle.name,
296        description: trace.description.clone(),
297        expectation: oracle.expectation,
298        passed: oracle.passed,
299        deterministic: oracle.divergence.is_none(),
300        first_run_counts: oracle.first_run_counts,
301        second_run_counts: oracle.second_run_counts,
302        metrics,
303        first_divergence: oracle.divergence,
304        receipt,
305    })
306}
307
308fn summarize_replay_benchmark(fixtures: &[ReplayBenchmarkFixtureReport]) -> ReplayBenchmarkSummary {
309    let fixture_count = fixtures.len();
310    let passed = fixtures.iter().filter(|fixture| fixture.passed).count();
311    let deterministic_fixtures = fixtures
312        .iter()
313        .filter(|fixture| fixture.deterministic)
314        .count();
315    let runtime = fixtures
316        .iter()
317        .fold(ReplayRuntimeCostMetrics::default(), |mut acc, fixture| {
318            let runtime = &fixture.metrics.runtime_cost;
319            acc.observed_interactions += runtime.observed_interactions;
320            acc.event_log_entries += runtime.event_log_entries;
321            acc.trigger_firings += runtime.trigger_firings;
322            acc.llm_interactions += runtime.llm_interactions;
323            acc.protocol_interactions += runtime.protocol_interactions;
324            acc.approval_interactions += runtime.approval_interactions;
325            acc.effect_receipts += runtime.effect_receipts;
326            acc.persona_runtime_states += runtime.persona_runtime_states;
327            acc.agent_transcript_deltas += runtime.agent_transcript_deltas;
328            acc.final_artifacts += runtime.final_artifacts;
329            acc.policy_decisions += runtime.policy_decisions;
330            acc.channel_receipts += runtime.channel_receipts;
331            acc.llm_input_tokens += runtime.llm_input_tokens;
332            acc.llm_output_tokens += runtime.llm_output_tokens;
333            acc.observed_cost_usd =
334                sum_optional_cost(acc.observed_cost_usd, runtime.observed_cost_usd);
335            acc
336        });
337    ReplayBenchmarkSummary {
338        passed,
339        failed: fixture_count.saturating_sub(passed),
340        deterministic_fixtures,
341        drifted_fixtures: fixture_count.saturating_sub(deterministic_fixtures),
342        mean_replay_fidelity_score: average_metric(fixtures, |fixture| {
343            fixture.metrics.replay_fidelity_score
344        }),
345        mean_permission_decision_preservation_score: average_metric(fixtures, |fixture| {
346            fixture.metrics.permission_decision_preservation_score
347        }),
348        tool_call_drift_count: fixtures
349            .iter()
350            .map(|fixture| fixture.metrics.tool_call_drift_count)
351            .sum(),
352        transcript_drift_count: fixtures
353            .iter()
354            .map(|fixture| fixture.metrics.transcript_drift_count)
355            .sum(),
356        observed_interactions: runtime.observed_interactions,
357        llm_input_tokens: runtime.llm_input_tokens,
358        llm_output_tokens: runtime.llm_output_tokens,
359    }
360}
361
362fn replay_metrics(
363    trace: &ReplayOracleTrace,
364    oracle: &ReplayOracleReport,
365    category_scores: BTreeMap<String, ReplayCategoryMetric>,
366) -> Result<ReplayBenchmarkMetrics, ReplayBenchmarkError> {
367    let compared_categories = category_scores
368        .values()
369        .filter(|metric| metric.compared)
370        .count();
371    let matched_categories = category_scores
372        .values()
373        .filter(|metric| metric.compared && metric.matched)
374        .count();
375    let replay_fidelity_score = if compared_categories == 0 {
376        0.0
377    } else {
378        matched_categories as f64 / compared_categories as f64
379    };
380    let permission_decision_preservation_score =
381        section_score(&category_scores, &PERMISSION_SECTIONS);
382    let tool_call_drift_count = section_drift_count(&category_scores, &TOOL_DRIFT_SECTIONS);
383    let transcript_drift_count =
384        section_drift_count(&category_scores, &["agent_transcript_deltas"]);
385    let runtime_cost = runtime_cost_metrics(&trace.first_run, &trace.second_run);
386    let debugging_time_to_root_cause_proxy =
387        debugging_proxy_metrics(oracle.divergence.as_ref(), &category_scores);
388
389    Ok(ReplayBenchmarkMetrics {
390        determinism_score: if oracle.divergence.is_none() {
391            1.0
392        } else {
393            0.0
394        },
395        replay_fidelity_score,
396        permission_decision_preservation_score,
397        tool_call_drift_count,
398        transcript_drift_count,
399        runtime_cost,
400        debugging_time_to_root_cause_proxy,
401        category_scores,
402    })
403}
404
405fn category_scores(
406    first: &JsonValue,
407    second: &JsonValue,
408    oracle: &ReplayOracleReport,
409) -> BTreeMap<String, ReplayCategoryMetric> {
410    let first_counts = counts_by_section(&oracle.first_run_counts);
411    let second_counts = counts_by_section(&oracle.second_run_counts);
412    REPLAY_TRACE_SECTIONS
413        .iter()
414        .map(|section| {
415            let first_value = first.get(*section).unwrap_or(&JsonValue::Null);
416            let second_value = second.get(*section).unwrap_or(&JsonValue::Null);
417            let first_run_count = first_counts.get(*section).copied().unwrap_or_default();
418            let second_run_count = second_counts.get(*section).copied().unwrap_or_default();
419            let compared = first_run_count > 0 || second_run_count > 0;
420            let drift_count = if compared {
421                drift_count(first_value, second_value)
422            } else {
423                0
424            };
425            (
426                (*section).to_string(),
427                ReplayCategoryMetric {
428                    compared,
429                    matched: drift_count == 0,
430                    drift_count,
431                    first_run_count,
432                    second_run_count,
433                },
434            )
435        })
436        .collect()
437}
438
439fn counts_by_section(counts: &ReplayTraceRunCounts) -> BTreeMap<&'static str, usize> {
440    BTreeMap::from([
441        ("event_log_entries", counts.event_log_entries),
442        ("trigger_firings", counts.trigger_firings),
443        ("llm_interactions", counts.llm_interactions),
444        ("protocol_interactions", counts.protocol_interactions),
445        ("approval_interactions", counts.approval_interactions),
446        ("effect_receipts", counts.effect_receipts),
447        ("persona_runtime_states", counts.persona_runtime_states),
448        ("agent_transcript_deltas", counts.agent_transcript_deltas),
449        ("final_artifacts", counts.final_artifacts),
450        ("policy_decisions", counts.policy_decisions),
451        // CH-07 (#1878).
452        ("channel_receipts", counts.channel_receipts),
453        ("lifecycle_receipts", counts.lifecycle_receipts),
454    ])
455}
456
457fn drift_count(first: &JsonValue, second: &JsonValue) -> usize {
458    if first == second {
459        return 0;
460    }
461    match (first, second) {
462        (JsonValue::Array(first_items), JsonValue::Array(second_items)) => {
463            let shared = first_items.len().min(second_items.len());
464            let item_drifts = (0..shared)
465                .filter(|index| first_items[*index] != second_items[*index])
466                .count();
467            item_drifts + first_items.len().abs_diff(second_items.len())
468        }
469        (JsonValue::Object(first_map), JsonValue::Object(second_map)) => {
470            let keys = first_map
471                .keys()
472                .chain(second_map.keys())
473                .collect::<BTreeSet<_>>();
474            keys.into_iter()
475                .filter(|key| first_map.get(*key) != second_map.get(*key))
476                .count()
477        }
478        _ => 1,
479    }
480}
481
482fn section_score(
483    category_scores: &BTreeMap<String, ReplayCategoryMetric>,
484    sections: &[&str],
485) -> f64 {
486    let compared = sections
487        .iter()
488        .filter_map(|section| category_scores.get(*section))
489        .filter(|metric| metric.compared)
490        .collect::<Vec<_>>();
491    if compared.is_empty() {
492        return 1.0;
493    }
494    compared.iter().filter(|metric| metric.matched).count() as f64 / compared.len() as f64
495}
496
497fn section_drift_count(
498    category_scores: &BTreeMap<String, ReplayCategoryMetric>,
499    sections: &[&str],
500) -> usize {
501    sections
502        .iter()
503        .filter_map(|section| category_scores.get(*section))
504        .map(|metric| metric.drift_count)
505        .sum()
506}
507
508fn runtime_cost_metrics(
509    first_run: &ReplayTraceRun,
510    second_run: &ReplayTraceRun,
511) -> ReplayRuntimeCostMetrics {
512    let first = first_run.counts();
513    let second = second_run.counts();
514    let observed_cost_usd =
515        sum_optional_cost(cost_usd_for_run(first_run), cost_usd_for_run(second_run));
516    ReplayRuntimeCostMetrics {
517        observed_interactions: trace_material_count(&first) + trace_material_count(&second),
518        event_log_entries: first.event_log_entries + second.event_log_entries,
519        trigger_firings: first.trigger_firings + second.trigger_firings,
520        llm_interactions: first.llm_interactions + second.llm_interactions,
521        protocol_interactions: first.protocol_interactions + second.protocol_interactions,
522        approval_interactions: first.approval_interactions + second.approval_interactions,
523        effect_receipts: first.effect_receipts + second.effect_receipts,
524        persona_runtime_states: first.persona_runtime_states + second.persona_runtime_states,
525        agent_transcript_deltas: first.agent_transcript_deltas + second.agent_transcript_deltas,
526        final_artifacts: first.final_artifacts + second.final_artifacts,
527        policy_decisions: first.policy_decisions + second.policy_decisions,
528        channel_receipts: first.channel_receipts + second.channel_receipts,
529        lifecycle_receipts: first.lifecycle_receipts + second.lifecycle_receipts,
530        llm_input_tokens: token_total(first_run, "input_tokens")
531            + token_total(second_run, "input_tokens"),
532        llm_output_tokens: token_total(first_run, "output_tokens")
533            + token_total(second_run, "output_tokens"),
534        observed_cost_usd,
535    }
536}
537
538fn trace_material_count(counts: &ReplayTraceRunCounts) -> usize {
539    counts.event_log_entries
540        + counts.trigger_firings
541        + counts.llm_interactions
542        + counts.protocol_interactions
543        + counts.approval_interactions
544        + counts.effect_receipts
545        + counts.persona_runtime_states
546        + counts.agent_transcript_deltas
547        + counts.final_artifacts
548        + counts.policy_decisions
549        + counts.channel_receipts
550        + counts.lifecycle_receipts
551}
552
553fn token_total(run: &ReplayTraceRun, token_key: &str) -> u64 {
554    run.llm_interactions
555        .iter()
556        .filter_map(|interaction| {
557            interaction
558                .get(token_key)
559                .and_then(JsonValue::as_u64)
560                .or_else(|| {
561                    interaction
562                        .get("usage")
563                        .and_then(|usage| usage.get(token_key))
564                        .and_then(JsonValue::as_u64)
565                })
566        })
567        .sum()
568}
569
570fn cost_usd_for_run(run: &ReplayTraceRun) -> Option<f64> {
571    let mut seen = false;
572    let mut total = 0.0;
573    for interaction in &run.llm_interactions {
574        if let Some(cost) = interaction
575            .get("cost_usd")
576            .and_then(JsonValue::as_f64)
577            .or_else(|| {
578                interaction
579                    .get("usage")
580                    .and_then(|usage| usage.get("cost_usd"))
581                    .and_then(JsonValue::as_f64)
582            })
583        {
584            seen = true;
585            total += cost;
586        }
587    }
588    seen.then_some(total)
589}
590
591fn sum_optional_cost(left: Option<f64>, right: Option<f64>) -> Option<f64> {
592    match (left, right) {
593        (Some(left), Some(right)) => Some(left + right),
594        (Some(value), None) | (None, Some(value)) => Some(value),
595        (None, None) => None,
596    }
597}
598
599fn debugging_proxy_metrics(
600    divergence: Option<&ReplayDivergence>,
601    category_scores: &BTreeMap<String, ReplayCategoryMetric>,
602) -> ReplayDebuggingProxyMetrics {
603    let first_divergence_path = divergence.map(|divergence| divergence.path.clone());
604    let first_divergence_depth = first_divergence_path
605        .as_deref()
606        .map(json_path_depth)
607        .unwrap_or_default();
608    let drift_surface_count = category_scores
609        .values()
610        .filter(|metric| metric.compared && !metric.matched)
611        .count();
612    ReplayDebuggingProxyMetrics {
613        proxy_kind: "first_divergence_depth_plus_drift_surfaces".to_string(),
614        first_divergence_path,
615        first_divergence_depth,
616        drift_surface_count,
617        estimated_triage_steps: if drift_surface_count == 0 {
618            0
619        } else {
620            1 + first_divergence_depth + drift_surface_count
621        },
622    }
623}
624
625fn json_path_depth(path: &str) -> usize {
626    let path = path.trim();
627    if path == "$" {
628        return 0;
629    }
630    if let Some(pointer_path) = path.strip_prefix('/') {
631        return pointer_path
632            .split('/')
633            .filter(|segment| !segment.is_empty())
634            .count();
635    }
636    path.split('.')
637        .filter(|segment| !segment.is_empty() && *segment != "$")
638        .count()
639}
640
641fn fixture_receipt(
642    name: &str,
643    path: &str,
644    metrics: &ReplayBenchmarkMetrics,
645    canonical_first_sha256: &str,
646    canonical_second_sha256: &str,
647) -> Result<ReplayBenchmarkFixtureReceipt, ReplayBenchmarkError> {
648    let receipt_material = json!({
649        "ingest_kind": REPLAY_BENCHMARK_CLOUD_INGEST_KIND,
650        "report_schema_version": REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION,
651        "replay_trace_schema_version": REPLAY_TRACE_SCHEMA_VERSION,
652        "name": name,
653        "path": path,
654        "canonical_first_sha256": canonical_first_sha256,
655        "canonical_second_sha256": canonical_second_sha256,
656        "metrics": metrics,
657    });
658    Ok(ReplayBenchmarkFixtureReceipt {
659        ingest_kind: REPLAY_BENCHMARK_CLOUD_INGEST_KIND.to_string(),
660        report_schema_version: REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION.to_string(),
661        replay_trace_schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
662        canonical_first_sha256: canonical_first_sha256.to_string(),
663        canonical_second_sha256: canonical_second_sha256.to_string(),
664        benchmark_receipt_sha256: sha256_json(&receipt_material)?,
665    })
666}
667
668fn sha256_json(value: &JsonValue) -> Result<String, ReplayBenchmarkError> {
669    let bytes = serde_json::to_vec(value)
670        .map_err(|error| ReplayBenchmarkError::Serialization(error.to_string()))?;
671    Ok(format!("sha256:{}", hex::encode(Sha256::digest(bytes))))
672}
673
674fn sha256_value(value: &JsonValue) -> Result<String, ReplayBenchmarkError> {
675    sha256_json(value)
676}
677
678fn sha256_text(text: &str) -> String {
679    format!("sha256:{}", hex::encode(Sha256::digest(text.as_bytes())))
680}
681
682fn average_metric(
683    fixtures: &[ReplayBenchmarkFixtureReport],
684    metric: impl Fn(&ReplayBenchmarkFixtureReport) -> f64,
685) -> f64 {
686    if fixtures.is_empty() {
687        0.0
688    } else {
689        fixtures.iter().map(metric).sum::<f64>() / fixtures.len() as f64
690    }
691}
692
693fn adapt_opencode_jsonl(input: &str, run_id: &str) -> Result<ReplayTraceRun, ReplayBenchmarkError> {
694    let mut run = ReplayTraceRun {
695        run_id: run_id.to_string(),
696        ..ReplayTraceRun::default()
697    };
698    for (index, raw_line) in input.lines().enumerate() {
699        let line_no = index + 1;
700        let line = raw_line.trim();
701        if line.is_empty() {
702            continue;
703        }
704        let value: JsonValue = serde_json::from_str(line).map_err(|error| {
705            ReplayBenchmarkError::Adapter(format!(
706                "invalid {} JSONL line {line_no}: {error}",
707                OPENCODE_JSONL_ADAPTER_ID
708            ))
709        })?;
710        let object = value.as_object().ok_or_else(|| {
711            ReplayBenchmarkError::Adapter(format!(
712                "{} JSONL line {line_no} must be an object",
713                OPENCODE_JSONL_ADAPTER_ID
714            ))
715        })?;
716        let event_type = object
717            .get("type")
718            .or_else(|| object.get("event"))
719            .and_then(JsonValue::as_str)
720            .unwrap_or("event");
721        match event_type {
722            "message" | "session.message" => {
723                run.agent_transcript_deltas
724                    .push(adapt_opencode_message(object, line_no));
725            }
726            "tool_call" | "tool" | "session.tool_call" => {
727                let (protocol, receipt) = adapt_opencode_tool_call(object, line_no)?;
728                run.protocol_interactions.push(protocol);
729                run.effect_receipts.push(receipt);
730            }
731            "permission" | "permission_decision" | "session.permission" => {
732                let (approval, policy) = adapt_opencode_permission(object, line_no);
733                run.approval_interactions.push(approval);
734                run.policy_decisions.push(policy);
735            }
736            "llm" | "model" | "session.llm" => {
737                run.llm_interactions
738                    .push(adapt_opencode_llm(object, line_no));
739            }
740            _ => run
741                .event_log_entries
742                .push(adapt_opencode_event(object, event_type, line_no)),
743        }
744    }
745    if trace_material_count(&run.counts()) == 0 {
746        return Err(ReplayBenchmarkError::Adapter(format!(
747            "{} input contained no adaptable events",
748            OPENCODE_JSONL_ADAPTER_ID
749        )));
750    }
751    Ok(run)
752}
753
754fn adapt_opencode_message(
755    object: &serde_json::Map<String, JsonValue>,
756    line_no: usize,
757) -> JsonValue {
758    let content = object.get("content").cloned().unwrap_or(JsonValue::Null);
759    json!({
760        "delta_id": object_string(object, "id").unwrap_or_else(|| format!("message-{line_no}")),
761        "agent": object_string(object, "agent").unwrap_or_else(|| "opencode".to_string()),
762        "role": object_string(object, "role").unwrap_or_else(|| "assistant".to_string()),
763        "content_sha256": sha256_text(&content.to_string()),
764    })
765}
766
767fn adapt_opencode_tool_call(
768    object: &serde_json::Map<String, JsonValue>,
769    line_no: usize,
770) -> Result<(JsonValue, JsonValue), ReplayBenchmarkError> {
771    let tool = object_string(object, "tool")
772        .or_else(|| object_string(object, "name"))
773        .unwrap_or_else(|| "unknown_tool".to_string());
774    let arguments = object
775        .get("arguments")
776        .or_else(|| object.get("args"))
777        .cloned()
778        .unwrap_or_else(|| json!({}));
779    let result = object
780        .get("result")
781        .or_else(|| object.get("output"))
782        .cloned()
783        .unwrap_or(JsonValue::Null);
784    let status = object_string(object, "status").unwrap_or_else(|| "completed".to_string());
785    let arguments_sha256 = sha256_value(&arguments)?;
786    let result_sha256 = sha256_value(&result)?;
787    Ok((
788        json!({
789            "protocol": "opencode",
790            "boundary": "tool_call",
791            "tool": tool,
792            "call_id": object_string(object, "id").unwrap_or_else(|| format!("tool-{line_no}")),
793            "arguments_sha256": arguments_sha256,
794            "status": status,
795            "result_sha256": result_sha256,
796        }),
797        json!({
798            "receipt_id": object_string(object, "receipt_id").unwrap_or_else(|| format!("tool-receipt-{line_no}")),
799            "kind": "tool_call",
800            "tool": tool,
801            "status": status,
802            "arguments_sha256": arguments_sha256,
803            "result_sha256": result_sha256,
804        }),
805    ))
806}
807
808fn adapt_opencode_permission(
809    object: &serde_json::Map<String, JsonValue>,
810    line_no: usize,
811) -> (JsonValue, JsonValue) {
812    let action = object_string(object, "action").unwrap_or_else(|| "unknown".to_string());
813    let decision = object_string(object, "decision")
814        .or_else(|| object_string(object, "response"))
815        .unwrap_or_else(|| "unknown".to_string());
816    (
817        json!({
818            "request_id": object_string(object, "id").unwrap_or_else(|| format!("permission-{line_no}")),
819            "principal": object_string(object, "principal").unwrap_or_else(|| "agent".to_string()),
820            "action": action,
821            "response": decision,
822            "reviewer": object.get("reviewer").cloned().unwrap_or(JsonValue::Null),
823        }),
824        json!({
825            "decision_id": object_string(object, "decision_id").unwrap_or_else(|| format!("policy-{line_no}")),
826            "capability": object_string(object, "capability").unwrap_or(action),
827            "decision": decision,
828            "approval_required": true,
829        }),
830    )
831}
832
833fn adapt_opencode_llm(object: &serde_json::Map<String, JsonValue>, line_no: usize) -> JsonValue {
834    let input_tokens = object
835        .get("input_tokens")
836        .and_then(JsonValue::as_u64)
837        .or_else(|| {
838            object
839                .get("usage")
840                .and_then(|usage| usage.get("input_tokens"))
841                .and_then(JsonValue::as_u64)
842        })
843        .unwrap_or_default();
844    let output_tokens = object
845        .get("output_tokens")
846        .and_then(JsonValue::as_u64)
847        .or_else(|| {
848            object
849                .get("usage")
850                .and_then(|usage| usage.get("output_tokens"))
851                .and_then(JsonValue::as_u64)
852        })
853        .unwrap_or_default();
854    let messages_sha256 = object
855        .get("messages")
856        .map(|value| sha256_text(&value.to_string()))
857        .unwrap_or_else(|| sha256_text(""));
858    let response_sha256 = object
859        .get("response")
860        .map(|value| sha256_text(&value.to_string()))
861        .unwrap_or_else(|| sha256_text(""));
862    json!({
863        "request_id": object_string(object, "id").unwrap_or_else(|| format!("llm-{line_no}")),
864        "provider": object_string(object, "provider").unwrap_or_else(|| "opencode".to_string()),
865        "model": object_string(object, "model").unwrap_or_else(|| "unknown".to_string()),
866        "messages_sha256": messages_sha256,
867        "response_sha256": response_sha256,
868        "usage": {
869            "input_tokens": input_tokens,
870            "output_tokens": output_tokens,
871        },
872    })
873}
874
875fn adapt_opencode_event(
876    object: &serde_json::Map<String, JsonValue>,
877    event_type: &str,
878    line_no: usize,
879) -> JsonValue {
880    json!({
881        "event_id": line_no,
882        "topic": object_string(object, "topic").unwrap_or_else(|| "opencode.session".to_string()),
883        "kind": event_type,
884        "payload": object.get("payload").cloned().unwrap_or_else(|| JsonValue::Object(object.clone())),
885    })
886}
887
888fn object_string(object: &serde_json::Map<String, JsonValue>, key: &str) -> Option<String> {
889    object
890        .get(key)
891        .and_then(JsonValue::as_str)
892        .map(str::to_string)
893}
894
895#[cfg(test)]
896mod tests {
897    use super::*;
898
899    fn trace_pair(status: (&str, &str)) -> ReplayOracleTrace {
900        ReplayOracleTrace {
901            schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
902            name: "simple_tool_run".to_string(),
903            description: Some("golden replay benchmark fixture".to_string()),
904            expect: ReplayExpectation::Match,
905            allowlist: vec![ReplayAllowlistRule {
906                path: "/run_id".to_string(),
907                reason: "run ids are allocated per execution".to_string(),
908                replacement: None,
909            }],
910            first_run: ReplayTraceRun {
911                run_id: "first".to_string(),
912                protocol_interactions: vec![json!({
913                    "protocol": "mcp",
914                    "boundary": "tools/call",
915                    "tool": "read_file",
916                    "status": status.0,
917                })],
918                policy_decisions: vec![json!({
919                    "capability": "fs.read",
920                    "decision": "allow",
921                })],
922                ..ReplayTraceRun::default()
923            },
924            second_run: ReplayTraceRun {
925                run_id: "second".to_string(),
926                protocol_interactions: vec![json!({
927                    "protocol": "mcp",
928                    "boundary": "tools/call",
929                    "tool": "read_file",
930                    "status": status.1,
931                })],
932                policy_decisions: vec![json!({
933                    "capability": "fs.read",
934                    "decision": "allow",
935                })],
936                ..ReplayTraceRun::default()
937            },
938            protocol_fixture_refs: Vec::new(),
939        }
940    }
941
942    #[test]
943    fn replay_benchmark_reports_stable_golden_metrics_for_matching_trace() {
944        let fixture =
945            benchmark_replay_trace("benchmarks/replay/simple.json", &trace_pair(("ok", "ok")))
946                .expect("benchmark fixture");
947
948        assert!(fixture.passed);
949        assert!(fixture.deterministic);
950        assert_eq!(fixture.metrics.determinism_score, 1.0);
951        assert_eq!(fixture.metrics.replay_fidelity_score, 1.0);
952        assert_eq!(fixture.metrics.permission_decision_preservation_score, 1.0);
953        assert_eq!(fixture.metrics.tool_call_drift_count, 0);
954        assert!(fixture
955            .receipt
956            .benchmark_receipt_sha256
957            .starts_with("sha256:"));
958    }
959
960    #[test]
961    fn replay_benchmark_reports_reduced_fidelity_for_meaningful_drift() {
962        let fixture =
963            benchmark_replay_trace("benchmarks/replay/drift.json", &trace_pair(("ok", "error")))
964                .expect("benchmark fixture");
965
966        assert!(!fixture.passed);
967        assert!(!fixture.deterministic);
968        assert_eq!(fixture.metrics.determinism_score, 0.0);
969        assert_eq!(fixture.metrics.replay_fidelity_score, 0.5);
970        assert_eq!(fixture.metrics.tool_call_drift_count, 1);
971        assert_eq!(
972            fixture
973                .metrics
974                .debugging_time_to_root_cause_proxy
975                .first_divergence_path
976                .as_deref(),
977            Some("/protocol_interactions/0/status")
978        );
979        assert_eq!(
980            fixture
981                .metrics
982                .debugging_time_to_root_cause_proxy
983                .first_divergence_depth,
984            3
985        );
986        assert_eq!(
987            fixture
988                .metrics
989                .debugging_time_to_root_cause_proxy
990                .estimated_triage_steps,
991            5
992        );
993    }
994
995    #[test]
996    fn replay_benchmark_summary_is_stable_across_repeated_runs() {
997        let first = benchmark_replay_trace("fixture.json", &trace_pair(("ok", "ok")))
998            .expect("first benchmark");
999        let second = benchmark_replay_trace("fixture.json", &trace_pair(("ok", "ok")))
1000            .expect("second benchmark");
1001
1002        let first_json = serde_json::to_string(&first).expect("serialize first");
1003        let second_json = serde_json::to_string(&second).expect("serialize second");
1004        assert_eq!(first_json, second_json);
1005    }
1006
1007    #[test]
1008    fn opencode_jsonl_adapter_maps_messages_tools_permissions_and_llm_usage() {
1009        let input = concat!(
1010            "{\"type\":\"message\",\"id\":\"m1\",\"role\":\"assistant\",\"content\":\"done\"}\n",
1011            "{\"type\":\"tool_call\",\"id\":\"t1\",\"tool\":\"write_file\",\"arguments\":{\"path\":\"notes.md\"},\"result\":{\"ok\":true}}\n",
1012            "{\"type\":\"permission\",\"id\":\"p1\",\"action\":\"write_file\",\"decision\":\"approved\"}\n",
1013            "{\"type\":\"llm\",\"id\":\"l1\",\"model\":\"qwen\",\"usage\":{\"input_tokens\":7,\"output_tokens\":3}}\n"
1014        );
1015
1016        let run = OpenCodeJsonlAdapter
1017            .adapt_run(input, "opencode-run")
1018            .expect("adapt opencode jsonl");
1019
1020        assert_eq!(run.run_id, "opencode-run");
1021        assert_eq!(run.agent_transcript_deltas.len(), 1);
1022        assert_eq!(run.protocol_interactions.len(), 1);
1023        assert_eq!(run.effect_receipts.len(), 1);
1024        assert_eq!(run.approval_interactions.len(), 1);
1025        assert_eq!(run.policy_decisions.len(), 1);
1026        assert_eq!(run.llm_interactions.len(), 1);
1027        assert_eq!(token_total(&run, "input_tokens"), 7);
1028        assert_eq!(token_total(&run, "output_tokens"), 3);
1029    }
1030
1031    #[test]
1032    fn adapted_trace_pair_can_be_benchmarked() {
1033        let first = "{\"type\":\"tool_call\",\"tool\":\"read_file\",\"result\":{\"ok\":true}}\n";
1034        let second = "{\"type\":\"tool_call\",\"tool\":\"read_file\",\"result\":{\"ok\":true}}\n";
1035
1036        let fixture = benchmark_adapted_replay_pair(
1037            &OpenCodeJsonlAdapter,
1038            "external-tool-run",
1039            first,
1040            second,
1041        )
1042        .expect("benchmark adapted pair");
1043
1044        assert!(fixture.passed);
1045        assert_eq!(fixture.name, "external-tool-run");
1046        assert_eq!(fixture.metrics.tool_call_drift_count, 0);
1047    }
1048}
harn_vm/orchestration/replay_bench.rs

harn_vm/orchestration/
replay_bench.rs