harn_vm/orchestration/
replay_bench.rs

1use std::collections::{BTreeMap, BTreeSet};
2use std::fmt;
3
4use serde::{Deserialize, Serialize};
5use serde_json::{json, Value as JsonValue};
6use sha2::{Digest, Sha256};
7
8use super::{
9    canonicalize_run, run_replay_oracle_trace, ReplayAllowlistRule, ReplayDivergence,
10    ReplayExpectation, ReplayOracleError, ReplayOracleReport, ReplayOracleTrace, ReplayTraceRun,
11    ReplayTraceRunCounts, REPLAY_TRACE_SCHEMA_VERSION,
12};
13
14pub const REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION: &str = "harn.replay_benchmark.report.v1";
15pub const REPLAY_BENCHMARK_CLOUD_INGEST_KIND: &str = "harn_cloud.replay_determinism.leaderboard.v1";
16pub const OPENCODE_JSONL_ADAPTER_ID: &str = "opencode-jsonl";
17pub const OPENCODE_JSONL_ADAPTER_SCHEMA_VERSION: &str =
18    "harn.replay_benchmark.adapter.opencode_jsonl.v1";
19
20const REPLAY_TRACE_SECTIONS: [&str; 10] = [
21    "event_log_entries",
22    "trigger_firings",
23    "llm_interactions",
24    "protocol_interactions",
25    "approval_interactions",
26    "effect_receipts",
27    "persona_runtime_states",
28    "agent_transcript_deltas",
29    "final_artifacts",
30    "policy_decisions",
31];
32
33const TOOL_DRIFT_SECTIONS: [&str; 3] = [
34    "llm_interactions",
35    "protocol_interactions",
36    "effect_receipts",
37];
38
39const PERMISSION_SECTIONS: [&str; 2] = ["approval_interactions", "policy_decisions"];
40
41#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
42pub struct ReplayBenchmarkReport {
43    pub schema_version: String,
44    pub cloud_ingest: ReplayBenchmarkCloudIngest,
45    pub suite: ReplayBenchmarkSuiteIdentity,
46    pub summary: ReplayBenchmarkSummary,
47    pub fixtures: Vec<ReplayBenchmarkFixtureReport>,
48}
49
50#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
51pub struct ReplayBenchmarkCloudIngest {
52    pub kind: String,
53    pub leaderboard_key: String,
54    pub report_schema_version: String,
55    pub replay_trace_schema_version: String,
56    pub artifact_contract: String,
57}
58
59#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
60pub struct ReplayBenchmarkSuiteIdentity {
61    pub name: String,
62    pub fixture_count: usize,
63    pub source_paths: Vec<String>,
64}
65
66#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
67pub struct ReplayBenchmarkSummary {
68    pub passed: usize,
69    pub failed: usize,
70    pub deterministic_fixtures: usize,
71    pub drifted_fixtures: usize,
72    pub mean_replay_fidelity_score: f64,
73    pub mean_permission_decision_preservation_score: f64,
74    pub tool_call_drift_count: usize,
75    pub transcript_drift_count: usize,
76    pub observed_interactions: usize,
77    pub llm_input_tokens: u64,
78    pub llm_output_tokens: u64,
79}
80
81#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
82pub struct ReplayBenchmarkFixtureReport {
83    pub path: String,
84    pub name: String,
85    pub description: Option<String>,
86    pub expectation: ReplayExpectation,
87    pub passed: bool,
88    pub deterministic: bool,
89    pub first_run_counts: ReplayTraceRunCounts,
90    pub second_run_counts: ReplayTraceRunCounts,
91    pub metrics: ReplayBenchmarkMetrics,
92    pub first_divergence: Option<ReplayDivergence>,
93    pub receipt: ReplayBenchmarkFixtureReceipt,
94}
95
96#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
97pub struct ReplayBenchmarkMetrics {
98    pub determinism_score: f64,
99    pub replay_fidelity_score: f64,
100    pub permission_decision_preservation_score: f64,
101    pub tool_call_drift_count: usize,
102    pub transcript_drift_count: usize,
103    pub runtime_cost: ReplayRuntimeCostMetrics,
104    pub debugging_time_to_root_cause_proxy: ReplayDebuggingProxyMetrics,
105    pub category_scores: BTreeMap<String, ReplayCategoryMetric>,
106}
107
108#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
109pub struct ReplayCategoryMetric {
110    pub compared: bool,
111    pub matched: bool,
112    pub drift_count: usize,
113    pub first_run_count: usize,
114    pub second_run_count: usize,
115}
116
117#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
118pub struct ReplayRuntimeCostMetrics {
119    pub observed_interactions: usize,
120    pub event_log_entries: usize,
121    pub trigger_firings: usize,
122    pub llm_interactions: usize,
123    pub protocol_interactions: usize,
124    pub approval_interactions: usize,
125    pub effect_receipts: usize,
126    pub persona_runtime_states: usize,
127    pub agent_transcript_deltas: usize,
128    pub final_artifacts: usize,
129    pub policy_decisions: usize,
130    pub llm_input_tokens: u64,
131    pub llm_output_tokens: u64,
132    #[serde(skip_serializing_if = "Option::is_none")]
133    pub observed_cost_usd: Option<f64>,
134}
135
136#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
137pub struct ReplayDebuggingProxyMetrics {
138    pub proxy_kind: String,
139    pub first_divergence_path: Option<String>,
140    pub first_divergence_depth: usize,
141    pub drift_surface_count: usize,
142    pub estimated_triage_steps: usize,
143}
144
145#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
146pub struct ReplayBenchmarkFixtureReceipt {
147    pub ingest_kind: String,
148    pub report_schema_version: String,
149    pub replay_trace_schema_version: String,
150    pub canonical_first_sha256: String,
151    pub canonical_second_sha256: String,
152    pub benchmark_receipt_sha256: String,
153}
154
155#[derive(Debug, Clone, PartialEq, Eq)]
156pub enum ReplayBenchmarkError {
157    Oracle(ReplayOracleError),
158    Adapter(String),
159    Serialization(String),
160}
161
162impl fmt::Display for ReplayBenchmarkError {
163    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
164        match self {
165            Self::Oracle(error) => error.fmt(f),
166            Self::Adapter(message) | Self::Serialization(message) => message.fmt(f),
167        }
168    }
169}
170
171impl std::error::Error for ReplayBenchmarkError {}
172
173impl From<ReplayOracleError> for ReplayBenchmarkError {
174    fn from(error: ReplayOracleError) -> Self {
175        Self::Oracle(error)
176    }
177}
178
179pub trait ReplayTraceAdapter {
180    fn adapter_id(&self) -> &'static str;
181    fn input_schema_version(&self) -> &'static str;
182    fn adapt_run(&self, input: &str, run_id: &str) -> Result<ReplayTraceRun, ReplayBenchmarkError>;
183}
184
185#[derive(Clone, Copy, Debug, Default)]
186pub struct OpenCodeJsonlAdapter;
187
188impl ReplayTraceAdapter for OpenCodeJsonlAdapter {
189    fn adapter_id(&self) -> &'static str {
190        OPENCODE_JSONL_ADAPTER_ID
191    }
192
193    fn input_schema_version(&self) -> &'static str {
194        OPENCODE_JSONL_ADAPTER_SCHEMA_VERSION
195    }
196
197    fn adapt_run(&self, input: &str, run_id: &str) -> Result<ReplayTraceRun, ReplayBenchmarkError> {
198        adapt_opencode_jsonl(input, run_id)
199    }
200}
201
202pub fn benchmark_replay_trace(
203    path: impl Into<String>,
204    trace: &ReplayOracleTrace,
205) -> Result<ReplayBenchmarkFixtureReport, ReplayBenchmarkError> {
206    let path = path.into();
207    let oracle = run_replay_oracle_trace(trace)?;
208    benchmark_replay_trace_from_oracle(path, trace, oracle)
209}
210
211pub fn benchmark_adapted_replay_pair(
212    adapter: &dyn ReplayTraceAdapter,
213    name: impl Into<String>,
214    first_input: &str,
215    second_input: &str,
216) -> Result<ReplayBenchmarkFixtureReport, ReplayBenchmarkError> {
217    let name = name.into();
218    let trace = ReplayOracleTrace {
219        schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
220        name: name.clone(),
221        description: Some(format!(
222            "External replay trace pair adapted with {} ({})",
223            adapter.adapter_id(),
224            adapter.input_schema_version()
225        )),
226        expect: ReplayExpectation::Match,
227        allowlist: vec![ReplayAllowlistRule {
228            path: "/run_id".to_string(),
229            reason: "external trace runs are imported as separate executions".to_string(),
230            replacement: None,
231        }],
232        first_run: adapter.adapt_run(first_input, "adapted_first_run")?,
233        second_run: adapter.adapt_run(second_input, "adapted_second_run")?,
234        protocol_fixture_refs: Vec::new(),
235    };
236    benchmark_replay_trace(format!("adapter:{}:{name}", adapter.adapter_id()), &trace)
237}
238
239pub fn build_replay_benchmark_report(
240    suite_name: impl Into<String>,
241    source_paths: Vec<String>,
242    fixtures: Vec<ReplayBenchmarkFixtureReport>,
243) -> ReplayBenchmarkReport {
244    let suite_name = suite_name.into();
245    let summary = summarize_replay_benchmark(&fixtures);
246    ReplayBenchmarkReport {
247        schema_version: REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION.to_string(),
248        cloud_ingest: ReplayBenchmarkCloudIngest {
249            kind: REPLAY_BENCHMARK_CLOUD_INGEST_KIND.to_string(),
250            leaderboard_key: "replay-determinism".to_string(),
251            report_schema_version: REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION.to_string(),
252            replay_trace_schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
253            artifact_contract:
254                "fixtures[].receipt + fixtures[].metrics are stable Cloud leaderboard inputs"
255                    .to_string(),
256        },
257        suite: ReplayBenchmarkSuiteIdentity {
258            name: suite_name,
259            fixture_count: fixtures.len(),
260            source_paths,
261        },
262        summary,
263        fixtures,
264    }
265}
266
267fn benchmark_replay_trace_from_oracle(
268    path: String,
269    trace: &ReplayOracleTrace,
270    oracle: ReplayOracleReport,
271) -> Result<ReplayBenchmarkFixtureReport, ReplayBenchmarkError> {
272    let first = canonicalize_run(&trace.first_run, &trace.allowlist)?;
273    let second = canonicalize_run(&trace.second_run, &trace.allowlist)?;
274    let category_scores = category_scores(&first, &second, &oracle);
275    let metrics = replay_metrics(trace, &oracle, category_scores)?;
276    let canonical_first_sha256 = sha256_json(&first)?;
277    let canonical_second_sha256 = sha256_json(&second)?;
278    let receipt = fixture_receipt(
279        &trace.name,
280        &path,
281        &metrics,
282        &canonical_first_sha256,
283        &canonical_second_sha256,
284    )?;
285
286    Ok(ReplayBenchmarkFixtureReport {
287        path,
288        name: oracle.name,
289        description: trace.description.clone(),
290        expectation: oracle.expectation,
291        passed: oracle.passed,
292        deterministic: oracle.divergence.is_none(),
293        first_run_counts: oracle.first_run_counts,
294        second_run_counts: oracle.second_run_counts,
295        metrics,
296        first_divergence: oracle.divergence,
297        receipt,
298    })
299}
300
301fn summarize_replay_benchmark(fixtures: &[ReplayBenchmarkFixtureReport]) -> ReplayBenchmarkSummary {
302    let fixture_count = fixtures.len();
303    let passed = fixtures.iter().filter(|fixture| fixture.passed).count();
304    let deterministic_fixtures = fixtures
305        .iter()
306        .filter(|fixture| fixture.deterministic)
307        .count();
308    let runtime = fixtures
309        .iter()
310        .fold(ReplayRuntimeCostMetrics::default(), |mut acc, fixture| {
311            let runtime = &fixture.metrics.runtime_cost;
312            acc.observed_interactions += runtime.observed_interactions;
313            acc.event_log_entries += runtime.event_log_entries;
314            acc.trigger_firings += runtime.trigger_firings;
315            acc.llm_interactions += runtime.llm_interactions;
316            acc.protocol_interactions += runtime.protocol_interactions;
317            acc.approval_interactions += runtime.approval_interactions;
318            acc.effect_receipts += runtime.effect_receipts;
319            acc.persona_runtime_states += runtime.persona_runtime_states;
320            acc.agent_transcript_deltas += runtime.agent_transcript_deltas;
321            acc.final_artifacts += runtime.final_artifacts;
322            acc.policy_decisions += runtime.policy_decisions;
323            acc.llm_input_tokens += runtime.llm_input_tokens;
324            acc.llm_output_tokens += runtime.llm_output_tokens;
325            acc.observed_cost_usd =
326                sum_optional_cost(acc.observed_cost_usd, runtime.observed_cost_usd);
327            acc
328        });
329    ReplayBenchmarkSummary {
330        passed,
331        failed: fixture_count.saturating_sub(passed),
332        deterministic_fixtures,
333        drifted_fixtures: fixture_count.saturating_sub(deterministic_fixtures),
334        mean_replay_fidelity_score: average_metric(fixtures, |fixture| {
335            fixture.metrics.replay_fidelity_score
336        }),
337        mean_permission_decision_preservation_score: average_metric(fixtures, |fixture| {
338            fixture.metrics.permission_decision_preservation_score
339        }),
340        tool_call_drift_count: fixtures
341            .iter()
342            .map(|fixture| fixture.metrics.tool_call_drift_count)
343            .sum(),
344        transcript_drift_count: fixtures
345            .iter()
346            .map(|fixture| fixture.metrics.transcript_drift_count)
347            .sum(),
348        observed_interactions: runtime.observed_interactions,
349        llm_input_tokens: runtime.llm_input_tokens,
350        llm_output_tokens: runtime.llm_output_tokens,
351    }
352}
353
354fn replay_metrics(
355    trace: &ReplayOracleTrace,
356    oracle: &ReplayOracleReport,
357    category_scores: BTreeMap<String, ReplayCategoryMetric>,
358) -> Result<ReplayBenchmarkMetrics, ReplayBenchmarkError> {
359    let compared_categories = category_scores
360        .values()
361        .filter(|metric| metric.compared)
362        .count();
363    let matched_categories = category_scores
364        .values()
365        .filter(|metric| metric.compared && metric.matched)
366        .count();
367    let replay_fidelity_score = if compared_categories == 0 {
368        0.0
369    } else {
370        matched_categories as f64 / compared_categories as f64
371    };
372    let permission_decision_preservation_score =
373        section_score(&category_scores, &PERMISSION_SECTIONS);
374    let tool_call_drift_count = section_drift_count(&category_scores, &TOOL_DRIFT_SECTIONS);
375    let transcript_drift_count =
376        section_drift_count(&category_scores, &["agent_transcript_deltas"]);
377    let runtime_cost = runtime_cost_metrics(&trace.first_run, &trace.second_run);
378    let debugging_time_to_root_cause_proxy =
379        debugging_proxy_metrics(oracle.divergence.as_ref(), &category_scores);
380
381    Ok(ReplayBenchmarkMetrics {
382        determinism_score: if oracle.divergence.is_none() {
383            1.0
384        } else {
385            0.0
386        },
387        replay_fidelity_score,
388        permission_decision_preservation_score,
389        tool_call_drift_count,
390        transcript_drift_count,
391        runtime_cost,
392        debugging_time_to_root_cause_proxy,
393        category_scores,
394    })
395}
396
397fn category_scores(
398    first: &JsonValue,
399    second: &JsonValue,
400    oracle: &ReplayOracleReport,
401) -> BTreeMap<String, ReplayCategoryMetric> {
402    let first_counts = counts_by_section(&oracle.first_run_counts);
403    let second_counts = counts_by_section(&oracle.second_run_counts);
404    REPLAY_TRACE_SECTIONS
405        .iter()
406        .map(|section| {
407            let first_value = first.get(*section).unwrap_or(&JsonValue::Null);
408            let second_value = second.get(*section).unwrap_or(&JsonValue::Null);
409            let first_run_count = first_counts.get(*section).copied().unwrap_or_default();
410            let second_run_count = second_counts.get(*section).copied().unwrap_or_default();
411            let compared = first_run_count > 0 || second_run_count > 0;
412            let drift_count = if compared {
413                drift_count(first_value, second_value)
414            } else {
415                0
416            };
417            (
418                (*section).to_string(),
419                ReplayCategoryMetric {
420                    compared,
421                    matched: drift_count == 0,
422                    drift_count,
423                    first_run_count,
424                    second_run_count,
425                },
426            )
427        })
428        .collect()
429}
430
431fn counts_by_section(counts: &ReplayTraceRunCounts) -> BTreeMap<&'static str, usize> {
432    BTreeMap::from([
433        ("event_log_entries", counts.event_log_entries),
434        ("trigger_firings", counts.trigger_firings),
435        ("llm_interactions", counts.llm_interactions),
436        ("protocol_interactions", counts.protocol_interactions),
437        ("approval_interactions", counts.approval_interactions),
438        ("effect_receipts", counts.effect_receipts),
439        ("persona_runtime_states", counts.persona_runtime_states),
440        ("agent_transcript_deltas", counts.agent_transcript_deltas),
441        ("final_artifacts", counts.final_artifacts),
442        ("policy_decisions", counts.policy_decisions),
443    ])
444}
445
446fn drift_count(first: &JsonValue, second: &JsonValue) -> usize {
447    if first == second {
448        return 0;
449    }
450    match (first, second) {
451        (JsonValue::Array(first_items), JsonValue::Array(second_items)) => {
452            let shared = first_items.len().min(second_items.len());
453            let item_drifts = (0..shared)
454                .filter(|index| first_items[*index] != second_items[*index])
455                .count();
456            item_drifts + first_items.len().abs_diff(second_items.len())
457        }
458        (JsonValue::Object(first_map), JsonValue::Object(second_map)) => {
459            let keys = first_map
460                .keys()
461                .chain(second_map.keys())
462                .collect::<BTreeSet<_>>();
463            keys.into_iter()
464                .filter(|key| first_map.get(*key) != second_map.get(*key))
465                .count()
466        }
467        _ => 1,
468    }
469}
470
471fn section_score(
472    category_scores: &BTreeMap<String, ReplayCategoryMetric>,
473    sections: &[&str],
474) -> f64 {
475    let compared = sections
476        .iter()
477        .filter_map(|section| category_scores.get(*section))
478        .filter(|metric| metric.compared)
479        .collect::<Vec<_>>();
480    if compared.is_empty() {
481        return 1.0;
482    }
483    compared.iter().filter(|metric| metric.matched).count() as f64 / compared.len() as f64
484}
485
486fn section_drift_count(
487    category_scores: &BTreeMap<String, ReplayCategoryMetric>,
488    sections: &[&str],
489) -> usize {
490    sections
491        .iter()
492        .filter_map(|section| category_scores.get(*section))
493        .map(|metric| metric.drift_count)
494        .sum()
495}
496
497fn runtime_cost_metrics(
498    first_run: &ReplayTraceRun,
499    second_run: &ReplayTraceRun,
500) -> ReplayRuntimeCostMetrics {
501    let first = first_run.counts();
502    let second = second_run.counts();
503    let observed_cost_usd =
504        sum_optional_cost(cost_usd_for_run(first_run), cost_usd_for_run(second_run));
505    ReplayRuntimeCostMetrics {
506        observed_interactions: trace_material_count(&first) + trace_material_count(&second),
507        event_log_entries: first.event_log_entries + second.event_log_entries,
508        trigger_firings: first.trigger_firings + second.trigger_firings,
509        llm_interactions: first.llm_interactions + second.llm_interactions,
510        protocol_interactions: first.protocol_interactions + second.protocol_interactions,
511        approval_interactions: first.approval_interactions + second.approval_interactions,
512        effect_receipts: first.effect_receipts + second.effect_receipts,
513        persona_runtime_states: first.persona_runtime_states + second.persona_runtime_states,
514        agent_transcript_deltas: first.agent_transcript_deltas + second.agent_transcript_deltas,
515        final_artifacts: first.final_artifacts + second.final_artifacts,
516        policy_decisions: first.policy_decisions + second.policy_decisions,
517        llm_input_tokens: token_total(first_run, "input_tokens")
518            + token_total(second_run, "input_tokens"),
519        llm_output_tokens: token_total(first_run, "output_tokens")
520            + token_total(second_run, "output_tokens"),
521        observed_cost_usd,
522    }
523}
524
525fn trace_material_count(counts: &ReplayTraceRunCounts) -> usize {
526    counts.event_log_entries
527        + counts.trigger_firings
528        + counts.llm_interactions
529        + counts.protocol_interactions
530        + counts.approval_interactions
531        + counts.effect_receipts
532        + counts.persona_runtime_states
533        + counts.agent_transcript_deltas
534        + counts.final_artifacts
535        + counts.policy_decisions
536}
537
538fn token_total(run: &ReplayTraceRun, token_key: &str) -> u64 {
539    run.llm_interactions
540        .iter()
541        .filter_map(|interaction| {
542            interaction
543                .get(token_key)
544                .and_then(JsonValue::as_u64)
545                .or_else(|| {
546                    interaction
547                        .get("usage")
548                        .and_then(|usage| usage.get(token_key))
549                        .and_then(JsonValue::as_u64)
550                })
551        })
552        .sum()
553}
554
555fn cost_usd_for_run(run: &ReplayTraceRun) -> Option<f64> {
556    let mut seen = false;
557    let mut total = 0.0;
558    for interaction in &run.llm_interactions {
559        if let Some(cost) = interaction
560            .get("cost_usd")
561            .and_then(JsonValue::as_f64)
562            .or_else(|| {
563                interaction
564                    .get("usage")
565                    .and_then(|usage| usage.get("cost_usd"))
566                    .and_then(JsonValue::as_f64)
567            })
568        {
569            seen = true;
570            total += cost;
571        }
572    }
573    seen.then_some(total)
574}
575
576fn sum_optional_cost(left: Option<f64>, right: Option<f64>) -> Option<f64> {
577    match (left, right) {
578        (Some(left), Some(right)) => Some(left + right),
579        (Some(value), None) | (None, Some(value)) => Some(value),
580        (None, None) => None,
581    }
582}
583
584fn debugging_proxy_metrics(
585    divergence: Option<&ReplayDivergence>,
586    category_scores: &BTreeMap<String, ReplayCategoryMetric>,
587) -> ReplayDebuggingProxyMetrics {
588    let first_divergence_path = divergence.map(|divergence| divergence.path.clone());
589    let first_divergence_depth = first_divergence_path
590        .as_deref()
591        .map(json_path_depth)
592        .unwrap_or_default();
593    let drift_surface_count = category_scores
594        .values()
595        .filter(|metric| metric.compared && !metric.matched)
596        .count();
597    ReplayDebuggingProxyMetrics {
598        proxy_kind: "first_divergence_depth_plus_drift_surfaces".to_string(),
599        first_divergence_path,
600        first_divergence_depth,
601        drift_surface_count,
602        estimated_triage_steps: if drift_surface_count == 0 {
603            0
604        } else {
605            1 + first_divergence_depth + drift_surface_count
606        },
607    }
608}
609
610fn json_path_depth(path: &str) -> usize {
611    let path = path.trim();
612    if path == "$" {
613        return 0;
614    }
615    if let Some(pointer_path) = path.strip_prefix('/') {
616        return pointer_path
617            .split('/')
618            .filter(|segment| !segment.is_empty())
619            .count();
620    }
621    path.split('.')
622        .filter(|segment| !segment.is_empty() && *segment != "$")
623        .count()
624}
625
626fn fixture_receipt(
627    name: &str,
628    path: &str,
629    metrics: &ReplayBenchmarkMetrics,
630    canonical_first_sha256: &str,
631    canonical_second_sha256: &str,
632) -> Result<ReplayBenchmarkFixtureReceipt, ReplayBenchmarkError> {
633    let receipt_material = json!({
634        "ingest_kind": REPLAY_BENCHMARK_CLOUD_INGEST_KIND,
635        "report_schema_version": REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION,
636        "replay_trace_schema_version": REPLAY_TRACE_SCHEMA_VERSION,
637        "name": name,
638        "path": path,
639        "canonical_first_sha256": canonical_first_sha256,
640        "canonical_second_sha256": canonical_second_sha256,
641        "metrics": metrics,
642    });
643    Ok(ReplayBenchmarkFixtureReceipt {
644        ingest_kind: REPLAY_BENCHMARK_CLOUD_INGEST_KIND.to_string(),
645        report_schema_version: REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION.to_string(),
646        replay_trace_schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
647        canonical_first_sha256: canonical_first_sha256.to_string(),
648        canonical_second_sha256: canonical_second_sha256.to_string(),
649        benchmark_receipt_sha256: sha256_json(&receipt_material)?,
650    })
651}
652
653fn sha256_json(value: &JsonValue) -> Result<String, ReplayBenchmarkError> {
654    let bytes = serde_json::to_vec(value)
655        .map_err(|error| ReplayBenchmarkError::Serialization(error.to_string()))?;
656    Ok(format!("sha256:{}", hex::encode(Sha256::digest(bytes))))
657}
658
659fn sha256_value(value: &JsonValue) -> Result<String, ReplayBenchmarkError> {
660    sha256_json(value)
661}
662
663fn sha256_text(text: &str) -> String {
664    format!("sha256:{}", hex::encode(Sha256::digest(text.as_bytes())))
665}
666
667fn average_metric(
668    fixtures: &[ReplayBenchmarkFixtureReport],
669    metric: impl Fn(&ReplayBenchmarkFixtureReport) -> f64,
670) -> f64 {
671    if fixtures.is_empty() {
672        0.0
673    } else {
674        fixtures.iter().map(metric).sum::<f64>() / fixtures.len() as f64
675    }
676}
677
678fn adapt_opencode_jsonl(input: &str, run_id: &str) -> Result<ReplayTraceRun, ReplayBenchmarkError> {
679    let mut run = ReplayTraceRun {
680        run_id: run_id.to_string(),
681        ..ReplayTraceRun::default()
682    };
683    for (index, raw_line) in input.lines().enumerate() {
684        let line_no = index + 1;
685        let line = raw_line.trim();
686        if line.is_empty() {
687            continue;
688        }
689        let value: JsonValue = serde_json::from_str(line).map_err(|error| {
690            ReplayBenchmarkError::Adapter(format!(
691                "invalid {} JSONL line {line_no}: {error}",
692                OPENCODE_JSONL_ADAPTER_ID
693            ))
694        })?;
695        let object = value.as_object().ok_or_else(|| {
696            ReplayBenchmarkError::Adapter(format!(
697                "{} JSONL line {line_no} must be an object",
698                OPENCODE_JSONL_ADAPTER_ID
699            ))
700        })?;
701        let event_type = object
702            .get("type")
703            .or_else(|| object.get("event"))
704            .and_then(JsonValue::as_str)
705            .unwrap_or("event");
706        match event_type {
707            "message" | "session.message" => {
708                run.agent_transcript_deltas
709                    .push(adapt_opencode_message(object, line_no));
710            }
711            "tool_call" | "tool" | "session.tool_call" => {
712                let (protocol, receipt) = adapt_opencode_tool_call(object, line_no)?;
713                run.protocol_interactions.push(protocol);
714                run.effect_receipts.push(receipt);
715            }
716            "permission" | "permission_decision" | "session.permission" => {
717                let (approval, policy) = adapt_opencode_permission(object, line_no);
718                run.approval_interactions.push(approval);
719                run.policy_decisions.push(policy);
720            }
721            "llm" | "model" | "session.llm" => {
722                run.llm_interactions
723                    .push(adapt_opencode_llm(object, line_no));
724            }
725            _ => run
726                .event_log_entries
727                .push(adapt_opencode_event(object, event_type, line_no)),
728        }
729    }
730    if trace_material_count(&run.counts()) == 0 {
731        return Err(ReplayBenchmarkError::Adapter(format!(
732            "{} input contained no adaptable events",
733            OPENCODE_JSONL_ADAPTER_ID
734        )));
735    }
736    Ok(run)
737}
738
739fn adapt_opencode_message(
740    object: &serde_json::Map<String, JsonValue>,
741    line_no: usize,
742) -> JsonValue {
743    let content = object.get("content").cloned().unwrap_or(JsonValue::Null);
744    json!({
745        "delta_id": object_string(object, "id").unwrap_or_else(|| format!("message-{line_no}")),
746        "agent": object_string(object, "agent").unwrap_or_else(|| "opencode".to_string()),
747        "role": object_string(object, "role").unwrap_or_else(|| "assistant".to_string()),
748        "content_sha256": sha256_text(&content.to_string()),
749    })
750}
751
752fn adapt_opencode_tool_call(
753    object: &serde_json::Map<String, JsonValue>,
754    line_no: usize,
755) -> Result<(JsonValue, JsonValue), ReplayBenchmarkError> {
756    let tool = object_string(object, "tool")
757        .or_else(|| object_string(object, "name"))
758        .unwrap_or_else(|| "unknown_tool".to_string());
759    let arguments = object
760        .get("arguments")
761        .or_else(|| object.get("args"))
762        .cloned()
763        .unwrap_or_else(|| json!({}));
764    let result = object
765        .get("result")
766        .or_else(|| object.get("output"))
767        .cloned()
768        .unwrap_or(JsonValue::Null);
769    let status = object_string(object, "status").unwrap_or_else(|| "completed".to_string());
770    let arguments_sha256 = sha256_value(&arguments)?;
771    let result_sha256 = sha256_value(&result)?;
772    Ok((
773        json!({
774            "protocol": "opencode",
775            "boundary": "tool_call",
776            "tool": tool,
777            "call_id": object_string(object, "id").unwrap_or_else(|| format!("tool-{line_no}")),
778            "arguments_sha256": arguments_sha256,
779            "status": status,
780            "result_sha256": result_sha256,
781        }),
782        json!({
783            "receipt_id": object_string(object, "receipt_id").unwrap_or_else(|| format!("tool-receipt-{line_no}")),
784            "kind": "tool_call",
785            "tool": tool,
786            "status": status,
787            "arguments_sha256": arguments_sha256,
788            "result_sha256": result_sha256,
789        }),
790    ))
791}
792
793fn adapt_opencode_permission(
794    object: &serde_json::Map<String, JsonValue>,
795    line_no: usize,
796) -> (JsonValue, JsonValue) {
797    let action = object_string(object, "action").unwrap_or_else(|| "unknown".to_string());
798    let decision = object_string(object, "decision")
799        .or_else(|| object_string(object, "response"))
800        .unwrap_or_else(|| "unknown".to_string());
801    (
802        json!({
803            "request_id": object_string(object, "id").unwrap_or_else(|| format!("permission-{line_no}")),
804            "principal": object_string(object, "principal").unwrap_or_else(|| "agent".to_string()),
805            "action": action,
806            "response": decision,
807            "reviewer": object.get("reviewer").cloned().unwrap_or(JsonValue::Null),
808        }),
809        json!({
810            "decision_id": object_string(object, "decision_id").unwrap_or_else(|| format!("policy-{line_no}")),
811            "capability": object_string(object, "capability").unwrap_or(action),
812            "decision": decision,
813            "approval_required": true,
814        }),
815    )
816}
817
818fn adapt_opencode_llm(object: &serde_json::Map<String, JsonValue>, line_no: usize) -> JsonValue {
819    let input_tokens = object
820        .get("input_tokens")
821        .and_then(JsonValue::as_u64)
822        .or_else(|| {
823            object
824                .get("usage")
825                .and_then(|usage| usage.get("input_tokens"))
826                .and_then(JsonValue::as_u64)
827        })
828        .unwrap_or_default();
829    let output_tokens = object
830        .get("output_tokens")
831        .and_then(JsonValue::as_u64)
832        .or_else(|| {
833            object
834                .get("usage")
835                .and_then(|usage| usage.get("output_tokens"))
836                .and_then(JsonValue::as_u64)
837        })
838        .unwrap_or_default();
839    let messages_sha256 = object
840        .get("messages")
841        .map(|value| sha256_text(&value.to_string()))
842        .unwrap_or_else(|| sha256_text(""));
843    let response_sha256 = object
844        .get("response")
845        .map(|value| sha256_text(&value.to_string()))
846        .unwrap_or_else(|| sha256_text(""));
847    json!({
848        "request_id": object_string(object, "id").unwrap_or_else(|| format!("llm-{line_no}")),
849        "provider": object_string(object, "provider").unwrap_or_else(|| "opencode".to_string()),
850        "model": object_string(object, "model").unwrap_or_else(|| "unknown".to_string()),
851        "messages_sha256": messages_sha256,
852        "response_sha256": response_sha256,
853        "usage": {
854            "input_tokens": input_tokens,
855            "output_tokens": output_tokens,
856        },
857    })
858}
859
860fn adapt_opencode_event(
861    object: &serde_json::Map<String, JsonValue>,
862    event_type: &str,
863    line_no: usize,
864) -> JsonValue {
865    json!({
866        "event_id": line_no,
867        "topic": object_string(object, "topic").unwrap_or_else(|| "opencode.session".to_string()),
868        "kind": event_type,
869        "payload": object.get("payload").cloned().unwrap_or_else(|| JsonValue::Object(object.clone())),
870    })
871}
872
873fn object_string(object: &serde_json::Map<String, JsonValue>, key: &str) -> Option<String> {
874    object
875        .get(key)
876        .and_then(JsonValue::as_str)
877        .map(str::to_string)
878}
879
880#[cfg(test)]
881mod tests {
882    use super::*;
883
884    fn trace_pair(status: (&str, &str)) -> ReplayOracleTrace {
885        ReplayOracleTrace {
886            schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
887            name: "simple_tool_run".to_string(),
888            description: Some("golden replay benchmark fixture".to_string()),
889            expect: ReplayExpectation::Match,
890            allowlist: vec![ReplayAllowlistRule {
891                path: "/run_id".to_string(),
892                reason: "run ids are allocated per execution".to_string(),
893                replacement: None,
894            }],
895            first_run: ReplayTraceRun {
896                run_id: "first".to_string(),
897                protocol_interactions: vec![json!({
898                    "protocol": "mcp",
899                    "boundary": "tools/call",
900                    "tool": "read_file",
901                    "status": status.0,
902                })],
903                policy_decisions: vec![json!({
904                    "capability": "fs.read",
905                    "decision": "allow",
906                })],
907                ..ReplayTraceRun::default()
908            },
909            second_run: ReplayTraceRun {
910                run_id: "second".to_string(),
911                protocol_interactions: vec![json!({
912                    "protocol": "mcp",
913                    "boundary": "tools/call",
914                    "tool": "read_file",
915                    "status": status.1,
916                })],
917                policy_decisions: vec![json!({
918                    "capability": "fs.read",
919                    "decision": "allow",
920                })],
921                ..ReplayTraceRun::default()
922            },
923            protocol_fixture_refs: Vec::new(),
924        }
925    }
926
927    #[test]
928    fn replay_benchmark_reports_stable_golden_metrics_for_matching_trace() {
929        let fixture =
930            benchmark_replay_trace("benchmarks/replay/simple.json", &trace_pair(("ok", "ok")))
931                .expect("benchmark fixture");
932
933        assert!(fixture.passed);
934        assert!(fixture.deterministic);
935        assert_eq!(fixture.metrics.determinism_score, 1.0);
936        assert_eq!(fixture.metrics.replay_fidelity_score, 1.0);
937        assert_eq!(fixture.metrics.permission_decision_preservation_score, 1.0);
938        assert_eq!(fixture.metrics.tool_call_drift_count, 0);
939        assert!(fixture
940            .receipt
941            .benchmark_receipt_sha256
942            .starts_with("sha256:"));
943    }
944
945    #[test]
946    fn replay_benchmark_reports_reduced_fidelity_for_meaningful_drift() {
947        let fixture =
948            benchmark_replay_trace("benchmarks/replay/drift.json", &trace_pair(("ok", "error")))
949                .expect("benchmark fixture");
950
951        assert!(!fixture.passed);
952        assert!(!fixture.deterministic);
953        assert_eq!(fixture.metrics.determinism_score, 0.0);
954        assert_eq!(fixture.metrics.replay_fidelity_score, 0.5);
955        assert_eq!(fixture.metrics.tool_call_drift_count, 1);
956        assert_eq!(
957            fixture
958                .metrics
959                .debugging_time_to_root_cause_proxy
960                .first_divergence_path
961                .as_deref(),
962            Some("/protocol_interactions/0/status")
963        );
964        assert_eq!(
965            fixture
966                .metrics
967                .debugging_time_to_root_cause_proxy
968                .first_divergence_depth,
969            3
970        );
971        assert_eq!(
972            fixture
973                .metrics
974                .debugging_time_to_root_cause_proxy
975                .estimated_triage_steps,
976            5
977        );
978    }
979
980    #[test]
981    fn replay_benchmark_summary_is_stable_across_repeated_runs() {
982        let first = benchmark_replay_trace("fixture.json", &trace_pair(("ok", "ok")))
983            .expect("first benchmark");
984        let second = benchmark_replay_trace("fixture.json", &trace_pair(("ok", "ok")))
985            .expect("second benchmark");
986
987        let first_json = serde_json::to_string(&first).expect("serialize first");
988        let second_json = serde_json::to_string(&second).expect("serialize second");
989        assert_eq!(first_json, second_json);
990    }
991
992    #[test]
993    fn opencode_jsonl_adapter_maps_messages_tools_permissions_and_llm_usage() {
994        let input = concat!(
995            "{\"type\":\"message\",\"id\":\"m1\",\"role\":\"assistant\",\"content\":\"done\"}\n",
996            "{\"type\":\"tool_call\",\"id\":\"t1\",\"tool\":\"write_file\",\"arguments\":{\"path\":\"notes.md\"},\"result\":{\"ok\":true}}\n",
997            "{\"type\":\"permission\",\"id\":\"p1\",\"action\":\"write_file\",\"decision\":\"approved\"}\n",
998            "{\"type\":\"llm\",\"id\":\"l1\",\"model\":\"qwen\",\"usage\":{\"input_tokens\":7,\"output_tokens\":3}}\n"
999        );
1000
1001        let run = OpenCodeJsonlAdapter
1002            .adapt_run(input, "opencode-run")
1003            .expect("adapt opencode jsonl");
1004
1005        assert_eq!(run.run_id, "opencode-run");
1006        assert_eq!(run.agent_transcript_deltas.len(), 1);
1007        assert_eq!(run.protocol_interactions.len(), 1);
1008        assert_eq!(run.effect_receipts.len(), 1);
1009        assert_eq!(run.approval_interactions.len(), 1);
1010        assert_eq!(run.policy_decisions.len(), 1);
1011        assert_eq!(run.llm_interactions.len(), 1);
1012        assert_eq!(token_total(&run, "input_tokens"), 7);
1013        assert_eq!(token_total(&run, "output_tokens"), 3);
1014    }
1015
1016    #[test]
1017    fn adapted_trace_pair_can_be_benchmarked() {
1018        let first = "{\"type\":\"tool_call\",\"tool\":\"read_file\",\"result\":{\"ok\":true}}\n";
1019        let second = "{\"type\":\"tool_call\",\"tool\":\"read_file\",\"result\":{\"ok\":true}}\n";
1020
1021        let fixture = benchmark_adapted_replay_pair(
1022            &OpenCodeJsonlAdapter,
1023            "external-tool-run",
1024            first,
1025            second,
1026        )
1027        .expect("benchmark adapted pair");
1028
1029        assert!(fixture.passed);
1030        assert_eq!(fixture.name, "external-tool-run");
1031        assert_eq!(fixture.metrics.tool_call_drift_count, 0);
1032    }
1033}
harn_vm/orchestration/replay_bench.rs

harn_vm/orchestration/
replay_bench.rs