1use std::collections::{BTreeMap, BTreeSet};
2use std::fmt;
3
4use serde::{Deserialize, Serialize};
5use serde_json::{json, Value as JsonValue};
6use sha2::{Digest, Sha256};
7
8use super::{
9 canonicalize_run, run_replay_oracle_trace, ReplayAllowlistRule, ReplayDivergence,
10 ReplayExpectation, ReplayOracleError, ReplayOracleReport, ReplayOracleTrace, ReplayTraceRun,
11 ReplayTraceRunCounts, REPLAY_TRACE_SCHEMA_VERSION,
12};
13
14pub const REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION: &str = "harn.replay_benchmark.report.v1";
15pub const REPLAY_BENCHMARK_CLOUD_INGEST_KIND: &str = "harn_cloud.replay_determinism.leaderboard.v1";
16pub const OPENCODE_JSONL_ADAPTER_ID: &str = "opencode-jsonl";
17pub const OPENCODE_JSONL_ADAPTER_SCHEMA_VERSION: &str =
18 "harn.replay_benchmark.adapter.opencode_jsonl.v1";
19
20const REPLAY_TRACE_SECTIONS: [&str; 10] = [
21 "event_log_entries",
22 "trigger_firings",
23 "llm_interactions",
24 "protocol_interactions",
25 "approval_interactions",
26 "effect_receipts",
27 "persona_runtime_states",
28 "agent_transcript_deltas",
29 "final_artifacts",
30 "policy_decisions",
31];
32
33const TOOL_DRIFT_SECTIONS: [&str; 3] = [
34 "llm_interactions",
35 "protocol_interactions",
36 "effect_receipts",
37];
38
39const PERMISSION_SECTIONS: [&str; 2] = ["approval_interactions", "policy_decisions"];
40
41#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
42pub struct ReplayBenchmarkReport {
43 pub schema_version: String,
44 pub cloud_ingest: ReplayBenchmarkCloudIngest,
45 pub suite: ReplayBenchmarkSuiteIdentity,
46 pub summary: ReplayBenchmarkSummary,
47 pub fixtures: Vec<ReplayBenchmarkFixtureReport>,
48}
49
50#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
51pub struct ReplayBenchmarkCloudIngest {
52 pub kind: String,
53 pub leaderboard_key: String,
54 pub report_schema_version: String,
55 pub replay_trace_schema_version: String,
56 pub artifact_contract: String,
57}
58
59#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
60pub struct ReplayBenchmarkSuiteIdentity {
61 pub name: String,
62 pub fixture_count: usize,
63 pub source_paths: Vec<String>,
64}
65
66#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
67pub struct ReplayBenchmarkSummary {
68 pub passed: usize,
69 pub failed: usize,
70 pub deterministic_fixtures: usize,
71 pub drifted_fixtures: usize,
72 pub mean_replay_fidelity_score: f64,
73 pub mean_permission_decision_preservation_score: f64,
74 pub tool_call_drift_count: usize,
75 pub transcript_drift_count: usize,
76 pub observed_interactions: usize,
77 pub llm_input_tokens: u64,
78 pub llm_output_tokens: u64,
79}
80
81#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
82pub struct ReplayBenchmarkFixtureReport {
83 pub path: String,
84 pub name: String,
85 pub description: Option<String>,
86 pub expectation: ReplayExpectation,
87 pub passed: bool,
88 pub deterministic: bool,
89 pub first_run_counts: ReplayTraceRunCounts,
90 pub second_run_counts: ReplayTraceRunCounts,
91 pub metrics: ReplayBenchmarkMetrics,
92 pub first_divergence: Option<ReplayDivergence>,
93 pub receipt: ReplayBenchmarkFixtureReceipt,
94}
95
96#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
97pub struct ReplayBenchmarkMetrics {
98 pub determinism_score: f64,
99 pub replay_fidelity_score: f64,
100 pub permission_decision_preservation_score: f64,
101 pub tool_call_drift_count: usize,
102 pub transcript_drift_count: usize,
103 pub runtime_cost: ReplayRuntimeCostMetrics,
104 pub debugging_time_to_root_cause_proxy: ReplayDebuggingProxyMetrics,
105 pub category_scores: BTreeMap<String, ReplayCategoryMetric>,
106}
107
108#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
109pub struct ReplayCategoryMetric {
110 pub compared: bool,
111 pub matched: bool,
112 pub drift_count: usize,
113 pub first_run_count: usize,
114 pub second_run_count: usize,
115}
116
117#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
118pub struct ReplayRuntimeCostMetrics {
119 pub observed_interactions: usize,
120 pub event_log_entries: usize,
121 pub trigger_firings: usize,
122 pub llm_interactions: usize,
123 pub protocol_interactions: usize,
124 pub approval_interactions: usize,
125 pub effect_receipts: usize,
126 pub persona_runtime_states: usize,
127 pub agent_transcript_deltas: usize,
128 pub final_artifacts: usize,
129 pub policy_decisions: usize,
130 pub llm_input_tokens: u64,
131 pub llm_output_tokens: u64,
132 #[serde(skip_serializing_if = "Option::is_none")]
133 pub observed_cost_usd: Option<f64>,
134}
135
136#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
137pub struct ReplayDebuggingProxyMetrics {
138 pub proxy_kind: String,
139 pub first_divergence_path: Option<String>,
140 pub first_divergence_depth: usize,
141 pub drift_surface_count: usize,
142 pub estimated_triage_steps: usize,
143}
144
145#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
146pub struct ReplayBenchmarkFixtureReceipt {
147 pub ingest_kind: String,
148 pub report_schema_version: String,
149 pub replay_trace_schema_version: String,
150 pub canonical_first_sha256: String,
151 pub canonical_second_sha256: String,
152 pub benchmark_receipt_sha256: String,
153}
154
155#[derive(Debug, Clone, PartialEq, Eq)]
156pub enum ReplayBenchmarkError {
157 Oracle(ReplayOracleError),
158 Adapter(String),
159 Serialization(String),
160}
161
162impl fmt::Display for ReplayBenchmarkError {
163 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
164 match self {
165 Self::Oracle(error) => error.fmt(f),
166 Self::Adapter(message) | Self::Serialization(message) => message.fmt(f),
167 }
168 }
169}
170
171impl std::error::Error for ReplayBenchmarkError {}
172
173impl From<ReplayOracleError> for ReplayBenchmarkError {
174 fn from(error: ReplayOracleError) -> Self {
175 Self::Oracle(error)
176 }
177}
178
179pub trait ReplayTraceAdapter {
180 fn adapter_id(&self) -> &'static str;
181 fn input_schema_version(&self) -> &'static str;
182 fn adapt_run(&self, input: &str, run_id: &str) -> Result<ReplayTraceRun, ReplayBenchmarkError>;
183}
184
185#[derive(Clone, Copy, Debug, Default)]
186pub struct OpenCodeJsonlAdapter;
187
188impl ReplayTraceAdapter for OpenCodeJsonlAdapter {
189 fn adapter_id(&self) -> &'static str {
190 OPENCODE_JSONL_ADAPTER_ID
191 }
192
193 fn input_schema_version(&self) -> &'static str {
194 OPENCODE_JSONL_ADAPTER_SCHEMA_VERSION
195 }
196
197 fn adapt_run(&self, input: &str, run_id: &str) -> Result<ReplayTraceRun, ReplayBenchmarkError> {
198 adapt_opencode_jsonl(input, run_id)
199 }
200}
201
202pub fn benchmark_replay_trace(
203 path: impl Into<String>,
204 trace: &ReplayOracleTrace,
205) -> Result<ReplayBenchmarkFixtureReport, ReplayBenchmarkError> {
206 let path = path.into();
207 let oracle = run_replay_oracle_trace(trace)?;
208 benchmark_replay_trace_from_oracle(path, trace, oracle)
209}
210
211pub fn benchmark_adapted_replay_pair(
212 adapter: &dyn ReplayTraceAdapter,
213 name: impl Into<String>,
214 first_input: &str,
215 second_input: &str,
216) -> Result<ReplayBenchmarkFixtureReport, ReplayBenchmarkError> {
217 let name = name.into();
218 let trace = ReplayOracleTrace {
219 schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
220 name: name.clone(),
221 description: Some(format!(
222 "External replay trace pair adapted with {} ({})",
223 adapter.adapter_id(),
224 adapter.input_schema_version()
225 )),
226 expect: ReplayExpectation::Match,
227 allowlist: vec![ReplayAllowlistRule {
228 path: "/run_id".to_string(),
229 reason: "external trace runs are imported as separate executions".to_string(),
230 replacement: None,
231 }],
232 first_run: adapter.adapt_run(first_input, "adapted_first_run")?,
233 second_run: adapter.adapt_run(second_input, "adapted_second_run")?,
234 protocol_fixture_refs: Vec::new(),
235 };
236 benchmark_replay_trace(format!("adapter:{}:{name}", adapter.adapter_id()), &trace)
237}
238
239pub fn build_replay_benchmark_report(
240 suite_name: impl Into<String>,
241 source_paths: Vec<String>,
242 fixtures: Vec<ReplayBenchmarkFixtureReport>,
243) -> ReplayBenchmarkReport {
244 let suite_name = suite_name.into();
245 let summary = summarize_replay_benchmark(&fixtures);
246 ReplayBenchmarkReport {
247 schema_version: REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION.to_string(),
248 cloud_ingest: ReplayBenchmarkCloudIngest {
249 kind: REPLAY_BENCHMARK_CLOUD_INGEST_KIND.to_string(),
250 leaderboard_key: "replay-determinism".to_string(),
251 report_schema_version: REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION.to_string(),
252 replay_trace_schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
253 artifact_contract:
254 "fixtures[].receipt + fixtures[].metrics are stable Cloud leaderboard inputs"
255 .to_string(),
256 },
257 suite: ReplayBenchmarkSuiteIdentity {
258 name: suite_name,
259 fixture_count: fixtures.len(),
260 source_paths,
261 },
262 summary,
263 fixtures,
264 }
265}
266
267fn benchmark_replay_trace_from_oracle(
268 path: String,
269 trace: &ReplayOracleTrace,
270 oracle: ReplayOracleReport,
271) -> Result<ReplayBenchmarkFixtureReport, ReplayBenchmarkError> {
272 let first = canonicalize_run(&trace.first_run, &trace.allowlist)?;
273 let second = canonicalize_run(&trace.second_run, &trace.allowlist)?;
274 let category_scores = category_scores(&first, &second, &oracle);
275 let metrics = replay_metrics(trace, &oracle, category_scores)?;
276 let canonical_first_sha256 = sha256_json(&first)?;
277 let canonical_second_sha256 = sha256_json(&second)?;
278 let receipt = fixture_receipt(
279 &trace.name,
280 &path,
281 &metrics,
282 &canonical_first_sha256,
283 &canonical_second_sha256,
284 )?;
285
286 Ok(ReplayBenchmarkFixtureReport {
287 path,
288 name: oracle.name,
289 description: trace.description.clone(),
290 expectation: oracle.expectation,
291 passed: oracle.passed,
292 deterministic: oracle.divergence.is_none(),
293 first_run_counts: oracle.first_run_counts,
294 second_run_counts: oracle.second_run_counts,
295 metrics,
296 first_divergence: oracle.divergence,
297 receipt,
298 })
299}
300
301fn summarize_replay_benchmark(fixtures: &[ReplayBenchmarkFixtureReport]) -> ReplayBenchmarkSummary {
302 let fixture_count = fixtures.len();
303 let passed = fixtures.iter().filter(|fixture| fixture.passed).count();
304 let deterministic_fixtures = fixtures
305 .iter()
306 .filter(|fixture| fixture.deterministic)
307 .count();
308 let runtime = fixtures
309 .iter()
310 .fold(ReplayRuntimeCostMetrics::default(), |mut acc, fixture| {
311 let runtime = &fixture.metrics.runtime_cost;
312 acc.observed_interactions += runtime.observed_interactions;
313 acc.event_log_entries += runtime.event_log_entries;
314 acc.trigger_firings += runtime.trigger_firings;
315 acc.llm_interactions += runtime.llm_interactions;
316 acc.protocol_interactions += runtime.protocol_interactions;
317 acc.approval_interactions += runtime.approval_interactions;
318 acc.effect_receipts += runtime.effect_receipts;
319 acc.persona_runtime_states += runtime.persona_runtime_states;
320 acc.agent_transcript_deltas += runtime.agent_transcript_deltas;
321 acc.final_artifacts += runtime.final_artifacts;
322 acc.policy_decisions += runtime.policy_decisions;
323 acc.llm_input_tokens += runtime.llm_input_tokens;
324 acc.llm_output_tokens += runtime.llm_output_tokens;
325 acc.observed_cost_usd =
326 sum_optional_cost(acc.observed_cost_usd, runtime.observed_cost_usd);
327 acc
328 });
329 ReplayBenchmarkSummary {
330 passed,
331 failed: fixture_count.saturating_sub(passed),
332 deterministic_fixtures,
333 drifted_fixtures: fixture_count.saturating_sub(deterministic_fixtures),
334 mean_replay_fidelity_score: average_metric(fixtures, |fixture| {
335 fixture.metrics.replay_fidelity_score
336 }),
337 mean_permission_decision_preservation_score: average_metric(fixtures, |fixture| {
338 fixture.metrics.permission_decision_preservation_score
339 }),
340 tool_call_drift_count: fixtures
341 .iter()
342 .map(|fixture| fixture.metrics.tool_call_drift_count)
343 .sum(),
344 transcript_drift_count: fixtures
345 .iter()
346 .map(|fixture| fixture.metrics.transcript_drift_count)
347 .sum(),
348 observed_interactions: runtime.observed_interactions,
349 llm_input_tokens: runtime.llm_input_tokens,
350 llm_output_tokens: runtime.llm_output_tokens,
351 }
352}
353
354fn replay_metrics(
355 trace: &ReplayOracleTrace,
356 oracle: &ReplayOracleReport,
357 category_scores: BTreeMap<String, ReplayCategoryMetric>,
358) -> Result<ReplayBenchmarkMetrics, ReplayBenchmarkError> {
359 let compared_categories = category_scores
360 .values()
361 .filter(|metric| metric.compared)
362 .count();
363 let matched_categories = category_scores
364 .values()
365 .filter(|metric| metric.compared && metric.matched)
366 .count();
367 let replay_fidelity_score = if compared_categories == 0 {
368 0.0
369 } else {
370 matched_categories as f64 / compared_categories as f64
371 };
372 let permission_decision_preservation_score =
373 section_score(&category_scores, &PERMISSION_SECTIONS);
374 let tool_call_drift_count = section_drift_count(&category_scores, &TOOL_DRIFT_SECTIONS);
375 let transcript_drift_count =
376 section_drift_count(&category_scores, &["agent_transcript_deltas"]);
377 let runtime_cost = runtime_cost_metrics(&trace.first_run, &trace.second_run);
378 let debugging_time_to_root_cause_proxy =
379 debugging_proxy_metrics(oracle.divergence.as_ref(), &category_scores);
380
381 Ok(ReplayBenchmarkMetrics {
382 determinism_score: if oracle.divergence.is_none() {
383 1.0
384 } else {
385 0.0
386 },
387 replay_fidelity_score,
388 permission_decision_preservation_score,
389 tool_call_drift_count,
390 transcript_drift_count,
391 runtime_cost,
392 debugging_time_to_root_cause_proxy,
393 category_scores,
394 })
395}
396
397fn category_scores(
398 first: &JsonValue,
399 second: &JsonValue,
400 oracle: &ReplayOracleReport,
401) -> BTreeMap<String, ReplayCategoryMetric> {
402 let first_counts = counts_by_section(&oracle.first_run_counts);
403 let second_counts = counts_by_section(&oracle.second_run_counts);
404 REPLAY_TRACE_SECTIONS
405 .iter()
406 .map(|section| {
407 let first_value = first.get(*section).unwrap_or(&JsonValue::Null);
408 let second_value = second.get(*section).unwrap_or(&JsonValue::Null);
409 let first_run_count = first_counts.get(*section).copied().unwrap_or_default();
410 let second_run_count = second_counts.get(*section).copied().unwrap_or_default();
411 let compared = first_run_count > 0 || second_run_count > 0;
412 let drift_count = if compared {
413 drift_count(first_value, second_value)
414 } else {
415 0
416 };
417 (
418 (*section).to_string(),
419 ReplayCategoryMetric {
420 compared,
421 matched: drift_count == 0,
422 drift_count,
423 first_run_count,
424 second_run_count,
425 },
426 )
427 })
428 .collect()
429}
430
431fn counts_by_section(counts: &ReplayTraceRunCounts) -> BTreeMap<&'static str, usize> {
432 BTreeMap::from([
433 ("event_log_entries", counts.event_log_entries),
434 ("trigger_firings", counts.trigger_firings),
435 ("llm_interactions", counts.llm_interactions),
436 ("protocol_interactions", counts.protocol_interactions),
437 ("approval_interactions", counts.approval_interactions),
438 ("effect_receipts", counts.effect_receipts),
439 ("persona_runtime_states", counts.persona_runtime_states),
440 ("agent_transcript_deltas", counts.agent_transcript_deltas),
441 ("final_artifacts", counts.final_artifacts),
442 ("policy_decisions", counts.policy_decisions),
443 ])
444}
445
446fn drift_count(first: &JsonValue, second: &JsonValue) -> usize {
447 if first == second {
448 return 0;
449 }
450 match (first, second) {
451 (JsonValue::Array(first_items), JsonValue::Array(second_items)) => {
452 let shared = first_items.len().min(second_items.len());
453 let item_drifts = (0..shared)
454 .filter(|index| first_items[*index] != second_items[*index])
455 .count();
456 item_drifts + first_items.len().abs_diff(second_items.len())
457 }
458 (JsonValue::Object(first_map), JsonValue::Object(second_map)) => {
459 let keys = first_map
460 .keys()
461 .chain(second_map.keys())
462 .collect::<BTreeSet<_>>();
463 keys.into_iter()
464 .filter(|key| first_map.get(*key) != second_map.get(*key))
465 .count()
466 }
467 _ => 1,
468 }
469}
470
471fn section_score(
472 category_scores: &BTreeMap<String, ReplayCategoryMetric>,
473 sections: &[&str],
474) -> f64 {
475 let compared = sections
476 .iter()
477 .filter_map(|section| category_scores.get(*section))
478 .filter(|metric| metric.compared)
479 .collect::<Vec<_>>();
480 if compared.is_empty() {
481 return 1.0;
482 }
483 compared.iter().filter(|metric| metric.matched).count() as f64 / compared.len() as f64
484}
485
486fn section_drift_count(
487 category_scores: &BTreeMap<String, ReplayCategoryMetric>,
488 sections: &[&str],
489) -> usize {
490 sections
491 .iter()
492 .filter_map(|section| category_scores.get(*section))
493 .map(|metric| metric.drift_count)
494 .sum()
495}
496
497fn runtime_cost_metrics(
498 first_run: &ReplayTraceRun,
499 second_run: &ReplayTraceRun,
500) -> ReplayRuntimeCostMetrics {
501 let first = first_run.counts();
502 let second = second_run.counts();
503 let observed_cost_usd =
504 sum_optional_cost(cost_usd_for_run(first_run), cost_usd_for_run(second_run));
505 ReplayRuntimeCostMetrics {
506 observed_interactions: trace_material_count(&first) + trace_material_count(&second),
507 event_log_entries: first.event_log_entries + second.event_log_entries,
508 trigger_firings: first.trigger_firings + second.trigger_firings,
509 llm_interactions: first.llm_interactions + second.llm_interactions,
510 protocol_interactions: first.protocol_interactions + second.protocol_interactions,
511 approval_interactions: first.approval_interactions + second.approval_interactions,
512 effect_receipts: first.effect_receipts + second.effect_receipts,
513 persona_runtime_states: first.persona_runtime_states + second.persona_runtime_states,
514 agent_transcript_deltas: first.agent_transcript_deltas + second.agent_transcript_deltas,
515 final_artifacts: first.final_artifacts + second.final_artifacts,
516 policy_decisions: first.policy_decisions + second.policy_decisions,
517 llm_input_tokens: token_total(first_run, "input_tokens")
518 + token_total(second_run, "input_tokens"),
519 llm_output_tokens: token_total(first_run, "output_tokens")
520 + token_total(second_run, "output_tokens"),
521 observed_cost_usd,
522 }
523}
524
525fn trace_material_count(counts: &ReplayTraceRunCounts) -> usize {
526 counts.event_log_entries
527 + counts.trigger_firings
528 + counts.llm_interactions
529 + counts.protocol_interactions
530 + counts.approval_interactions
531 + counts.effect_receipts
532 + counts.persona_runtime_states
533 + counts.agent_transcript_deltas
534 + counts.final_artifacts
535 + counts.policy_decisions
536}
537
538fn token_total(run: &ReplayTraceRun, token_key: &str) -> u64 {
539 run.llm_interactions
540 .iter()
541 .filter_map(|interaction| {
542 interaction
543 .get(token_key)
544 .and_then(JsonValue::as_u64)
545 .or_else(|| {
546 interaction
547 .get("usage")
548 .and_then(|usage| usage.get(token_key))
549 .and_then(JsonValue::as_u64)
550 })
551 })
552 .sum()
553}
554
555fn cost_usd_for_run(run: &ReplayTraceRun) -> Option<f64> {
556 let mut seen = false;
557 let mut total = 0.0;
558 for interaction in &run.llm_interactions {
559 if let Some(cost) = interaction
560 .get("cost_usd")
561 .and_then(JsonValue::as_f64)
562 .or_else(|| {
563 interaction
564 .get("usage")
565 .and_then(|usage| usage.get("cost_usd"))
566 .and_then(JsonValue::as_f64)
567 })
568 {
569 seen = true;
570 total += cost;
571 }
572 }
573 seen.then_some(total)
574}
575
576fn sum_optional_cost(left: Option<f64>, right: Option<f64>) -> Option<f64> {
577 match (left, right) {
578 (Some(left), Some(right)) => Some(left + right),
579 (Some(value), None) | (None, Some(value)) => Some(value),
580 (None, None) => None,
581 }
582}
583
584fn debugging_proxy_metrics(
585 divergence: Option<&ReplayDivergence>,
586 category_scores: &BTreeMap<String, ReplayCategoryMetric>,
587) -> ReplayDebuggingProxyMetrics {
588 let first_divergence_path = divergence.map(|divergence| divergence.path.clone());
589 let first_divergence_depth = first_divergence_path
590 .as_deref()
591 .map(json_path_depth)
592 .unwrap_or_default();
593 let drift_surface_count = category_scores
594 .values()
595 .filter(|metric| metric.compared && !metric.matched)
596 .count();
597 ReplayDebuggingProxyMetrics {
598 proxy_kind: "first_divergence_depth_plus_drift_surfaces".to_string(),
599 first_divergence_path,
600 first_divergence_depth,
601 drift_surface_count,
602 estimated_triage_steps: if drift_surface_count == 0 {
603 0
604 } else {
605 1 + first_divergence_depth + drift_surface_count
606 },
607 }
608}
609
610fn json_path_depth(path: &str) -> usize {
611 let path = path.trim();
612 if path == "$" {
613 return 0;
614 }
615 if let Some(pointer_path) = path.strip_prefix('/') {
616 return pointer_path
617 .split('/')
618 .filter(|segment| !segment.is_empty())
619 .count();
620 }
621 path.split('.')
622 .filter(|segment| !segment.is_empty() && *segment != "$")
623 .count()
624}
625
626fn fixture_receipt(
627 name: &str,
628 path: &str,
629 metrics: &ReplayBenchmarkMetrics,
630 canonical_first_sha256: &str,
631 canonical_second_sha256: &str,
632) -> Result<ReplayBenchmarkFixtureReceipt, ReplayBenchmarkError> {
633 let receipt_material = json!({
634 "ingest_kind": REPLAY_BENCHMARK_CLOUD_INGEST_KIND,
635 "report_schema_version": REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION,
636 "replay_trace_schema_version": REPLAY_TRACE_SCHEMA_VERSION,
637 "name": name,
638 "path": path,
639 "canonical_first_sha256": canonical_first_sha256,
640 "canonical_second_sha256": canonical_second_sha256,
641 "metrics": metrics,
642 });
643 Ok(ReplayBenchmarkFixtureReceipt {
644 ingest_kind: REPLAY_BENCHMARK_CLOUD_INGEST_KIND.to_string(),
645 report_schema_version: REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION.to_string(),
646 replay_trace_schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
647 canonical_first_sha256: canonical_first_sha256.to_string(),
648 canonical_second_sha256: canonical_second_sha256.to_string(),
649 benchmark_receipt_sha256: sha256_json(&receipt_material)?,
650 })
651}
652
653fn sha256_json(value: &JsonValue) -> Result<String, ReplayBenchmarkError> {
654 let bytes = serde_json::to_vec(value)
655 .map_err(|error| ReplayBenchmarkError::Serialization(error.to_string()))?;
656 Ok(format!("sha256:{}", hex::encode(Sha256::digest(bytes))))
657}
658
659fn sha256_value(value: &JsonValue) -> Result<String, ReplayBenchmarkError> {
660 sha256_json(value)
661}
662
663fn sha256_text(text: &str) -> String {
664 format!("sha256:{}", hex::encode(Sha256::digest(text.as_bytes())))
665}
666
667fn average_metric(
668 fixtures: &[ReplayBenchmarkFixtureReport],
669 metric: impl Fn(&ReplayBenchmarkFixtureReport) -> f64,
670) -> f64 {
671 if fixtures.is_empty() {
672 0.0
673 } else {
674 fixtures.iter().map(metric).sum::<f64>() / fixtures.len() as f64
675 }
676}
677
678fn adapt_opencode_jsonl(input: &str, run_id: &str) -> Result<ReplayTraceRun, ReplayBenchmarkError> {
679 let mut run = ReplayTraceRun {
680 run_id: run_id.to_string(),
681 ..ReplayTraceRun::default()
682 };
683 for (index, raw_line) in input.lines().enumerate() {
684 let line_no = index + 1;
685 let line = raw_line.trim();
686 if line.is_empty() {
687 continue;
688 }
689 let value: JsonValue = serde_json::from_str(line).map_err(|error| {
690 ReplayBenchmarkError::Adapter(format!(
691 "invalid {} JSONL line {line_no}: {error}",
692 OPENCODE_JSONL_ADAPTER_ID
693 ))
694 })?;
695 let object = value.as_object().ok_or_else(|| {
696 ReplayBenchmarkError::Adapter(format!(
697 "{} JSONL line {line_no} must be an object",
698 OPENCODE_JSONL_ADAPTER_ID
699 ))
700 })?;
701 let event_type = object
702 .get("type")
703 .or_else(|| object.get("event"))
704 .and_then(JsonValue::as_str)
705 .unwrap_or("event");
706 match event_type {
707 "message" | "session.message" => {
708 run.agent_transcript_deltas
709 .push(adapt_opencode_message(object, line_no));
710 }
711 "tool_call" | "tool" | "session.tool_call" => {
712 let (protocol, receipt) = adapt_opencode_tool_call(object, line_no)?;
713 run.protocol_interactions.push(protocol);
714 run.effect_receipts.push(receipt);
715 }
716 "permission" | "permission_decision" | "session.permission" => {
717 let (approval, policy) = adapt_opencode_permission(object, line_no);
718 run.approval_interactions.push(approval);
719 run.policy_decisions.push(policy);
720 }
721 "llm" | "model" | "session.llm" => {
722 run.llm_interactions
723 .push(adapt_opencode_llm(object, line_no));
724 }
725 _ => run
726 .event_log_entries
727 .push(adapt_opencode_event(object, event_type, line_no)),
728 }
729 }
730 if trace_material_count(&run.counts()) == 0 {
731 return Err(ReplayBenchmarkError::Adapter(format!(
732 "{} input contained no adaptable events",
733 OPENCODE_JSONL_ADAPTER_ID
734 )));
735 }
736 Ok(run)
737}
738
739fn adapt_opencode_message(
740 object: &serde_json::Map<String, JsonValue>,
741 line_no: usize,
742) -> JsonValue {
743 let content = object.get("content").cloned().unwrap_or(JsonValue::Null);
744 json!({
745 "delta_id": object_string(object, "id").unwrap_or_else(|| format!("message-{line_no}")),
746 "agent": object_string(object, "agent").unwrap_or_else(|| "opencode".to_string()),
747 "role": object_string(object, "role").unwrap_or_else(|| "assistant".to_string()),
748 "content_sha256": sha256_text(&content.to_string()),
749 })
750}
751
752fn adapt_opencode_tool_call(
753 object: &serde_json::Map<String, JsonValue>,
754 line_no: usize,
755) -> Result<(JsonValue, JsonValue), ReplayBenchmarkError> {
756 let tool = object_string(object, "tool")
757 .or_else(|| object_string(object, "name"))
758 .unwrap_or_else(|| "unknown_tool".to_string());
759 let arguments = object
760 .get("arguments")
761 .or_else(|| object.get("args"))
762 .cloned()
763 .unwrap_or_else(|| json!({}));
764 let result = object
765 .get("result")
766 .or_else(|| object.get("output"))
767 .cloned()
768 .unwrap_or(JsonValue::Null);
769 let status = object_string(object, "status").unwrap_or_else(|| "completed".to_string());
770 let arguments_sha256 = sha256_value(&arguments)?;
771 let result_sha256 = sha256_value(&result)?;
772 Ok((
773 json!({
774 "protocol": "opencode",
775 "boundary": "tool_call",
776 "tool": tool,
777 "call_id": object_string(object, "id").unwrap_or_else(|| format!("tool-{line_no}")),
778 "arguments_sha256": arguments_sha256,
779 "status": status,
780 "result_sha256": result_sha256,
781 }),
782 json!({
783 "receipt_id": object_string(object, "receipt_id").unwrap_or_else(|| format!("tool-receipt-{line_no}")),
784 "kind": "tool_call",
785 "tool": tool,
786 "status": status,
787 "arguments_sha256": arguments_sha256,
788 "result_sha256": result_sha256,
789 }),
790 ))
791}
792
793fn adapt_opencode_permission(
794 object: &serde_json::Map<String, JsonValue>,
795 line_no: usize,
796) -> (JsonValue, JsonValue) {
797 let action = object_string(object, "action").unwrap_or_else(|| "unknown".to_string());
798 let decision = object_string(object, "decision")
799 .or_else(|| object_string(object, "response"))
800 .unwrap_or_else(|| "unknown".to_string());
801 (
802 json!({
803 "request_id": object_string(object, "id").unwrap_or_else(|| format!("permission-{line_no}")),
804 "principal": object_string(object, "principal").unwrap_or_else(|| "agent".to_string()),
805 "action": action,
806 "response": decision,
807 "reviewer": object.get("reviewer").cloned().unwrap_or(JsonValue::Null),
808 }),
809 json!({
810 "decision_id": object_string(object, "decision_id").unwrap_or_else(|| format!("policy-{line_no}")),
811 "capability": object_string(object, "capability").unwrap_or(action),
812 "decision": decision,
813 "approval_required": true,
814 }),
815 )
816}
817
818fn adapt_opencode_llm(object: &serde_json::Map<String, JsonValue>, line_no: usize) -> JsonValue {
819 let input_tokens = object
820 .get("input_tokens")
821 .and_then(JsonValue::as_u64)
822 .or_else(|| {
823 object
824 .get("usage")
825 .and_then(|usage| usage.get("input_tokens"))
826 .and_then(JsonValue::as_u64)
827 })
828 .unwrap_or_default();
829 let output_tokens = object
830 .get("output_tokens")
831 .and_then(JsonValue::as_u64)
832 .or_else(|| {
833 object
834 .get("usage")
835 .and_then(|usage| usage.get("output_tokens"))
836 .and_then(JsonValue::as_u64)
837 })
838 .unwrap_or_default();
839 let messages_sha256 = object
840 .get("messages")
841 .map(|value| sha256_text(&value.to_string()))
842 .unwrap_or_else(|| sha256_text(""));
843 let response_sha256 = object
844 .get("response")
845 .map(|value| sha256_text(&value.to_string()))
846 .unwrap_or_else(|| sha256_text(""));
847 json!({
848 "request_id": object_string(object, "id").unwrap_or_else(|| format!("llm-{line_no}")),
849 "provider": object_string(object, "provider").unwrap_or_else(|| "opencode".to_string()),
850 "model": object_string(object, "model").unwrap_or_else(|| "unknown".to_string()),
851 "messages_sha256": messages_sha256,
852 "response_sha256": response_sha256,
853 "usage": {
854 "input_tokens": input_tokens,
855 "output_tokens": output_tokens,
856 },
857 })
858}
859
860fn adapt_opencode_event(
861 object: &serde_json::Map<String, JsonValue>,
862 event_type: &str,
863 line_no: usize,
864) -> JsonValue {
865 json!({
866 "event_id": line_no,
867 "topic": object_string(object, "topic").unwrap_or_else(|| "opencode.session".to_string()),
868 "kind": event_type,
869 "payload": object.get("payload").cloned().unwrap_or_else(|| JsonValue::Object(object.clone())),
870 })
871}
872
873fn object_string(object: &serde_json::Map<String, JsonValue>, key: &str) -> Option<String> {
874 object
875 .get(key)
876 .and_then(JsonValue::as_str)
877 .map(str::to_string)
878}
879
880#[cfg(test)]
881mod tests {
882 use super::*;
883
884 fn trace_pair(status: (&str, &str)) -> ReplayOracleTrace {
885 ReplayOracleTrace {
886 schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
887 name: "simple_tool_run".to_string(),
888 description: Some("golden replay benchmark fixture".to_string()),
889 expect: ReplayExpectation::Match,
890 allowlist: vec![ReplayAllowlistRule {
891 path: "/run_id".to_string(),
892 reason: "run ids are allocated per execution".to_string(),
893 replacement: None,
894 }],
895 first_run: ReplayTraceRun {
896 run_id: "first".to_string(),
897 protocol_interactions: vec![json!({
898 "protocol": "mcp",
899 "boundary": "tools/call",
900 "tool": "read_file",
901 "status": status.0,
902 })],
903 policy_decisions: vec![json!({
904 "capability": "fs.read",
905 "decision": "allow",
906 })],
907 ..ReplayTraceRun::default()
908 },
909 second_run: ReplayTraceRun {
910 run_id: "second".to_string(),
911 protocol_interactions: vec![json!({
912 "protocol": "mcp",
913 "boundary": "tools/call",
914 "tool": "read_file",
915 "status": status.1,
916 })],
917 policy_decisions: vec![json!({
918 "capability": "fs.read",
919 "decision": "allow",
920 })],
921 ..ReplayTraceRun::default()
922 },
923 protocol_fixture_refs: Vec::new(),
924 }
925 }
926
927 #[test]
928 fn replay_benchmark_reports_stable_golden_metrics_for_matching_trace() {
929 let fixture =
930 benchmark_replay_trace("benchmarks/replay/simple.json", &trace_pair(("ok", "ok")))
931 .expect("benchmark fixture");
932
933 assert!(fixture.passed);
934 assert!(fixture.deterministic);
935 assert_eq!(fixture.metrics.determinism_score, 1.0);
936 assert_eq!(fixture.metrics.replay_fidelity_score, 1.0);
937 assert_eq!(fixture.metrics.permission_decision_preservation_score, 1.0);
938 assert_eq!(fixture.metrics.tool_call_drift_count, 0);
939 assert!(fixture
940 .receipt
941 .benchmark_receipt_sha256
942 .starts_with("sha256:"));
943 }
944
945 #[test]
946 fn replay_benchmark_reports_reduced_fidelity_for_meaningful_drift() {
947 let fixture =
948 benchmark_replay_trace("benchmarks/replay/drift.json", &trace_pair(("ok", "error")))
949 .expect("benchmark fixture");
950
951 assert!(!fixture.passed);
952 assert!(!fixture.deterministic);
953 assert_eq!(fixture.metrics.determinism_score, 0.0);
954 assert_eq!(fixture.metrics.replay_fidelity_score, 0.5);
955 assert_eq!(fixture.metrics.tool_call_drift_count, 1);
956 assert_eq!(
957 fixture
958 .metrics
959 .debugging_time_to_root_cause_proxy
960 .first_divergence_path
961 .as_deref(),
962 Some("/protocol_interactions/0/status")
963 );
964 assert_eq!(
965 fixture
966 .metrics
967 .debugging_time_to_root_cause_proxy
968 .first_divergence_depth,
969 3
970 );
971 assert_eq!(
972 fixture
973 .metrics
974 .debugging_time_to_root_cause_proxy
975 .estimated_triage_steps,
976 5
977 );
978 }
979
980 #[test]
981 fn replay_benchmark_summary_is_stable_across_repeated_runs() {
982 let first = benchmark_replay_trace("fixture.json", &trace_pair(("ok", "ok")))
983 .expect("first benchmark");
984 let second = benchmark_replay_trace("fixture.json", &trace_pair(("ok", "ok")))
985 .expect("second benchmark");
986
987 let first_json = serde_json::to_string(&first).expect("serialize first");
988 let second_json = serde_json::to_string(&second).expect("serialize second");
989 assert_eq!(first_json, second_json);
990 }
991
992 #[test]
993 fn opencode_jsonl_adapter_maps_messages_tools_permissions_and_llm_usage() {
994 let input = concat!(
995 "{\"type\":\"message\",\"id\":\"m1\",\"role\":\"assistant\",\"content\":\"done\"}\n",
996 "{\"type\":\"tool_call\",\"id\":\"t1\",\"tool\":\"write_file\",\"arguments\":{\"path\":\"notes.md\"},\"result\":{\"ok\":true}}\n",
997 "{\"type\":\"permission\",\"id\":\"p1\",\"action\":\"write_file\",\"decision\":\"approved\"}\n",
998 "{\"type\":\"llm\",\"id\":\"l1\",\"model\":\"qwen\",\"usage\":{\"input_tokens\":7,\"output_tokens\":3}}\n"
999 );
1000
1001 let run = OpenCodeJsonlAdapter
1002 .adapt_run(input, "opencode-run")
1003 .expect("adapt opencode jsonl");
1004
1005 assert_eq!(run.run_id, "opencode-run");
1006 assert_eq!(run.agent_transcript_deltas.len(), 1);
1007 assert_eq!(run.protocol_interactions.len(), 1);
1008 assert_eq!(run.effect_receipts.len(), 1);
1009 assert_eq!(run.approval_interactions.len(), 1);
1010 assert_eq!(run.policy_decisions.len(), 1);
1011 assert_eq!(run.llm_interactions.len(), 1);
1012 assert_eq!(token_total(&run, "input_tokens"), 7);
1013 assert_eq!(token_total(&run, "output_tokens"), 3);
1014 }
1015
1016 #[test]
1017 fn adapted_trace_pair_can_be_benchmarked() {
1018 let first = "{\"type\":\"tool_call\",\"tool\":\"read_file\",\"result\":{\"ok\":true}}\n";
1019 let second = "{\"type\":\"tool_call\",\"tool\":\"read_file\",\"result\":{\"ok\":true}}\n";
1020
1021 let fixture = benchmark_adapted_replay_pair(
1022 &OpenCodeJsonlAdapter,
1023 "external-tool-run",
1024 first,
1025 second,
1026 )
1027 .expect("benchmark adapted pair");
1028
1029 assert!(fixture.passed);
1030 assert_eq!(fixture.name, "external-tool-run");
1031 assert_eq!(fixture.metrics.tool_call_drift_count, 0);
1032 }
1033}