1use std::collections::{BTreeMap, BTreeSet};
2use std::fmt;
3
4use serde::{Deserialize, Serialize};
5use serde_json::{json, Value as JsonValue};
6use sha2::{Digest, Sha256};
7
8use super::{
9 canonicalize_run, run_replay_oracle_trace, ReplayAllowlistRule, ReplayDivergence,
10 ReplayExpectation, ReplayOracleError, ReplayOracleReport, ReplayOracleTrace, ReplayTraceRun,
11 ReplayTraceRunCounts, REPLAY_TRACE_SCHEMA_VERSION,
12};
13
14pub const REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION: &str = "harn.replay_benchmark.report.v1";
15pub const REPLAY_BENCHMARK_CLOUD_INGEST_KIND: &str = "harn_cloud.replay_determinism.leaderboard.v1";
16pub const OPENCODE_JSONL_ADAPTER_ID: &str = "opencode-jsonl";
17pub const OPENCODE_JSONL_ADAPTER_SCHEMA_VERSION: &str =
18 "harn.replay_benchmark.adapter.opencode_jsonl.v1";
19
20const REPLAY_TRACE_SECTIONS: [&str; 11] = [
21 "event_log_entries",
22 "trigger_firings",
23 "llm_interactions",
24 "protocol_interactions",
25 "approval_interactions",
26 "effect_receipts",
27 "persona_runtime_states",
28 "agent_transcript_deltas",
29 "final_artifacts",
30 "policy_decisions",
31 "channel_receipts",
33];
34
35const TOOL_DRIFT_SECTIONS: [&str; 3] = [
36 "llm_interactions",
37 "protocol_interactions",
38 "effect_receipts",
39];
40
41const PERMISSION_SECTIONS: [&str; 2] = ["approval_interactions", "policy_decisions"];
42
43#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
44pub struct ReplayBenchmarkReport {
45 pub schema_version: String,
46 pub cloud_ingest: ReplayBenchmarkCloudIngest,
47 pub suite: ReplayBenchmarkSuiteIdentity,
48 pub summary: ReplayBenchmarkSummary,
49 pub fixtures: Vec<ReplayBenchmarkFixtureReport>,
50}
51
52#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
53pub struct ReplayBenchmarkCloudIngest {
54 pub kind: String,
55 pub leaderboard_key: String,
56 pub report_schema_version: String,
57 pub replay_trace_schema_version: String,
58 pub artifact_contract: String,
59}
60
61#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
62pub struct ReplayBenchmarkSuiteIdentity {
63 pub name: String,
64 pub fixture_count: usize,
65 pub source_paths: Vec<String>,
66}
67
68#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
69pub struct ReplayBenchmarkSummary {
70 pub passed: usize,
71 pub failed: usize,
72 pub deterministic_fixtures: usize,
73 pub drifted_fixtures: usize,
74 pub mean_replay_fidelity_score: f64,
75 pub mean_permission_decision_preservation_score: f64,
76 pub tool_call_drift_count: usize,
77 pub transcript_drift_count: usize,
78 pub observed_interactions: usize,
79 pub llm_input_tokens: u64,
80 pub llm_output_tokens: u64,
81}
82
83#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
84pub struct ReplayBenchmarkFixtureReport {
85 pub path: String,
86 pub name: String,
87 pub description: Option<String>,
88 pub expectation: ReplayExpectation,
89 pub passed: bool,
90 pub deterministic: bool,
91 pub first_run_counts: ReplayTraceRunCounts,
92 pub second_run_counts: ReplayTraceRunCounts,
93 pub metrics: ReplayBenchmarkMetrics,
94 pub first_divergence: Option<ReplayDivergence>,
95 pub receipt: ReplayBenchmarkFixtureReceipt,
96}
97
98#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
99pub struct ReplayBenchmarkMetrics {
100 pub determinism_score: f64,
101 pub replay_fidelity_score: f64,
102 pub permission_decision_preservation_score: f64,
103 pub tool_call_drift_count: usize,
104 pub transcript_drift_count: usize,
105 pub runtime_cost: ReplayRuntimeCostMetrics,
106 pub debugging_time_to_root_cause_proxy: ReplayDebuggingProxyMetrics,
107 pub category_scores: BTreeMap<String, ReplayCategoryMetric>,
108}
109
110#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
111pub struct ReplayCategoryMetric {
112 pub compared: bool,
113 pub matched: bool,
114 pub drift_count: usize,
115 pub first_run_count: usize,
116 pub second_run_count: usize,
117}
118
119#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
120pub struct ReplayRuntimeCostMetrics {
121 pub observed_interactions: usize,
122 pub event_log_entries: usize,
123 pub trigger_firings: usize,
124 pub llm_interactions: usize,
125 pub protocol_interactions: usize,
126 pub approval_interactions: usize,
127 pub effect_receipts: usize,
128 pub persona_runtime_states: usize,
129 pub agent_transcript_deltas: usize,
130 pub final_artifacts: usize,
131 pub policy_decisions: usize,
132 #[serde(default)]
134 pub channel_receipts: usize,
135 #[serde(default)]
136 pub lifecycle_receipts: usize,
137 pub llm_input_tokens: u64,
138 pub llm_output_tokens: u64,
139 #[serde(skip_serializing_if = "Option::is_none")]
140 pub observed_cost_usd: Option<f64>,
141}
142
143#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
144pub struct ReplayDebuggingProxyMetrics {
145 pub proxy_kind: String,
146 pub first_divergence_path: Option<String>,
147 pub first_divergence_depth: usize,
148 pub drift_surface_count: usize,
149 pub estimated_triage_steps: usize,
150}
151
152#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
153pub struct ReplayBenchmarkFixtureReceipt {
154 pub ingest_kind: String,
155 pub report_schema_version: String,
156 pub replay_trace_schema_version: String,
157 pub canonical_first_sha256: String,
158 pub canonical_second_sha256: String,
159 pub benchmark_receipt_sha256: String,
160}
161
162#[derive(Debug, Clone, PartialEq, Eq)]
163pub enum ReplayBenchmarkError {
164 Oracle(ReplayOracleError),
165 Adapter(String),
166 Serialization(String),
167}
168
169impl fmt::Display for ReplayBenchmarkError {
170 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
171 match self {
172 Self::Oracle(error) => error.fmt(f),
173 Self::Adapter(message) | Self::Serialization(message) => message.fmt(f),
174 }
175 }
176}
177
178impl std::error::Error for ReplayBenchmarkError {}
179
180impl From<ReplayOracleError> for ReplayBenchmarkError {
181 fn from(error: ReplayOracleError) -> Self {
182 Self::Oracle(error)
183 }
184}
185
186pub trait ReplayTraceAdapter {
187 fn adapter_id(&self) -> &'static str;
188 fn input_schema_version(&self) -> &'static str;
189 fn adapt_run(&self, input: &str, run_id: &str) -> Result<ReplayTraceRun, ReplayBenchmarkError>;
190}
191
192#[derive(Clone, Copy, Debug, Default)]
193pub struct OpenCodeJsonlAdapter;
194
195impl ReplayTraceAdapter for OpenCodeJsonlAdapter {
196 fn adapter_id(&self) -> &'static str {
197 OPENCODE_JSONL_ADAPTER_ID
198 }
199
200 fn input_schema_version(&self) -> &'static str {
201 OPENCODE_JSONL_ADAPTER_SCHEMA_VERSION
202 }
203
204 fn adapt_run(&self, input: &str, run_id: &str) -> Result<ReplayTraceRun, ReplayBenchmarkError> {
205 adapt_opencode_jsonl(input, run_id)
206 }
207}
208
209pub fn benchmark_replay_trace(
210 path: impl Into<String>,
211 trace: &ReplayOracleTrace,
212) -> Result<ReplayBenchmarkFixtureReport, ReplayBenchmarkError> {
213 let path = path.into();
214 let oracle = run_replay_oracle_trace(trace)?;
215 benchmark_replay_trace_from_oracle(path, trace, oracle)
216}
217
218pub fn benchmark_adapted_replay_pair(
219 adapter: &dyn ReplayTraceAdapter,
220 name: impl Into<String>,
221 first_input: &str,
222 second_input: &str,
223) -> Result<ReplayBenchmarkFixtureReport, ReplayBenchmarkError> {
224 let name = name.into();
225 let trace = ReplayOracleTrace {
226 schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
227 name: name.clone(),
228 description: Some(format!(
229 "External replay trace pair adapted with {} ({})",
230 adapter.adapter_id(),
231 adapter.input_schema_version()
232 )),
233 expect: ReplayExpectation::Match,
234 allowlist: vec![ReplayAllowlistRule {
235 path: "/run_id".to_string(),
236 reason: "external trace runs are imported as separate executions".to_string(),
237 replacement: None,
238 }],
239 first_run: adapter.adapt_run(first_input, "adapted_first_run")?,
240 second_run: adapter.adapt_run(second_input, "adapted_second_run")?,
241 protocol_fixture_refs: Vec::new(),
242 };
243 benchmark_replay_trace(format!("adapter:{}:{name}", adapter.adapter_id()), &trace)
244}
245
246pub fn build_replay_benchmark_report(
247 suite_name: impl Into<String>,
248 source_paths: Vec<String>,
249 fixtures: Vec<ReplayBenchmarkFixtureReport>,
250) -> ReplayBenchmarkReport {
251 let suite_name = suite_name.into();
252 let summary = summarize_replay_benchmark(&fixtures);
253 ReplayBenchmarkReport {
254 schema_version: REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION.to_string(),
255 cloud_ingest: ReplayBenchmarkCloudIngest {
256 kind: REPLAY_BENCHMARK_CLOUD_INGEST_KIND.to_string(),
257 leaderboard_key: "replay-determinism".to_string(),
258 report_schema_version: REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION.to_string(),
259 replay_trace_schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
260 artifact_contract:
261 "fixtures[].receipt + fixtures[].metrics are stable Cloud leaderboard inputs"
262 .to_string(),
263 },
264 suite: ReplayBenchmarkSuiteIdentity {
265 name: suite_name,
266 fixture_count: fixtures.len(),
267 source_paths,
268 },
269 summary,
270 fixtures,
271 }
272}
273
274fn benchmark_replay_trace_from_oracle(
275 path: String,
276 trace: &ReplayOracleTrace,
277 oracle: ReplayOracleReport,
278) -> Result<ReplayBenchmarkFixtureReport, ReplayBenchmarkError> {
279 let first = canonicalize_run(&trace.first_run, &trace.allowlist)?;
280 let second = canonicalize_run(&trace.second_run, &trace.allowlist)?;
281 let category_scores = category_scores(&first, &second, &oracle);
282 let metrics = replay_metrics(trace, &oracle, category_scores)?;
283 let canonical_first_sha256 = sha256_json(&first)?;
284 let canonical_second_sha256 = sha256_json(&second)?;
285 let receipt = fixture_receipt(
286 &trace.name,
287 &path,
288 &metrics,
289 &canonical_first_sha256,
290 &canonical_second_sha256,
291 )?;
292
293 Ok(ReplayBenchmarkFixtureReport {
294 path,
295 name: oracle.name,
296 description: trace.description.clone(),
297 expectation: oracle.expectation,
298 passed: oracle.passed,
299 deterministic: oracle.divergence.is_none(),
300 first_run_counts: oracle.first_run_counts,
301 second_run_counts: oracle.second_run_counts,
302 metrics,
303 first_divergence: oracle.divergence,
304 receipt,
305 })
306}
307
308fn summarize_replay_benchmark(fixtures: &[ReplayBenchmarkFixtureReport]) -> ReplayBenchmarkSummary {
309 let fixture_count = fixtures.len();
310 let passed = fixtures.iter().filter(|fixture| fixture.passed).count();
311 let deterministic_fixtures = fixtures
312 .iter()
313 .filter(|fixture| fixture.deterministic)
314 .count();
315 let runtime = fixtures
316 .iter()
317 .fold(ReplayRuntimeCostMetrics::default(), |mut acc, fixture| {
318 let runtime = &fixture.metrics.runtime_cost;
319 acc.observed_interactions += runtime.observed_interactions;
320 acc.event_log_entries += runtime.event_log_entries;
321 acc.trigger_firings += runtime.trigger_firings;
322 acc.llm_interactions += runtime.llm_interactions;
323 acc.protocol_interactions += runtime.protocol_interactions;
324 acc.approval_interactions += runtime.approval_interactions;
325 acc.effect_receipts += runtime.effect_receipts;
326 acc.persona_runtime_states += runtime.persona_runtime_states;
327 acc.agent_transcript_deltas += runtime.agent_transcript_deltas;
328 acc.final_artifacts += runtime.final_artifacts;
329 acc.policy_decisions += runtime.policy_decisions;
330 acc.channel_receipts += runtime.channel_receipts;
331 acc.llm_input_tokens += runtime.llm_input_tokens;
332 acc.llm_output_tokens += runtime.llm_output_tokens;
333 acc.observed_cost_usd =
334 sum_optional_cost(acc.observed_cost_usd, runtime.observed_cost_usd);
335 acc
336 });
337 ReplayBenchmarkSummary {
338 passed,
339 failed: fixture_count.saturating_sub(passed),
340 deterministic_fixtures,
341 drifted_fixtures: fixture_count.saturating_sub(deterministic_fixtures),
342 mean_replay_fidelity_score: average_metric(fixtures, |fixture| {
343 fixture.metrics.replay_fidelity_score
344 }),
345 mean_permission_decision_preservation_score: average_metric(fixtures, |fixture| {
346 fixture.metrics.permission_decision_preservation_score
347 }),
348 tool_call_drift_count: fixtures
349 .iter()
350 .map(|fixture| fixture.metrics.tool_call_drift_count)
351 .sum(),
352 transcript_drift_count: fixtures
353 .iter()
354 .map(|fixture| fixture.metrics.transcript_drift_count)
355 .sum(),
356 observed_interactions: runtime.observed_interactions,
357 llm_input_tokens: runtime.llm_input_tokens,
358 llm_output_tokens: runtime.llm_output_tokens,
359 }
360}
361
362fn replay_metrics(
363 trace: &ReplayOracleTrace,
364 oracle: &ReplayOracleReport,
365 category_scores: BTreeMap<String, ReplayCategoryMetric>,
366) -> Result<ReplayBenchmarkMetrics, ReplayBenchmarkError> {
367 let compared_categories = category_scores
368 .values()
369 .filter(|metric| metric.compared)
370 .count();
371 let matched_categories = category_scores
372 .values()
373 .filter(|metric| metric.compared && metric.matched)
374 .count();
375 let replay_fidelity_score = if compared_categories == 0 {
376 0.0
377 } else {
378 matched_categories as f64 / compared_categories as f64
379 };
380 let permission_decision_preservation_score =
381 section_score(&category_scores, &PERMISSION_SECTIONS);
382 let tool_call_drift_count = section_drift_count(&category_scores, &TOOL_DRIFT_SECTIONS);
383 let transcript_drift_count =
384 section_drift_count(&category_scores, &["agent_transcript_deltas"]);
385 let runtime_cost = runtime_cost_metrics(&trace.first_run, &trace.second_run);
386 let debugging_time_to_root_cause_proxy =
387 debugging_proxy_metrics(oracle.divergence.as_ref(), &category_scores);
388
389 Ok(ReplayBenchmarkMetrics {
390 determinism_score: if oracle.divergence.is_none() {
391 1.0
392 } else {
393 0.0
394 },
395 replay_fidelity_score,
396 permission_decision_preservation_score,
397 tool_call_drift_count,
398 transcript_drift_count,
399 runtime_cost,
400 debugging_time_to_root_cause_proxy,
401 category_scores,
402 })
403}
404
405fn category_scores(
406 first: &JsonValue,
407 second: &JsonValue,
408 oracle: &ReplayOracleReport,
409) -> BTreeMap<String, ReplayCategoryMetric> {
410 let first_counts = counts_by_section(&oracle.first_run_counts);
411 let second_counts = counts_by_section(&oracle.second_run_counts);
412 REPLAY_TRACE_SECTIONS
413 .iter()
414 .map(|section| {
415 let first_value = first.get(*section).unwrap_or(&JsonValue::Null);
416 let second_value = second.get(*section).unwrap_or(&JsonValue::Null);
417 let first_run_count = first_counts.get(*section).copied().unwrap_or_default();
418 let second_run_count = second_counts.get(*section).copied().unwrap_or_default();
419 let compared = first_run_count > 0 || second_run_count > 0;
420 let drift_count = if compared {
421 drift_count(first_value, second_value)
422 } else {
423 0
424 };
425 (
426 (*section).to_string(),
427 ReplayCategoryMetric {
428 compared,
429 matched: drift_count == 0,
430 drift_count,
431 first_run_count,
432 second_run_count,
433 },
434 )
435 })
436 .collect()
437}
438
439fn counts_by_section(counts: &ReplayTraceRunCounts) -> BTreeMap<&'static str, usize> {
440 BTreeMap::from([
441 ("event_log_entries", counts.event_log_entries),
442 ("trigger_firings", counts.trigger_firings),
443 ("llm_interactions", counts.llm_interactions),
444 ("protocol_interactions", counts.protocol_interactions),
445 ("approval_interactions", counts.approval_interactions),
446 ("effect_receipts", counts.effect_receipts),
447 ("persona_runtime_states", counts.persona_runtime_states),
448 ("agent_transcript_deltas", counts.agent_transcript_deltas),
449 ("final_artifacts", counts.final_artifacts),
450 ("policy_decisions", counts.policy_decisions),
451 ("channel_receipts", counts.channel_receipts),
453 ("lifecycle_receipts", counts.lifecycle_receipts),
454 ])
455}
456
457fn drift_count(first: &JsonValue, second: &JsonValue) -> usize {
458 if first == second {
459 return 0;
460 }
461 match (first, second) {
462 (JsonValue::Array(first_items), JsonValue::Array(second_items)) => {
463 let shared = first_items.len().min(second_items.len());
464 let item_drifts = (0..shared)
465 .filter(|index| first_items[*index] != second_items[*index])
466 .count();
467 item_drifts + first_items.len().abs_diff(second_items.len())
468 }
469 (JsonValue::Object(first_map), JsonValue::Object(second_map)) => {
470 let keys = first_map
471 .keys()
472 .chain(second_map.keys())
473 .collect::<BTreeSet<_>>();
474 keys.into_iter()
475 .filter(|key| first_map.get(*key) != second_map.get(*key))
476 .count()
477 }
478 _ => 1,
479 }
480}
481
482fn section_score(
483 category_scores: &BTreeMap<String, ReplayCategoryMetric>,
484 sections: &[&str],
485) -> f64 {
486 let compared = sections
487 .iter()
488 .filter_map(|section| category_scores.get(*section))
489 .filter(|metric| metric.compared)
490 .collect::<Vec<_>>();
491 if compared.is_empty() {
492 return 1.0;
493 }
494 compared.iter().filter(|metric| metric.matched).count() as f64 / compared.len() as f64
495}
496
497fn section_drift_count(
498 category_scores: &BTreeMap<String, ReplayCategoryMetric>,
499 sections: &[&str],
500) -> usize {
501 sections
502 .iter()
503 .filter_map(|section| category_scores.get(*section))
504 .map(|metric| metric.drift_count)
505 .sum()
506}
507
508fn runtime_cost_metrics(
509 first_run: &ReplayTraceRun,
510 second_run: &ReplayTraceRun,
511) -> ReplayRuntimeCostMetrics {
512 let first = first_run.counts();
513 let second = second_run.counts();
514 let observed_cost_usd =
515 sum_optional_cost(cost_usd_for_run(first_run), cost_usd_for_run(second_run));
516 ReplayRuntimeCostMetrics {
517 observed_interactions: trace_material_count(&first) + trace_material_count(&second),
518 event_log_entries: first.event_log_entries + second.event_log_entries,
519 trigger_firings: first.trigger_firings + second.trigger_firings,
520 llm_interactions: first.llm_interactions + second.llm_interactions,
521 protocol_interactions: first.protocol_interactions + second.protocol_interactions,
522 approval_interactions: first.approval_interactions + second.approval_interactions,
523 effect_receipts: first.effect_receipts + second.effect_receipts,
524 persona_runtime_states: first.persona_runtime_states + second.persona_runtime_states,
525 agent_transcript_deltas: first.agent_transcript_deltas + second.agent_transcript_deltas,
526 final_artifacts: first.final_artifacts + second.final_artifacts,
527 policy_decisions: first.policy_decisions + second.policy_decisions,
528 channel_receipts: first.channel_receipts + second.channel_receipts,
529 lifecycle_receipts: first.lifecycle_receipts + second.lifecycle_receipts,
530 llm_input_tokens: token_total(first_run, "input_tokens")
531 + token_total(second_run, "input_tokens"),
532 llm_output_tokens: token_total(first_run, "output_tokens")
533 + token_total(second_run, "output_tokens"),
534 observed_cost_usd,
535 }
536}
537
538fn trace_material_count(counts: &ReplayTraceRunCounts) -> usize {
539 counts.event_log_entries
540 + counts.trigger_firings
541 + counts.llm_interactions
542 + counts.protocol_interactions
543 + counts.approval_interactions
544 + counts.effect_receipts
545 + counts.persona_runtime_states
546 + counts.agent_transcript_deltas
547 + counts.final_artifacts
548 + counts.policy_decisions
549 + counts.channel_receipts
550 + counts.lifecycle_receipts
551}
552
553fn token_total(run: &ReplayTraceRun, token_key: &str) -> u64 {
554 run.llm_interactions
555 .iter()
556 .filter_map(|interaction| {
557 interaction
558 .get(token_key)
559 .and_then(JsonValue::as_u64)
560 .or_else(|| {
561 interaction
562 .get("usage")
563 .and_then(|usage| usage.get(token_key))
564 .and_then(JsonValue::as_u64)
565 })
566 })
567 .sum()
568}
569
570fn cost_usd_for_run(run: &ReplayTraceRun) -> Option<f64> {
571 let mut seen = false;
572 let mut total = 0.0;
573 for interaction in &run.llm_interactions {
574 if let Some(cost) = interaction
575 .get("cost_usd")
576 .and_then(JsonValue::as_f64)
577 .or_else(|| {
578 interaction
579 .get("usage")
580 .and_then(|usage| usage.get("cost_usd"))
581 .and_then(JsonValue::as_f64)
582 })
583 {
584 seen = true;
585 total += cost;
586 }
587 }
588 seen.then_some(total)
589}
590
591fn sum_optional_cost(left: Option<f64>, right: Option<f64>) -> Option<f64> {
592 match (left, right) {
593 (Some(left), Some(right)) => Some(left + right),
594 (Some(value), None) | (None, Some(value)) => Some(value),
595 (None, None) => None,
596 }
597}
598
599fn debugging_proxy_metrics(
600 divergence: Option<&ReplayDivergence>,
601 category_scores: &BTreeMap<String, ReplayCategoryMetric>,
602) -> ReplayDebuggingProxyMetrics {
603 let first_divergence_path = divergence.map(|divergence| divergence.path.clone());
604 let first_divergence_depth = first_divergence_path
605 .as_deref()
606 .map(json_path_depth)
607 .unwrap_or_default();
608 let drift_surface_count = category_scores
609 .values()
610 .filter(|metric| metric.compared && !metric.matched)
611 .count();
612 ReplayDebuggingProxyMetrics {
613 proxy_kind: "first_divergence_depth_plus_drift_surfaces".to_string(),
614 first_divergence_path,
615 first_divergence_depth,
616 drift_surface_count,
617 estimated_triage_steps: if drift_surface_count == 0 {
618 0
619 } else {
620 1 + first_divergence_depth + drift_surface_count
621 },
622 }
623}
624
625fn json_path_depth(path: &str) -> usize {
626 let path = path.trim();
627 if path == "$" {
628 return 0;
629 }
630 if let Some(pointer_path) = path.strip_prefix('/') {
631 return pointer_path
632 .split('/')
633 .filter(|segment| !segment.is_empty())
634 .count();
635 }
636 path.split('.')
637 .filter(|segment| !segment.is_empty() && *segment != "$")
638 .count()
639}
640
641fn fixture_receipt(
642 name: &str,
643 path: &str,
644 metrics: &ReplayBenchmarkMetrics,
645 canonical_first_sha256: &str,
646 canonical_second_sha256: &str,
647) -> Result<ReplayBenchmarkFixtureReceipt, ReplayBenchmarkError> {
648 let receipt_material = json!({
649 "ingest_kind": REPLAY_BENCHMARK_CLOUD_INGEST_KIND,
650 "report_schema_version": REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION,
651 "replay_trace_schema_version": REPLAY_TRACE_SCHEMA_VERSION,
652 "name": name,
653 "path": path,
654 "canonical_first_sha256": canonical_first_sha256,
655 "canonical_second_sha256": canonical_second_sha256,
656 "metrics": metrics,
657 });
658 Ok(ReplayBenchmarkFixtureReceipt {
659 ingest_kind: REPLAY_BENCHMARK_CLOUD_INGEST_KIND.to_string(),
660 report_schema_version: REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION.to_string(),
661 replay_trace_schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
662 canonical_first_sha256: canonical_first_sha256.to_string(),
663 canonical_second_sha256: canonical_second_sha256.to_string(),
664 benchmark_receipt_sha256: sha256_json(&receipt_material)?,
665 })
666}
667
668fn sha256_json(value: &JsonValue) -> Result<String, ReplayBenchmarkError> {
669 let bytes = serde_json::to_vec(value)
670 .map_err(|error| ReplayBenchmarkError::Serialization(error.to_string()))?;
671 Ok(format!("sha256:{}", hex::encode(Sha256::digest(bytes))))
672}
673
674fn sha256_value(value: &JsonValue) -> Result<String, ReplayBenchmarkError> {
675 sha256_json(value)
676}
677
678fn sha256_text(text: &str) -> String {
679 format!("sha256:{}", hex::encode(Sha256::digest(text.as_bytes())))
680}
681
682fn average_metric(
683 fixtures: &[ReplayBenchmarkFixtureReport],
684 metric: impl Fn(&ReplayBenchmarkFixtureReport) -> f64,
685) -> f64 {
686 if fixtures.is_empty() {
687 0.0
688 } else {
689 fixtures.iter().map(metric).sum::<f64>() / fixtures.len() as f64
690 }
691}
692
693fn adapt_opencode_jsonl(input: &str, run_id: &str) -> Result<ReplayTraceRun, ReplayBenchmarkError> {
694 let mut run = ReplayTraceRun {
695 run_id: run_id.to_string(),
696 ..ReplayTraceRun::default()
697 };
698 for (index, raw_line) in input.lines().enumerate() {
699 let line_no = index + 1;
700 let line = raw_line.trim();
701 if line.is_empty() {
702 continue;
703 }
704 let value: JsonValue = serde_json::from_str(line).map_err(|error| {
705 ReplayBenchmarkError::Adapter(format!(
706 "invalid {} JSONL line {line_no}: {error}",
707 OPENCODE_JSONL_ADAPTER_ID
708 ))
709 })?;
710 let object = value.as_object().ok_or_else(|| {
711 ReplayBenchmarkError::Adapter(format!(
712 "{} JSONL line {line_no} must be an object",
713 OPENCODE_JSONL_ADAPTER_ID
714 ))
715 })?;
716 let event_type = object
717 .get("type")
718 .or_else(|| object.get("event"))
719 .and_then(JsonValue::as_str)
720 .unwrap_or("event");
721 match event_type {
722 "message" | "session.message" => {
723 run.agent_transcript_deltas
724 .push(adapt_opencode_message(object, line_no));
725 }
726 "tool_call" | "tool" | "session.tool_call" => {
727 let (protocol, receipt) = adapt_opencode_tool_call(object, line_no)?;
728 run.protocol_interactions.push(protocol);
729 run.effect_receipts.push(receipt);
730 }
731 "permission" | "permission_decision" | "session.permission" => {
732 let (approval, policy) = adapt_opencode_permission(object, line_no);
733 run.approval_interactions.push(approval);
734 run.policy_decisions.push(policy);
735 }
736 "llm" | "model" | "session.llm" => {
737 run.llm_interactions
738 .push(adapt_opencode_llm(object, line_no));
739 }
740 _ => run
741 .event_log_entries
742 .push(adapt_opencode_event(object, event_type, line_no)),
743 }
744 }
745 if trace_material_count(&run.counts()) == 0 {
746 return Err(ReplayBenchmarkError::Adapter(format!(
747 "{} input contained no adaptable events",
748 OPENCODE_JSONL_ADAPTER_ID
749 )));
750 }
751 Ok(run)
752}
753
754fn adapt_opencode_message(
755 object: &serde_json::Map<String, JsonValue>,
756 line_no: usize,
757) -> JsonValue {
758 let content = object.get("content").cloned().unwrap_or(JsonValue::Null);
759 json!({
760 "delta_id": object_string(object, "id").unwrap_or_else(|| format!("message-{line_no}")),
761 "agent": object_string(object, "agent").unwrap_or_else(|| "opencode".to_string()),
762 "role": object_string(object, "role").unwrap_or_else(|| "assistant".to_string()),
763 "content_sha256": sha256_text(&content.to_string()),
764 })
765}
766
767fn adapt_opencode_tool_call(
768 object: &serde_json::Map<String, JsonValue>,
769 line_no: usize,
770) -> Result<(JsonValue, JsonValue), ReplayBenchmarkError> {
771 let tool = object_string(object, "tool")
772 .or_else(|| object_string(object, "name"))
773 .unwrap_or_else(|| "unknown_tool".to_string());
774 let arguments = object
775 .get("arguments")
776 .or_else(|| object.get("args"))
777 .cloned()
778 .unwrap_or_else(|| json!({}));
779 let result = object
780 .get("result")
781 .or_else(|| object.get("output"))
782 .cloned()
783 .unwrap_or(JsonValue::Null);
784 let status = object_string(object, "status").unwrap_or_else(|| "completed".to_string());
785 let arguments_sha256 = sha256_value(&arguments)?;
786 let result_sha256 = sha256_value(&result)?;
787 Ok((
788 json!({
789 "protocol": "opencode",
790 "boundary": "tool_call",
791 "tool": tool,
792 "call_id": object_string(object, "id").unwrap_or_else(|| format!("tool-{line_no}")),
793 "arguments_sha256": arguments_sha256,
794 "status": status,
795 "result_sha256": result_sha256,
796 }),
797 json!({
798 "receipt_id": object_string(object, "receipt_id").unwrap_or_else(|| format!("tool-receipt-{line_no}")),
799 "kind": "tool_call",
800 "tool": tool,
801 "status": status,
802 "arguments_sha256": arguments_sha256,
803 "result_sha256": result_sha256,
804 }),
805 ))
806}
807
808fn adapt_opencode_permission(
809 object: &serde_json::Map<String, JsonValue>,
810 line_no: usize,
811) -> (JsonValue, JsonValue) {
812 let action = object_string(object, "action").unwrap_or_else(|| "unknown".to_string());
813 let decision = object_string(object, "decision")
814 .or_else(|| object_string(object, "response"))
815 .unwrap_or_else(|| "unknown".to_string());
816 (
817 json!({
818 "request_id": object_string(object, "id").unwrap_or_else(|| format!("permission-{line_no}")),
819 "principal": object_string(object, "principal").unwrap_or_else(|| "agent".to_string()),
820 "action": action,
821 "response": decision,
822 "reviewer": object.get("reviewer").cloned().unwrap_or(JsonValue::Null),
823 }),
824 json!({
825 "decision_id": object_string(object, "decision_id").unwrap_or_else(|| format!("policy-{line_no}")),
826 "capability": object_string(object, "capability").unwrap_or(action),
827 "decision": decision,
828 "approval_required": true,
829 }),
830 )
831}
832
833fn adapt_opencode_llm(object: &serde_json::Map<String, JsonValue>, line_no: usize) -> JsonValue {
834 let input_tokens = object
835 .get("input_tokens")
836 .and_then(JsonValue::as_u64)
837 .or_else(|| {
838 object
839 .get("usage")
840 .and_then(|usage| usage.get("input_tokens"))
841 .and_then(JsonValue::as_u64)
842 })
843 .unwrap_or_default();
844 let output_tokens = object
845 .get("output_tokens")
846 .and_then(JsonValue::as_u64)
847 .or_else(|| {
848 object
849 .get("usage")
850 .and_then(|usage| usage.get("output_tokens"))
851 .and_then(JsonValue::as_u64)
852 })
853 .unwrap_or_default();
854 let messages_sha256 = object
855 .get("messages")
856 .map(|value| sha256_text(&value.to_string()))
857 .unwrap_or_else(|| sha256_text(""));
858 let response_sha256 = object
859 .get("response")
860 .map(|value| sha256_text(&value.to_string()))
861 .unwrap_or_else(|| sha256_text(""));
862 json!({
863 "request_id": object_string(object, "id").unwrap_or_else(|| format!("llm-{line_no}")),
864 "provider": object_string(object, "provider").unwrap_or_else(|| "opencode".to_string()),
865 "model": object_string(object, "model").unwrap_or_else(|| "unknown".to_string()),
866 "messages_sha256": messages_sha256,
867 "response_sha256": response_sha256,
868 "usage": {
869 "input_tokens": input_tokens,
870 "output_tokens": output_tokens,
871 },
872 })
873}
874
875fn adapt_opencode_event(
876 object: &serde_json::Map<String, JsonValue>,
877 event_type: &str,
878 line_no: usize,
879) -> JsonValue {
880 json!({
881 "event_id": line_no,
882 "topic": object_string(object, "topic").unwrap_or_else(|| "opencode.session".to_string()),
883 "kind": event_type,
884 "payload": object.get("payload").cloned().unwrap_or_else(|| JsonValue::Object(object.clone())),
885 })
886}
887
888fn object_string(object: &serde_json::Map<String, JsonValue>, key: &str) -> Option<String> {
889 object
890 .get(key)
891 .and_then(JsonValue::as_str)
892 .map(str::to_string)
893}
894
895#[cfg(test)]
896mod tests {
897 use super::*;
898
899 fn trace_pair(status: (&str, &str)) -> ReplayOracleTrace {
900 ReplayOracleTrace {
901 schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
902 name: "simple_tool_run".to_string(),
903 description: Some("golden replay benchmark fixture".to_string()),
904 expect: ReplayExpectation::Match,
905 allowlist: vec![ReplayAllowlistRule {
906 path: "/run_id".to_string(),
907 reason: "run ids are allocated per execution".to_string(),
908 replacement: None,
909 }],
910 first_run: ReplayTraceRun {
911 run_id: "first".to_string(),
912 protocol_interactions: vec![json!({
913 "protocol": "mcp",
914 "boundary": "tools/call",
915 "tool": "read_file",
916 "status": status.0,
917 })],
918 policy_decisions: vec![json!({
919 "capability": "fs.read",
920 "decision": "allow",
921 })],
922 ..ReplayTraceRun::default()
923 },
924 second_run: ReplayTraceRun {
925 run_id: "second".to_string(),
926 protocol_interactions: vec![json!({
927 "protocol": "mcp",
928 "boundary": "tools/call",
929 "tool": "read_file",
930 "status": status.1,
931 })],
932 policy_decisions: vec![json!({
933 "capability": "fs.read",
934 "decision": "allow",
935 })],
936 ..ReplayTraceRun::default()
937 },
938 protocol_fixture_refs: Vec::new(),
939 }
940 }
941
942 #[test]
943 fn replay_benchmark_reports_stable_golden_metrics_for_matching_trace() {
944 let fixture =
945 benchmark_replay_trace("benchmarks/replay/simple.json", &trace_pair(("ok", "ok")))
946 .expect("benchmark fixture");
947
948 assert!(fixture.passed);
949 assert!(fixture.deterministic);
950 assert_eq!(fixture.metrics.determinism_score, 1.0);
951 assert_eq!(fixture.metrics.replay_fidelity_score, 1.0);
952 assert_eq!(fixture.metrics.permission_decision_preservation_score, 1.0);
953 assert_eq!(fixture.metrics.tool_call_drift_count, 0);
954 assert!(fixture
955 .receipt
956 .benchmark_receipt_sha256
957 .starts_with("sha256:"));
958 }
959
960 #[test]
961 fn replay_benchmark_reports_reduced_fidelity_for_meaningful_drift() {
962 let fixture =
963 benchmark_replay_trace("benchmarks/replay/drift.json", &trace_pair(("ok", "error")))
964 .expect("benchmark fixture");
965
966 assert!(!fixture.passed);
967 assert!(!fixture.deterministic);
968 assert_eq!(fixture.metrics.determinism_score, 0.0);
969 assert_eq!(fixture.metrics.replay_fidelity_score, 0.5);
970 assert_eq!(fixture.metrics.tool_call_drift_count, 1);
971 assert_eq!(
972 fixture
973 .metrics
974 .debugging_time_to_root_cause_proxy
975 .first_divergence_path
976 .as_deref(),
977 Some("/protocol_interactions/0/status")
978 );
979 assert_eq!(
980 fixture
981 .metrics
982 .debugging_time_to_root_cause_proxy
983 .first_divergence_depth,
984 3
985 );
986 assert_eq!(
987 fixture
988 .metrics
989 .debugging_time_to_root_cause_proxy
990 .estimated_triage_steps,
991 5
992 );
993 }
994
995 #[test]
996 fn replay_benchmark_summary_is_stable_across_repeated_runs() {
997 let first = benchmark_replay_trace("fixture.json", &trace_pair(("ok", "ok")))
998 .expect("first benchmark");
999 let second = benchmark_replay_trace("fixture.json", &trace_pair(("ok", "ok")))
1000 .expect("second benchmark");
1001
1002 let first_json = serde_json::to_string(&first).expect("serialize first");
1003 let second_json = serde_json::to_string(&second).expect("serialize second");
1004 assert_eq!(first_json, second_json);
1005 }
1006
1007 #[test]
1008 fn opencode_jsonl_adapter_maps_messages_tools_permissions_and_llm_usage() {
1009 let input = concat!(
1010 "{\"type\":\"message\",\"id\":\"m1\",\"role\":\"assistant\",\"content\":\"done\"}\n",
1011 "{\"type\":\"tool_call\",\"id\":\"t1\",\"tool\":\"write_file\",\"arguments\":{\"path\":\"notes.md\"},\"result\":{\"ok\":true}}\n",
1012 "{\"type\":\"permission\",\"id\":\"p1\",\"action\":\"write_file\",\"decision\":\"approved\"}\n",
1013 "{\"type\":\"llm\",\"id\":\"l1\",\"model\":\"qwen\",\"usage\":{\"input_tokens\":7,\"output_tokens\":3}}\n"
1014 );
1015
1016 let run = OpenCodeJsonlAdapter
1017 .adapt_run(input, "opencode-run")
1018 .expect("adapt opencode jsonl");
1019
1020 assert_eq!(run.run_id, "opencode-run");
1021 assert_eq!(run.agent_transcript_deltas.len(), 1);
1022 assert_eq!(run.protocol_interactions.len(), 1);
1023 assert_eq!(run.effect_receipts.len(), 1);
1024 assert_eq!(run.approval_interactions.len(), 1);
1025 assert_eq!(run.policy_decisions.len(), 1);
1026 assert_eq!(run.llm_interactions.len(), 1);
1027 assert_eq!(token_total(&run, "input_tokens"), 7);
1028 assert_eq!(token_total(&run, "output_tokens"), 3);
1029 }
1030
1031 #[test]
1032 fn adapted_trace_pair_can_be_benchmarked() {
1033 let first = "{\"type\":\"tool_call\",\"tool\":\"read_file\",\"result\":{\"ok\":true}}\n";
1034 let second = "{\"type\":\"tool_call\",\"tool\":\"read_file\",\"result\":{\"ok\":true}}\n";
1035
1036 let fixture = benchmark_adapted_replay_pair(
1037 &OpenCodeJsonlAdapter,
1038 "external-tool-run",
1039 first,
1040 second,
1041 )
1042 .expect("benchmark adapted pair");
1043
1044 assert!(fixture.passed);
1045 assert_eq!(fixture.name, "external-tool-run");
1046 assert_eq!(fixture.metrics.tool_call_drift_count, 0);
1047 }
1048}