1use std::collections::{BTreeMap, BTreeSet};
2use std::fmt;
3
4use serde::{Deserialize, Serialize};
5use serde_json::{json, Value as JsonValue};
6use sha2::{Digest, Sha256};
7
8use super::{
9 canonicalize_run, run_replay_oracle_trace, ReplayAllowlistRule, ReplayDivergence,
10 ReplayExpectation, ReplayOracleError, ReplayOracleReport, ReplayOracleTrace, ReplayTraceRun,
11 ReplayTraceRunCounts, REPLAY_TRACE_SCHEMA_VERSION,
12};
13
14pub const REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION: &str = "harn.replay_benchmark.report.v1";
15pub const REPLAY_BENCHMARK_CLOUD_INGEST_KIND: &str = "harn_cloud.replay_determinism.leaderboard.v1";
16pub const OPENCODE_JSONL_ADAPTER_ID: &str = "opencode-jsonl";
17pub const OPENCODE_JSONL_ADAPTER_SCHEMA_VERSION: &str =
18 "harn.replay_benchmark.adapter.opencode_jsonl.v1";
19
20const REPLAY_TRACE_SECTIONS: [&str; 11] = [
21 "event_log_entries",
22 "trigger_firings",
23 "llm_interactions",
24 "protocol_interactions",
25 "approval_interactions",
26 "effect_receipts",
27 "persona_runtime_states",
28 "agent_transcript_deltas",
29 "final_artifacts",
30 "policy_decisions",
31 "channel_receipts",
33];
34
35const TOOL_DRIFT_SECTIONS: [&str; 3] = [
36 "llm_interactions",
37 "protocol_interactions",
38 "effect_receipts",
39];
40
41const PERMISSION_SECTIONS: [&str; 2] = ["approval_interactions", "policy_decisions"];
42
43#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
44pub struct ReplayBenchmarkReport {
45 pub schema_version: String,
46 pub cloud_ingest: ReplayBenchmarkCloudIngest,
47 pub suite: ReplayBenchmarkSuiteIdentity,
48 pub summary: ReplayBenchmarkSummary,
49 pub fixtures: Vec<ReplayBenchmarkFixtureReport>,
50}
51
52#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
53pub struct ReplayBenchmarkCloudIngest {
54 pub kind: String,
55 pub leaderboard_key: String,
56 pub report_schema_version: String,
57 pub replay_trace_schema_version: String,
58 pub artifact_contract: String,
59}
60
61#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
62pub struct ReplayBenchmarkSuiteIdentity {
63 pub name: String,
64 pub fixture_count: usize,
65 pub source_paths: Vec<String>,
66}
67
68#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
69pub struct ReplayBenchmarkSummary {
70 pub passed: usize,
71 pub failed: usize,
72 pub deterministic_fixtures: usize,
73 pub drifted_fixtures: usize,
74 pub mean_replay_fidelity_score: f64,
75 pub mean_permission_decision_preservation_score: f64,
76 pub tool_call_drift_count: usize,
77 pub transcript_drift_count: usize,
78 pub observed_interactions: usize,
79 pub llm_input_tokens: u64,
80 pub llm_output_tokens: u64,
81}
82
83#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
84pub struct ReplayBenchmarkFixtureReport {
85 pub path: String,
86 pub name: String,
87 pub description: Option<String>,
88 pub expectation: ReplayExpectation,
89 pub passed: bool,
90 pub deterministic: bool,
91 pub first_run_counts: ReplayTraceRunCounts,
92 pub second_run_counts: ReplayTraceRunCounts,
93 pub metrics: ReplayBenchmarkMetrics,
94 pub first_divergence: Option<ReplayDivergence>,
95 pub receipt: ReplayBenchmarkFixtureReceipt,
96}
97
98#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
99pub struct ReplayBenchmarkMetrics {
100 pub determinism_score: f64,
101 pub replay_fidelity_score: f64,
102 pub permission_decision_preservation_score: f64,
103 pub tool_call_drift_count: usize,
104 pub transcript_drift_count: usize,
105 pub runtime_cost: ReplayRuntimeCostMetrics,
106 pub debugging_time_to_root_cause_proxy: ReplayDebuggingProxyMetrics,
107 pub category_scores: BTreeMap<String, ReplayCategoryMetric>,
108}
109
110#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
111pub struct ReplayCategoryMetric {
112 pub compared: bool,
113 pub matched: bool,
114 pub drift_count: usize,
115 pub first_run_count: usize,
116 pub second_run_count: usize,
117}
118
119#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
120pub struct ReplayRuntimeCostMetrics {
121 pub observed_interactions: usize,
122 pub event_log_entries: usize,
123 pub trigger_firings: usize,
124 pub llm_interactions: usize,
125 pub protocol_interactions: usize,
126 pub approval_interactions: usize,
127 pub effect_receipts: usize,
128 pub persona_runtime_states: usize,
129 pub agent_transcript_deltas: usize,
130 pub final_artifacts: usize,
131 pub policy_decisions: usize,
132 #[serde(default)]
134 pub channel_receipts: usize,
135 #[serde(default)]
136 pub lifecycle_receipts: usize,
137 pub llm_input_tokens: u64,
138 pub llm_output_tokens: u64,
139 #[serde(skip_serializing_if = "Option::is_none")]
140 pub observed_cost_usd: Option<f64>,
141}
142
143#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
144pub struct ReplayDebuggingProxyMetrics {
145 pub proxy_kind: String,
146 pub first_divergence_path: Option<String>,
147 pub first_divergence_depth: usize,
148 pub drift_surface_count: usize,
149 pub estimated_triage_steps: usize,
150}
151
152#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
153pub struct ReplayBenchmarkFixtureReceipt {
154 pub ingest_kind: String,
155 pub report_schema_version: String,
156 pub replay_trace_schema_version: String,
157 pub canonical_first_sha256: String,
158 pub canonical_second_sha256: String,
159 pub benchmark_receipt_sha256: String,
160}
161
162#[derive(Debug, Clone, PartialEq, Eq)]
163pub enum ReplayBenchmarkError {
164 Oracle(ReplayOracleError),
165 Adapter(String),
166 Serialization(String),
167}
168
169impl fmt::Display for ReplayBenchmarkError {
170 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
171 match self {
172 Self::Oracle(error) => error.fmt(f),
173 Self::Adapter(message) | Self::Serialization(message) => message.fmt(f),
174 }
175 }
176}
177
178impl std::error::Error for ReplayBenchmarkError {}
179
180impl From<ReplayOracleError> for ReplayBenchmarkError {
181 fn from(error: ReplayOracleError) -> Self {
182 Self::Oracle(error)
183 }
184}
185
186pub trait ReplayTraceAdapter {
187 fn adapter_id(&self) -> &'static str;
188 fn input_schema_version(&self) -> &'static str;
189 fn adapt_run(&self, input: &str, run_id: &str) -> Result<ReplayTraceRun, ReplayBenchmarkError>;
190}
191
192#[derive(Clone, Copy, Debug, Default)]
193pub struct OpenCodeJsonlAdapter;
194
195impl ReplayTraceAdapter for OpenCodeJsonlAdapter {
196 fn adapter_id(&self) -> &'static str {
197 OPENCODE_JSONL_ADAPTER_ID
198 }
199
200 fn input_schema_version(&self) -> &'static str {
201 OPENCODE_JSONL_ADAPTER_SCHEMA_VERSION
202 }
203
204 fn adapt_run(&self, input: &str, run_id: &str) -> Result<ReplayTraceRun, ReplayBenchmarkError> {
205 adapt_opencode_jsonl(input, run_id)
206 }
207}
208
209pub fn benchmark_replay_trace(
210 path: impl Into<String>,
211 trace: &ReplayOracleTrace,
212) -> Result<ReplayBenchmarkFixtureReport, ReplayBenchmarkError> {
213 let path = path.into();
214 let oracle = run_replay_oracle_trace(trace)?;
215 benchmark_replay_trace_from_oracle(path, trace, oracle)
216}
217
218pub fn benchmark_adapted_replay_pair(
219 adapter: &dyn ReplayTraceAdapter,
220 name: impl Into<String>,
221 first_input: &str,
222 second_input: &str,
223) -> Result<ReplayBenchmarkFixtureReport, ReplayBenchmarkError> {
224 let name = name.into();
225 let trace = ReplayOracleTrace {
226 schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
227 name: name.clone(),
228 description: Some(format!(
229 "External replay trace pair adapted with {} ({})",
230 adapter.adapter_id(),
231 adapter.input_schema_version()
232 )),
233 expect: ReplayExpectation::Match,
234 allowlist: vec![ReplayAllowlistRule {
235 path: "/run_id".to_string(),
236 reason: "external trace runs are imported as separate executions".to_string(),
237 replacement: None,
238 }],
239 first_run: adapter.adapt_run(first_input, "adapted_first_run")?,
240 second_run: adapter.adapt_run(second_input, "adapted_second_run")?,
241 protocol_fixture_refs: Vec::new(),
242 };
243 benchmark_replay_trace(format!("adapter:{}:{name}", adapter.adapter_id()), &trace)
244}
245
246pub fn build_replay_benchmark_report(
247 suite_name: impl Into<String>,
248 source_paths: Vec<String>,
249 fixtures: Vec<ReplayBenchmarkFixtureReport>,
250) -> ReplayBenchmarkReport {
251 let suite_name = suite_name.into();
252 let summary = summarize_replay_benchmark(&fixtures);
253 ReplayBenchmarkReport {
254 schema_version: REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION.to_string(),
255 cloud_ingest: ReplayBenchmarkCloudIngest {
256 kind: REPLAY_BENCHMARK_CLOUD_INGEST_KIND.to_string(),
257 leaderboard_key: "replay-determinism".to_string(),
258 report_schema_version: REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION.to_string(),
259 replay_trace_schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
260 artifact_contract:
261 "fixtures[].receipt + fixtures[].metrics are stable Cloud leaderboard inputs"
262 .to_string(),
263 },
264 suite: ReplayBenchmarkSuiteIdentity {
265 name: suite_name,
266 fixture_count: fixtures.len(),
267 source_paths,
268 },
269 summary,
270 fixtures,
271 }
272}
273
274fn benchmark_replay_trace_from_oracle(
275 path: String,
276 trace: &ReplayOracleTrace,
277 oracle: ReplayOracleReport,
278) -> Result<ReplayBenchmarkFixtureReport, ReplayBenchmarkError> {
279 let first = canonicalize_run(&trace.first_run, &trace.allowlist)?;
280 let second = canonicalize_run(&trace.second_run, &trace.allowlist)?;
281 let category_scores = category_scores(&first, &second, &oracle);
282 let metrics = replay_metrics(trace, &oracle, category_scores)?;
283 let canonical_first_sha256 = sha256_json(&first)?;
284 let canonical_second_sha256 = sha256_json(&second)?;
285 let receipt = fixture_receipt(
286 &trace.name,
287 &path,
288 &metrics,
289 &canonical_first_sha256,
290 &canonical_second_sha256,
291 )?;
292
293 Ok(ReplayBenchmarkFixtureReport {
294 path,
295 name: oracle.name,
296 description: trace.description.clone(),
297 expectation: oracle.expectation,
298 passed: oracle.passed,
299 deterministic: oracle.divergence.is_none(),
300 first_run_counts: oracle.first_run_counts,
301 second_run_counts: oracle.second_run_counts,
302 metrics,
303 first_divergence: oracle.divergence,
304 receipt,
305 })
306}
307
308fn summarize_replay_benchmark(fixtures: &[ReplayBenchmarkFixtureReport]) -> ReplayBenchmarkSummary {
309 let fixture_count = fixtures.len();
310 let passed = fixtures.iter().filter(|fixture| fixture.passed).count();
311 let deterministic_fixtures = fixtures
312 .iter()
313 .filter(|fixture| fixture.deterministic)
314 .count();
315 let runtime = fixtures
316 .iter()
317 .fold(ReplayRuntimeCostMetrics::default(), |mut acc, fixture| {
318 let runtime = &fixture.metrics.runtime_cost;
319 acc.observed_interactions += runtime.observed_interactions;
320 acc.event_log_entries += runtime.event_log_entries;
321 acc.trigger_firings += runtime.trigger_firings;
322 acc.llm_interactions += runtime.llm_interactions;
323 acc.protocol_interactions += runtime.protocol_interactions;
324 acc.approval_interactions += runtime.approval_interactions;
325 acc.effect_receipts += runtime.effect_receipts;
326 acc.persona_runtime_states += runtime.persona_runtime_states;
327 acc.agent_transcript_deltas += runtime.agent_transcript_deltas;
328 acc.final_artifacts += runtime.final_artifacts;
329 acc.policy_decisions += runtime.policy_decisions;
330 acc.channel_receipts += runtime.channel_receipts;
331 acc.llm_input_tokens += runtime.llm_input_tokens;
332 acc.llm_output_tokens += runtime.llm_output_tokens;
333 acc.observed_cost_usd =
334 sum_optional_cost(acc.observed_cost_usd, runtime.observed_cost_usd);
335 acc
336 });
337 ReplayBenchmarkSummary {
338 passed,
339 failed: fixture_count.saturating_sub(passed),
340 deterministic_fixtures,
341 drifted_fixtures: fixture_count.saturating_sub(deterministic_fixtures),
342 mean_replay_fidelity_score: average_metric(fixtures, |fixture| {
343 fixture.metrics.replay_fidelity_score
344 }),
345 mean_permission_decision_preservation_score: average_metric(fixtures, |fixture| {
346 fixture.metrics.permission_decision_preservation_score
347 }),
348 tool_call_drift_count: fixtures
349 .iter()
350 .map(|fixture| fixture.metrics.tool_call_drift_count)
351 .sum(),
352 transcript_drift_count: fixtures
353 .iter()
354 .map(|fixture| fixture.metrics.transcript_drift_count)
355 .sum(),
356 observed_interactions: runtime.observed_interactions,
357 llm_input_tokens: runtime.llm_input_tokens,
358 llm_output_tokens: runtime.llm_output_tokens,
359 }
360}
361
362fn replay_metrics(
363 trace: &ReplayOracleTrace,
364 oracle: &ReplayOracleReport,
365 category_scores: BTreeMap<String, ReplayCategoryMetric>,
366) -> Result<ReplayBenchmarkMetrics, ReplayBenchmarkError> {
367 let compared_categories = category_scores
368 .values()
369 .filter(|metric| metric.compared)
370 .count();
371 let matched_categories = category_scores
372 .values()
373 .filter(|metric| metric.compared && metric.matched)
374 .count();
375 let replay_fidelity_score = if compared_categories == 0 {
376 0.0
377 } else {
378 matched_categories as f64 / compared_categories as f64
379 };
380 let permission_decision_preservation_score =
381 section_score(&category_scores, &PERMISSION_SECTIONS);
382 let tool_call_drift_count = section_drift_count(&category_scores, &TOOL_DRIFT_SECTIONS);
383 let transcript_drift_count =
384 section_drift_count(&category_scores, &["agent_transcript_deltas"]);
385 let runtime_cost = runtime_cost_metrics(&trace.first_run, &trace.second_run);
386 let debugging_time_to_root_cause_proxy =
387 debugging_proxy_metrics(oracle.divergence.as_ref(), &category_scores);
388
389 Ok(ReplayBenchmarkMetrics {
390 determinism_score: if oracle.divergence.is_none() {
391 1.0
392 } else {
393 0.0
394 },
395 replay_fidelity_score,
396 permission_decision_preservation_score,
397 tool_call_drift_count,
398 transcript_drift_count,
399 runtime_cost,
400 debugging_time_to_root_cause_proxy,
401 category_scores,
402 })
403}
404
405fn category_scores(
406 first: &JsonValue,
407 second: &JsonValue,
408 oracle: &ReplayOracleReport,
409) -> BTreeMap<String, ReplayCategoryMetric> {
410 let first_counts = counts_by_section(&oracle.first_run_counts);
411 let second_counts = counts_by_section(&oracle.second_run_counts);
412 REPLAY_TRACE_SECTIONS
413 .iter()
414 .map(|section| {
415 let first_value = first.get(*section).unwrap_or(&JsonValue::Null);
416 let second_value = second.get(*section).unwrap_or(&JsonValue::Null);
417 let first_run_count = first_counts.get(*section).copied().unwrap_or_default();
418 let second_run_count = second_counts.get(*section).copied().unwrap_or_default();
419 let compared = first_run_count > 0 || second_run_count > 0;
420 let drift_count = if compared {
421 drift_count(first_value, second_value)
422 } else {
423 0
424 };
425 (
426 (*section).to_string(),
427 ReplayCategoryMetric {
428 compared,
429 matched: drift_count == 0,
430 drift_count,
431 first_run_count,
432 second_run_count,
433 },
434 )
435 })
436 .collect()
437}
438
439fn counts_by_section(counts: &ReplayTraceRunCounts) -> BTreeMap<&'static str, usize> {
440 BTreeMap::from([
441 ("event_log_entries", counts.event_log_entries),
442 ("trigger_firings", counts.trigger_firings),
443 ("llm_interactions", counts.llm_interactions),
444 ("protocol_interactions", counts.protocol_interactions),
445 ("approval_interactions", counts.approval_interactions),
446 ("effect_receipts", counts.effect_receipts),
447 ("persona_runtime_states", counts.persona_runtime_states),
448 ("agent_transcript_deltas", counts.agent_transcript_deltas),
449 ("final_artifacts", counts.final_artifacts),
450 ("policy_decisions", counts.policy_decisions),
451 ("channel_receipts", counts.channel_receipts),
453 ("lifecycle_receipts", counts.lifecycle_receipts),
454 ])
455}
456
457fn drift_count(first: &JsonValue, second: &JsonValue) -> usize {
458 if first == second {
459 return 0;
460 }
461 match (first, second) {
462 (JsonValue::Array(first_items), JsonValue::Array(second_items)) => {
463 let shared = first_items.len().min(second_items.len());
464 let item_drifts = (0..shared)
465 .filter(|index| first_items[*index] != second_items[*index])
466 .count();
467 item_drifts + first_items.len().abs_diff(second_items.len())
468 }
469 (JsonValue::Object(first_map), JsonValue::Object(second_map)) => {
470 let keys = first_map
471 .keys()
472 .chain(second_map.keys())
473 .collect::<BTreeSet<_>>();
474 keys.into_iter()
475 .filter(|key| first_map.get(*key) != second_map.get(*key))
476 .count()
477 }
478 _ => 1,
479 }
480}
481
482fn section_score(
483 category_scores: &BTreeMap<String, ReplayCategoryMetric>,
484 sections: &[&str],
485) -> f64 {
486 let compared = sections
487 .iter()
488 .filter_map(|section| category_scores.get(*section))
489 .filter(|metric| metric.compared)
490 .collect::<Vec<_>>();
491 if compared.is_empty() {
492 return 1.0;
493 }
494 compared.iter().filter(|metric| metric.matched).count() as f64 / compared.len() as f64
495}
496
497fn section_drift_count(
498 category_scores: &BTreeMap<String, ReplayCategoryMetric>,
499 sections: &[&str],
500) -> usize {
501 sections
502 .iter()
503 .filter_map(|section| category_scores.get(*section))
504 .map(|metric| metric.drift_count)
505 .sum()
506}
507
508fn runtime_cost_metrics(
509 first_run: &ReplayTraceRun,
510 second_run: &ReplayTraceRun,
511) -> ReplayRuntimeCostMetrics {
512 let first = first_run.counts();
513 let second = second_run.counts();
514 let observed_cost_usd =
515 sum_optional_cost(cost_usd_for_run(first_run), cost_usd_for_run(second_run));
516 ReplayRuntimeCostMetrics {
517 observed_interactions: trace_material_count(&first) + trace_material_count(&second),
518 event_log_entries: first.event_log_entries + second.event_log_entries,
519 trigger_firings: first.trigger_firings + second.trigger_firings,
520 llm_interactions: first.llm_interactions + second.llm_interactions,
521 protocol_interactions: first.protocol_interactions + second.protocol_interactions,
522 approval_interactions: first.approval_interactions + second.approval_interactions,
523 effect_receipts: first.effect_receipts + second.effect_receipts,
524 persona_runtime_states: first.persona_runtime_states + second.persona_runtime_states,
525 agent_transcript_deltas: first.agent_transcript_deltas + second.agent_transcript_deltas,
526 final_artifacts: first.final_artifacts + second.final_artifacts,
527 policy_decisions: first.policy_decisions + second.policy_decisions,
528 channel_receipts: first.channel_receipts + second.channel_receipts,
529 lifecycle_receipts: first.lifecycle_receipts + second.lifecycle_receipts,
530 llm_input_tokens: token_total(first_run, "input_tokens")
531 + token_total(second_run, "input_tokens"),
532 llm_output_tokens: token_total(first_run, "output_tokens")
533 + token_total(second_run, "output_tokens"),
534 observed_cost_usd,
535 }
536}
537
538fn trace_material_count(counts: &ReplayTraceRunCounts) -> usize {
539 counts.event_log_entries
540 + counts.trigger_firings
541 + counts.llm_interactions
542 + counts.protocol_interactions
543 + counts.approval_interactions
544 + counts.effect_receipts
545 + counts.persona_runtime_states
546 + counts.agent_transcript_deltas
547 + counts.final_artifacts
548 + counts.policy_decisions
549 + counts.channel_receipts
550 + counts.lifecycle_receipts
551}
552
553fn token_total(run: &ReplayTraceRun, token_key: &str) -> u64 {
554 run.llm_interactions
555 .iter()
556 .filter_map(|interaction| {
557 interaction
558 .get(token_key)
559 .and_then(JsonValue::as_u64)
560 .or_else(|| {
561 interaction
562 .get("usage")
563 .and_then(|usage| usage.get(token_key))
564 .and_then(JsonValue::as_u64)
565 })
566 })
567 .sum()
568}
569
570fn cost_usd_for_run(run: &ReplayTraceRun) -> Option<f64> {
571 let mut seen = false;
572 let mut total = 0.0;
573 for interaction in &run.llm_interactions {
574 if let Some(cost) = interaction
575 .get("cost_usd")
576 .and_then(JsonValue::as_f64)
577 .or_else(|| {
578 interaction
579 .get("usage")
580 .and_then(|usage| usage.get("cost_usd"))
581 .and_then(JsonValue::as_f64)
582 })
583 {
584 seen = true;
585 total += cost;
586 }
587 }
588 seen.then_some(total)
589}
590
591fn sum_optional_cost(left: Option<f64>, right: Option<f64>) -> Option<f64> {
592 match (left, right) {
593 (Some(left), Some(right)) => Some(left + right),
594 (Some(value), None) | (None, Some(value)) => Some(value),
595 (None, None) => None,
596 }
597}
598
599fn debugging_proxy_metrics(
600 divergence: Option<&ReplayDivergence>,
601 category_scores: &BTreeMap<String, ReplayCategoryMetric>,
602) -> ReplayDebuggingProxyMetrics {
603 let first_divergence_path = divergence.map(|divergence| divergence.path.clone());
604 let first_divergence_depth = first_divergence_path
605 .as_deref()
606 .map(json_path_depth)
607 .unwrap_or_default();
608 let drift_surface_count = category_scores
609 .values()
610 .filter(|metric| metric.compared && !metric.matched)
611 .count();
612 ReplayDebuggingProxyMetrics {
613 proxy_kind: "first_divergence_depth_plus_drift_surfaces".to_string(),
614 first_divergence_path,
615 first_divergence_depth,
616 drift_surface_count,
617 estimated_triage_steps: if drift_surface_count == 0 {
618 0
619 } else {
620 1 + first_divergence_depth + drift_surface_count
621 },
622 }
623}
624
625fn json_path_depth(path: &str) -> usize {
626 let path = path.trim();
627 if path == "$" {
628 return 0;
629 }
630 if let Some(pointer_path) = path.strip_prefix('/') {
631 return pointer_path
632 .split('/')
633 .filter(|segment| !segment.is_empty())
634 .count();
635 }
636 path.split('.')
637 .filter(|segment| !segment.is_empty() && *segment != "$")
638 .count()
639}
640
641fn fixture_receipt(
642 name: &str,
643 path: &str,
644 metrics: &ReplayBenchmarkMetrics,
645 canonical_first_sha256: &str,
646 canonical_second_sha256: &str,
647) -> Result<ReplayBenchmarkFixtureReceipt, ReplayBenchmarkError> {
648 let receipt_material = json!({
649 "ingest_kind": REPLAY_BENCHMARK_CLOUD_INGEST_KIND,
650 "report_schema_version": REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION,
651 "replay_trace_schema_version": REPLAY_TRACE_SCHEMA_VERSION,
652 "name": name,
653 "path": path,
654 "canonical_first_sha256": canonical_first_sha256,
655 "canonical_second_sha256": canonical_second_sha256,
656 "metrics": metrics,
657 });
658 Ok(ReplayBenchmarkFixtureReceipt {
659 ingest_kind: REPLAY_BENCHMARK_CLOUD_INGEST_KIND.to_string(),
660 report_schema_version: REPLAY_BENCHMARK_REPORT_SCHEMA_VERSION.to_string(),
661 replay_trace_schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
662 canonical_first_sha256: canonical_first_sha256.to_string(),
663 canonical_second_sha256: canonical_second_sha256.to_string(),
664 benchmark_receipt_sha256: sha256_json(&receipt_material)?,
665 })
666}
667
668fn sha256_json(value: &JsonValue) -> Result<String, ReplayBenchmarkError> {
669 let bytes = serde_json::to_vec(value)
670 .map_err(|error| ReplayBenchmarkError::Serialization(error.to_string()))?;
671 Ok(format!("sha256:{}", hex::encode(Sha256::digest(bytes))))
672}
673
674fn sha256_value(value: &JsonValue) -> Result<String, ReplayBenchmarkError> {
675 sha256_json(value)
676}
677
678fn sha256_text(text: &str) -> String {
679 format!("sha256:{}", hex::encode(Sha256::digest(text.as_bytes())))
680}
681
682fn average_metric(
683 fixtures: &[ReplayBenchmarkFixtureReport],
684 metric: impl Fn(&ReplayBenchmarkFixtureReport) -> f64,
685) -> f64 {
686 if fixtures.is_empty() {
687 0.0
688 } else {
689 fixtures.iter().map(metric).sum::<f64>() / fixtures.len() as f64
690 }
691}
692
693fn adapt_opencode_jsonl(input: &str, run_id: &str) -> Result<ReplayTraceRun, ReplayBenchmarkError> {
694 let mut run = ReplayTraceRun {
695 run_id: run_id.to_string(),
696 ..ReplayTraceRun::default()
697 };
698 for (index, raw_line) in input.lines().enumerate() {
699 let line_no = index + 1;
700 let line = raw_line.trim();
701 if line.is_empty() {
702 continue;
703 }
704 let value: JsonValue = serde_json::from_str(line).map_err(|error| {
705 ReplayBenchmarkError::Adapter(format!(
706 "invalid {OPENCODE_JSONL_ADAPTER_ID} JSONL line {line_no}: {error}"
707 ))
708 })?;
709 let object = value.as_object().ok_or_else(|| {
710 ReplayBenchmarkError::Adapter(format!(
711 "{OPENCODE_JSONL_ADAPTER_ID} JSONL line {line_no} must be an object"
712 ))
713 })?;
714 let event_type = object
715 .get("type")
716 .or_else(|| object.get("event"))
717 .and_then(JsonValue::as_str)
718 .unwrap_or("event");
719 match event_type {
720 "message" | "session.message" => {
721 run.agent_transcript_deltas
722 .push(adapt_opencode_message(object, line_no));
723 }
724 "tool_call" | "tool" | "session.tool_call" => {
725 let (protocol, receipt) = adapt_opencode_tool_call(object, line_no)?;
726 run.protocol_interactions.push(protocol);
727 run.effect_receipts.push(receipt);
728 }
729 "permission" | "permission_decision" | "session.permission" => {
730 let (approval, policy) = adapt_opencode_permission(object, line_no);
731 run.approval_interactions.push(approval);
732 run.policy_decisions.push(policy);
733 }
734 "llm" | "model" | "session.llm" => {
735 run.llm_interactions
736 .push(adapt_opencode_llm(object, line_no));
737 }
738 _ => run
739 .event_log_entries
740 .push(adapt_opencode_event(object, event_type, line_no)),
741 }
742 }
743 if trace_material_count(&run.counts()) == 0 {
744 return Err(ReplayBenchmarkError::Adapter(format!(
745 "{OPENCODE_JSONL_ADAPTER_ID} input contained no adaptable events"
746 )));
747 }
748 Ok(run)
749}
750
751fn adapt_opencode_message(
752 object: &serde_json::Map<String, JsonValue>,
753 line_no: usize,
754) -> JsonValue {
755 let content = object.get("content").cloned().unwrap_or(JsonValue::Null);
756 json!({
757 "delta_id": object_string(object, "id").unwrap_or_else(|| format!("message-{line_no}")),
758 "agent": object_string(object, "agent").unwrap_or_else(|| "opencode".to_string()),
759 "role": object_string(object, "role").unwrap_or_else(|| "assistant".to_string()),
760 "content_sha256": sha256_text(&content.to_string()),
761 })
762}
763
764fn adapt_opencode_tool_call(
765 object: &serde_json::Map<String, JsonValue>,
766 line_no: usize,
767) -> Result<(JsonValue, JsonValue), ReplayBenchmarkError> {
768 let tool = object_string(object, "tool")
769 .or_else(|| object_string(object, "name"))
770 .unwrap_or_else(|| "unknown_tool".to_string());
771 let arguments = object
772 .get("arguments")
773 .or_else(|| object.get("args"))
774 .cloned()
775 .unwrap_or_else(|| json!({}));
776 let result = object
777 .get("result")
778 .or_else(|| object.get("output"))
779 .cloned()
780 .unwrap_or(JsonValue::Null);
781 let status = object_string(object, "status").unwrap_or_else(|| "completed".to_string());
782 let arguments_sha256 = sha256_value(&arguments)?;
783 let result_sha256 = sha256_value(&result)?;
784 Ok((
785 json!({
786 "protocol": "opencode",
787 "boundary": "tool_call",
788 "tool": tool,
789 "call_id": object_string(object, "id").unwrap_or_else(|| format!("tool-{line_no}")),
790 "arguments_sha256": arguments_sha256,
791 "status": status,
792 "result_sha256": result_sha256,
793 }),
794 json!({
795 "receipt_id": object_string(object, "receipt_id").unwrap_or_else(|| format!("tool-receipt-{line_no}")),
796 "kind": "tool_call",
797 "tool": tool,
798 "status": status,
799 "arguments_sha256": arguments_sha256,
800 "result_sha256": result_sha256,
801 }),
802 ))
803}
804
805fn adapt_opencode_permission(
806 object: &serde_json::Map<String, JsonValue>,
807 line_no: usize,
808) -> (JsonValue, JsonValue) {
809 let action = object_string(object, "action").unwrap_or_else(|| "unknown".to_string());
810 let decision = object_string(object, "decision")
811 .or_else(|| object_string(object, "response"))
812 .unwrap_or_else(|| "unknown".to_string());
813 (
814 json!({
815 "request_id": object_string(object, "id").unwrap_or_else(|| format!("permission-{line_no}")),
816 "principal": object_string(object, "principal").unwrap_or_else(|| "agent".to_string()),
817 "action": action,
818 "response": decision,
819 "reviewer": object.get("reviewer").cloned().unwrap_or(JsonValue::Null),
820 }),
821 json!({
822 "decision_id": object_string(object, "decision_id").unwrap_or_else(|| format!("policy-{line_no}")),
823 "capability": object_string(object, "capability").unwrap_or(action),
824 "decision": decision,
825 "approval_required": true,
826 }),
827 )
828}
829
830fn adapt_opencode_llm(object: &serde_json::Map<String, JsonValue>, line_no: usize) -> JsonValue {
831 let input_tokens = object
832 .get("input_tokens")
833 .and_then(JsonValue::as_u64)
834 .or_else(|| {
835 object
836 .get("usage")
837 .and_then(|usage| usage.get("input_tokens"))
838 .and_then(JsonValue::as_u64)
839 })
840 .unwrap_or_default();
841 let output_tokens = object
842 .get("output_tokens")
843 .and_then(JsonValue::as_u64)
844 .or_else(|| {
845 object
846 .get("usage")
847 .and_then(|usage| usage.get("output_tokens"))
848 .and_then(JsonValue::as_u64)
849 })
850 .unwrap_or_default();
851 let messages_sha256 = object
852 .get("messages")
853 .map(|value| sha256_text(&value.to_string()))
854 .unwrap_or_else(|| sha256_text(""));
855 let response_sha256 = object
856 .get("response")
857 .map(|value| sha256_text(&value.to_string()))
858 .unwrap_or_else(|| sha256_text(""));
859 json!({
860 "request_id": object_string(object, "id").unwrap_or_else(|| format!("llm-{line_no}")),
861 "provider": object_string(object, "provider").unwrap_or_else(|| "opencode".to_string()),
862 "model": object_string(object, "model").unwrap_or_else(|| "unknown".to_string()),
863 "messages_sha256": messages_sha256,
864 "response_sha256": response_sha256,
865 "usage": {
866 "input_tokens": input_tokens,
867 "output_tokens": output_tokens,
868 },
869 })
870}
871
872fn adapt_opencode_event(
873 object: &serde_json::Map<String, JsonValue>,
874 event_type: &str,
875 line_no: usize,
876) -> JsonValue {
877 json!({
878 "event_id": line_no,
879 "topic": object_string(object, "topic").unwrap_or_else(|| "opencode.session".to_string()),
880 "kind": event_type,
881 "payload": object.get("payload").cloned().unwrap_or_else(|| JsonValue::Object(object.clone())),
882 })
883}
884
885fn object_string(object: &serde_json::Map<String, JsonValue>, key: &str) -> Option<String> {
886 object
887 .get(key)
888 .and_then(JsonValue::as_str)
889 .map(str::to_string)
890}
891
892#[cfg(test)]
893mod tests {
894 use super::*;
895
896 fn trace_pair(status: (&str, &str)) -> ReplayOracleTrace {
897 ReplayOracleTrace {
898 schema_version: REPLAY_TRACE_SCHEMA_VERSION.to_string(),
899 name: "simple_tool_run".to_string(),
900 description: Some("golden replay benchmark fixture".to_string()),
901 expect: ReplayExpectation::Match,
902 allowlist: vec![ReplayAllowlistRule {
903 path: "/run_id".to_string(),
904 reason: "run ids are allocated per execution".to_string(),
905 replacement: None,
906 }],
907 first_run: ReplayTraceRun {
908 run_id: "first".to_string(),
909 protocol_interactions: vec![json!({
910 "protocol": "mcp",
911 "boundary": "tools/call",
912 "tool": "read_file",
913 "status": status.0,
914 })],
915 policy_decisions: vec![json!({
916 "capability": "fs.read",
917 "decision": "allow",
918 })],
919 ..ReplayTraceRun::default()
920 },
921 second_run: ReplayTraceRun {
922 run_id: "second".to_string(),
923 protocol_interactions: vec![json!({
924 "protocol": "mcp",
925 "boundary": "tools/call",
926 "tool": "read_file",
927 "status": status.1,
928 })],
929 policy_decisions: vec![json!({
930 "capability": "fs.read",
931 "decision": "allow",
932 })],
933 ..ReplayTraceRun::default()
934 },
935 protocol_fixture_refs: Vec::new(),
936 }
937 }
938
939 #[test]
940 fn replay_benchmark_reports_stable_golden_metrics_for_matching_trace() {
941 let fixture =
942 benchmark_replay_trace("benchmarks/replay/simple.json", &trace_pair(("ok", "ok")))
943 .expect("benchmark fixture");
944
945 assert!(fixture.passed);
946 assert!(fixture.deterministic);
947 assert_eq!(fixture.metrics.determinism_score, 1.0);
948 assert_eq!(fixture.metrics.replay_fidelity_score, 1.0);
949 assert_eq!(fixture.metrics.permission_decision_preservation_score, 1.0);
950 assert_eq!(fixture.metrics.tool_call_drift_count, 0);
951 assert!(fixture
952 .receipt
953 .benchmark_receipt_sha256
954 .starts_with("sha256:"));
955 }
956
957 #[test]
958 fn replay_benchmark_reports_reduced_fidelity_for_meaningful_drift() {
959 let fixture =
960 benchmark_replay_trace("benchmarks/replay/drift.json", &trace_pair(("ok", "error")))
961 .expect("benchmark fixture");
962
963 assert!(!fixture.passed);
964 assert!(!fixture.deterministic);
965 assert_eq!(fixture.metrics.determinism_score, 0.0);
966 assert_eq!(fixture.metrics.replay_fidelity_score, 0.5);
967 assert_eq!(fixture.metrics.tool_call_drift_count, 1);
968 assert_eq!(
969 fixture
970 .metrics
971 .debugging_time_to_root_cause_proxy
972 .first_divergence_path
973 .as_deref(),
974 Some("/protocol_interactions/0/status")
975 );
976 assert_eq!(
977 fixture
978 .metrics
979 .debugging_time_to_root_cause_proxy
980 .first_divergence_depth,
981 3
982 );
983 assert_eq!(
984 fixture
985 .metrics
986 .debugging_time_to_root_cause_proxy
987 .estimated_triage_steps,
988 5
989 );
990 }
991
992 #[test]
993 fn replay_benchmark_summary_is_stable_across_repeated_runs() {
994 let first = benchmark_replay_trace("fixture.json", &trace_pair(("ok", "ok")))
995 .expect("first benchmark");
996 let second = benchmark_replay_trace("fixture.json", &trace_pair(("ok", "ok")))
997 .expect("second benchmark");
998
999 let first_json = serde_json::to_string(&first).expect("serialize first");
1000 let second_json = serde_json::to_string(&second).expect("serialize second");
1001 assert_eq!(first_json, second_json);
1002 }
1003
1004 #[test]
1005 fn opencode_jsonl_adapter_maps_messages_tools_permissions_and_llm_usage() {
1006 let input = concat!(
1007 "{\"type\":\"message\",\"id\":\"m1\",\"role\":\"assistant\",\"content\":\"done\"}\n",
1008 "{\"type\":\"tool_call\",\"id\":\"t1\",\"tool\":\"write_file\",\"arguments\":{\"path\":\"notes.md\"},\"result\":{\"ok\":true}}\n",
1009 "{\"type\":\"permission\",\"id\":\"p1\",\"action\":\"write_file\",\"decision\":\"approved\"}\n",
1010 "{\"type\":\"llm\",\"id\":\"l1\",\"model\":\"qwen\",\"usage\":{\"input_tokens\":7,\"output_tokens\":3}}\n"
1011 );
1012
1013 let run = OpenCodeJsonlAdapter
1014 .adapt_run(input, "opencode-run")
1015 .expect("adapt opencode jsonl");
1016
1017 assert_eq!(run.run_id, "opencode-run");
1018 assert_eq!(run.agent_transcript_deltas.len(), 1);
1019 assert_eq!(run.protocol_interactions.len(), 1);
1020 assert_eq!(run.effect_receipts.len(), 1);
1021 assert_eq!(run.approval_interactions.len(), 1);
1022 assert_eq!(run.policy_decisions.len(), 1);
1023 assert_eq!(run.llm_interactions.len(), 1);
1024 assert_eq!(token_total(&run, "input_tokens"), 7);
1025 assert_eq!(token_total(&run, "output_tokens"), 3);
1026 }
1027
1028 #[test]
1029 fn adapted_trace_pair_can_be_benchmarked() {
1030 let first = "{\"type\":\"tool_call\",\"tool\":\"read_file\",\"result\":{\"ok\":true}}\n";
1031 let second = "{\"type\":\"tool_call\",\"tool\":\"read_file\",\"result\":{\"ok\":true}}\n";
1032
1033 let fixture = benchmark_adapted_replay_pair(
1034 &OpenCodeJsonlAdapter,
1035 "external-tool-run",
1036 first,
1037 second,
1038 )
1039 .expect("benchmark adapted pair");
1040
1041 assert!(fixture.passed);
1042 assert_eq!(fixture.name, "external-tool-run");
1043 assert_eq!(fixture.metrics.tool_call_drift_count, 0);
1044 }
1045}