1use serde::{Deserialize, Serialize};
4
5use agent_sdk_core::{
6 AgentError, EffectId, EffectKind, EntityKind, EntityRef, JournalRecord, JournalRecordPayload,
7 RunTrace, SessionTimeline, ToolCallId, ToolCallRecord, ToolCallRecordStatus, TurnTrace,
8};
9
10use crate::{EvaluationMetricDelta, EvaluationScope};
11
12#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
13pub struct TraceMetrics {
15 pub scope: EvaluationScope,
17 pub started_at_millis: Option<u64>,
19 pub ended_at_millis: Option<u64>,
21 pub elapsed_ms: Option<u64>,
23 pub record_count: usize,
25 pub run_count: usize,
27 pub turn_count: usize,
29 pub provider_call_count: u64,
31 pub provider_input_tokens: u64,
33 pub provider_output_tokens: u64,
35 pub provider_total_tokens: u64,
37 pub tool_call_count: u64,
39 pub tool_completed_count: u64,
41 pub tool_failed_count: u64,
43 pub tool_timed_out_count: u64,
45 pub tool_cancelled_count: u64,
47 pub tool_denied_count: u64,
49 pub tool_rewritten_count: u64,
51 pub tool_unknown_count: u64,
53 pub tool_recovery_required_count: u64,
55 pub tool_total_elapsed_ms: Option<u64>,
57 pub tools: Vec<ToolTraceMetric>,
59}
60
61impl TraceMetrics {
62 pub fn from_turn_trace(trace: &TurnTrace) -> Result<Self, AgentError> {
64 let turn_id = trace.turn_id.clone().ok_or_else(|| {
65 AgentError::contract_violation("turn trace is missing turn id for metrics")
66 })?;
67 let run_count = if trace.run_ids.is_empty() {
68 unique_run_count(&trace.records)
69 } else {
70 trace.run_ids.len()
71 };
72 Ok(Self::from_records(
73 EvaluationScope::Turn {
74 session_id: trace.session_id.clone(),
75 turn_id,
76 },
77 run_count,
78 1,
79 &trace.records,
80 ))
81 }
82
83 pub fn from_run_trace(trace: &RunTrace) -> Result<Self, AgentError> {
85 let run_id = trace.run_id.clone().ok_or_else(|| {
86 AgentError::contract_violation("run trace is missing run id for metrics")
87 })?;
88 let records = if trace.records.is_empty() {
89 trace
90 .turn_traces
91 .iter()
92 .flat_map(|turn| turn.records.iter().cloned())
93 .collect::<Vec<_>>()
94 } else {
95 trace.records.clone()
96 };
97 let turn_count = if trace.turn_traces.is_empty() {
98 unique_turn_count(&records)
99 } else {
100 trace.turn_traces.len()
101 };
102 Ok(Self::from_records(
103 EvaluationScope::Run { run_id },
104 1,
105 turn_count,
106 &records,
107 ))
108 }
109
110 pub fn from_session_timeline(timeline: &SessionTimeline) -> Self {
112 let records = timeline
113 .turns
114 .iter()
115 .flat_map(|turn| turn.records.iter().cloned())
116 .collect::<Vec<_>>();
117 let run_count = timeline
118 .turns
119 .iter()
120 .flat_map(|turn| turn.run_ids.iter().cloned())
121 .fold(Vec::new(), |mut runs, run_id| {
122 push_unique(&mut runs, run_id);
123 runs
124 })
125 .len()
126 .max(unique_run_count(&records));
127 Self::from_records(
128 EvaluationScope::Session {
129 session_id: timeline.session_id.clone(),
130 },
131 run_count,
132 timeline.turns.len(),
133 &records,
134 )
135 }
136
137 fn from_records(
138 scope: EvaluationScope,
139 run_count: usize,
140 turn_count: usize,
141 records: &[JournalRecord],
142 ) -> Self {
143 let started_at_millis = records
144 .iter()
145 .filter_map(|record| non_zero_timestamp(record.timestamp_millis))
146 .min();
147 let ended_at_millis = records
148 .iter()
149 .filter_map(|record| non_zero_timestamp(record.timestamp_millis))
150 .max();
151 let elapsed_ms = started_at_millis
152 .zip(ended_at_millis)
153 .and_then(|(started_at, ended_at)| ended_at.checked_sub(started_at));
154 let mut provider_call_count = 0;
155 let mut provider_input_tokens = 0;
156 let mut provider_output_tokens = 0;
157 let mut provider_total_tokens = 0;
158
159 for record in records {
160 if let JournalRecordPayload::ModelAttempt(attempt) = &record.payload
161 && attempt.stop_reason.is_some()
162 {
163 provider_call_count += 1;
164 if let Some(usage) = &attempt.usage {
165 provider_input_tokens += u64::from(usage.input_tokens.unwrap_or_default());
166 provider_output_tokens += u64::from(usage.output_tokens.unwrap_or_default());
167 provider_total_tokens += u64::from(usage.total_tokens.unwrap_or_default());
168 }
169 }
170 }
171
172 let tools = tool_metrics(records);
173 let tool_total_elapsed_ms = tools
174 .iter()
175 .filter_map(|tool| tool.elapsed_ms)
176 .reduce(|left, right| left.saturating_add(right));
177 let mut metrics = Self {
178 scope,
179 started_at_millis,
180 ended_at_millis,
181 elapsed_ms,
182 record_count: records.len(),
183 run_count,
184 turn_count,
185 provider_call_count,
186 provider_input_tokens,
187 provider_output_tokens,
188 provider_total_tokens,
189 tool_call_count: tools.len() as u64,
190 tool_completed_count: 0,
191 tool_failed_count: 0,
192 tool_timed_out_count: 0,
193 tool_cancelled_count: 0,
194 tool_denied_count: 0,
195 tool_rewritten_count: 0,
196 tool_unknown_count: 0,
197 tool_recovery_required_count: 0,
198 tool_total_elapsed_ms,
199 tools,
200 };
201 metrics.count_tool_statuses();
202 metrics
203 }
204
205 fn count_tool_statuses(&mut self) {
206 for tool in &self.tools {
207 match tool.status {
208 ToolCallRecordStatus::Completed => self.tool_completed_count += 1,
209 ToolCallRecordStatus::Failed => self.tool_failed_count += 1,
210 ToolCallRecordStatus::TimedOut => self.tool_timed_out_count += 1,
211 ToolCallRecordStatus::Cancelled => self.tool_cancelled_count += 1,
212 ToolCallRecordStatus::DeniedBeforeExecution => self.tool_denied_count += 1,
213 ToolCallRecordStatus::ResultRewritten => self.tool_rewritten_count += 1,
214 ToolCallRecordStatus::Unknown => self.tool_unknown_count += 1,
215 ToolCallRecordStatus::RecoveryRequired => self.tool_recovery_required_count += 1,
216 ToolCallRecordStatus::Requested
217 | ToolCallRecordStatus::RequestModified
218 | ToolCallRecordStatus::IntentRecorded => {}
219 }
220 }
221 }
222}
223
224#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
225pub struct ToolTraceMetric {
227 pub tool_call_ref: EntityRef,
229 pub canonical_tool_name: String,
231 pub status: ToolCallRecordStatus,
233 pub started_at_millis: Option<u64>,
235 pub ended_at_millis: Option<u64>,
237 pub elapsed_ms: Option<u64>,
239}
240
241#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
242pub struct TraceMetricsComparison {
244 pub observed: TraceMetrics,
246 pub baseline: TraceMetrics,
248 pub metric_deltas: Vec<EvaluationMetricDelta>,
250}
251
252impl TraceMetricsComparison {
253 pub fn sessions(observed: &SessionTimeline, baseline: &SessionTimeline) -> Self {
255 Self::from_metrics(
256 TraceMetrics::from_session_timeline(observed),
257 TraceMetrics::from_session_timeline(baseline),
258 )
259 }
260
261 pub fn from_metrics(observed: TraceMetrics, baseline: TraceMetrics) -> Self {
263 let metric_deltas = metric_deltas(&observed, &baseline);
264 Self {
265 observed,
266 baseline,
267 metric_deltas,
268 }
269 }
270}
271
272#[derive(Clone)]
273struct ToolMetricState {
274 tool_call_id: ToolCallId,
275 entries: Vec<ToolJournalEntry>,
276}
277
278#[derive(Clone)]
279struct ToolJournalEntry {
280 journal_seq: u64,
281 timestamp_millis: u64,
282 record: ToolCallRecord,
283}
284
285fn tool_metrics(records: &[JournalRecord]) -> Vec<ToolTraceMetric> {
286 let mut states = Vec::<ToolMetricState>::new();
287 for record in records {
288 let JournalRecordPayload::Tool(tool_record) = &record.payload else {
289 continue;
290 };
291 if let Some(state) = states
292 .iter_mut()
293 .find(|state| state.tool_call_id == tool_record.tool_call_id)
294 {
295 state.entries.push(ToolJournalEntry {
296 journal_seq: record.journal_seq,
297 timestamp_millis: record.timestamp_millis,
298 record: tool_record.clone(),
299 });
300 } else {
301 states.push(ToolMetricState {
302 tool_call_id: tool_record.tool_call_id.clone(),
303 entries: vec![ToolJournalEntry {
304 journal_seq: record.journal_seq,
305 timestamp_millis: record.timestamp_millis,
306 record: tool_record.clone(),
307 }],
308 });
309 }
310 }
311
312 states
313 .into_iter()
314 .filter_map(|state| tool_metric(state.entries))
315 .collect()
316}
317
318fn tool_metric(mut entries: Vec<ToolJournalEntry>) -> Option<ToolTraceMetric> {
319 entries.sort_by_key(|entry| entry.journal_seq);
320 let latest = entries.last()?.record.clone();
321 let start = entries.iter().find_map(tool_start);
322 let end = start
323 .as_ref()
324 .and_then(|start| matching_tool_end(&entries, start));
325 let elapsed_ms = start
326 .as_ref()
327 .zip(end.as_ref())
328 .and_then(|(start, end)| end.timestamp_millis.checked_sub(start.timestamp_millis));
329
330 Some(ToolTraceMetric {
331 tool_call_ref: EntityRef::new(EntityKind::ToolCall, latest.tool_call_id),
332 canonical_tool_name: latest.canonical_tool_name.as_str().to_string(),
333 status: latest.status,
334 started_at_millis: start.map(|start| start.timestamp_millis),
335 ended_at_millis: end.map(|end| end.timestamp_millis),
336 elapsed_ms,
337 })
338}
339
340struct ToolStart {
341 effect_id: EffectId,
342 timestamp_millis: u64,
343}
344
345struct ToolEnd {
346 timestamp_millis: u64,
347}
348
349fn tool_start(entry: &ToolJournalEntry) -> Option<ToolStart> {
350 if entry.timestamp_millis == 0 || entry.record.status != ToolCallRecordStatus::IntentRecorded {
351 return None;
352 }
353 let intent = entry.record.effect_intent.as_ref()?;
354 if intent.kind != EffectKind::ToolExecution {
355 return None;
356 }
357 Some(ToolStart {
358 effect_id: intent.effect_id.clone(),
359 timestamp_millis: entry.timestamp_millis,
360 })
361}
362
363fn matching_tool_end(entries: &[ToolJournalEntry], start: &ToolStart) -> Option<ToolEnd> {
364 entries
365 .iter()
366 .filter(|entry| {
367 entry.timestamp_millis > start.timestamp_millis && is_terminal_status(&entry.record)
368 })
369 .filter_map(|entry| {
370 let result = entry.record.effect_result.as_ref()?;
371 (result.effect_id == start.effect_id).then_some(ToolEnd {
372 timestamp_millis: entry.timestamp_millis,
373 })
374 })
375 .max_by_key(|end| end.timestamp_millis)
376}
377
378fn is_terminal_status(record: &ToolCallRecord) -> bool {
379 matches!(
380 record.status,
381 ToolCallRecordStatus::Completed
382 | ToolCallRecordStatus::Failed
383 | ToolCallRecordStatus::TimedOut
384 | ToolCallRecordStatus::Cancelled
385 | ToolCallRecordStatus::DeniedBeforeExecution
386 | ToolCallRecordStatus::ResultRewritten
387 | ToolCallRecordStatus::Unknown
388 | ToolCallRecordStatus::RecoveryRequired
389 )
390}
391
392fn metric_deltas(observed: &TraceMetrics, baseline: &TraceMetrics) -> Vec<EvaluationMetricDelta> {
393 let mut deltas = Vec::new();
394 push_count_delta(
395 &mut deltas,
396 "trace.elapsed_ms",
397 observed.elapsed_ms,
398 baseline.elapsed_ms,
399 );
400 push_delta(
401 &mut deltas,
402 "trace.provider_call_count",
403 observed.provider_call_count,
404 baseline.provider_call_count,
405 );
406 push_delta(
407 &mut deltas,
408 "trace.provider_input_tokens",
409 observed.provider_input_tokens,
410 baseline.provider_input_tokens,
411 );
412 push_delta(
413 &mut deltas,
414 "trace.provider_output_tokens",
415 observed.provider_output_tokens,
416 baseline.provider_output_tokens,
417 );
418 push_delta(
419 &mut deltas,
420 "trace.provider_total_tokens",
421 observed.provider_total_tokens,
422 baseline.provider_total_tokens,
423 );
424 push_delta(
425 &mut deltas,
426 "trace.tool_call_count",
427 observed.tool_call_count,
428 baseline.tool_call_count,
429 );
430 push_delta(
431 &mut deltas,
432 "trace.tool_completed_count",
433 observed.tool_completed_count,
434 baseline.tool_completed_count,
435 );
436 push_delta(
437 &mut deltas,
438 "trace.tool_failed_count",
439 observed.tool_failed_count,
440 baseline.tool_failed_count,
441 );
442 push_delta(
443 &mut deltas,
444 "trace.tool_denied_count",
445 observed.tool_denied_count,
446 baseline.tool_denied_count,
447 );
448 push_count_delta(
449 &mut deltas,
450 "trace.tool_total_elapsed_ms",
451 observed.tool_total_elapsed_ms,
452 baseline.tool_total_elapsed_ms,
453 );
454 deltas
455}
456
457fn push_delta(
458 deltas: &mut Vec<EvaluationMetricDelta>,
459 metric_ref: &'static str,
460 observed: u64,
461 baseline: u64,
462) {
463 deltas.push(EvaluationMetricDelta::new(
464 metric_ref,
465 signed_delta(observed, baseline),
466 format!("{metric_ref}: observed={observed}, baseline={baseline}"),
467 ));
468}
469
470fn push_count_delta(
471 deltas: &mut Vec<EvaluationMetricDelta>,
472 metric_ref: &'static str,
473 observed: Option<u64>,
474 baseline: Option<u64>,
475) {
476 if let Some((observed, baseline)) = observed.zip(baseline) {
477 push_delta(deltas, metric_ref, observed, baseline);
478 }
479}
480
481fn signed_delta(observed: u64, baseline: u64) -> String {
482 let delta = i128::from(observed) - i128::from(baseline);
483 if delta >= 0 {
484 format!("+{delta}")
485 } else {
486 delta.to_string()
487 }
488}
489
490fn unique_run_count(records: &[JournalRecord]) -> usize {
491 records
492 .iter()
493 .fold(Vec::new(), |mut run_ids, record| {
494 push_unique(&mut run_ids, record.run_id.clone());
495 run_ids
496 })
497 .len()
498}
499
500fn unique_turn_count(records: &[JournalRecord]) -> usize {
501 records
502 .iter()
503 .filter_map(|record| record.turn_id.clone())
504 .fold(Vec::new(), |mut turn_ids, turn_id| {
505 push_unique(&mut turn_ids, turn_id);
506 turn_ids
507 })
508 .len()
509}
510
511fn non_zero_timestamp(timestamp_millis: u64) -> Option<u64> {
512 (timestamp_millis > 0).then_some(timestamp_millis)
513}
514
515fn push_unique<T: Eq>(items: &mut Vec<T>, value: T) {
516 if !items.contains(&value) {
517 items.push(value);
518 }
519}