Skip to main content

agent_sdk_eval/
metrics.rs

1//! Deterministic metrics derived from core traces and journal records.
2
3use serde::{Deserialize, Serialize};
4
5use agent_sdk_core::{
6    AgentError, EffectId, EffectKind, EntityKind, EntityRef, JournalRecord, JournalRecordPayload,
7    RunTrace, SessionTimeline, ToolCallId, ToolCallRecord, ToolCallRecordStatus, TurnTrace,
8};
9
10use crate::{EvaluationMetricDelta, EvaluationScope};
11
12#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
13/// Deterministic metrics for a turn, run, session, or custom trace scope.
14pub struct TraceMetrics {
15    /// Scope these metrics describe.
16    pub scope: EvaluationScope,
17    /// Earliest non-zero journal timestamp observed for this scope.
18    pub started_at_millis: Option<u64>,
19    /// Latest non-zero journal timestamp observed for this scope.
20    pub ended_at_millis: Option<u64>,
21    /// `ended_at_millis - started_at_millis` when both are available.
22    pub elapsed_ms: Option<u64>,
23    /// Number of journal records inspected for this scope.
24    pub record_count: usize,
25    /// Number of distinct runs represented in this scope.
26    pub run_count: usize,
27    /// Number of distinct turns represented in this scope.
28    pub turn_count: usize,
29    /// Number of completed provider attempts.
30    pub provider_call_count: u64,
31    /// Sum of provider-reported input tokens.
32    pub provider_input_tokens: u64,
33    /// Sum of provider-reported output tokens.
34    pub provider_output_tokens: u64,
35    /// Sum of provider-reported total tokens.
36    pub provider_total_tokens: u64,
37    /// Number of distinct tool calls represented in this scope.
38    pub tool_call_count: u64,
39    /// Number of tools whose latest terminal status is completed.
40    pub tool_completed_count: u64,
41    /// Number of tools whose latest terminal status is failed.
42    pub tool_failed_count: u64,
43    /// Number of tools whose latest terminal status is timed out.
44    pub tool_timed_out_count: u64,
45    /// Number of tools whose latest terminal status is cancelled.
46    pub tool_cancelled_count: u64,
47    /// Number of tools denied before execution.
48    pub tool_denied_count: u64,
49    /// Number of tools whose result was rewritten by a hook.
50    pub tool_rewritten_count: u64,
51    /// Number of tools whose latest terminal status is unknown.
52    pub tool_unknown_count: u64,
53    /// Number of tools that require recovery.
54    pub tool_recovery_required_count: u64,
55    /// Sum of per-tool elapsed milliseconds when at least one tool has timing.
56    pub tool_total_elapsed_ms: Option<u64>,
57    /// Per-tool metrics, one row per distinct tool call id.
58    pub tools: Vec<ToolTraceMetric>,
59}
60
61impl TraceMetrics {
62    /// Builds deterministic metrics from a turn trace.
63    pub fn from_turn_trace(trace: &TurnTrace) -> Result<Self, AgentError> {
64        let turn_id = trace.turn_id.clone().ok_or_else(|| {
65            AgentError::contract_violation("turn trace is missing turn id for metrics")
66        })?;
67        let run_count = if trace.run_ids.is_empty() {
68            unique_run_count(&trace.records)
69        } else {
70            trace.run_ids.len()
71        };
72        Ok(Self::from_records(
73            EvaluationScope::Turn {
74                session_id: trace.session_id.clone(),
75                turn_id,
76            },
77            run_count,
78            1,
79            &trace.records,
80        ))
81    }
82
83    /// Builds deterministic metrics from a run trace.
84    pub fn from_run_trace(trace: &RunTrace) -> Result<Self, AgentError> {
85        let run_id = trace.run_id.clone().ok_or_else(|| {
86            AgentError::contract_violation("run trace is missing run id for metrics")
87        })?;
88        let records = if trace.records.is_empty() {
89            trace
90                .turn_traces
91                .iter()
92                .flat_map(|turn| turn.records.iter().cloned())
93                .collect::<Vec<_>>()
94        } else {
95            trace.records.clone()
96        };
97        let turn_count = if trace.turn_traces.is_empty() {
98            unique_turn_count(&records)
99        } else {
100            trace.turn_traces.len()
101        };
102        Ok(Self::from_records(
103            EvaluationScope::Run { run_id },
104            1,
105            turn_count,
106            &records,
107        ))
108    }
109
110    /// Builds deterministic metrics from a session timeline.
111    pub fn from_session_timeline(timeline: &SessionTimeline) -> Self {
112        let records = timeline
113            .turns
114            .iter()
115            .flat_map(|turn| turn.records.iter().cloned())
116            .collect::<Vec<_>>();
117        let run_count = timeline
118            .turns
119            .iter()
120            .flat_map(|turn| turn.run_ids.iter().cloned())
121            .fold(Vec::new(), |mut runs, run_id| {
122                push_unique(&mut runs, run_id);
123                runs
124            })
125            .len()
126            .max(unique_run_count(&records));
127        Self::from_records(
128            EvaluationScope::Session {
129                session_id: timeline.session_id.clone(),
130            },
131            run_count,
132            timeline.turns.len(),
133            &records,
134        )
135    }
136
137    fn from_records(
138        scope: EvaluationScope,
139        run_count: usize,
140        turn_count: usize,
141        records: &[JournalRecord],
142    ) -> Self {
143        let started_at_millis = records
144            .iter()
145            .filter_map(|record| non_zero_timestamp(record.timestamp_millis))
146            .min();
147        let ended_at_millis = records
148            .iter()
149            .filter_map(|record| non_zero_timestamp(record.timestamp_millis))
150            .max();
151        let elapsed_ms = started_at_millis
152            .zip(ended_at_millis)
153            .and_then(|(started_at, ended_at)| ended_at.checked_sub(started_at));
154        let mut provider_call_count = 0;
155        let mut provider_input_tokens = 0;
156        let mut provider_output_tokens = 0;
157        let mut provider_total_tokens = 0;
158
159        for record in records {
160            if let JournalRecordPayload::ModelAttempt(attempt) = &record.payload
161                && attempt.stop_reason.is_some()
162            {
163                provider_call_count += 1;
164                if let Some(usage) = &attempt.usage {
165                    provider_input_tokens += u64::from(usage.input_tokens.unwrap_or_default());
166                    provider_output_tokens += u64::from(usage.output_tokens.unwrap_or_default());
167                    provider_total_tokens += u64::from(usage.total_tokens.unwrap_or_default());
168                }
169            }
170        }
171
172        let tools = tool_metrics(records);
173        let tool_total_elapsed_ms = tools
174            .iter()
175            .filter_map(|tool| tool.elapsed_ms)
176            .reduce(|left, right| left.saturating_add(right));
177        let mut metrics = Self {
178            scope,
179            started_at_millis,
180            ended_at_millis,
181            elapsed_ms,
182            record_count: records.len(),
183            run_count,
184            turn_count,
185            provider_call_count,
186            provider_input_tokens,
187            provider_output_tokens,
188            provider_total_tokens,
189            tool_call_count: tools.len() as u64,
190            tool_completed_count: 0,
191            tool_failed_count: 0,
192            tool_timed_out_count: 0,
193            tool_cancelled_count: 0,
194            tool_denied_count: 0,
195            tool_rewritten_count: 0,
196            tool_unknown_count: 0,
197            tool_recovery_required_count: 0,
198            tool_total_elapsed_ms,
199            tools,
200        };
201        metrics.count_tool_statuses();
202        metrics
203    }
204
205    fn count_tool_statuses(&mut self) {
206        for tool in &self.tools {
207            match tool.status {
208                ToolCallRecordStatus::Completed => self.tool_completed_count += 1,
209                ToolCallRecordStatus::Failed => self.tool_failed_count += 1,
210                ToolCallRecordStatus::TimedOut => self.tool_timed_out_count += 1,
211                ToolCallRecordStatus::Cancelled => self.tool_cancelled_count += 1,
212                ToolCallRecordStatus::DeniedBeforeExecution => self.tool_denied_count += 1,
213                ToolCallRecordStatus::ResultRewritten => self.tool_rewritten_count += 1,
214                ToolCallRecordStatus::Unknown => self.tool_unknown_count += 1,
215                ToolCallRecordStatus::RecoveryRequired => self.tool_recovery_required_count += 1,
216                ToolCallRecordStatus::Requested
217                | ToolCallRecordStatus::RequestModified
218                | ToolCallRecordStatus::IntentRecorded => {}
219            }
220        }
221    }
222}
223
224#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
225/// Deterministic metrics for one tool call.
226pub struct ToolTraceMetric {
227    /// Tool call ref represented by this row.
228    pub tool_call_ref: EntityRef,
229    /// Canonical tool name recorded by the SDK.
230    pub canonical_tool_name: String,
231    /// Latest observed tool status.
232    pub status: ToolCallRecordStatus,
233    /// Timestamp of the tool-execution intent record when available.
234    pub started_at_millis: Option<u64>,
235    /// Timestamp of the matching terminal result record when available.
236    pub ended_at_millis: Option<u64>,
237    /// `ended_at_millis - started_at_millis` when durable evidence supports it.
238    pub elapsed_ms: Option<u64>,
239}
240
241#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
242/// Deterministic comparison between two trace metric sets.
243pub struct TraceMetricsComparison {
244    /// Observed metrics.
245    pub observed: TraceMetrics,
246    /// Baseline or comparison metrics.
247    pub baseline: TraceMetrics,
248    /// Deterministic metric deltas computed as observed minus baseline.
249    pub metric_deltas: Vec<EvaluationMetricDelta>,
250}
251
252impl TraceMetricsComparison {
253    /// Compares two session timelines.
254    pub fn sessions(observed: &SessionTimeline, baseline: &SessionTimeline) -> Self {
255        Self::from_metrics(
256            TraceMetrics::from_session_timeline(observed),
257            TraceMetrics::from_session_timeline(baseline),
258        )
259    }
260
261    /// Compares two already computed metric sets.
262    pub fn from_metrics(observed: TraceMetrics, baseline: TraceMetrics) -> Self {
263        let metric_deltas = metric_deltas(&observed, &baseline);
264        Self {
265            observed,
266            baseline,
267            metric_deltas,
268        }
269    }
270}
271
272#[derive(Clone)]
273struct ToolMetricState {
274    tool_call_id: ToolCallId,
275    entries: Vec<ToolJournalEntry>,
276}
277
278#[derive(Clone)]
279struct ToolJournalEntry {
280    journal_seq: u64,
281    timestamp_millis: u64,
282    record: ToolCallRecord,
283}
284
285fn tool_metrics(records: &[JournalRecord]) -> Vec<ToolTraceMetric> {
286    let mut states = Vec::<ToolMetricState>::new();
287    for record in records {
288        let JournalRecordPayload::Tool(tool_record) = &record.payload else {
289            continue;
290        };
291        if let Some(state) = states
292            .iter_mut()
293            .find(|state| state.tool_call_id == tool_record.tool_call_id)
294        {
295            state.entries.push(ToolJournalEntry {
296                journal_seq: record.journal_seq,
297                timestamp_millis: record.timestamp_millis,
298                record: tool_record.clone(),
299            });
300        } else {
301            states.push(ToolMetricState {
302                tool_call_id: tool_record.tool_call_id.clone(),
303                entries: vec![ToolJournalEntry {
304                    journal_seq: record.journal_seq,
305                    timestamp_millis: record.timestamp_millis,
306                    record: tool_record.clone(),
307                }],
308            });
309        }
310    }
311
312    states
313        .into_iter()
314        .filter_map(|state| tool_metric(state.entries))
315        .collect()
316}
317
318fn tool_metric(mut entries: Vec<ToolJournalEntry>) -> Option<ToolTraceMetric> {
319    entries.sort_by_key(|entry| entry.journal_seq);
320    let latest = entries.last()?.record.clone();
321    let start = entries.iter().find_map(tool_start);
322    let end = start
323        .as_ref()
324        .and_then(|start| matching_tool_end(&entries, start));
325    let elapsed_ms = start
326        .as_ref()
327        .zip(end.as_ref())
328        .and_then(|(start, end)| end.timestamp_millis.checked_sub(start.timestamp_millis));
329
330    Some(ToolTraceMetric {
331        tool_call_ref: EntityRef::new(EntityKind::ToolCall, latest.tool_call_id),
332        canonical_tool_name: latest.canonical_tool_name.as_str().to_string(),
333        status: latest.status,
334        started_at_millis: start.map(|start| start.timestamp_millis),
335        ended_at_millis: end.map(|end| end.timestamp_millis),
336        elapsed_ms,
337    })
338}
339
340struct ToolStart {
341    effect_id: EffectId,
342    timestamp_millis: u64,
343}
344
345struct ToolEnd {
346    timestamp_millis: u64,
347}
348
349fn tool_start(entry: &ToolJournalEntry) -> Option<ToolStart> {
350    if entry.timestamp_millis == 0 || entry.record.status != ToolCallRecordStatus::IntentRecorded {
351        return None;
352    }
353    let intent = entry.record.effect_intent.as_ref()?;
354    if intent.kind != EffectKind::ToolExecution {
355        return None;
356    }
357    Some(ToolStart {
358        effect_id: intent.effect_id.clone(),
359        timestamp_millis: entry.timestamp_millis,
360    })
361}
362
363fn matching_tool_end(entries: &[ToolJournalEntry], start: &ToolStart) -> Option<ToolEnd> {
364    entries
365        .iter()
366        .filter(|entry| {
367            entry.timestamp_millis > start.timestamp_millis && is_terminal_status(&entry.record)
368        })
369        .filter_map(|entry| {
370            let result = entry.record.effect_result.as_ref()?;
371            (result.effect_id == start.effect_id).then_some(ToolEnd {
372                timestamp_millis: entry.timestamp_millis,
373            })
374        })
375        .max_by_key(|end| end.timestamp_millis)
376}
377
378fn is_terminal_status(record: &ToolCallRecord) -> bool {
379    matches!(
380        record.status,
381        ToolCallRecordStatus::Completed
382            | ToolCallRecordStatus::Failed
383            | ToolCallRecordStatus::TimedOut
384            | ToolCallRecordStatus::Cancelled
385            | ToolCallRecordStatus::DeniedBeforeExecution
386            | ToolCallRecordStatus::ResultRewritten
387            | ToolCallRecordStatus::Unknown
388            | ToolCallRecordStatus::RecoveryRequired
389    )
390}
391
392fn metric_deltas(observed: &TraceMetrics, baseline: &TraceMetrics) -> Vec<EvaluationMetricDelta> {
393    let mut deltas = Vec::new();
394    push_count_delta(
395        &mut deltas,
396        "trace.elapsed_ms",
397        observed.elapsed_ms,
398        baseline.elapsed_ms,
399    );
400    push_delta(
401        &mut deltas,
402        "trace.provider_call_count",
403        observed.provider_call_count,
404        baseline.provider_call_count,
405    );
406    push_delta(
407        &mut deltas,
408        "trace.provider_input_tokens",
409        observed.provider_input_tokens,
410        baseline.provider_input_tokens,
411    );
412    push_delta(
413        &mut deltas,
414        "trace.provider_output_tokens",
415        observed.provider_output_tokens,
416        baseline.provider_output_tokens,
417    );
418    push_delta(
419        &mut deltas,
420        "trace.provider_total_tokens",
421        observed.provider_total_tokens,
422        baseline.provider_total_tokens,
423    );
424    push_delta(
425        &mut deltas,
426        "trace.tool_call_count",
427        observed.tool_call_count,
428        baseline.tool_call_count,
429    );
430    push_delta(
431        &mut deltas,
432        "trace.tool_completed_count",
433        observed.tool_completed_count,
434        baseline.tool_completed_count,
435    );
436    push_delta(
437        &mut deltas,
438        "trace.tool_failed_count",
439        observed.tool_failed_count,
440        baseline.tool_failed_count,
441    );
442    push_delta(
443        &mut deltas,
444        "trace.tool_denied_count",
445        observed.tool_denied_count,
446        baseline.tool_denied_count,
447    );
448    push_count_delta(
449        &mut deltas,
450        "trace.tool_total_elapsed_ms",
451        observed.tool_total_elapsed_ms,
452        baseline.tool_total_elapsed_ms,
453    );
454    deltas
455}
456
457fn push_delta(
458    deltas: &mut Vec<EvaluationMetricDelta>,
459    metric_ref: &'static str,
460    observed: u64,
461    baseline: u64,
462) {
463    deltas.push(EvaluationMetricDelta::new(
464        metric_ref,
465        signed_delta(observed, baseline),
466        format!("{metric_ref}: observed={observed}, baseline={baseline}"),
467    ));
468}
469
470fn push_count_delta(
471    deltas: &mut Vec<EvaluationMetricDelta>,
472    metric_ref: &'static str,
473    observed: Option<u64>,
474    baseline: Option<u64>,
475) {
476    if let Some((observed, baseline)) = observed.zip(baseline) {
477        push_delta(deltas, metric_ref, observed, baseline);
478    }
479}
480
481fn signed_delta(observed: u64, baseline: u64) -> String {
482    let delta = i128::from(observed) - i128::from(baseline);
483    if delta >= 0 {
484        format!("+{delta}")
485    } else {
486        delta.to_string()
487    }
488}
489
490fn unique_run_count(records: &[JournalRecord]) -> usize {
491    records
492        .iter()
493        .fold(Vec::new(), |mut run_ids, record| {
494            push_unique(&mut run_ids, record.run_id.clone());
495            run_ids
496        })
497        .len()
498}
499
500fn unique_turn_count(records: &[JournalRecord]) -> usize {
501    records
502        .iter()
503        .filter_map(|record| record.turn_id.clone())
504        .fold(Vec::new(), |mut turn_ids, turn_id| {
505            push_unique(&mut turn_ids, turn_id);
506            turn_ids
507        })
508        .len()
509}
510
511fn non_zero_timestamp(timestamp_millis: u64) -> Option<u64> {
512    (timestamp_millis > 0).then_some(timestamp_millis)
513}
514
515fn push_unique<T: Eq>(items: &mut Vec<T>, value: T) {
516    if !items.contains(&value) {
517        items.push(value);
518    }
519}