Skip to main content

roder_evals/
trace.rs

1use roder_api::events::{RoderEvent, ThreadId, TurnId};
2use roder_api::inference::InferenceEvent;
3use serde::{Deserialize, Serialize};
4use time::OffsetDateTime;
5
6#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
7#[serde(rename_all = "camelCase")]
8pub struct EvalRun {
9    pub suite_id: String,
10    pub run_id: String,
11    pub provider: String,
12    pub model: String,
13    #[serde(with = "time::serde::rfc3339")]
14    pub started_at: OffsetDateTime,
15    #[serde(default)]
16    pub tags: Vec<String>,
17}
18
19#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
20#[serde(rename_all = "camelCase")]
21pub struct EvalTrajectory {
22    pub thread_id: ThreadId,
23    pub turn_id: TurnId,
24    #[serde(default)]
25    pub events: Vec<EvalTrajectoryEvent>,
26}
27
28#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
29#[serde(rename_all = "camelCase")]
30pub struct EvalTrajectoryEvent {
31    #[serde(with = "time::serde::rfc3339")]
32    pub timestamp: OffsetDateTime,
33    pub event_type: String,
34    pub thread_id: ThreadId,
35    pub turn_id: TurnId,
36    #[serde(default, skip_serializing_if = "Option::is_none")]
37    pub tool_id: Option<String>,
38    #[serde(default, skip_serializing_if = "Option::is_none")]
39    pub tool_name: Option<String>,
40    #[serde(default, skip_serializing_if = "Option::is_none")]
41    pub token_usage: Option<EvalTokenUsage>,
42    #[serde(default, skip_serializing_if = "Option::is_none")]
43    pub runtime_profile: Option<String>,
44    #[serde(default, skip_serializing_if = "Option::is_none")]
45    pub speed_policy_phase: Option<String>,
46    #[serde(default, skip_serializing_if = "Option::is_none")]
47    pub speed_policy_reasoning: Option<String>,
48    #[serde(default)]
49    pub is_error: bool,
50}
51
52#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
53#[serde(rename_all = "camelCase")]
54pub struct EvalTokenUsage {
55    pub prompt_tokens: u32,
56    pub completion_tokens: u32,
57    pub total_tokens: u32,
58    pub cached_prompt_tokens: u32,
59    #[serde(default)]
60    pub cache_creation_prompt_tokens: u32,
61    #[serde(default, skip_serializing_if = "Option::is_none")]
62    pub cache_hit_rate: Option<f64>,
63}
64
65#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
66#[serde(rename_all = "camelCase")]
67pub struct EvalMetric {
68    pub name: String,
69    pub kind: EvalMetricKind,
70    pub value: f64,
71    #[serde(default, skip_serializing_if = "Option::is_none")]
72    pub unit: Option<String>,
73}
74
75#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
76#[serde(rename_all = "snake_case")]
77pub enum EvalMetricKind {
78    Outcome,
79    Count,
80    Duration,
81    Tokens,
82    Bytes,
83    Flag,
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
87#[serde(rename_all = "snake_case")]
88pub enum EvalOutcome {
89    Pass,
90    Fail,
91    Timeout,
92    HarnessError,
93    VerifierUncertain,
94}
95
96#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
97#[serde(rename_all = "snake_case")]
98pub enum EvalFailureClass {
99    Model,
100    ToolSchema,
101    Runtime,
102    Environment,
103    Provider,
104    Verifier,
105    Unknown,
106}
107
108#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
109#[serde(rename_all = "camelCase")]
110pub struct EvalReport {
111    pub run: EvalRun,
112    pub outcome: EvalOutcome,
113    #[serde(default, skip_serializing_if = "Option::is_none")]
114    pub failure_class: Option<EvalFailureClass>,
115    pub trajectory: EvalTrajectory,
116    #[serde(default)]
117    pub metrics: Vec<EvalMetric>,
118}
119
120impl EvalTrajectory {
121    pub fn from_events(
122        thread_id: impl Into<ThreadId>,
123        turn_id: impl Into<TurnId>,
124        events: &[RoderEvent],
125    ) -> Self {
126        let thread_id = thread_id.into();
127        let turn_id = turn_id.into();
128        let events = events
129            .iter()
130            .filter_map(EvalTrajectoryEvent::from_event)
131            .collect();
132        Self {
133            thread_id,
134            turn_id,
135            events,
136        }
137    }
138}
139
140impl EvalTrajectoryEvent {
141    pub fn from_event(event: &RoderEvent) -> Option<Self> {
142        match event {
143            RoderEvent::TurnStarted(e) => {
144                let mut event = Self::basic("turn_started", &e.thread_id, &e.turn_id, e.timestamp);
145                event.runtime_profile = Some(e.runtime_profile.as_str().to_string());
146                Some(event)
147            }
148            RoderEvent::InferenceStarted(e) => {
149                let mut event =
150                    Self::basic("inference_started", &e.thread_id, &e.turn_id, e.timestamp);
151                if let Some(decision) = &e.speed_policy {
152                    event.speed_policy_phase = Some(decision.phase.as_str().to_string());
153                    event.speed_policy_reasoning = decision
154                        .applied_reasoning
155                        .clone()
156                        .or_else(|| Some(decision.desired_reasoning.clone()));
157                }
158                Some(event)
159            }
160            RoderEvent::ContextAssemblyCompleted(e) => Some(Self::basic(
161                "context_assembly_completed",
162                &e.thread_id,
163                &e.turn_id,
164                e.timestamp,
165            )),
166            RoderEvent::ContextEntrypointCandidatesInjected(e) => Some(Self::basic(
167                "entrypoint_candidates_injected",
168                &e.thread_id,
169                &e.turn_id,
170                e.timestamp,
171            )),
172            RoderEvent::ContextCompactionStarted(e) => Some(Self::basic(
173                "context_compaction_started",
174                &e.thread_id,
175                &e.turn_id,
176                e.timestamp,
177            )),
178            RoderEvent::ContextCompactionRecorded(e) => Some(Self::basic(
179                "context_compaction_recorded",
180                &e.thread_id,
181                &e.turn_id,
182                e.timestamp,
183            )),
184            RoderEvent::ContextCompactionSkipped(e) => Some(Self::basic(
185                "context_compaction_skipped",
186                &e.thread_id,
187                &e.turn_id,
188                e.timestamp,
189            )),
190            RoderEvent::RetrievalRoutePlanned(e) => Some(Self::basic(
191                "retrieval_route_planned",
192                &e.plan.thread_id,
193                &e.plan.turn_id,
194                e.plan.timestamp,
195            )),
196            RoderEvent::RetrievalRouteAccepted(e) => {
197                let mut event = Self::basic(
198                    "retrieval_route_accepted",
199                    &e.thread_id,
200                    &e.turn_id,
201                    e.timestamp,
202                );
203                event.tool_name = Some(e.tool.clone());
204                Some(event)
205            }
206            RoderEvent::RetrievalRouteIgnored(e) => {
207                let mut event = Self::basic(
208                    "retrieval_route_ignored",
209                    &e.thread_id,
210                    &e.turn_id,
211                    e.timestamp,
212                );
213                event.tool_name = Some(e.chosen_tool.clone());
214                Some(event)
215            }
216            RoderEvent::RetrievalRouteFailed(e) => {
217                let mut event = Self::basic(
218                    "retrieval_route_failed",
219                    &e.thread_id,
220                    &e.turn_id,
221                    e.timestamp,
222                );
223                event.tool_name = Some(e.tool.clone());
224                event.is_error = true;
225                Some(event)
226            }
227            RoderEvent::RetrievalResultUsed(e) => {
228                let mut event = Self::basic(
229                    "retrieval_result_used",
230                    &e.thread_id,
231                    &e.turn_id,
232                    e.timestamp,
233                );
234                event.tool_name = Some(e.outcome.tool.clone());
235                event.is_error = !matches!(
236                    e.outcome.outcome,
237                    roder_api::retrieval::RetrievalOutcomeKind::Useful
238                );
239                Some(event)
240            }
241            RoderEvent::RetrievalDiscoveryItemPromoted(e) => Some(Self::basic(
242                "retrieval_discovery_item_promoted",
243                &e.thread_id,
244                &e.turn_id,
245                e.timestamp,
246            )),
247            RoderEvent::RetrievalPromotionSkipped(e) => {
248                let mut event = Self::basic(
249                    "retrieval_promotion_skipped",
250                    &e.thread_id,
251                    &e.turn_id,
252                    e.timestamp,
253                );
254                event.is_error = true;
255                Some(event)
256            }
257            RoderEvent::InferenceEventReceived(e) => {
258                let mut event =
259                    Self::basic("inference_event", &e.thread_id, &e.turn_id, e.timestamp);
260                if let InferenceEvent::Usage(usage) = &e.event {
261                    event.token_usage = Some(EvalTokenUsage {
262                        prompt_tokens: usage.prompt_tokens,
263                        completion_tokens: usage.completion_tokens,
264                        total_tokens: usage.total_tokens,
265                        cached_prompt_tokens: usage.cached_prompt_tokens,
266                        cache_creation_prompt_tokens: usage.cache_creation_prompt_tokens,
267                        cache_hit_rate: usage.cache_hit_rate,
268                    });
269                }
270                Some(event)
271            }
272            RoderEvent::ToolCallRequested(e) => {
273                let mut event =
274                    Self::basic("tool_call_requested", &e.thread_id, &e.turn_id, e.timestamp);
275                event.tool_id = Some(e.tool_id.clone());
276                event.tool_name = Some(e.tool_name.clone());
277                Some(event)
278            }
279            RoderEvent::ToolCallStarted(e) => {
280                let mut event =
281                    Self::basic("tool_call_started", &e.thread_id, &e.turn_id, e.timestamp);
282                event.tool_id = Some(e.tool_id.clone());
283                event.tool_name = e.tool_name.clone();
284                Some(event)
285            }
286            RoderEvent::ToolCallCompleted(e) => {
287                let mut event =
288                    Self::basic("tool_call_completed", &e.thread_id, &e.turn_id, e.timestamp);
289                event.tool_id = Some(e.tool_id.clone());
290                event.tool_name = e.tool_name.clone();
291                event.is_error = e.is_error;
292                Some(event)
293            }
294            RoderEvent::ToolOutputTruncated(e) => {
295                let mut event = Self::basic(
296                    "tool_output_truncated",
297                    &e.thread_id,
298                    &e.turn_id,
299                    e.timestamp,
300                );
301                event.tool_id = Some(e.tool_id.clone());
302                event.tool_name = e.tool_name.clone();
303                Some(event)
304            }
305            RoderEvent::TaskLedgerUpdated(e) => Some(Self::basic(
306                "task_ledger_updated",
307                &e.thread_id,
308                &e.turn_id,
309                e.timestamp,
310            )),
311            RoderEvent::VerificationRequired(e) => Some(Self::basic(
312                "verification_required",
313                &e.thread_id,
314                &e.turn_id,
315                e.timestamp,
316            )),
317            RoderEvent::VerificationCompleted(e) => {
318                let mut event = Self::basic(
319                    "verification_completed",
320                    &e.thread_id,
321                    &e.turn_id,
322                    e.timestamp,
323                );
324                event.is_error = !e.passed;
325                Some(event)
326            }
327            RoderEvent::VerificationSkipped(e) => Some(Self::basic(
328                "verification_skipped",
329                &e.thread_id,
330                &e.turn_id,
331                e.timestamp,
332            )),
333            RoderEvent::ReliabilityFailureRecorded(e) => {
334                let mut event = Self::basic(
335                    "reliability_failure",
336                    &e.context.thread_id,
337                    &e.context.turn_id,
338                    e.timestamp,
339                );
340                event.tool_id = e.context.tool_id.clone();
341                event.tool_name = e.context.tool_name.clone();
342                event.is_error = true;
343                Some(event)
344            }
345            RoderEvent::ReliabilityRetryRecorded(e) => Some(Self::basic(
346                "reliability_retry",
347                &e.context.thread_id,
348                &e.context.turn_id,
349                e.timestamp,
350            )),
351            RoderEvent::ReliabilityLimitRecorded(e) => {
352                let mut event = Self::basic(
353                    "reliability_limit",
354                    &e.context.thread_id,
355                    &e.context.turn_id,
356                    e.timestamp,
357                );
358                event.is_error = true;
359                Some(event)
360            }
361            RoderEvent::TurnCompleted(e) => Some(Self::basic(
362                "turn_completed",
363                &e.thread_id,
364                &e.turn_id,
365                e.timestamp,
366            )),
367            RoderEvent::TurnFailed(e) => {
368                let mut event = Self::basic("turn_failed", &e.thread_id, &e.turn_id, e.timestamp);
369                event.is_error = true;
370                Some(event)
371            }
372            _ => None,
373        }
374    }
375
376    fn basic(
377        event_type: impl Into<String>,
378        thread_id: &ThreadId,
379        turn_id: &TurnId,
380        timestamp: OffsetDateTime,
381    ) -> Self {
382        Self {
383            timestamp,
384            event_type: event_type.into(),
385            thread_id: thread_id.clone(),
386            turn_id: turn_id.clone(),
387            tool_id: None,
388            tool_name: None,
389            token_usage: None,
390            runtime_profile: None,
391            speed_policy_phase: None,
392            speed_policy_reasoning: None,
393            is_error: false,
394        }
395    }
396}
397
398#[cfg(test)]
399mod tests {
400    use roder_api::events::{
401        InferenceEventReceived, RoderEvent, ToolCallCompleted, ToolCallRequested, TurnStarted,
402    };
403    use roder_api::inference::{InferenceEvent, RuntimeProfile, TokenUsage};
404
405    use super::*;
406
407    #[test]
408    fn trajectory_preserves_turn_tool_and_token_usage_ids() {
409        let events = vec![
410            RoderEvent::TurnStarted(TurnStarted {
411                thread_id: "thread-1".to_string(),
412                turn_id: "turn-1".to_string(),
413                runtime_profile: RuntimeProfile::Eval,
414                timestamp: OffsetDateTime::UNIX_EPOCH,
415            }),
416            RoderEvent::ToolCallRequested(ToolCallRequested {
417                thread_id: "thread-1".to_string(),
418                turn_id: "turn-1".to_string(),
419                tool_id: "tool-1".to_string(),
420                tool_name: "exec_command".to_string(),
421                display_payload: None,
422                timestamp: OffsetDateTime::UNIX_EPOCH,
423            }),
424            RoderEvent::ToolCallCompleted(ToolCallCompleted {
425                thread_id: "thread-1".to_string(),
426                turn_id: "turn-1".to_string(),
427                tool_id: "tool-1".to_string(),
428                tool_name: Some("exec_command".to_string()),
429                display_payload: None,
430                is_error: true,
431                output: Some("missing cmd".to_string()),
432                timestamp: OffsetDateTime::UNIX_EPOCH,
433            }),
434            RoderEvent::InferenceEventReceived(InferenceEventReceived {
435                thread_id: "thread-1".to_string(),
436                turn_id: "turn-1".to_string(),
437                event: InferenceEvent::Usage(TokenUsage {
438                    prompt_tokens: 10,
439                    completion_tokens: 5,
440                    total_tokens: 15,
441                    cached_prompt_tokens: 9,
442                    cache_creation_prompt_tokens: 1,
443                    cache_hit_rate: Some(0.9),
444                }),
445                timestamp: OffsetDateTime::UNIX_EPOCH,
446            }),
447        ];
448
449        let trajectory = EvalTrajectory::from_events("thread-1", "turn-1", &events);
450
451        assert_eq!(trajectory.events.len(), 4);
452        assert_eq!(
453            trajectory.events[0].runtime_profile.as_deref(),
454            Some("eval")
455        );
456        assert_eq!(trajectory.events[1].tool_id.as_deref(), Some("tool-1"));
457        assert!(trajectory.events[2].is_error);
458        assert_eq!(
459            trajectory.events[3]
460                .token_usage
461                .as_ref()
462                .unwrap()
463                .total_tokens,
464            15
465        );
466        let json = serde_json::to_value(&trajectory).unwrap();
467        assert_eq!(json["events"][1]["toolName"], "exec_command");
468    }
469
470    #[test]
471    fn eval_reports_round_trip_failure_classes() {
472        let report = EvalReport {
473            run: EvalRun {
474                suite_id: "tool-schema".to_string(),
475                run_id: "run-1".to_string(),
476                provider: "mock".to_string(),
477                model: "mock".to_string(),
478                started_at: OffsetDateTime::UNIX_EPOCH,
479                tags: vec!["offline".to_string()],
480            },
481            outcome: EvalOutcome::Fail,
482            failure_class: Some(EvalFailureClass::ToolSchema),
483            trajectory: EvalTrajectory {
484                thread_id: "thread-1".to_string(),
485                turn_id: "turn-1".to_string(),
486                events: Vec::new(),
487            },
488            metrics: vec![EvalMetric {
489                name: "tool_errors".to_string(),
490                kind: EvalMetricKind::Count,
491                value: 1.0,
492                unit: None,
493            }],
494        };
495
496        let json = serde_json::to_string(&report).unwrap();
497        let round_trip: EvalReport = serde_json::from_str(&json).unwrap();
498
499        assert_eq!(round_trip.outcome, EvalOutcome::Fail);
500        assert_eq!(round_trip.failure_class, Some(EvalFailureClass::ToolSchema));
501        assert_eq!(round_trip.metrics[0].name, "tool_errors");
502    }
503
504    #[test]
505    fn eval_report_serde_fixtures_cover_core_outcomes() {
506        let cases = [
507            (EvalOutcome::Pass, None),
508            (EvalOutcome::Fail, Some(EvalFailureClass::ToolSchema)),
509            (EvalOutcome::Timeout, Some(EvalFailureClass::Runtime)),
510            (
511                EvalOutcome::VerifierUncertain,
512                Some(EvalFailureClass::Verifier),
513            ),
514        ];
515
516        for (index, (outcome, failure_class)) in cases.into_iter().enumerate() {
517            let report = EvalReport {
518                run: EvalRun {
519                    suite_id: "phase44-fixtures".to_string(),
520                    run_id: format!("run-{index}"),
521                    provider: "mock".to_string(),
522                    model: "mock".to_string(),
523                    started_at: OffsetDateTime::UNIX_EPOCH,
524                    tags: vec!["offline".to_string()],
525                },
526                outcome,
527                failure_class,
528                trajectory: EvalTrajectory {
529                    thread_id: "thread-1".to_string(),
530                    turn_id: "turn-1".to_string(),
531                    events: Vec::new(),
532                },
533                metrics: vec![EvalMetric {
534                    name: "wall_time_ms".to_string(),
535                    kind: EvalMetricKind::Duration,
536                    value: 12.0,
537                    unit: Some("ms".to_string()),
538                }],
539            };
540
541            let value = serde_json::to_value(&report).unwrap();
542            let round_trip: EvalReport = serde_json::from_value(value).unwrap();
543
544            assert_eq!(round_trip.outcome, report.outcome);
545            assert_eq!(round_trip.failure_class, report.failure_class);
546        }
547    }
548}