1use roder_api::events::{RoderEvent, ThreadId, TurnId};
2use roder_api::inference::InferenceEvent;
3use serde::{Deserialize, Serialize};
4use time::OffsetDateTime;
5
6#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
7#[serde(rename_all = "camelCase")]
8pub struct EvalRun {
9 pub suite_id: String,
10 pub run_id: String,
11 pub provider: String,
12 pub model: String,
13 #[serde(with = "time::serde::rfc3339")]
14 pub started_at: OffsetDateTime,
15 #[serde(default)]
16 pub tags: Vec<String>,
17}
18
19#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
20#[serde(rename_all = "camelCase")]
21pub struct EvalTrajectory {
22 pub thread_id: ThreadId,
23 pub turn_id: TurnId,
24 #[serde(default)]
25 pub events: Vec<EvalTrajectoryEvent>,
26}
27
28#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
29#[serde(rename_all = "camelCase")]
30pub struct EvalTrajectoryEvent {
31 #[serde(with = "time::serde::rfc3339")]
32 pub timestamp: OffsetDateTime,
33 pub event_type: String,
34 pub thread_id: ThreadId,
35 pub turn_id: TurnId,
36 #[serde(default, skip_serializing_if = "Option::is_none")]
37 pub tool_id: Option<String>,
38 #[serde(default, skip_serializing_if = "Option::is_none")]
39 pub tool_name: Option<String>,
40 #[serde(default, skip_serializing_if = "Option::is_none")]
41 pub token_usage: Option<EvalTokenUsage>,
42 #[serde(default, skip_serializing_if = "Option::is_none")]
43 pub runtime_profile: Option<String>,
44 #[serde(default, skip_serializing_if = "Option::is_none")]
45 pub speed_policy_phase: Option<String>,
46 #[serde(default, skip_serializing_if = "Option::is_none")]
47 pub speed_policy_reasoning: Option<String>,
48 #[serde(default)]
49 pub is_error: bool,
50}
51
52#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
53#[serde(rename_all = "camelCase")]
54pub struct EvalTokenUsage {
55 pub prompt_tokens: u32,
56 pub completion_tokens: u32,
57 pub total_tokens: u32,
58 pub cached_prompt_tokens: u32,
59 #[serde(default)]
60 pub cache_creation_prompt_tokens: u32,
61 #[serde(default, skip_serializing_if = "Option::is_none")]
62 pub cache_hit_rate: Option<f64>,
63}
64
65#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
66#[serde(rename_all = "camelCase")]
67pub struct EvalMetric {
68 pub name: String,
69 pub kind: EvalMetricKind,
70 pub value: f64,
71 #[serde(default, skip_serializing_if = "Option::is_none")]
72 pub unit: Option<String>,
73}
74
75#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
76#[serde(rename_all = "snake_case")]
77pub enum EvalMetricKind {
78 Outcome,
79 Count,
80 Duration,
81 Tokens,
82 Bytes,
83 Flag,
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
87#[serde(rename_all = "snake_case")]
88pub enum EvalOutcome {
89 Pass,
90 Fail,
91 Timeout,
92 HarnessError,
93 VerifierUncertain,
94}
95
96#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
97#[serde(rename_all = "snake_case")]
98pub enum EvalFailureClass {
99 Model,
100 ToolSchema,
101 Runtime,
102 Environment,
103 Provider,
104 Verifier,
105 Unknown,
106}
107
108#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
109#[serde(rename_all = "camelCase")]
110pub struct EvalReport {
111 pub run: EvalRun,
112 pub outcome: EvalOutcome,
113 #[serde(default, skip_serializing_if = "Option::is_none")]
114 pub failure_class: Option<EvalFailureClass>,
115 pub trajectory: EvalTrajectory,
116 #[serde(default)]
117 pub metrics: Vec<EvalMetric>,
118}
119
120impl EvalTrajectory {
121 pub fn from_events(
122 thread_id: impl Into<ThreadId>,
123 turn_id: impl Into<TurnId>,
124 events: &[RoderEvent],
125 ) -> Self {
126 let thread_id = thread_id.into();
127 let turn_id = turn_id.into();
128 let events = events
129 .iter()
130 .filter_map(EvalTrajectoryEvent::from_event)
131 .collect();
132 Self {
133 thread_id,
134 turn_id,
135 events,
136 }
137 }
138}
139
140impl EvalTrajectoryEvent {
141 pub fn from_event(event: &RoderEvent) -> Option<Self> {
142 match event {
143 RoderEvent::TurnStarted(e) => {
144 let mut event = Self::basic("turn_started", &e.thread_id, &e.turn_id, e.timestamp);
145 event.runtime_profile = Some(e.runtime_profile.as_str().to_string());
146 Some(event)
147 }
148 RoderEvent::InferenceStarted(e) => {
149 let mut event =
150 Self::basic("inference_started", &e.thread_id, &e.turn_id, e.timestamp);
151 if let Some(decision) = &e.speed_policy {
152 event.speed_policy_phase = Some(decision.phase.as_str().to_string());
153 event.speed_policy_reasoning = decision
154 .applied_reasoning
155 .clone()
156 .or_else(|| Some(decision.desired_reasoning.clone()));
157 }
158 Some(event)
159 }
160 RoderEvent::ContextAssemblyCompleted(e) => Some(Self::basic(
161 "context_assembly_completed",
162 &e.thread_id,
163 &e.turn_id,
164 e.timestamp,
165 )),
166 RoderEvent::ContextEntrypointCandidatesInjected(e) => Some(Self::basic(
167 "entrypoint_candidates_injected",
168 &e.thread_id,
169 &e.turn_id,
170 e.timestamp,
171 )),
172 RoderEvent::ContextCompactionStarted(e) => Some(Self::basic(
173 "context_compaction_started",
174 &e.thread_id,
175 &e.turn_id,
176 e.timestamp,
177 )),
178 RoderEvent::ContextCompactionRecorded(e) => Some(Self::basic(
179 "context_compaction_recorded",
180 &e.thread_id,
181 &e.turn_id,
182 e.timestamp,
183 )),
184 RoderEvent::ContextCompactionSkipped(e) => Some(Self::basic(
185 "context_compaction_skipped",
186 &e.thread_id,
187 &e.turn_id,
188 e.timestamp,
189 )),
190 RoderEvent::RetrievalRoutePlanned(e) => Some(Self::basic(
191 "retrieval_route_planned",
192 &e.plan.thread_id,
193 &e.plan.turn_id,
194 e.plan.timestamp,
195 )),
196 RoderEvent::RetrievalRouteAccepted(e) => {
197 let mut event = Self::basic(
198 "retrieval_route_accepted",
199 &e.thread_id,
200 &e.turn_id,
201 e.timestamp,
202 );
203 event.tool_name = Some(e.tool.clone());
204 Some(event)
205 }
206 RoderEvent::RetrievalRouteIgnored(e) => {
207 let mut event = Self::basic(
208 "retrieval_route_ignored",
209 &e.thread_id,
210 &e.turn_id,
211 e.timestamp,
212 );
213 event.tool_name = Some(e.chosen_tool.clone());
214 Some(event)
215 }
216 RoderEvent::RetrievalRouteFailed(e) => {
217 let mut event = Self::basic(
218 "retrieval_route_failed",
219 &e.thread_id,
220 &e.turn_id,
221 e.timestamp,
222 );
223 event.tool_name = Some(e.tool.clone());
224 event.is_error = true;
225 Some(event)
226 }
227 RoderEvent::RetrievalResultUsed(e) => {
228 let mut event = Self::basic(
229 "retrieval_result_used",
230 &e.thread_id,
231 &e.turn_id,
232 e.timestamp,
233 );
234 event.tool_name = Some(e.outcome.tool.clone());
235 event.is_error = !matches!(
236 e.outcome.outcome,
237 roder_api::retrieval::RetrievalOutcomeKind::Useful
238 );
239 Some(event)
240 }
241 RoderEvent::RetrievalDiscoveryItemPromoted(e) => Some(Self::basic(
242 "retrieval_discovery_item_promoted",
243 &e.thread_id,
244 &e.turn_id,
245 e.timestamp,
246 )),
247 RoderEvent::RetrievalPromotionSkipped(e) => {
248 let mut event = Self::basic(
249 "retrieval_promotion_skipped",
250 &e.thread_id,
251 &e.turn_id,
252 e.timestamp,
253 );
254 event.is_error = true;
255 Some(event)
256 }
257 RoderEvent::InferenceEventReceived(e) => {
258 let mut event =
259 Self::basic("inference_event", &e.thread_id, &e.turn_id, e.timestamp);
260 if let InferenceEvent::Usage(usage) = &e.event {
261 event.token_usage = Some(EvalTokenUsage {
262 prompt_tokens: usage.prompt_tokens,
263 completion_tokens: usage.completion_tokens,
264 total_tokens: usage.total_tokens,
265 cached_prompt_tokens: usage.cached_prompt_tokens,
266 cache_creation_prompt_tokens: usage.cache_creation_prompt_tokens,
267 cache_hit_rate: usage.cache_hit_rate,
268 });
269 }
270 Some(event)
271 }
272 RoderEvent::ToolCallRequested(e) => {
273 let mut event =
274 Self::basic("tool_call_requested", &e.thread_id, &e.turn_id, e.timestamp);
275 event.tool_id = Some(e.tool_id.clone());
276 event.tool_name = Some(e.tool_name.clone());
277 Some(event)
278 }
279 RoderEvent::ToolCallStarted(e) => {
280 let mut event =
281 Self::basic("tool_call_started", &e.thread_id, &e.turn_id, e.timestamp);
282 event.tool_id = Some(e.tool_id.clone());
283 event.tool_name = e.tool_name.clone();
284 Some(event)
285 }
286 RoderEvent::ToolCallCompleted(e) => {
287 let mut event =
288 Self::basic("tool_call_completed", &e.thread_id, &e.turn_id, e.timestamp);
289 event.tool_id = Some(e.tool_id.clone());
290 event.tool_name = e.tool_name.clone();
291 event.is_error = e.is_error;
292 Some(event)
293 }
294 RoderEvent::ToolOutputTruncated(e) => {
295 let mut event = Self::basic(
296 "tool_output_truncated",
297 &e.thread_id,
298 &e.turn_id,
299 e.timestamp,
300 );
301 event.tool_id = Some(e.tool_id.clone());
302 event.tool_name = e.tool_name.clone();
303 Some(event)
304 }
305 RoderEvent::TaskLedgerUpdated(e) => Some(Self::basic(
306 "task_ledger_updated",
307 &e.thread_id,
308 &e.turn_id,
309 e.timestamp,
310 )),
311 RoderEvent::VerificationRequired(e) => Some(Self::basic(
312 "verification_required",
313 &e.thread_id,
314 &e.turn_id,
315 e.timestamp,
316 )),
317 RoderEvent::VerificationCompleted(e) => {
318 let mut event = Self::basic(
319 "verification_completed",
320 &e.thread_id,
321 &e.turn_id,
322 e.timestamp,
323 );
324 event.is_error = !e.passed;
325 Some(event)
326 }
327 RoderEvent::VerificationSkipped(e) => Some(Self::basic(
328 "verification_skipped",
329 &e.thread_id,
330 &e.turn_id,
331 e.timestamp,
332 )),
333 RoderEvent::ReliabilityFailureRecorded(e) => {
334 let mut event = Self::basic(
335 "reliability_failure",
336 &e.context.thread_id,
337 &e.context.turn_id,
338 e.timestamp,
339 );
340 event.tool_id = e.context.tool_id.clone();
341 event.tool_name = e.context.tool_name.clone();
342 event.is_error = true;
343 Some(event)
344 }
345 RoderEvent::ReliabilityRetryRecorded(e) => Some(Self::basic(
346 "reliability_retry",
347 &e.context.thread_id,
348 &e.context.turn_id,
349 e.timestamp,
350 )),
351 RoderEvent::ReliabilityLimitRecorded(e) => {
352 let mut event = Self::basic(
353 "reliability_limit",
354 &e.context.thread_id,
355 &e.context.turn_id,
356 e.timestamp,
357 );
358 event.is_error = true;
359 Some(event)
360 }
361 RoderEvent::TurnCompleted(e) => Some(Self::basic(
362 "turn_completed",
363 &e.thread_id,
364 &e.turn_id,
365 e.timestamp,
366 )),
367 RoderEvent::TurnFailed(e) => {
368 let mut event = Self::basic("turn_failed", &e.thread_id, &e.turn_id, e.timestamp);
369 event.is_error = true;
370 Some(event)
371 }
372 _ => None,
373 }
374 }
375
376 fn basic(
377 event_type: impl Into<String>,
378 thread_id: &ThreadId,
379 turn_id: &TurnId,
380 timestamp: OffsetDateTime,
381 ) -> Self {
382 Self {
383 timestamp,
384 event_type: event_type.into(),
385 thread_id: thread_id.clone(),
386 turn_id: turn_id.clone(),
387 tool_id: None,
388 tool_name: None,
389 token_usage: None,
390 runtime_profile: None,
391 speed_policy_phase: None,
392 speed_policy_reasoning: None,
393 is_error: false,
394 }
395 }
396}
397
398#[cfg(test)]
399mod tests {
400 use roder_api::events::{
401 InferenceEventReceived, RoderEvent, ToolCallCompleted, ToolCallRequested, TurnStarted,
402 };
403 use roder_api::inference::{InferenceEvent, RuntimeProfile, TokenUsage};
404
405 use super::*;
406
407 #[test]
408 fn trajectory_preserves_turn_tool_and_token_usage_ids() {
409 let events = vec![
410 RoderEvent::TurnStarted(TurnStarted {
411 thread_id: "thread-1".to_string(),
412 turn_id: "turn-1".to_string(),
413 runtime_profile: RuntimeProfile::Eval,
414 timestamp: OffsetDateTime::UNIX_EPOCH,
415 }),
416 RoderEvent::ToolCallRequested(ToolCallRequested {
417 thread_id: "thread-1".to_string(),
418 turn_id: "turn-1".to_string(),
419 tool_id: "tool-1".to_string(),
420 tool_name: "exec_command".to_string(),
421 display_payload: None,
422 timestamp: OffsetDateTime::UNIX_EPOCH,
423 }),
424 RoderEvent::ToolCallCompleted(ToolCallCompleted {
425 thread_id: "thread-1".to_string(),
426 turn_id: "turn-1".to_string(),
427 tool_id: "tool-1".to_string(),
428 tool_name: Some("exec_command".to_string()),
429 display_payload: None,
430 is_error: true,
431 output: Some("missing cmd".to_string()),
432 timestamp: OffsetDateTime::UNIX_EPOCH,
433 }),
434 RoderEvent::InferenceEventReceived(InferenceEventReceived {
435 thread_id: "thread-1".to_string(),
436 turn_id: "turn-1".to_string(),
437 event: InferenceEvent::Usage(TokenUsage {
438 prompt_tokens: 10,
439 completion_tokens: 5,
440 total_tokens: 15,
441 cached_prompt_tokens: 9,
442 cache_creation_prompt_tokens: 1,
443 cache_hit_rate: Some(0.9),
444 }),
445 timestamp: OffsetDateTime::UNIX_EPOCH,
446 }),
447 ];
448
449 let trajectory = EvalTrajectory::from_events("thread-1", "turn-1", &events);
450
451 assert_eq!(trajectory.events.len(), 4);
452 assert_eq!(
453 trajectory.events[0].runtime_profile.as_deref(),
454 Some("eval")
455 );
456 assert_eq!(trajectory.events[1].tool_id.as_deref(), Some("tool-1"));
457 assert!(trajectory.events[2].is_error);
458 assert_eq!(
459 trajectory.events[3]
460 .token_usage
461 .as_ref()
462 .unwrap()
463 .total_tokens,
464 15
465 );
466 let json = serde_json::to_value(&trajectory).unwrap();
467 assert_eq!(json["events"][1]["toolName"], "exec_command");
468 }
469
470 #[test]
471 fn eval_reports_round_trip_failure_classes() {
472 let report = EvalReport {
473 run: EvalRun {
474 suite_id: "tool-schema".to_string(),
475 run_id: "run-1".to_string(),
476 provider: "mock".to_string(),
477 model: "mock".to_string(),
478 started_at: OffsetDateTime::UNIX_EPOCH,
479 tags: vec!["offline".to_string()],
480 },
481 outcome: EvalOutcome::Fail,
482 failure_class: Some(EvalFailureClass::ToolSchema),
483 trajectory: EvalTrajectory {
484 thread_id: "thread-1".to_string(),
485 turn_id: "turn-1".to_string(),
486 events: Vec::new(),
487 },
488 metrics: vec![EvalMetric {
489 name: "tool_errors".to_string(),
490 kind: EvalMetricKind::Count,
491 value: 1.0,
492 unit: None,
493 }],
494 };
495
496 let json = serde_json::to_string(&report).unwrap();
497 let round_trip: EvalReport = serde_json::from_str(&json).unwrap();
498
499 assert_eq!(round_trip.outcome, EvalOutcome::Fail);
500 assert_eq!(round_trip.failure_class, Some(EvalFailureClass::ToolSchema));
501 assert_eq!(round_trip.metrics[0].name, "tool_errors");
502 }
503
504 #[test]
505 fn eval_report_serde_fixtures_cover_core_outcomes() {
506 let cases = [
507 (EvalOutcome::Pass, None),
508 (EvalOutcome::Fail, Some(EvalFailureClass::ToolSchema)),
509 (EvalOutcome::Timeout, Some(EvalFailureClass::Runtime)),
510 (
511 EvalOutcome::VerifierUncertain,
512 Some(EvalFailureClass::Verifier),
513 ),
514 ];
515
516 for (index, (outcome, failure_class)) in cases.into_iter().enumerate() {
517 let report = EvalReport {
518 run: EvalRun {
519 suite_id: "phase44-fixtures".to_string(),
520 run_id: format!("run-{index}"),
521 provider: "mock".to_string(),
522 model: "mock".to_string(),
523 started_at: OffsetDateTime::UNIX_EPOCH,
524 tags: vec!["offline".to_string()],
525 },
526 outcome,
527 failure_class,
528 trajectory: EvalTrajectory {
529 thread_id: "thread-1".to_string(),
530 turn_id: "turn-1".to_string(),
531 events: Vec::new(),
532 },
533 metrics: vec![EvalMetric {
534 name: "wall_time_ms".to_string(),
535 kind: EvalMetricKind::Duration,
536 value: 12.0,
537 unit: Some("ms".to_string()),
538 }],
539 };
540
541 let value = serde_json::to_value(&report).unwrap();
542 let round_trip: EvalReport = serde_json::from_value(value).unwrap();
543
544 assert_eq!(round_trip.outcome, report.outcome);
545 assert_eq!(round_trip.failure_class, report.failure_class);
546 }
547 }
548}