Skip to main content

simple_agents_workflow/evals/
models.rs

1use std::path::Path;
2
3use serde::{ser::SerializeStruct, Deserialize, Serialize};
4use serde_json::Value;
5use thiserror::Error;
6
7use crate::yaml_runner::{
8    YamlWorkflowCustomWorkerExecutor, YamlWorkflowExecutionFlags, YamlWorkflowExecutorBinding,
9    YamlWorkflowRunOptions, YamlWorkflowRunOutput,
10};
11
12#[derive(Debug, Error)]
13pub enum EvalError {
14    #[error("failed to read eval dataset '{path}': {source}")]
15    ReadDataset {
16        path: String,
17        source: std::io::Error,
18    },
19    #[error("failed to parse eval dataset '{path}' line {line}: {source}")]
20    ParseDatasetLine {
21        path: String,
22        line: usize,
23        source: serde_json::Error,
24    },
25    #[error("invalid eval dataset: {message}")]
26    InvalidDataset { message: String },
27}
28
29impl Serialize for EvalError {
30    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
31    where
32        S: serde::Serializer,
33    {
34        let mut state = serializer.serialize_struct("EvalError", 3)?;
35        let (code, path) = match self {
36            EvalError::ReadDataset { path, .. } => ("read_dataset_failed", Some(path)),
37            EvalError::ParseDatasetLine { path, .. } => ("parse_dataset_line_failed", Some(path)),
38            EvalError::InvalidDataset { .. } => ("invalid_dataset", None),
39        };
40        state.serialize_field("code", code)?;
41        state.serialize_field("message", &self.to_string())?;
42        if let Some(path) = path {
43            state.serialize_field("path", path)?;
44        }
45        state.end()
46    }
47}
48
49pub struct EvalSuiteRunRequest<'a> {
50    pub suite_id: Option<&'a str>,
51    pub workflow_path: &'a Path,
52    pub dataset_path: &'a Path,
53    pub executor: YamlWorkflowExecutorBinding<'a>,
54    pub custom_worker: Option<&'a dyn YamlWorkflowCustomWorkerExecutor>,
55    pub execution: YamlWorkflowExecutionFlags,
56    pub workflow_options: YamlWorkflowRunOptions,
57    pub max_concurrency: usize,
58}
59
60#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
61#[serde(deny_unknown_fields)]
62pub struct EvalDatasetRecord {
63    pub id: String,
64    pub input: Value,
65    pub expected_output: Value,
66    #[serde(default)]
67    pub rubric: Option<Value>,
68    #[serde(default)]
69    pub custom: Option<Value>,
70    #[serde(default)]
71    pub metadata: Option<Value>,
72}
73
74#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
75#[serde(rename_all = "snake_case")]
76pub enum EvalRunStatus {
77    Passed,
78    Failed,
79    Error,
80}
81
82#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
83pub struct EvalReport {
84    pub suite_id: String,
85    pub status: EvalRunStatus,
86    pub summary: EvalSummary,
87    pub cases: Vec<EvalCaseResult>,
88}
89
90#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
91pub struct EvalSummary {
92    pub total_cases: usize,
93    pub passed_cases: usize,
94    pub failed_cases: usize,
95    pub error_cases: usize,
96    pub pass_rate: f64,
97}
98
99#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
100pub struct EvalCaseResult {
101    pub case_id: String,
102    pub status: EvalRunStatus,
103    #[serde(skip_serializing_if = "Option::is_none")]
104    pub expected: Option<Value>,
105    #[serde(skip_serializing_if = "Option::is_none")]
106    pub actual: Option<Value>,
107    #[serde(default, skip_serializing_if = "Vec::is_empty")]
108    pub evaluations: Vec<EvalResult>,
109    #[serde(skip_serializing_if = "Option::is_none")]
110    pub workflow_output: Option<YamlWorkflowRunOutput>,
111    #[serde(skip_serializing_if = "Option::is_none")]
112    pub error: Option<EvalErrorInfo>,
113}
114
115#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
116pub struct EvalResult {
117    pub id: String,
118    pub status: EvalRunStatus,
119    pub passed: bool,
120    #[serde(skip_serializing_if = "Option::is_none")]
121    pub score: Option<f64>,
122    #[serde(skip_serializing_if = "Option::is_none")]
123    pub expected: Option<Value>,
124    #[serde(skip_serializing_if = "Option::is_none")]
125    pub actual: Option<Value>,
126    #[serde(skip_serializing_if = "Option::is_none")]
127    pub reason: Option<String>,
128    #[serde(skip_serializing_if = "Option::is_none")]
129    pub metadata: Option<Value>,
130}
131
132#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
133pub struct EvalErrorInfo {
134    pub code: String,
135    pub message: String,
136}