Skip to main content

skilltest_core/
testcase.rs

1//! Test cases: the YAML a user writes to describe one test of a skill — the
2//! initial data to hand the skill, an optional simulated user for multi-turn
3//! runs, and the evals that decide pass/fail.
4
5use std::path::{Path, PathBuf};
6
7use serde::{Deserialize, Serialize};
8
9use crate::error::{Error, Result};
10use crate::eval::Eval;
11
12/// The simulated-user block that turns a single-turn case into a multi-turn one.
13/// When present, after each assistant turn the runner asks the provider to play
14/// the user (guided by `persona`) until `done_when` holds or `max_turns` is hit.
15#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
16pub struct SimulatedUser {
17    /// Instructions describing how the simulated user should behave.
18    pub persona: String,
19    /// A plain-English condition; when the judge decides it holds, the
20    /// conversation ends. Optional — without it the run ends at `max_turns` or
21    /// when the skill reports itself done.
22    #[serde(default)]
23    pub done_when: Option<String>,
24    /// Per-case override of the global assistant-turn cap.
25    #[serde(default)]
26    pub max_turns: Option<u32>,
27}
28
29/// One test case.
30#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
31#[serde(deny_unknown_fields)]
32pub struct TestCase {
33    /// Human-readable name (defaults to the file stem when loaded from a file).
34    #[serde(default)]
35    pub name: String,
36    /// Path to the skill directory under test, relative to the test-case file.
37    pub skill: PathBuf,
38    /// The initial data/prompt handed to the skill as the first user message.
39    pub input: String,
40    /// Present for multi-turn cases; absent for single-turn.
41    #[serde(default)]
42    pub user: Option<SimulatedUser>,
43    /// The evals that decide whether this case passes. Must be non-empty.
44    pub evals: Vec<Eval>,
45}
46
47impl TestCase {
48    /// Load a test case from a YAML file. The `name` defaults to the file stem
49    /// and `skill` is resolved relative to the file's directory.
50    ///
51    /// # Errors
52    /// [`Error::Io`] if the file cannot be read, [`Error::Yaml`] on parse
53    /// failure, and [`Error::Invalid`] if the case is internally inconsistent.
54    pub fn load(path: &Path) -> Result<Self> {
55        let text = std::fs::read_to_string(path).map_err(|source| Error::Io {
56            path: path.to_path_buf(),
57            source,
58        })?;
59        let mut case: TestCase = serde_yaml::from_str(&text).map_err(|source| Error::Yaml {
60            path: path.to_path_buf(),
61            source,
62        })?;
63        if case.name.is_empty() {
64            case.name = path
65                .file_stem()
66                .and_then(|s| s.to_str())
67                .unwrap_or("case")
68                .to_string();
69        }
70        if let Some(parent) = path.parent() {
71            if case.skill.is_relative() {
72                case.skill = parent.join(&case.skill);
73            }
74        }
75        case.validate()?;
76        Ok(case)
77    }
78
79    /// Whether this is a multi-turn case (has a simulated user).
80    #[must_use]
81    pub fn is_multi_turn(&self) -> bool {
82        self.user.is_some()
83    }
84
85    /// Validate the case's structure and every eval.
86    ///
87    /// # Errors
88    /// [`Error::Invalid`] when input/evals are empty or an eval is malformed.
89    pub fn validate(&self) -> Result<()> {
90        if self.input.trim().is_empty() {
91            return Err(Error::Invalid(format!(
92                "test case `{}` has an empty `input`",
93                self.name
94            )));
95        }
96        if self.evals.is_empty() {
97            return Err(Error::Invalid(format!(
98                "test case `{}` defines no `evals`",
99                self.name
100            )));
101        }
102        for eval in &self.evals {
103            eval.validate()?;
104        }
105        if let Some(user) = &self.user {
106            if user.persona.trim().is_empty() {
107                return Err(Error::Invalid(format!(
108                    "test case `{}` has a `user` block with an empty `persona`",
109                    self.name
110                )));
111            }
112            if user.max_turns == Some(0) {
113                return Err(Error::Invalid(format!(
114                    "test case `{}` sets `user.max_turns` to 0",
115                    self.name
116                )));
117            }
118        }
119        Ok(())
120    }
121}
122
123/// Discover test-case files: either a single `.yaml`/`.yml` file or every such
124/// file directly inside a directory (sorted for deterministic ordering).
125///
126/// # Errors
127/// [`Error::Io`] if a directory cannot be read, [`Error::Invalid`] if the path
128/// matches nothing usable.
129pub fn discover_cases(path: &Path) -> Result<Vec<PathBuf>> {
130    if path.is_file() {
131        return Ok(vec![path.to_path_buf()]);
132    }
133    if path.is_dir() {
134        let entries = std::fs::read_dir(path).map_err(|source| Error::Io {
135            path: path.to_path_buf(),
136            source,
137        })?;
138        let mut files: Vec<PathBuf> = entries
139            .filter_map(std::result::Result::ok)
140            .map(|e| e.path())
141            .filter(|p| {
142                p.is_file()
143                    && matches!(p.extension().and_then(|s| s.to_str()), Some("yaml" | "yml"))
144            })
145            .collect();
146        files.sort();
147        if files.is_empty() {
148            return Err(Error::Invalid(format!(
149                "no .yaml test cases found in {}",
150                path.display()
151            )));
152        }
153        return Ok(files);
154    }
155    Err(Error::Invalid(format!(
156        "path does not exist: {}",
157        path.display()
158    )))
159}
160
161#[cfg(test)]
162mod tests {
163    use super::*;
164    use crate::eval::Eval;
165
166    #[test]
167    fn parses_single_turn_case() {
168        let yaml = r#"
169skill: ./greeter
170input: "Greet Dr. Smith"
171evals:
172  - type: boolean
173    criterion: "greets Dr. Smith by name"
174"#;
175        let case: TestCase = serde_yaml::from_str(yaml).unwrap();
176        assert!(!case.is_multi_turn());
177        assert_eq!(case.evals.len(), 1);
178        assert!(matches!(case.evals[0], Eval::Boolean { .. }));
179    }
180
181    #[test]
182    fn parses_multi_turn_case() {
183        let yaml = r#"
184name: booking
185skill: ./booker
186input: "I want to book an appointment"
187user:
188  persona: "You are a terse patient."
189  done_when: "the assistant has confirmed a booking"
190  max_turns: 5
191evals:
192  - type: numeric
193    criterion: "how clearly was the appointment confirmed"
194    min: 0
195    max: 10
196    threshold: 7
197"#;
198        let case: TestCase = serde_yaml::from_str(yaml).unwrap();
199        assert!(case.is_multi_turn());
200        assert_eq!(case.user.as_ref().unwrap().max_turns, Some(5));
201        case.validate().unwrap();
202    }
203
204    #[test]
205    fn empty_evals_is_invalid() {
206        let yaml = "skill: ./x\ninput: hi\nevals: []\n";
207        let case: TestCase = serde_yaml::from_str(yaml).unwrap();
208        assert!(case.validate().is_err());
209    }
210
211    #[test]
212    fn unknown_field_is_rejected() {
213        let yaml = "skill: ./x\ninput: hi\nbogus: 1\nevals: []\n";
214        assert!(serde_yaml::from_str::<TestCase>(yaml).is_err());
215    }
216}