Skip to main content

skilltest_core/
runner.rs

1//! The runner: orchestrates a test case into a conversation, drives the
2//! provider across turns, scores the transcript with evals, and fans out over
3//! the configured platform × model matrix.
4
5use crate::config::Config;
6use crate::conversation::{Message, Transcript};
7use crate::error::Result;
8use crate::eval::{Eval, JudgeValue};
9use crate::provider::{JudgeKind, JudgeQuery, Provider, SkillRef, Usage};
10use crate::report::{CaseRun, Report};
11use crate::skill::{load_skill, SkillDefinition};
12use crate::testcase::TestCase;
13
14/// Runs test cases against a provider using a configuration.
15pub struct Runner<'a> {
16    provider: &'a dyn Provider,
17    config: &'a Config,
18}
19
20impl<'a> Runner<'a> {
21    /// Build a runner.
22    #[must_use]
23    pub fn new(provider: &'a dyn Provider, config: &'a Config) -> Self {
24        Self { provider, config }
25    }
26
27    /// Run every supplied case across the full platform × model matrix and
28    /// collect a [`Report`].
29    ///
30    /// # Errors
31    /// Propagates the first [`crate::Error`] from loading a skill or a provider
32    /// failure. Eval *failures* are not errors — they are recorded in the report.
33    pub fn run_all(&self, cases: &[TestCase]) -> Result<Report> {
34        let mut runs = Vec::new();
35        for case in cases {
36            runs.extend(self.run_case(case)?);
37        }
38        Ok(Report::new(runs))
39    }
40
41    /// Run a single case across the matrix.
42    ///
43    /// # Errors
44    /// As [`Runner::run_all`].
45    pub fn run_case(&self, case: &TestCase) -> Result<Vec<CaseRun>> {
46        let skill = load_skill(&case.skill)?;
47        let mut runs = Vec::new();
48        for platform in &self.config.platforms {
49            for model in &self.config.models {
50                runs.push(self.run_case_on(case, &skill, platform, model)?);
51            }
52        }
53        Ok(runs)
54    }
55
56    /// Run a single case on one platform/model pair.
57    fn run_case_on(
58        &self,
59        case: &TestCase,
60        skill: &SkillDefinition,
61        platform: &str,
62        model: &str,
63    ) -> Result<CaseRun> {
64        let mut totals = Usage::default();
65        let transcript = self.converse(case, skill, platform, model, &mut totals)?;
66        let evals = self.score(case, &transcript, &mut totals)?;
67        let passed = evals.iter().all(|e| e.passed);
68        Ok(CaseRun {
69            case: case.name.clone(),
70            skill: skill.dir.to_string_lossy().into_owned(),
71            platform: platform.to_string(),
72            model: model.to_string(),
73            passed,
74            turns: transcript.assistant_turns(),
75            evals,
76            transcript,
77            usage: (!totals.is_empty()).then_some(totals),
78        })
79    }
80
81    /// Drive the conversation: a single assistant turn for single-turn cases, or
82    /// a simulated-user loop for multi-turn cases.
83    fn converse(
84        &self,
85        case: &TestCase,
86        skill: &SkillDefinition,
87        platform: &str,
88        model: &str,
89        totals: &mut Usage,
90    ) -> Result<Transcript> {
91        let dir = skill.dir.to_string_lossy().into_owned();
92        let skill_ref = SkillRef {
93            name: &skill.name,
94            dir: &dir,
95            instructions: &skill.instructions,
96        };
97        let judge_model = self.config.effective_judge_model();
98        let max_turns = case
99            .user
100            .as_ref()
101            .and_then(|u| u.max_turns)
102            .unwrap_or(self.config.max_turns) as usize;
103        let resume_supported = self.provider.supports_resume(platform);
104
105        let mut transcript = Transcript::from_input(&case.input);
106        // On harnesses that support it, thread the session_id from each
107        // respond into the next one so the harness keeps real state instead of
108        // being re-prompted with a stringified transcript.
109        let mut session: Option<String> = None;
110
111        loop {
112            let session_arg = if resume_supported {
113                session.as_deref()
114            } else {
115                None
116            };
117            let turn = self.provider.respond(
118                platform,
119                model,
120                &skill_ref,
121                &transcript.messages,
122                session_arg,
123            )?;
124            if let Some(u) = &turn.usage {
125                totals.add(u);
126            }
127            // Capture or refresh the session handle for the next turn.
128            if let Some(id) = turn.session_id {
129                session = Some(id);
130            }
131            let skill_done = turn.done;
132            transcript.push(Message::assistant(turn.message));
133
134            // Single-turn cases stop after the first assistant turn.
135            let Some(user) = &case.user else {
136                break;
137            };
138
139            if skill_done || transcript.assistant_turns() >= max_turns {
140                break;
141            }
142
143            // Stop early if the configured done-condition holds.
144            if let Some(done_when) = &user.done_when {
145                let query = JudgeQuery {
146                    kind: JudgeKind::Boolean,
147                    criterion: done_when,
148                    scale: None,
149                };
150                let verdict = self
151                    .provider
152                    .judge(judge_model, &query, &transcript.messages)?;
153                if let Some(u) = &verdict.usage {
154                    totals.add(u);
155                }
156                if matches!(verdict.value, JudgeValue::Bool(true)) {
157                    break;
158                }
159            }
160
161            // Otherwise the simulated user replies and the loop continues.
162            let user_turn =
163                self.provider
164                    .simulate_user(judge_model, &user.persona, &transcript.messages)?;
165            if let Some(u) = &user_turn.usage {
166                totals.add(u);
167            }
168            let stop = user_turn.stop;
169            transcript.push(Message::user(user_turn.message));
170            if stop {
171                break;
172            }
173        }
174
175        Ok(transcript)
176    }
177
178    /// Run every eval against the finished transcript.
179    fn score(
180        &self,
181        case: &TestCase,
182        transcript: &Transcript,
183        totals: &mut Usage,
184    ) -> Result<Vec<crate::eval::EvalOutcome>> {
185        let judge_model = self.config.effective_judge_model();
186        let mut outcomes = Vec::with_capacity(case.evals.len());
187        for eval in &case.evals {
188            let query = match eval {
189                Eval::Boolean { criterion, .. } => JudgeQuery {
190                    kind: JudgeKind::Boolean,
191                    criterion,
192                    scale: None,
193                },
194                Eval::Numeric {
195                    criterion,
196                    min,
197                    max,
198                    ..
199                } => JudgeQuery {
200                    kind: JudgeKind::Numeric,
201                    criterion,
202                    scale: Some((*min, *max)),
203                },
204            };
205            let verdict = self
206                .provider
207                .judge(judge_model, &query, &transcript.messages)?;
208            if let Some(u) = &verdict.usage {
209                totals.add(u);
210            }
211            outcomes.push(eval.outcome(&verdict.value, verdict.reason)?);
212        }
213        Ok(outcomes)
214    }
215}
216
217#[cfg(test)]
218mod tests {
219    use super::*;
220    use crate::conversation::Message;
221    use crate::provider::{AssistantTurn, JudgeVerdict, UserTurn};
222    use std::cell::RefCell;
223
224    /// An in-memory provider scripted with canned turns and verdicts, so the
225    /// runner's orchestration can be tested without any subprocess.
226    struct ScriptedProvider {
227        assistant: Vec<AssistantTurn>,
228        user: Vec<UserTurn>,
229        judge: Vec<JudgeVerdict>,
230        calls: RefCell<Calls>,
231    }
232
233    #[derive(Default)]
234    struct Calls {
235        assistant: usize,
236        user: usize,
237        judge: usize,
238    }
239
240    impl Provider for ScriptedProvider {
241        fn respond(
242            &self,
243            _platform: &str,
244            _model: &str,
245            _skill: &SkillRef<'_>,
246            _messages: &[Message],
247            _session: Option<&str>,
248        ) -> Result<AssistantTurn> {
249            let i = self.calls.borrow().assistant;
250            self.calls.borrow_mut().assistant += 1;
251            Ok(self.assistant[i.min(self.assistant.len() - 1)].clone())
252        }
253
254        fn simulate_user(
255            &self,
256            _model: &str,
257            _persona: &str,
258            _messages: &[Message],
259        ) -> Result<UserTurn> {
260            let i = self.calls.borrow().user;
261            self.calls.borrow_mut().user += 1;
262            Ok(self.user[i.min(self.user.len() - 1)].clone())
263        }
264
265        fn judge(
266            &self,
267            _model: &str,
268            _query: &JudgeQuery<'_>,
269            _messages: &[Message],
270        ) -> Result<JudgeVerdict> {
271            let i = self.calls.borrow().judge;
272            self.calls.borrow_mut().judge += 1;
273            let v = &self.judge[i.min(self.judge.len() - 1)];
274            Ok(JudgeVerdict {
275                value: v.value,
276                reason: v.reason.clone(),
277                usage: v.usage.clone(),
278            })
279        }
280    }
281
282    /// Create a throwaway skill directory with a minimal SKILL.md so the runner
283    /// (which loads the skill from disk) has something real to read.
284    fn temp_skill(tag: &str) -> std::path::PathBuf {
285        let dir = std::env::temp_dir().join(format!("skilltest-ut-{}-{tag}", std::process::id()));
286        std::fs::create_dir_all(&dir).unwrap();
287        std::fs::write(
288            dir.join("SKILL.md"),
289            "---\nname: greeter\ndescription: a test skill\n---\nfake-reply: hi\n",
290        )
291        .unwrap();
292        dir
293    }
294
295    fn boolean_case(skill: std::path::PathBuf) -> TestCase {
296        TestCase {
297            name: "greets".into(),
298            skill,
299            input: "Greet Dr. Smith".into(),
300            user: None,
301            evals: vec![Eval::Boolean {
302                criterion: "greets Dr. Smith".into(),
303                expected: true,
304                name: None,
305            }],
306        }
307    }
308
309    #[test]
310    fn single_turn_runs_one_assistant_turn_and_scores() {
311        let provider = ScriptedProvider {
312            assistant: vec![AssistantTurn {
313                message: "Hello, Dr. Smith!".into(),
314                done: false,
315                ..Default::default()
316            }],
317            user: vec![],
318            judge: vec![JudgeVerdict {
319                value: JudgeValue::Bool(true),
320                reason: "names her".into(),
321                usage: None,
322            }],
323            calls: RefCell::new(Calls::default()),
324        };
325        let config = Config::default();
326        let runner = Runner::new(&provider, &config);
327        let runs = runner
328            .run_case(&boolean_case(temp_skill("single")))
329            .unwrap();
330        assert_eq!(runs.len(), 1);
331        assert!(runs[0].passed);
332        assert_eq!(runs[0].turns, 1);
333        assert_eq!(provider.calls.borrow().assistant, 1);
334    }
335
336    #[test]
337    fn multi_turn_stops_when_done_when_holds() {
338        let mut case = boolean_case(temp_skill("multi"));
339        case.user = Some(crate::testcase::SimulatedUser {
340            persona: "a terse patient".into(),
341            done_when: Some("the assistant has greeted".into()),
342            max_turns: Some(5),
343        });
344        let provider = ScriptedProvider {
345            assistant: vec![AssistantTurn {
346                message: "Hi there".into(),
347                done: false,
348                ..Default::default()
349            }],
350            user: vec![UserTurn {
351                message: "continue".into(),
352                stop: false,
353                ..Default::default()
354            }],
355            // First judge call is the done_when check (true -> stop), second is
356            // the eval.
357            judge: vec![
358                JudgeVerdict {
359                    value: JudgeValue::Bool(true),
360                    reason: "done".into(),
361                    usage: None,
362                },
363                JudgeVerdict {
364                    value: JudgeValue::Bool(true),
365                    reason: "greeted".into(),
366                    usage: None,
367                },
368            ],
369            calls: RefCell::new(Calls::default()),
370        };
371        let config = Config::default();
372        let runner = Runner::new(&provider, &config);
373        let runs = runner.run_case(&case).unwrap();
374        assert!(runs[0].passed);
375        // One assistant turn, the simulated user never had to speak.
376        assert_eq!(provider.calls.borrow().assistant, 1);
377        assert_eq!(provider.calls.borrow().user, 0);
378    }
379
380    #[test]
381    fn failing_eval_marks_run_failed() {
382        let provider = ScriptedProvider {
383            assistant: vec![AssistantTurn {
384                message: "Hello".into(),
385                done: false,
386                ..Default::default()
387            }],
388            user: vec![],
389            judge: vec![JudgeVerdict {
390                value: JudgeValue::Bool(false),
391                reason: "no name".into(),
392                usage: None,
393            }],
394            calls: RefCell::new(Calls::default()),
395        };
396        let config = Config::default();
397        let runner = Runner::new(&provider, &config);
398        let report = runner
399            .run_all(&[boolean_case(temp_skill("faileval"))])
400            .unwrap();
401        assert!(!report.passed);
402        assert_eq!(report.summary.failed, 1);
403    }
404
405    #[test]
406    fn matrix_fans_out_over_platforms_and_models() {
407        let provider = ScriptedProvider {
408            assistant: vec![AssistantTurn {
409                message: "Hello".into(),
410                done: false,
411                ..Default::default()
412            }],
413            user: vec![],
414            judge: vec![JudgeVerdict {
415                value: JudgeValue::Bool(true),
416                reason: String::new(),
417                usage: None,
418            }],
419            calls: RefCell::new(Calls::default()),
420        };
421        let config = Config {
422            platforms: vec!["a".into(), "b".into()],
423            models: vec!["m1".into(), "m2".into()],
424            ..Config::default()
425        };
426        let runner = Runner::new(&provider, &config);
427        let runs = runner
428            .run_case(&boolean_case(temp_skill("matrix")))
429            .unwrap();
430        assert_eq!(runs.len(), 4);
431    }
432}