1use crate::config::Config;
6use crate::conversation::{Message, Transcript};
7use crate::error::Result;
8use crate::eval::{Eval, JudgeValue};
9use crate::provider::{JudgeKind, JudgeQuery, Provider, SkillRef, Usage};
10use crate::report::{CaseRun, Report};
11use crate::skill::{load_skill, SkillDefinition};
12use crate::testcase::TestCase;
13
14pub struct Runner<'a> {
16 provider: &'a dyn Provider,
17 config: &'a Config,
18}
19
20impl<'a> Runner<'a> {
21 #[must_use]
23 pub fn new(provider: &'a dyn Provider, config: &'a Config) -> Self {
24 Self { provider, config }
25 }
26
27 pub fn run_all(&self, cases: &[TestCase]) -> Result<Report> {
34 let mut runs = Vec::new();
35 for case in cases {
36 runs.extend(self.run_case(case)?);
37 }
38 Ok(Report::new(runs))
39 }
40
41 pub fn run_case(&self, case: &TestCase) -> Result<Vec<CaseRun>> {
46 let skill = load_skill(&case.skill)?;
47 let mut runs = Vec::new();
48 for platform in &self.config.platforms {
49 for model in &self.config.models {
50 runs.push(self.run_case_on(case, &skill, platform, model)?);
51 }
52 }
53 Ok(runs)
54 }
55
56 fn run_case_on(
58 &self,
59 case: &TestCase,
60 skill: &SkillDefinition,
61 platform: &str,
62 model: &str,
63 ) -> Result<CaseRun> {
64 let mut totals = Usage::default();
65 let transcript = self.converse(case, skill, platform, model, &mut totals)?;
66 let evals = self.score(case, &transcript, &mut totals)?;
67 let passed = evals.iter().all(|e| e.passed);
68 Ok(CaseRun {
69 case: case.name.clone(),
70 skill: skill.dir.to_string_lossy().into_owned(),
71 platform: platform.to_string(),
72 model: model.to_string(),
73 passed,
74 turns: transcript.assistant_turns(),
75 evals,
76 transcript,
77 usage: (!totals.is_empty()).then_some(totals),
78 })
79 }
80
81 fn converse(
84 &self,
85 case: &TestCase,
86 skill: &SkillDefinition,
87 platform: &str,
88 model: &str,
89 totals: &mut Usage,
90 ) -> Result<Transcript> {
91 let dir = skill.dir.to_string_lossy().into_owned();
92 let skill_ref = SkillRef {
93 name: &skill.name,
94 dir: &dir,
95 instructions: &skill.instructions,
96 };
97 let judge_model = self.config.effective_judge_model();
98 let max_turns = case
99 .user
100 .as_ref()
101 .and_then(|u| u.max_turns)
102 .unwrap_or(self.config.max_turns) as usize;
103 let resume_supported = self.provider.supports_resume(platform);
104
105 let mut transcript = Transcript::from_input(&case.input);
106 let mut session: Option<String> = None;
110
111 loop {
112 let session_arg = if resume_supported {
113 session.as_deref()
114 } else {
115 None
116 };
117 let turn = self.provider.respond(
118 platform,
119 model,
120 &skill_ref,
121 &transcript.messages,
122 session_arg,
123 )?;
124 if let Some(u) = &turn.usage {
125 totals.add(u);
126 }
127 if let Some(id) = turn.session_id {
129 session = Some(id);
130 }
131 let skill_done = turn.done;
132 transcript.push(Message::assistant(turn.message));
133
134 let Some(user) = &case.user else {
136 break;
137 };
138
139 if skill_done || transcript.assistant_turns() >= max_turns {
140 break;
141 }
142
143 if let Some(done_when) = &user.done_when {
145 let query = JudgeQuery {
146 kind: JudgeKind::Boolean,
147 criterion: done_when,
148 scale: None,
149 };
150 let verdict = self
151 .provider
152 .judge(judge_model, &query, &transcript.messages)?;
153 if let Some(u) = &verdict.usage {
154 totals.add(u);
155 }
156 if matches!(verdict.value, JudgeValue::Bool(true)) {
157 break;
158 }
159 }
160
161 let user_turn =
163 self.provider
164 .simulate_user(judge_model, &user.persona, &transcript.messages)?;
165 if let Some(u) = &user_turn.usage {
166 totals.add(u);
167 }
168 let stop = user_turn.stop;
169 transcript.push(Message::user(user_turn.message));
170 if stop {
171 break;
172 }
173 }
174
175 Ok(transcript)
176 }
177
178 fn score(
180 &self,
181 case: &TestCase,
182 transcript: &Transcript,
183 totals: &mut Usage,
184 ) -> Result<Vec<crate::eval::EvalOutcome>> {
185 let judge_model = self.config.effective_judge_model();
186 let mut outcomes = Vec::with_capacity(case.evals.len());
187 for eval in &case.evals {
188 let query = match eval {
189 Eval::Boolean { criterion, .. } => JudgeQuery {
190 kind: JudgeKind::Boolean,
191 criterion,
192 scale: None,
193 },
194 Eval::Numeric {
195 criterion,
196 min,
197 max,
198 ..
199 } => JudgeQuery {
200 kind: JudgeKind::Numeric,
201 criterion,
202 scale: Some((*min, *max)),
203 },
204 };
205 let verdict = self
206 .provider
207 .judge(judge_model, &query, &transcript.messages)?;
208 if let Some(u) = &verdict.usage {
209 totals.add(u);
210 }
211 outcomes.push(eval.outcome(&verdict.value, verdict.reason)?);
212 }
213 Ok(outcomes)
214 }
215}
216
217#[cfg(test)]
218mod tests {
219 use super::*;
220 use crate::conversation::Message;
221 use crate::provider::{AssistantTurn, JudgeVerdict, UserTurn};
222 use std::cell::RefCell;
223
224 struct ScriptedProvider {
227 assistant: Vec<AssistantTurn>,
228 user: Vec<UserTurn>,
229 judge: Vec<JudgeVerdict>,
230 calls: RefCell<Calls>,
231 }
232
233 #[derive(Default)]
234 struct Calls {
235 assistant: usize,
236 user: usize,
237 judge: usize,
238 }
239
240 impl Provider for ScriptedProvider {
241 fn respond(
242 &self,
243 _platform: &str,
244 _model: &str,
245 _skill: &SkillRef<'_>,
246 _messages: &[Message],
247 _session: Option<&str>,
248 ) -> Result<AssistantTurn> {
249 let i = self.calls.borrow().assistant;
250 self.calls.borrow_mut().assistant += 1;
251 Ok(self.assistant[i.min(self.assistant.len() - 1)].clone())
252 }
253
254 fn simulate_user(
255 &self,
256 _model: &str,
257 _persona: &str,
258 _messages: &[Message],
259 ) -> Result<UserTurn> {
260 let i = self.calls.borrow().user;
261 self.calls.borrow_mut().user += 1;
262 Ok(self.user[i.min(self.user.len() - 1)].clone())
263 }
264
265 fn judge(
266 &self,
267 _model: &str,
268 _query: &JudgeQuery<'_>,
269 _messages: &[Message],
270 ) -> Result<JudgeVerdict> {
271 let i = self.calls.borrow().judge;
272 self.calls.borrow_mut().judge += 1;
273 let v = &self.judge[i.min(self.judge.len() - 1)];
274 Ok(JudgeVerdict {
275 value: v.value,
276 reason: v.reason.clone(),
277 usage: v.usage.clone(),
278 })
279 }
280 }
281
282 fn temp_skill(tag: &str) -> std::path::PathBuf {
285 let dir = std::env::temp_dir().join(format!("skilltest-ut-{}-{tag}", std::process::id()));
286 std::fs::create_dir_all(&dir).unwrap();
287 std::fs::write(
288 dir.join("SKILL.md"),
289 "---\nname: greeter\ndescription: a test skill\n---\nfake-reply: hi\n",
290 )
291 .unwrap();
292 dir
293 }
294
295 fn boolean_case(skill: std::path::PathBuf) -> TestCase {
296 TestCase {
297 name: "greets".into(),
298 skill,
299 input: "Greet Dr. Smith".into(),
300 user: None,
301 evals: vec![Eval::Boolean {
302 criterion: "greets Dr. Smith".into(),
303 expected: true,
304 name: None,
305 }],
306 }
307 }
308
309 #[test]
310 fn single_turn_runs_one_assistant_turn_and_scores() {
311 let provider = ScriptedProvider {
312 assistant: vec![AssistantTurn {
313 message: "Hello, Dr. Smith!".into(),
314 done: false,
315 ..Default::default()
316 }],
317 user: vec![],
318 judge: vec![JudgeVerdict {
319 value: JudgeValue::Bool(true),
320 reason: "names her".into(),
321 usage: None,
322 }],
323 calls: RefCell::new(Calls::default()),
324 };
325 let config = Config::default();
326 let runner = Runner::new(&provider, &config);
327 let runs = runner
328 .run_case(&boolean_case(temp_skill("single")))
329 .unwrap();
330 assert_eq!(runs.len(), 1);
331 assert!(runs[0].passed);
332 assert_eq!(runs[0].turns, 1);
333 assert_eq!(provider.calls.borrow().assistant, 1);
334 }
335
336 #[test]
337 fn multi_turn_stops_when_done_when_holds() {
338 let mut case = boolean_case(temp_skill("multi"));
339 case.user = Some(crate::testcase::SimulatedUser {
340 persona: "a terse patient".into(),
341 done_when: Some("the assistant has greeted".into()),
342 max_turns: Some(5),
343 });
344 let provider = ScriptedProvider {
345 assistant: vec![AssistantTurn {
346 message: "Hi there".into(),
347 done: false,
348 ..Default::default()
349 }],
350 user: vec![UserTurn {
351 message: "continue".into(),
352 stop: false,
353 ..Default::default()
354 }],
355 judge: vec![
358 JudgeVerdict {
359 value: JudgeValue::Bool(true),
360 reason: "done".into(),
361 usage: None,
362 },
363 JudgeVerdict {
364 value: JudgeValue::Bool(true),
365 reason: "greeted".into(),
366 usage: None,
367 },
368 ],
369 calls: RefCell::new(Calls::default()),
370 };
371 let config = Config::default();
372 let runner = Runner::new(&provider, &config);
373 let runs = runner.run_case(&case).unwrap();
374 assert!(runs[0].passed);
375 assert_eq!(provider.calls.borrow().assistant, 1);
377 assert_eq!(provider.calls.borrow().user, 0);
378 }
379
380 #[test]
381 fn failing_eval_marks_run_failed() {
382 let provider = ScriptedProvider {
383 assistant: vec![AssistantTurn {
384 message: "Hello".into(),
385 done: false,
386 ..Default::default()
387 }],
388 user: vec![],
389 judge: vec![JudgeVerdict {
390 value: JudgeValue::Bool(false),
391 reason: "no name".into(),
392 usage: None,
393 }],
394 calls: RefCell::new(Calls::default()),
395 };
396 let config = Config::default();
397 let runner = Runner::new(&provider, &config);
398 let report = runner
399 .run_all(&[boolean_case(temp_skill("faileval"))])
400 .unwrap();
401 assert!(!report.passed);
402 assert_eq!(report.summary.failed, 1);
403 }
404
405 #[test]
406 fn matrix_fans_out_over_platforms_and_models() {
407 let provider = ScriptedProvider {
408 assistant: vec![AssistantTurn {
409 message: "Hello".into(),
410 done: false,
411 ..Default::default()
412 }],
413 user: vec![],
414 judge: vec![JudgeVerdict {
415 value: JudgeValue::Bool(true),
416 reason: String::new(),
417 usage: None,
418 }],
419 calls: RefCell::new(Calls::default()),
420 };
421 let config = Config {
422 platforms: vec!["a".into(), "b".into()],
423 models: vec!["m1".into(), "m2".into()],
424 ..Config::default()
425 };
426 let runner = Runner::new(&provider, &config);
427 let runs = runner
428 .run_case(&boolean_case(temp_skill("matrix")))
429 .unwrap();
430 assert_eq!(runs.len(), 4);
431 }
432}