Skip to main content

skilltest_core/
config.rs

1//! Configuration: which provider runs skills, the default platforms and models a
2//! run fans out across, and the model used for natural-language evals.
3//!
4//! Config is loaded from a YAML file (default `skilltest.yaml`) and then refined
5//! by CLI overrides (see [`Config::apply_overrides`]).
6
7use std::path::Path;
8
9use serde::{Deserialize, Serialize};
10
11use crate::error::{Error, Result};
12
13fn default_oneharness_bin() -> String {
14    "oneharness".to_string()
15}
16
17fn default_judge_harness() -> String {
18    "claude-code".to_string()
19}
20
21fn default_timeout_secs() -> u64 {
22    120
23}
24
25/// Settings for the default [`oneharness`](https://github.com/nickderobertis/oneharness)
26/// provider, which runs each prompt on a harness via `oneharness run`.
27#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
28#[serde(deny_unknown_fields)]
29pub struct OneharnessConfig {
30    /// The `oneharness` binary (resolved on `PATH`).
31    #[serde(default = "default_oneharness_bin")]
32    pub bin: String,
33    /// The harness used for evals and the simulated user (kept independent of the
34    /// harness under test, so the evaluator does not vary with the matrix).
35    #[serde(default = "default_judge_harness")]
36    pub judge_harness: String,
37    /// Per-call timeout passed through to `oneharness run --timeout`.
38    #[serde(default = "default_timeout_secs")]
39    pub timeout_secs: u64,
40}
41
42impl Default for OneharnessConfig {
43    fn default() -> Self {
44        Self {
45            bin: default_oneharness_bin(),
46            judge_harness: default_judge_harness(),
47            timeout_secs: default_timeout_secs(),
48        }
49    }
50}
51
52/// Settings for a custom provider command speaking the JSON-lines protocol (see
53/// `docs/protocol.md`). Used by the bundled `skilltest-fake-provider` and any
54/// provider you write yourself.
55#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
56#[serde(deny_unknown_fields)]
57pub struct CommandConfig {
58    /// The provider command as an argv vector.
59    pub command: Vec<String>,
60}
61
62/// Which provider backs a run.
63#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
64#[serde(tag = "kind", rename_all = "lowercase")]
65pub enum ProviderConfig {
66    /// Run skills through `oneharness` (the default).
67    Oneharness(OneharnessConfig),
68    /// Run a custom command speaking the JSON-lines protocol.
69    Command(CommandConfig),
70}
71
72impl Default for ProviderConfig {
73    fn default() -> Self {
74        ProviderConfig::Oneharness(OneharnessConfig::default())
75    }
76}
77
78/// The full configuration for a run.
79#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
80#[serde(default, deny_unknown_fields)]
81pub struct Config {
82    /// The provider that executes skills and evals.
83    pub provider: ProviderConfig,
84    /// Harness platforms a case runs on (e.g. `claude-code`, `codex`).
85    pub platforms: Vec<String>,
86    /// Models a case runs on (must be valid for the chosen harness, e.g.
87    /// `sonnet`/`haiku` for `claude-code`).
88    pub models: Vec<String>,
89    /// Model used for natural-language evals and the simulated user. Falls back
90    /// to the first entry of `models` when empty.
91    pub judge_model: String,
92    /// Default cap on assistant turns for multi-turn cases. A case may lower it.
93    pub max_turns: u32,
94}
95
96impl Default for Config {
97    fn default() -> Self {
98        Self {
99            provider: ProviderConfig::default(),
100            platforms: vec!["claude-code".to_string()],
101            models: vec!["claude-opus-4-8".to_string()],
102            judge_model: String::new(),
103            max_turns: 8,
104        }
105    }
106}
107
108/// CLI-supplied overrides. `None`/empty fields leave the config value in place.
109#[derive(Debug, Clone, Default)]
110pub struct Overrides {
111    /// If set, switch to a [`ProviderConfig::Command`] with this argv.
112    pub command_provider: Option<Vec<String>>,
113    /// Override the `oneharness` binary (only applies to the oneharness provider).
114    pub oneharness_bin: Option<String>,
115    /// Override the judge harness (only applies to the oneharness provider).
116    pub judge_harness: Option<String>,
117    /// Override the per-call timeout (only applies to the oneharness provider).
118    pub timeout_secs: Option<u64>,
119    pub platforms: Vec<String>,
120    pub models: Vec<String>,
121    pub judge_model: Option<String>,
122    pub max_turns: Option<u32>,
123}
124
125impl Config {
126    /// Load configuration from `path`. The standard config filename is
127    /// `skilltest.yaml`.
128    ///
129    /// # Errors
130    /// [`Error::Io`] if the file cannot be read, [`Error::Yaml`] if it does not
131    /// parse, and [`Error::Invalid`] if it parses but is internally
132    /// inconsistent (see [`Config::validate`]).
133    pub fn load(path: &Path) -> Result<Self> {
134        let text = std::fs::read_to_string(path).map_err(|source| Error::Io {
135            path: path.to_path_buf(),
136            source,
137        })?;
138        let config: Config = serde_yaml::from_str(&text).map_err(|source| Error::Yaml {
139            path: path.to_path_buf(),
140            source,
141        })?;
142        config.validate()?;
143        Ok(config)
144    }
145
146    /// Load `path` if it exists, otherwise return [`Config::default`].
147    ///
148    /// # Errors
149    /// Same as [`Config::load`] when the file is present but invalid.
150    pub fn load_or_default(path: &Path) -> Result<Self> {
151        if path.is_file() {
152            Self::load(path)
153        } else {
154            Ok(Self::default())
155        }
156    }
157
158    /// Apply CLI overrides in place, then re-validate.
159    ///
160    /// # Errors
161    /// [`Error::Invalid`] if the merged configuration is inconsistent.
162    pub fn apply_overrides(&mut self, overrides: Overrides) -> Result<()> {
163        if let Some(command) = overrides.command_provider {
164            self.provider = ProviderConfig::Command(CommandConfig { command });
165        } else if let ProviderConfig::Oneharness(oh) = &mut self.provider {
166            if let Some(bin) = overrides.oneharness_bin {
167                oh.bin = bin;
168            }
169            if let Some(judge_harness) = overrides.judge_harness {
170                oh.judge_harness = judge_harness;
171            }
172            if let Some(timeout) = overrides.timeout_secs {
173                oh.timeout_secs = timeout;
174            }
175        }
176        if !overrides.platforms.is_empty() {
177            self.platforms = overrides.platforms;
178        }
179        if !overrides.models.is_empty() {
180            self.models = overrides.models;
181        }
182        if let Some(judge) = overrides.judge_model {
183            self.judge_model = judge;
184        }
185        if let Some(max_turns) = overrides.max_turns {
186            self.max_turns = max_turns;
187        }
188        self.validate()
189    }
190
191    /// The model used for evals and the simulated user: `judge_model` if set,
192    /// otherwise the first configured model.
193    #[must_use]
194    pub fn effective_judge_model(&self) -> &str {
195        if self.judge_model.is_empty() {
196            self.models.first().map_or("", String::as_str)
197        } else {
198            &self.judge_model
199        }
200    }
201
202    /// Check internal consistency.
203    ///
204    /// # Errors
205    /// [`Error::Invalid`] when the provider is misconfigured or no
206    /// platform/model is set.
207    pub fn validate(&self) -> Result<()> {
208        match &self.provider {
209            ProviderConfig::Oneharness(oh) => {
210                if oh.bin.trim().is_empty() {
211                    return Err(Error::Invalid(
212                        "config `provider.bin` must name the oneharness binary".into(),
213                    ));
214                }
215                if oh.judge_harness.trim().is_empty() {
216                    return Err(Error::Invalid(
217                        "config `provider.judge_harness` must name a harness".into(),
218                    ));
219                }
220                if oh.timeout_secs == 0 {
221                    return Err(Error::Invalid(
222                        "config `provider.timeout_secs` must be at least 1".into(),
223                    ));
224                }
225            }
226            ProviderConfig::Command(c) => {
227                if c.command.is_empty() {
228                    return Err(Error::Invalid(
229                        "config `provider.command` must name a command".into(),
230                    ));
231                }
232            }
233        }
234        if self.platforms.is_empty() {
235            return Err(Error::Invalid(
236                "config `platforms` must list at least one harness platform".into(),
237            ));
238        }
239        if self.models.is_empty() {
240            return Err(Error::Invalid(
241                "config `models` must list at least one model".into(),
242            ));
243        }
244        if self.max_turns == 0 {
245            return Err(Error::Invalid(
246                "config `max_turns` must be at least 1".into(),
247            ));
248        }
249        Ok(())
250    }
251}
252
253#[cfg(test)]
254mod tests {
255    use super::*;
256
257    #[test]
258    fn defaults_are_valid_and_use_oneharness() {
259        let config = Config::default();
260        config.validate().unwrap();
261        assert!(matches!(config.provider, ProviderConfig::Oneharness(_)));
262    }
263
264    #[test]
265    fn command_override_switches_provider() {
266        let mut config = Config::default();
267        config
268            .apply_overrides(Overrides {
269                command_provider: Some(vec!["fake".into()]),
270                ..Default::default()
271            })
272            .unwrap();
273        assert_eq!(
274            config.provider,
275            ProviderConfig::Command(CommandConfig {
276                command: vec!["fake".into()]
277            })
278        );
279    }
280
281    #[test]
282    fn oneharness_bin_override_applies() {
283        let mut config = Config::default();
284        config
285            .apply_overrides(Overrides {
286                oneharness_bin: Some("/tmp/oneharness".into()),
287                ..Default::default()
288            })
289            .unwrap();
290        let ProviderConfig::Oneharness(oh) = &config.provider else {
291            panic!("expected oneharness provider");
292        };
293        assert_eq!(oh.bin, "/tmp/oneharness");
294    }
295
296    #[test]
297    fn parses_command_provider_yaml() {
298        let yaml = "provider:\n  kind: command\n  command: [\"prov\", \"--flag\"]\n";
299        let config: Config = serde_yaml::from_str(yaml).unwrap();
300        assert_eq!(
301            config.provider,
302            ProviderConfig::Command(CommandConfig {
303                command: vec!["prov".into(), "--flag".into()]
304            })
305        );
306    }
307
308    #[test]
309    fn parses_oneharness_provider_yaml() {
310        let yaml = "provider:\n  kind: oneharness\n  bin: oh\n  judge_harness: codex\n";
311        let config: Config = serde_yaml::from_str(yaml).unwrap();
312        let ProviderConfig::Oneharness(oh) = &config.provider else {
313            panic!("expected oneharness provider");
314        };
315        assert_eq!(oh.bin, "oh");
316        assert_eq!(oh.judge_harness, "codex");
317        // Unspecified fields fall back to defaults.
318        assert_eq!(oh.timeout_secs, 120);
319    }
320
321    #[test]
322    fn judge_model_falls_back_to_first_model() {
323        let config = Config::default();
324        assert_eq!(config.effective_judge_model(), "claude-opus-4-8");
325    }
326
327    #[test]
328    fn empty_models_is_invalid() {
329        let mut config = Config::default();
330        config.models.clear();
331        assert!(config.validate().is_err());
332    }
333}