Skip to main content

skilltest_core/
config.rs

1//! Configuration: which provider runs skills, the default platforms and models a
2//! run fans out across, and the model used for natural-language evals.
3//!
4//! Config is loaded from a YAML file (default `skilltest.yaml`) and then refined
5//! by CLI overrides (see [`Config::apply_overrides`]).
6
7use std::path::Path;
8
9use serde::{Deserialize, Serialize};
10
11use crate::error::{Error, Result};
12
13fn default_oneharness_bin() -> String {
14    "oneharness".to_string()
15}
16
17fn default_judge_harness() -> String {
18    "claude-code".to_string()
19}
20
21fn default_timeout_secs() -> u64 {
22    120
23}
24
25fn default_api_timeout_secs() -> u64 {
26    60
27}
28
29fn default_curl_bin() -> String {
30    "curl".to_string()
31}
32
33fn default_true() -> bool {
34    true
35}
36
37/// Settings for the default [`oneharness`](https://github.com/nickderobertis/oneharness)
38/// provider, which runs each prompt on a harness via `oneharness run`.
39#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
40#[serde(deny_unknown_fields)]
41pub struct OneharnessConfig {
42    /// The `oneharness` binary (resolved on `PATH`).
43    #[serde(default = "default_oneharness_bin")]
44    pub bin: String,
45    /// The harness used for evals and the simulated user (kept independent of the
46    /// harness under test, so the evaluator does not vary with the matrix).
47    #[serde(default = "default_judge_harness")]
48    pub judge_harness: String,
49    /// Per-call timeout passed through to `oneharness run --timeout`.
50    #[serde(default = "default_timeout_secs")]
51    pub timeout_secs: u64,
52}
53
54impl Default for OneharnessConfig {
55    fn default() -> Self {
56        Self {
57            bin: default_oneharness_bin(),
58            judge_harness: default_judge_harness(),
59            timeout_secs: default_timeout_secs(),
60        }
61    }
62}
63
64/// Settings for a custom provider command speaking the JSON-lines protocol (see
65/// `docs/protocol.md`). Used by the bundled `skilltest-fake-provider` and any
66/// provider you write yourself.
67#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
68#[serde(deny_unknown_fields)]
69pub struct CommandConfig {
70    /// The provider command as an argv vector.
71    pub command: Vec<String>,
72}
73
74/// Which provider backs a run.
75#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
76#[serde(tag = "kind", rename_all = "lowercase")]
77pub enum ProviderConfig {
78    /// Run skills through `oneharness` (the default).
79    Oneharness(OneharnessConfig),
80    /// Run a custom command speaking the JSON-lines protocol.
81    Command(CommandConfig),
82}
83
84impl Default for ProviderConfig {
85    fn default() -> Self {
86        ProviderConfig::Oneharness(OneharnessConfig::default())
87    }
88}
89
90/// Which model vendor's API the direct-API judge talks to.
91#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
92#[serde(rename_all = "lowercase")]
93pub enum ApiVendor {
94    /// Anthropic Messages API (`POST /v1/messages`).
95    Anthropic,
96    /// OpenAI Chat Completions API (`POST /v1/chat/completions`).
97    Openai,
98}
99
100/// Settings for judging evals and the simulated user with a direct model API
101/// call instead of running them through a harness. This trades the harness's
102/// auth-portability for a single fast HTTP round trip per judge call (no
103/// agent-loop cold start), with normalized token usage surfaced into the report.
104///
105/// The judge *model* is the run's `judge_model` (it must be a valid API model
106/// id for the chosen `vendor`, e.g. `claude-opus-4-8` or `gpt-4o`); only the
107/// transport is configured here.
108#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
109#[serde(deny_unknown_fields)]
110pub struct ApiJudgeConfig {
111    /// Which vendor's API to call.
112    pub vendor: ApiVendor,
113    /// Environment variable holding the API key. Defaults to `ANTHROPIC_API_KEY`
114    /// or `OPENAI_API_KEY` by vendor. The key is read at run time and never
115    /// stored in config.
116    #[serde(default)]
117    pub api_key_env: Option<String>,
118    /// Override the API endpoint (e.g. a proxy or an OpenAI-compatible gateway).
119    /// Defaults to the vendor's standard endpoint.
120    #[serde(default)]
121    pub base_url: Option<String>,
122    /// Per-call timeout in seconds, passed to `curl --max-time`.
123    #[serde(default = "default_api_timeout_secs")]
124    pub timeout_secs: u64,
125    /// The `curl` binary (resolved on `PATH`).
126    #[serde(default = "default_curl_bin")]
127    pub curl_bin: String,
128    /// Constrain the judge's verdict to the `{value, reason}` JSON schema via the
129    /// vendor's structured-outputs feature (Anthropic `output_config.format`,
130    /// OpenAI `response_format: json_schema`). On by default — it removes a class
131    /// of judge-parse fragility. Turn it off for a model/endpoint that doesn't
132    /// support structured outputs (the tolerant `{…}` extraction still applies).
133    #[serde(default = "default_true")]
134    pub strict_json: bool,
135}
136
137/// How evals and the simulated user are judged, independent of the provider that
138/// runs the skill. Absent (the default) means the run's provider judges too
139/// (e.g. the oneharness `judge_harness`).
140#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
141#[serde(tag = "kind", rename_all = "lowercase")]
142pub enum JudgeConfig {
143    /// Judge with a direct model API call (see [`ApiJudgeConfig`]).
144    Api(ApiJudgeConfig),
145}
146
147/// The full configuration for a run.
148#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
149#[serde(default, deny_unknown_fields)]
150pub struct Config {
151    /// The provider that executes skills and evals.
152    pub provider: ProviderConfig,
153    /// Harness platforms a case runs on (e.g. `claude-code`, `codex`).
154    pub platforms: Vec<String>,
155    /// Models a case runs on (must be valid for the chosen harness, e.g.
156    /// `sonnet`/`haiku` for `claude-code`).
157    pub models: Vec<String>,
158    /// Model used for natural-language evals and the simulated user. Falls back
159    /// to the first entry of `models` when empty.
160    pub judge_model: String,
161    /// Default cap on assistant turns for multi-turn cases. A case may lower it.
162    pub max_turns: u32,
163    /// Optional judge backend that overrides how evals and the simulated user are
164    /// scored, independent of the skill-running provider. When `None`, the
165    /// provider judges (e.g. the oneharness `judge_harness`).
166    #[serde(default, skip_serializing_if = "Option::is_none")]
167    pub judge: Option<JudgeConfig>,
168}
169
170impl Default for Config {
171    fn default() -> Self {
172        Self {
173            provider: ProviderConfig::default(),
174            platforms: vec!["claude-code".to_string()],
175            models: vec!["claude-opus-4-8".to_string()],
176            judge_model: String::new(),
177            max_turns: 8,
178            judge: None,
179        }
180    }
181}
182
183/// CLI-supplied overrides. `None`/empty fields leave the config value in place.
184#[derive(Debug, Clone, Default)]
185pub struct Overrides {
186    /// If set, switch to a [`ProviderConfig::Command`] with this argv.
187    pub command_provider: Option<Vec<String>>,
188    /// Override the `oneharness` binary (only applies to the oneharness provider).
189    pub oneharness_bin: Option<String>,
190    /// Override the judge harness (only applies to the oneharness provider).
191    pub judge_harness: Option<String>,
192    /// Override the per-call timeout (only applies to the oneharness provider).
193    pub timeout_secs: Option<u64>,
194    pub platforms: Vec<String>,
195    pub models: Vec<String>,
196    pub judge_model: Option<String>,
197    pub max_turns: Option<u32>,
198}
199
200impl Config {
201    /// Load configuration from `path`. The standard config filename is
202    /// `skilltest.yaml`.
203    ///
204    /// # Errors
205    /// [`Error::Io`] if the file cannot be read, [`Error::Yaml`] if it does not
206    /// parse, and [`Error::Invalid`] if it parses but is internally
207    /// inconsistent (see [`Config::validate`]).
208    pub fn load(path: &Path) -> Result<Self> {
209        let text = std::fs::read_to_string(path).map_err(|source| Error::Io {
210            path: path.to_path_buf(),
211            source,
212        })?;
213        let config: Config = serde_yaml::from_str(&text).map_err(|source| Error::Yaml {
214            path: path.to_path_buf(),
215            source,
216        })?;
217        config.validate()?;
218        Ok(config)
219    }
220
221    /// Load `path` if it exists, otherwise return [`Config::default`].
222    ///
223    /// # Errors
224    /// Same as [`Config::load`] when the file is present but invalid.
225    pub fn load_or_default(path: &Path) -> Result<Self> {
226        if path.is_file() {
227            Self::load(path)
228        } else {
229            Ok(Self::default())
230        }
231    }
232
233    /// Apply CLI overrides in place, then re-validate.
234    ///
235    /// # Errors
236    /// [`Error::Invalid`] if the merged configuration is inconsistent.
237    pub fn apply_overrides(&mut self, overrides: Overrides) -> Result<()> {
238        if let Some(command) = overrides.command_provider {
239            self.provider = ProviderConfig::Command(CommandConfig { command });
240        } else if let ProviderConfig::Oneharness(oh) = &mut self.provider {
241            if let Some(bin) = overrides.oneharness_bin {
242                oh.bin = bin;
243            }
244            if let Some(judge_harness) = overrides.judge_harness {
245                oh.judge_harness = judge_harness;
246            }
247            if let Some(timeout) = overrides.timeout_secs {
248                oh.timeout_secs = timeout;
249            }
250        }
251        if !overrides.platforms.is_empty() {
252            self.platforms = overrides.platforms;
253        }
254        if !overrides.models.is_empty() {
255            self.models = overrides.models;
256        }
257        if let Some(judge) = overrides.judge_model {
258            self.judge_model = judge;
259        }
260        if let Some(max_turns) = overrides.max_turns {
261            self.max_turns = max_turns;
262        }
263        self.validate()
264    }
265
266    /// The model used for evals and the simulated user: `judge_model` if set,
267    /// otherwise the first configured model.
268    #[must_use]
269    pub fn effective_judge_model(&self) -> &str {
270        if self.judge_model.is_empty() {
271            self.models.first().map_or("", String::as_str)
272        } else {
273            &self.judge_model
274        }
275    }
276
277    /// Check internal consistency.
278    ///
279    /// # Errors
280    /// [`Error::Invalid`] when the provider is misconfigured or no
281    /// platform/model is set.
282    pub fn validate(&self) -> Result<()> {
283        match &self.provider {
284            ProviderConfig::Oneharness(oh) => {
285                if oh.bin.trim().is_empty() {
286                    return Err(Error::Invalid(
287                        "config `provider.bin` must name the oneharness binary".into(),
288                    ));
289                }
290                if oh.judge_harness.trim().is_empty() {
291                    return Err(Error::Invalid(
292                        "config `provider.judge_harness` must name a harness".into(),
293                    ));
294                }
295                if oh.timeout_secs == 0 {
296                    return Err(Error::Invalid(
297                        "config `provider.timeout_secs` must be at least 1".into(),
298                    ));
299                }
300            }
301            ProviderConfig::Command(c) => {
302                if c.command.is_empty() {
303                    return Err(Error::Invalid(
304                        "config `provider.command` must name a command".into(),
305                    ));
306                }
307            }
308        }
309        if self.platforms.is_empty() {
310            return Err(Error::Invalid(
311                "config `platforms` must list at least one harness platform".into(),
312            ));
313        }
314        if self.models.is_empty() {
315            return Err(Error::Invalid(
316                "config `models` must list at least one model".into(),
317            ));
318        }
319        if self.max_turns == 0 {
320            return Err(Error::Invalid(
321                "config `max_turns` must be at least 1".into(),
322            ));
323        }
324        if let Some(JudgeConfig::Api(api)) = &self.judge {
325            if api.timeout_secs == 0 {
326                return Err(Error::Invalid(
327                    "config `judge.timeout_secs` must be at least 1".into(),
328                ));
329            }
330            if api.curl_bin.trim().is_empty() {
331                return Err(Error::Invalid(
332                    "config `judge.curl_bin` must name the curl binary".into(),
333                ));
334            }
335        }
336        Ok(())
337    }
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343
344    #[test]
345    fn defaults_are_valid_and_use_oneharness() {
346        let config = Config::default();
347        config.validate().unwrap();
348        assert!(matches!(config.provider, ProviderConfig::Oneharness(_)));
349    }
350
351    #[test]
352    fn command_override_switches_provider() {
353        let mut config = Config::default();
354        config
355            .apply_overrides(Overrides {
356                command_provider: Some(vec!["fake".into()]),
357                ..Default::default()
358            })
359            .unwrap();
360        assert_eq!(
361            config.provider,
362            ProviderConfig::Command(CommandConfig {
363                command: vec!["fake".into()]
364            })
365        );
366    }
367
368    #[test]
369    fn oneharness_bin_override_applies() {
370        let mut config = Config::default();
371        config
372            .apply_overrides(Overrides {
373                oneharness_bin: Some("/tmp/oneharness".into()),
374                ..Default::default()
375            })
376            .unwrap();
377        let ProviderConfig::Oneharness(oh) = &config.provider else {
378            panic!("expected oneharness provider");
379        };
380        assert_eq!(oh.bin, "/tmp/oneharness");
381    }
382
383    #[test]
384    fn parses_command_provider_yaml() {
385        let yaml = "provider:\n  kind: command\n  command: [\"prov\", \"--flag\"]\n";
386        let config: Config = serde_yaml::from_str(yaml).unwrap();
387        assert_eq!(
388            config.provider,
389            ProviderConfig::Command(CommandConfig {
390                command: vec!["prov".into(), "--flag".into()]
391            })
392        );
393    }
394
395    #[test]
396    fn parses_oneharness_provider_yaml() {
397        let yaml = "provider:\n  kind: oneharness\n  bin: oh\n  judge_harness: codex\n";
398        let config: Config = serde_yaml::from_str(yaml).unwrap();
399        let ProviderConfig::Oneharness(oh) = &config.provider else {
400            panic!("expected oneharness provider");
401        };
402        assert_eq!(oh.bin, "oh");
403        assert_eq!(oh.judge_harness, "codex");
404        // Unspecified fields fall back to defaults.
405        assert_eq!(oh.timeout_secs, 120);
406    }
407
408    #[test]
409    fn judge_model_falls_back_to_first_model() {
410        let config = Config::default();
411        assert_eq!(config.effective_judge_model(), "claude-opus-4-8");
412    }
413
414    #[test]
415    fn empty_models_is_invalid() {
416        let mut config = Config::default();
417        config.models.clear();
418        assert!(config.validate().is_err());
419    }
420
421    #[test]
422    fn parses_api_judge_config() {
423        let yaml = "\
424provider:\n  kind: oneharness\njudge:\n  kind: api\n  vendor: anthropic\n  timeout_secs: 30\n";
425        let config: Config = serde_yaml::from_str(yaml).unwrap();
426        let Some(JudgeConfig::Api(api)) = &config.judge else {
427            panic!("expected an api judge");
428        };
429        assert_eq!(api.vendor, ApiVendor::Anthropic);
430        assert_eq!(api.timeout_secs, 30);
431        // Unspecified fields fall back to defaults.
432        assert_eq!(api.curl_bin, "curl");
433        assert!(api.api_key_env.is_none());
434        assert!(api.strict_json, "strict JSON is on by default");
435        config.validate().unwrap();
436    }
437
438    #[test]
439    fn api_judge_zero_timeout_is_invalid() {
440        let yaml = "judge:\n  kind: api\n  vendor: openai\n  timeout_secs: 0\n";
441        let config: Config = serde_yaml::from_str(yaml).unwrap();
442        assert!(config.validate().is_err());
443    }
444
445    #[test]
446    fn default_config_has_no_judge_override() {
447        assert!(Config::default().judge.is_none());
448    }
449}