skilltest_core/
provider.rs

1//! The provider boundary. `skilltest` never talks to a model directly; a
2//! [`Provider`] runs the skill, plays the simulated user, and judges the
3//! transcript.
4//!
5//! There are two real implementations. [`OneharnessProvider`] (the default) runs
6//! each prompt on a harness through the
7//! [`oneharness`](https://github.com/nickderobertis/oneharness) CLI and parses
8//! its JSON. [`CommandProvider`] speaks a small JSON-lines protocol (see
9//! `docs/protocol.md`) and backs both the deterministic `skilltest-fake-provider`
10//! used by the gate and any custom provider you write. The [`Provider`] trait
11//! also lets the runner be unit-tested against an in-memory fake.
12
13use std::io::{BufRead as _, BufReader, Write as _};
14use std::ops::ControlFlow;
15use std::path::{Path, PathBuf};
16use std::process::{Command, Stdio};
17
18use serde::{Deserialize, Serialize};
19
20use crate::config::{ApiJudgeConfig, ApiVendor, OneharnessConfig};
21use crate::conversation::{Message, Role, ToolEvent};
22use crate::error::{Error, ProviderErrorKind, Result};
23use crate::eval::JudgeValue;
24use crate::mock::{parse_spy_log, MockCall, MockPlan};
25
26/// A borrowed view of the skill under test, as sent to the provider.
27pub struct SkillRef<'a> {
28    pub name: &'a str,
29    pub dir: &'a str,
30    pub instructions: &'a str,
31}
32
33/// The kind of judgement requested.
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
35pub enum JudgeKind {
36    Boolean,
37    Numeric,
38}
39
40impl JudgeKind {
41    fn as_str(self) -> &'static str {
42        match self {
43            JudgeKind::Boolean => "boolean",
44            JudgeKind::Numeric => "numeric",
45        }
46    }
47}
48
49/// A judge query: the criterion, its kind, and (for numeric) the scale.
50pub struct JudgeQuery<'a> {
51    pub kind: JudgeKind,
52    pub criterion: &'a str,
53    pub scale: Option<(f64, f64)>,
54}
55
56/// Token / cost usage for one provider call.
57///
58/// Each field is independently optional because not every harness reports every
59/// signal (cost is commonly absent on subscription auth; some harnesses report
60/// no usage at all). The whole struct is `Option<Usage>` on a turn — `None`
61/// means "no signal," not "zero."
62#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
63pub struct Usage {
64    #[serde(default, skip_serializing_if = "Option::is_none")]
65    pub input_tokens: Option<u64>,
66    #[serde(default, skip_serializing_if = "Option::is_none")]
67    pub output_tokens: Option<u64>,
68    #[serde(default, skip_serializing_if = "Option::is_none")]
69    pub cost_usd: Option<f64>,
70}
71
72impl Usage {
73    /// True iff every field is `None`.
74    #[must_use]
75    pub fn is_empty(&self) -> bool {
76        self.input_tokens.is_none() && self.output_tokens.is_none() && self.cost_usd.is_none()
77    }
78
79    /// Add another sample into this total. `None` values stay `None` until
80    /// something reports a real number, at which point they accumulate.
81    pub fn add(&mut self, other: &Usage) {
82        if let Some(v) = other.input_tokens {
83            self.input_tokens = Some(self.input_tokens.unwrap_or(0) + v);
84        }
85        if let Some(v) = other.output_tokens {
86            self.output_tokens = Some(self.output_tokens.unwrap_or(0) + v);
87        }
88        if let Some(v) = other.cost_usd {
89            self.cost_usd = Some(self.cost_usd.unwrap_or(0.0) + v);
90        }
91    }
92}
93
94/// An assistant/skill turn produced by the provider.
95#[derive(Debug, Clone, Default)]
96pub struct AssistantTurn {
97    pub message: String,
98    /// The skill signalled it considers the task complete.
99    pub done: bool,
100    /// Cost/token usage for this call, if the provider reported it.
101    pub usage: Option<Usage>,
102    /// A session handle the runner can pass back on the next `respond` call to
103    /// continue the same conversation against the real harness (only some
104    /// harnesses expose this — see `OneharnessProvider::supports_resume`).
105    pub session_id: Option<String>,
106    /// Normalized tool events the skill took this turn (shell commands, file
107    /// edits, tool uses), from oneharness `--events`. Empty when the harness
108    /// exposed no tool transcript. Attached to the assistant message so consumers
109    /// can analyze — and stream — what the skill *did*.
110    pub events: Vec<ToolEvent>,
111    /// The mock/spy channel's records for this turn — every observed tool call
112    /// with its original input and the verdict applied. `None` when the channel
113    /// was off (or the provider has no channel); `Some(vec![])` when it was on
114    /// and the turn made no tool calls. The distinction matters: a spy on a
115    /// channel-less run must err loudly, not read as "zero calls".
116    pub mock_calls: Option<Vec<MockCall>>,
117    /// A ready-to-run command that replays this run's recorded history (e.g.
118    /// `oneharness history show <name> --history-dir <dir>`), set when the
119    /// provider recorded run history for the turn. `None` for providers/configs
120    /// that record none. Only the skill-running provider (oneharness, with
121    /// history enabled) sets it; the runner lifts it onto the
122    /// [`crate::report::CaseRun`] so a past run can be reviewed.
123    pub history_command: Option<String>,
124}
125
126/// A simulated-user turn produced by the provider.
127#[derive(Debug, Clone, Default)]
128pub struct UserTurn {
129    pub message: String,
130    /// The simulated user chose to end the conversation.
131    pub stop: bool,
132    pub usage: Option<Usage>,
133}
134
135/// A judge verdict: the raw value (bool or number) plus the stated reason.
136#[derive(Debug, Clone)]
137pub struct JudgeVerdict {
138    pub value: JudgeValue,
139    pub reason: String,
140    pub usage: Option<Usage>,
141}
142
143/// The provider boundary.
144pub trait Provider {
145    /// Run one assistant/skill turn given the conversation so far. `session`,
146    /// when `Some`, is a handle returned by a previous `respond` call on this
147    /// run that the provider may use to continue the same harness session
148    /// (e.g. via `oneharness run --resume`); providers that don't support
149    /// continuation should ignore it.
150    ///
151    /// # Errors
152    /// [`Error::Provider`] if the command fails or returns malformed output.
153    fn respond(
154        &self,
155        platform: &str,
156        model: &str,
157        skill: &SkillRef<'_>,
158        messages: &[Message],
159        session: Option<&str>,
160    ) -> Result<AssistantTurn>;
161
162    /// Like [`Provider::respond`], but delivers each normalized tool event to
163    /// `on_event` as it is observed, so a caller can stream events live and
164    /// short-circuit. `on_event` returns [`ControlFlow::Break`] to abort the
165    /// turn — the provider tears down the harness and returns the partial turn.
166    ///
167    /// The default implementation runs the buffered [`Provider::respond`] and
168    /// replays the finished turn's events once; providers that can stream (like
169    /// [`OneharnessProvider`], via `oneharness --stream`) override it so events
170    /// arrive — and an abort takes effect — mid-turn.
171    ///
172    /// # Errors
173    /// [`Error::Provider`] if the command fails or returns malformed output.
174    fn respond_streaming(
175        &self,
176        platform: &str,
177        model: &str,
178        skill: &SkillRef<'_>,
179        messages: &[Message],
180        session: Option<&str>,
181        on_event: &mut dyn FnMut(&ToolEvent) -> ControlFlow<()>,
182    ) -> Result<AssistantTurn> {
183        let turn = self.respond(platform, model, skill, messages, session)?;
184        for event in &turn.events {
185            if on_event(event).is_break() {
186                break;
187            }
188        }
189        Ok(turn)
190    }
191
192    /// Like [`Provider::respond`], but with a tool mock/spy plan: the provider
193    /// must enforce the plan's compiled ruleset on the turn's tool calls and
194    /// return the observed-call records on the turn (`mock_calls`).
195    ///
196    /// The default implementation supports **no** mocking: a present plan is a
197    /// loud error — a provider silently ignoring mocks would let a mocked suite
198    /// pass vacuously — and an absent one delegates to [`Provider::respond`].
199    /// [`CommandProvider`] and [`OneharnessProvider`] override this.
200    ///
201    /// # Errors
202    /// [`Error::Provider`] if the command fails, returns malformed output, or a
203    /// plan was given and this provider cannot enforce it.
204    fn respond_with_mocks(
205        &self,
206        platform: &str,
207        model: &str,
208        skill: &SkillRef<'_>,
209        messages: &[Message],
210        session: Option<&str>,
211        mocks: Option<&MockPlan<'_>>,
212    ) -> Result<AssistantTurn> {
213        if mocks.is_some() {
214            return Err(Error::provider(
215                "mocks",
216                "this provider does not support tool mocking/spying; remove the `mocks` \
217                 declarations or use the oneharness/command provider",
218            ));
219        }
220        self.respond(platform, model, skill, messages, session)
221    }
222
223    /// Like [`Provider::respond_streaming`], with a tool mock/spy plan. Same
224    /// contract as [`Provider::respond_with_mocks`]: the default supports no
225    /// mocking and errs loudly on a present plan.
226    ///
227    /// # Errors
228    /// As [`Provider::respond_with_mocks`].
229    // One over clippy's arg limit; the signature is respond_streaming's plus
230    // the mock plan, and a params struct would obscure the trait symmetry.
231    #[allow(clippy::too_many_arguments)]
232    fn respond_streaming_with_mocks(
233        &self,
234        platform: &str,
235        model: &str,
236        skill: &SkillRef<'_>,
237        messages: &[Message],
238        session: Option<&str>,
239        mocks: Option<&MockPlan<'_>>,
240        on_event: &mut dyn FnMut(&ToolEvent) -> ControlFlow<()>,
241    ) -> Result<AssistantTurn> {
242        if mocks.is_some() {
243            return Err(Error::provider(
244                "mocks",
245                "this provider does not support tool mocking/spying; remove the `mocks` \
246                 declarations or use the oneharness/command provider",
247            ));
248        }
249        self.respond_streaming(platform, model, skill, messages, session, on_event)
250    }
251
252    /// Produce one simulated-user turn.
253    ///
254    /// # Errors
255    /// [`Error::Provider`] if the command fails or returns malformed output.
256    fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn>;
257
258    /// Score a criterion against the conversation.
259    ///
260    /// # Errors
261    /// [`Error::Provider`] if the command fails or returns malformed output.
262    fn judge(
263        &self,
264        model: &str,
265        query: &JudgeQuery<'_>,
266        messages: &[Message],
267    ) -> Result<JudgeVerdict>;
268
269    /// True iff `respond` on `platform` will faithfully continue a prior
270    /// session when given its `session_id`. The default is `false`; providers
271    /// that support resume override this so the runner knows to thread the
272    /// session id through.
273    fn supports_resume(&self, _platform: &str) -> bool {
274        false
275    }
276}
277
278// ---------------------------------------------------------------------------
279// Wire types (CommandProvider JSON-lines protocol)
280// ---------------------------------------------------------------------------
281
282#[derive(Serialize)]
283struct SkillPayload<'a> {
284    name: &'a str,
285    path: &'a str,
286    instructions: &'a str,
287}
288
289/// The mock/spy block of a `respond` request: its presence turns the channel
290/// on (the provider must return `mock_calls`); `rules` carries the compiled
291/// ruleset to enforce, or `null` for a spy-only run.
292#[derive(Serialize)]
293struct MocksPayload<'a> {
294    rules: Option<&'a serde_json::Value>,
295}
296
297#[derive(Serialize)]
298#[serde(tag = "op", rename_all = "lowercase")]
299enum Request<'a> {
300    Respond {
301        platform: &'a str,
302        model: &'a str,
303        skill: SkillPayload<'a>,
304        messages: &'a [Message],
305        #[serde(skip_serializing_if = "Option::is_none")]
306        session: Option<&'a str>,
307        #[serde(skip_serializing_if = "Option::is_none")]
308        mocks: Option<MocksPayload<'a>>,
309    },
310    User {
311        model: &'a str,
312        persona: &'a str,
313        messages: &'a [Message],
314    },
315    Judge {
316        model: &'a str,
317        kind: &'a str,
318        criterion: &'a str,
319        #[serde(skip_serializing_if = "Option::is_none")]
320        min: Option<f64>,
321        #[serde(skip_serializing_if = "Option::is_none")]
322        max: Option<f64>,
323        messages: &'a [Message],
324    },
325}
326
327#[derive(Deserialize)]
328struct RespondPayload {
329    message: String,
330    #[serde(default)]
331    done: bool,
332    #[serde(default)]
333    usage: Option<Usage>,
334    #[serde(default)]
335    session_id: Option<String>,
336    /// Optional normalized tool events a custom provider may report (parallel to
337    /// oneharness's `events`); absent/`null` when the provider surfaces none.
338    #[serde(default)]
339    events: Option<Vec<ToolEvent>>,
340    /// The mock/spy records for the turn; required (may be `[]`) whenever the
341    /// request carried a `mocks` block, absent otherwise.
342    #[serde(default)]
343    mock_calls: Option<Vec<MockCall>>,
344}
345
346#[derive(Deserialize)]
347struct UserPayload {
348    message: String,
349    #[serde(default)]
350    stop: bool,
351    #[serde(default)]
352    usage: Option<Usage>,
353}
354
355#[derive(Deserialize)]
356struct JudgePayload {
357    value: JudgeValue,
358    #[serde(default)]
359    reason: String,
360    #[serde(default)]
361    usage: Option<Usage>,
362}
363
364// ---------------------------------------------------------------------------
365// CommandProvider
366// ---------------------------------------------------------------------------
367
368/// A [`Provider`] backed by an external command speaking the JSON protocol.
369pub struct CommandProvider {
370    argv: Vec<String>,
371}
372
373impl CommandProvider {
374    /// Build a provider from an argv vector (program + args). The program is
375    /// resolved on `PATH`.
376    ///
377    /// # Errors
378    /// [`Error::Invalid`] if `argv` is empty.
379    pub fn new(argv: Vec<String>) -> Result<Self> {
380        if argv.is_empty() {
381            return Err(Error::Invalid("provider command is empty".into()));
382        }
383        Ok(Self { argv })
384    }
385
386    /// Send one request and parse the single response object from stdout.
387    fn call<T: for<'de> Deserialize<'de>>(&self, request: &Request<'_>, op: &str) -> Result<T> {
388        let payload = serde_json::to_vec(request).map_err(|e| {
389            Error::provider(op.to_string(), format!("could not encode request: {e}"))
390        })?;
391
392        let mut child = Command::new(&self.argv[0])
393            .args(&self.argv[1..])
394            .stdin(Stdio::piped())
395            .stdout(Stdio::piped())
396            .stderr(Stdio::piped())
397            .spawn()
398            .map_err(|e| {
399                Error::provider(
400                    op.to_string(),
401                    format!(
402                        "could not run provider `{}`: {e}. Is it installed and on PATH?",
403                        self.argv[0]
404                    ),
405                )
406            })?;
407
408        // Write the request, then close stdin so the child can finish. Writing
409        // before reading stdout is safe here because responses are small.
410        {
411            let stdin = child
412                .stdin
413                .as_mut()
414                .ok_or_else(|| Error::provider(op.to_string(), "could not open provider stdin"))?;
415            stdin
416                .write_all(&payload)
417                .and_then(|()| stdin.write_all(b"\n"))
418                .map_err(|e| {
419                    Error::provider(op.to_string(), format!("could not write request: {e}"))
420                })?;
421        }
422
423        let output = child.wait_with_output().map_err(|e| {
424            Error::provider(op.to_string(), format!("provider did not complete: {e}"))
425        })?;
426
427        if !output.status.success() {
428            let stderr = String::from_utf8_lossy(&output.stderr);
429            return Err(Error::provider(
430                op.to_string(),
431                format!("provider exited with {}: {}", output.status, stderr.trim()),
432            ));
433        }
434
435        let stdout = String::from_utf8_lossy(&output.stdout);
436        let line = stdout.trim();
437        if line.is_empty() {
438            return Err(Error::provider(
439                op.to_string(),
440                "provider produced no output (expected one JSON response object)",
441            ));
442        }
443        serde_json::from_str(line).map_err(|e| {
444            Error::provider(
445                op.to_string(),
446                format!("provider response was not valid JSON for `{op}`: {e}; got: {line}"),
447            )
448        })
449    }
450}
451
452impl CommandProvider {
453    /// The shared `respond` path: build the request (with the optional mock
454    /// block), call the command, and lift the payload onto a turn. A provider
455    /// that was handed a plan but returned no `mock_calls` is a loud error —
456    /// it silently ignored the mocks, which must never pass vacuously.
457    fn respond_impl(
458        &self,
459        platform: &str,
460        model: &str,
461        skill: &SkillRef<'_>,
462        messages: &[Message],
463        session: Option<&str>,
464        mocks: Option<&MockPlan<'_>>,
465    ) -> Result<AssistantTurn> {
466        let request = Request::Respond {
467            platform,
468            model,
469            skill: SkillPayload {
470                name: skill.name,
471                path: skill.dir,
472                instructions: skill.instructions,
473            },
474            messages,
475            session,
476            mocks: mocks.map(|plan| MocksPayload { rules: plan.rules }),
477        };
478        let payload: RespondPayload = self.call(&request, "respond")?;
479        if mocks.is_some() && payload.mock_calls.is_none() {
480            return Err(Error::provider(
481                "respond",
482                "the provider ignored the request's `mocks` block (no `mock_calls` in its \
483                 response); it does not support tool mocking/spying",
484            ));
485        }
486        Ok(AssistantTurn {
487            message: payload.message,
488            done: payload.done,
489            usage: payload.usage,
490            session_id: payload.session_id,
491            events: payload.events.unwrap_or_default(),
492            mock_calls: payload.mock_calls,
493            // The JSON-lines protocol has no history channel; a custom provider
494            // that wants review-ability records it out of band.
495            history_command: None,
496        })
497    }
498}
499
500impl Provider for CommandProvider {
501    fn respond(
502        &self,
503        platform: &str,
504        model: &str,
505        skill: &SkillRef<'_>,
506        messages: &[Message],
507        session: Option<&str>,
508    ) -> Result<AssistantTurn> {
509        self.respond_impl(platform, model, skill, messages, session, None)
510    }
511
512    fn respond_with_mocks(
513        &self,
514        platform: &str,
515        model: &str,
516        skill: &SkillRef<'_>,
517        messages: &[Message],
518        session: Option<&str>,
519        mocks: Option<&MockPlan<'_>>,
520    ) -> Result<AssistantTurn> {
521        self.respond_impl(platform, model, skill, messages, session, mocks)
522    }
523
524    // One over clippy's arg limit; the signature is respond_streaming's plus
525    // the mock plan, and a params struct would obscure the trait symmetry.
526    #[allow(clippy::too_many_arguments)]
527    fn respond_streaming_with_mocks(
528        &self,
529        platform: &str,
530        model: &str,
531        skill: &SkillRef<'_>,
532        messages: &[Message],
533        session: Option<&str>,
534        mocks: Option<&MockPlan<'_>>,
535        on_event: &mut dyn FnMut(&ToolEvent) -> ControlFlow<()>,
536    ) -> Result<AssistantTurn> {
537        // The command protocol is buffered (one request/response per op), so
538        // stream by replaying the finished turn's events, exactly like the
539        // trait's mock-less default.
540        let turn = self.respond_impl(platform, model, skill, messages, session, mocks)?;
541        for event in &turn.events {
542            if on_event(event).is_break() {
543                break;
544            }
545        }
546        Ok(turn)
547    }
548
549    fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
550        let request = Request::User {
551            model,
552            persona,
553            messages,
554        };
555        let payload: UserPayload = self.call(&request, "user")?;
556        Ok(UserTurn {
557            message: payload.message,
558            stop: payload.stop,
559            usage: payload.usage,
560        })
561    }
562
563    fn judge(
564        &self,
565        model: &str,
566        query: &JudgeQuery<'_>,
567        messages: &[Message],
568    ) -> Result<JudgeVerdict> {
569        let (min, max) = match query.scale {
570            Some((lo, hi)) => (Some(lo), Some(hi)),
571            None => (None, None),
572        };
573        let request = Request::Judge {
574            model,
575            kind: query.kind.as_str(),
576            criterion: query.criterion,
577            min,
578            max,
579            messages,
580        };
581        let payload: JudgePayload = self.call(&request, "judge")?;
582        Ok(JudgeVerdict {
583            value: payload.value,
584            reason: payload.reason,
585            usage: payload.usage,
586        })
587    }
588}
589
590// ---------------------------------------------------------------------------
591// OneharnessProvider
592// ---------------------------------------------------------------------------
593
594/// The default [`Provider`]: runs each prompt on a harness through the
595/// `oneharness` CLI (targets **v0.3.8+** — the release carrying opt-in run
596/// history: `run --history`/`--history-dir`/`--history-name`, a `history_file`
597/// in the report, and the `oneharness history` verb).
598///
599/// Wires six real oneharness features:
600///
601/// * `--system <skill instructions>` — the skill becomes a *real* system prompt
602///   on the underlying harness (e.g. `--append-system-prompt` for claude-code),
603///   instead of being inlined into the user message.
604/// * `--resume <session>` — multi-turn `respond` calls thread the previous
605///   `session_id` so the harness sees a continuing conversation (and keeps its
606///   tool state, files, etc.) instead of being re-prompted with a stringified
607///   transcript. Used only for harnesses that report `supports_resume` in the
608///   registry (claude-code, opencode, cursor today); other harnesses fall back
609///   to the inline-transcript path.
610/// * `--events` — normalized tool events (`{kind, name, input, output, index}`)
611///   lifted from each harness's transcript, so consumers can analyze *what the
612///   skill did*, not just its final text. Attached to the assistant turn.
613/// * Normalized `usage` (`input_tokens`, `output_tokens`, `cost_usd`) — surfaced
614///   on every turn so cross-model cost reporting is portable.
615/// * Normalized `failure_kind` (`auth`, `rate_limit`, `model_not_found`, …) —
616///   classified provider errors so the CLI can distinguish a broken environment
617///   from a broken skill.
618/// * `--history --history-dir <dir> --history-name <name>` — each *skill* run is
619///   recorded to a centralized history directory shared across every skilltest
620///   invocation, so past runs can be reviewed with `oneharness history`. The
621///   run's `history_file` is echoed back; the provider turns it into a
622///   ready-to-run `oneharness history show …` command on the assistant turn. The
623///   judge and simulated-user calls are deliberately never recorded.
624///
625/// Note on approval mode: skilltest deliberately passes **no `--mode`** flag, so
626/// oneharness applies its own default (v0.3.0+ normalized approval modes). Users
627/// who need a different mode — e.g. `bypass` to let the skill take every action
628/// without prompting — configure it through oneharness's own config
629/// (`ONEHARNESS_MODE` / its config file), keeping approval policy in one place.
630///
631/// Evals and the simulated user always run on the configured `judge_harness`,
632/// independent of the harness under test, so the evaluator does not drift with
633/// the matrix.
634pub struct OneharnessProvider {
635    bin: String,
636    judge_harness: String,
637    timeout_secs: u64,
638    /// Record skill runs to oneharness history when true.
639    history: bool,
640    /// The centralized directory history is written to (`--history-dir`).
641    history_dir: PathBuf,
642}
643
644/// The subset of the `oneharness run` JSON envelope we consume.
645#[derive(Deserialize)]
646struct OhEnvelope {
647    results: Vec<OhResult>,
648    /// Absolute path of the JSONL session file oneharness recorded this run to
649    /// (its `--history` output). It lives on the run **report**, alongside
650    /// `results` — not per result; `null`/absent when history was off or the
651    /// oneharness build predates the feature. Its presence is how the provider
652    /// knows a reviewable session exists before offering a `history show`
653    /// command.
654    #[serde(default)]
655    history_file: Option<String>,
656}
657
658#[derive(Deserialize)]
659struct OhResult {
660    status: String,
661    #[serde(default)]
662    text: Option<String>,
663    /// Raw harness stdout. oneharness's `text` extraction is best-effort and may
664    /// be null when a harness's output shape defeats it, with stdout as the
665    /// documented fallback; we honor that rather than hard-failing. No harness in
666    /// the live matrix relies on it today (OpenCode's JSONL — the case that
667    /// motivated this — is extracted natively as of oneharness v0.2.37), but the
668    /// contract holds for any harness, so the fallback stays as defense-in-depth.
669    #[serde(default)]
670    stdout: String,
671    #[serde(default)]
672    stderr: String,
673    #[serde(default)]
674    error: Option<String>,
675    #[serde(default)]
676    session_id: Option<String>,
677    #[serde(default)]
678    usage: Option<Usage>,
679    /// Normalized tool events oneharness lifted from the harness transcript (its
680    /// `--events` output); `null`/absent when the harness exposes none.
681    #[serde(default)]
682    events: Option<Vec<ToolEvent>>,
683    #[serde(default)]
684    failure_kind: Option<String>,
685}
686
687/// Parameters for one `oneharness run` invocation.
688struct RunArgs<'a> {
689    harness: &'a str,
690    model: &'a str,
691    prompt: &'a str,
692    /// Becomes `--system <text>`; only set on `respond` so the skill is the
693    /// system prompt rather than inlined into the user turn.
694    system: Option<&'a str>,
695    /// Becomes `--resume <id>`; only set when the runner wants to continue a
696    /// prior harness session.
697    resume: Option<&'a str>,
698    /// Becomes `--mock-rules <file>` (when the plan carries rules) plus
699    /// `--spy-file <file>` (always, so every tool call is recorded); only set
700    /// on `respond` — the judge and simulated user are never mocked.
701    mocks: Option<&'a MockPlan<'a>>,
702    /// Becomes `--history --history-dir <dir> --history-name <name>`; only set
703    /// on `respond` so the skill run is recorded to the centralized history — the
704    /// judge and simulated user are deliberately never recorded.
705    history: Option<HistoryArgs<'a>>,
706}
707
708/// The history-recording flags for one skill `oneharness run`.
709struct HistoryArgs<'a> {
710    /// The centralized directory history is written to (`--history-dir`).
711    dir: &'a Path,
712    /// A stable, review-friendly session name (`--history-name`) the provider
713    /// also embeds in the `history show` command it surfaces.
714    name: &'a str,
715}
716
717impl<'a> RunArgs<'a> {
718    /// The common mock-less, history-less shape (judge / simulated-user calls).
719    fn plain(harness: &'a str, model: &'a str, prompt: &'a str) -> Self {
720        RunArgs {
721            harness,
722            model,
723            prompt,
724            system: None,
725            resume: None,
726            mocks: None,
727            history: None,
728        }
729    }
730}
731
732/// What we get back from one `oneharness run`.
733struct RunOutcome {
734    text: String,
735    session_id: Option<String>,
736    usage: Option<Usage>,
737    events: Vec<ToolEvent>,
738    /// The spy-log records (present iff the run had a mock plan; empty when
739    /// the hook observed no tool calls).
740    mock_calls: Option<Vec<MockCall>>,
741    /// The absolute path oneharness recorded this run's history to, when history
742    /// was on and the run was recorded; `None` otherwise.
743    history_file: Option<String>,
744}
745
746/// The per-run temp files a mock plan needs: the rules JSON `--mock-rules`
747/// reads and the JSONL path `--spy-file` appends to. The directory is removed
748/// on drop, so every exit path (including errors) cleans up.
749struct MockFiles {
750    dir: std::path::PathBuf,
751    rules: Option<std::path::PathBuf>,
752    spy: std::path::PathBuf,
753}
754
755impl MockFiles {
756    /// Write the plan's compiled ruleset into a fresh private temp dir.
757    fn prepare(plan: &MockPlan<'_>) -> Result<MockFiles> {
758        let dir = std::env::temp_dir().join(format!(
759            "skilltest-mocks-{}-{}",
760            std::process::id(),
761            curl_config_nonce()
762        ));
763        std::fs::create_dir_all(&dir).map_err(|e| {
764            Error::provider("oneharness", format!("could not create mock temp dir: {e}"))
765        })?;
766        let rules = match plan.rules {
767            Some(rules) => {
768                let path = dir.join("rules.json");
769                std::fs::write(&path, rules.to_string()).map_err(|e| {
770                    Error::provider("oneharness", format!("could not write mock rules: {e}"))
771                })?;
772                Some(path)
773            }
774            None => None,
775        };
776        Ok(MockFiles {
777            spy: dir.join("spy.jsonl"),
778            rules,
779            dir,
780        })
781    }
782
783    /// Parse the spy log the run left behind. A missing file means the hook
784    /// never fired (the turn made no tool calls) — an empty record set, not an
785    /// error; a malformed line is loud (see [`parse_spy_log`]).
786    fn records(&self) -> Result<Vec<MockCall>> {
787        match std::fs::read_to_string(&self.spy) {
788            Ok(text) => parse_spy_log(&text),
789            Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Vec::new()),
790            Err(e) => Err(Error::provider(
791                "oneharness",
792                format!("could not read spy log `{}`: {e}", self.spy.display()),
793            )),
794        }
795    }
796}
797
798impl Drop for MockFiles {
799    fn drop(&mut self) {
800        let _ = std::fs::remove_dir_all(&self.dir);
801    }
802}
803
804/// Choose the harness's reply text: oneharness's extracted `text` when non-empty,
805/// otherwise its raw stdout. oneharness extracts `text` on a best-effort basis
806/// and, per its contract, may leave it null when a harness's output shape defeats
807/// extraction — the reply still survives in stdout. (OpenCode's JSONL once hit
808/// this; oneharness v0.2.37 extracts it natively, so the fallback is now
809/// defense-in-depth.) Returns `None` only when both are empty, the one case that
810/// is a genuine "the harness said nothing" error.
811fn select_reply_text(text: Option<String>, stdout: &str) -> Option<String> {
812    text.filter(|t| !t.trim().is_empty())
813        .or_else(|| (!stdout.trim().is_empty()).then(|| stdout.to_string()))
814}
815
816/// Build the provider error for a non-`ok` oneharness result, classifying it
817/// structurally. oneharness's `failure_kind` (`auth`/`rate_limit`/… , set on
818/// classified failures) wins; absent that, a terminal `status` skilltest
819/// recognizes — today `timeout` — still yields a category, so a deadline is
820/// [`ProviderErrorKind::Timeout`] rather than an unclassified error whose only
821/// signal is the word "timeout" in the message.
822fn oh_failure(context: String, message: String, failure_kind: Option<&str>, status: &str) -> Error {
823    match oh_failure_kind(failure_kind, status) {
824        Some(kind) => Error::provider_classified(context, message, kind),
825        None => Error::provider(context, message),
826    }
827}
828
829/// Classify a non-`ok` oneharness result: its `failure_kind` when set, else a
830/// category inferred from the terminal `status`.
831fn oh_failure_kind(failure_kind: Option<&str>, status: &str) -> Option<ProviderErrorKind> {
832    if let Some(raw) = failure_kind.filter(|k| !k.is_empty()) {
833        return Some(ProviderErrorKind::classify(raw));
834    }
835    match status {
836        "timeout" => Some(ProviderErrorKind::Timeout),
837        _ => None,
838    }
839}
840
841impl OneharnessProvider {
842    /// Build a provider from its configuration. The history directory is
843    /// resolved once here: the configured `history_dir` if set, otherwise the
844    /// centralized [`default_history_dir`] shared across skilltest invocations.
845    #[must_use]
846    pub fn new(config: &OneharnessConfig) -> Self {
847        Self {
848            bin: config.bin.clone(),
849            judge_harness: config.judge_harness.clone(),
850            timeout_secs: config.timeout_secs,
851            history: config.history,
852            history_dir: config
853                .history_dir
854                .as_ref()
855                .map_or_else(default_history_dir, PathBuf::from),
856        }
857    }
858
859    /// The history session name for a skill run, or `None` when recording is
860    /// disabled. Stable across a case's turns (it seeds off the opening user
861    /// prompt plus the platform/model), so every turn of one run lands in the
862    /// same reviewable session and one `history show` command covers the run.
863    fn history_name(&self, platform: &str, model: &str, messages: &[Message]) -> Option<String> {
864        self.history
865            .then(|| history_session_name(platform, model, messages))
866    }
867
868    /// Turn a finished run into a reviewable command, given the history name it
869    /// was recorded under. Returns `None` unless recording was on *and*
870    /// oneharness confirmed it wrote the session (`history_file`), so we never
871    /// surface a command that would resolve to nothing.
872    fn history_command(&self, name: Option<&str>, outcome: &RunOutcome) -> Option<String> {
873        match (name, &outcome.history_file) {
874            (Some(name), Some(_)) => Some(history_view_command(&self.bin, &self.history_dir, name)),
875            _ => None,
876        }
877    }
878
879    /// Run one prompt on `harness` and return the normalized text plus the
880    /// session id and usage (when oneharness lifted them from the harness's
881    /// output).
882    fn run(&self, args: &RunArgs<'_>) -> Result<RunOutcome> {
883        let timeout = self.timeout_secs.to_string();
884        let mut cmd = Command::new(&self.bin);
885        // Intentionally no `--output-format` override: oneharness already requests
886        // each harness's *default* format (json for claude-code/opencode,
887        // stream-json for cursor, text for codex/goose/qwen/crush/copilot) and
888        // extracts the reply accordingly. Forcing `json` everywhere broke the
889        // text-native harnesses — oneharness would json-extract their plain-text
890        // reply and find nothing ("harness produced no extractable text").
891        //
892        // `--events` asks oneharness to surface normalized tool events. It is safe
893        // for text extraction: oneharness only upgrades a harness whose default
894        // format carries no tool transcript to its events-capable format
895        // (claude→stream-json, codex→exec --json, qwen→stream-json) and still
896        // extracts the reply from it; harnesses whose default already carries a
897        // transcript (opencode, cursor) or expose none (goose/crush/copilot) are
898        // left on their default. So the reply keeps working everywhere and
899        // `events` is populated wherever the harness can express it.
900        //
901        // No `--mode`: oneharness applies its own default approval mode; users
902        // tune it (e.g. `bypass`) via oneharness config, not from here.
903        cmd.args([
904            "run",
905            "--harness",
906            args.harness,
907            "--compact",
908            "--events",
909            "--timeout",
910            &timeout,
911            "--prompt-file",
912            "-",
913        ]);
914        // An empty model means "unspecified" — omit `--model` so the harness uses
915        // its own default (cursor/crush/copilot) or an env-selected model (qwen
916        // via OPENAI_MODEL, goose via GOOSE_MODEL), exactly as oneharness's own
917        // smoke scripts do. Forwarding `--model ""` would push a broken empty
918        // model flag to the harness CLI.
919        if !args.model.is_empty() {
920            cmd.args(["--model", args.model]);
921        }
922        if let Some(system) = args.system {
923            cmd.args(["--system", system]);
924        }
925        if let Some(resume) = args.resume {
926            cmd.args(["--resume", resume]);
927        }
928        // Skill runs are recorded to the centralized history so past runs are
929        // reviewable; the judge/simulated-user calls carry no history args.
930        push_history_args(&mut cmd, args.history.as_ref());
931        // A mock plan rides oneharness's ephemeral per-run delivery: the
932        // compiled ruleset via `--mock-rules`, and always a `--spy-file` so
933        // every observed call (mocked or allowed) is recorded.
934        let mock_files = args.mocks.map(MockFiles::prepare).transpose()?;
935        if let Some(files) = &mock_files {
936            if let Some(rules) = &files.rules {
937                cmd.arg("--mock-rules");
938                cmd.arg(rules);
939            }
940            cmd.arg("--spy-file");
941            cmd.arg(&files.spy);
942        }
943
944        let mut child = cmd
945            .stdin(Stdio::piped())
946            .stdout(Stdio::piped())
947            .stderr(Stdio::piped())
948            .spawn()
949            .map_err(|e| {
950                Error::provider(
951                    "oneharness",
952                    format!(
953                        "could not run `{}`: {e}. Is oneharness installed and on PATH?",
954                        self.bin
955                    ),
956                )
957            })?;
958
959        child
960            .stdin
961            .as_mut()
962            .ok_or_else(|| Error::provider("oneharness", "could not open oneharness stdin"))?
963            .write_all(args.prompt.as_bytes())
964            .map_err(|e| Error::provider("oneharness", format!("could not write prompt: {e}")))?;
965
966        let output = child.wait_with_output().map_err(|e| {
967            Error::provider("oneharness", format!("oneharness did not complete: {e}"))
968        })?;
969
970        let stdout = String::from_utf8_lossy(&output.stdout);
971        let envelope: OhEnvelope = serde_json::from_str(stdout.trim()).map_err(|e| {
972            Error::provider(
973                "oneharness",
974                format!(
975                    "could not parse oneharness output: {e}; stderr: {}",
976                    String::from_utf8_lossy(&output.stderr).trim()
977                ),
978            )
979        })?;
980
981        // `history_file` is on the run report, not per result — capture it
982        // before consuming `results`.
983        let history_file = envelope.history_file;
984        let result = envelope
985            .results
986            .into_iter()
987            .next()
988            .ok_or_else(|| Error::provider("oneharness", "oneharness returned no results"))?;
989
990        if result.status != "ok" {
991            let detail = result
992                .error
993                .filter(|e| !e.is_empty())
994                .or_else(|| Some(result.stderr.clone()).filter(|s| !s.is_empty()))
995                .unwrap_or_else(|| format!("status `{}`", result.status));
996            let context = format!("oneharness:{}", args.harness);
997            let message = format!("harness run failed: {detail}");
998            return Err(oh_failure(
999                context,
1000                message,
1001                result.failure_kind.as_deref(),
1002                &result.status,
1003            ));
1004        }
1005
1006        // Prefer oneharness's extracted `text`; fall back to raw stdout when a
1007        // harness's output shape defeats extraction (oneharness's documented
1008        // contract — see OhResult::stdout). Only a run that produced *neither* is
1009        // a real error.
1010        let text = select_reply_text(result.text, &result.stdout).ok_or_else(|| {
1011            Error::provider(
1012                format!("oneharness:{}", args.harness),
1013                "harness produced neither extractable text nor stdout",
1014            )
1015        })?;
1016        let mock_calls = mock_files.as_ref().map(MockFiles::records).transpose()?;
1017        Ok(RunOutcome {
1018            text,
1019            session_id: result.session_id,
1020            usage: result.usage,
1021            events: result.events.unwrap_or_default(),
1022            mock_calls,
1023            history_file,
1024        })
1025    }
1026
1027    /// Like [`OneharnessProvider::run`], but drives `oneharness run --stream`,
1028    /// forwarding each normalized tool event to `on_event` the instant it is
1029    /// observed. When `on_event` returns [`ControlFlow::Break`], the oneharness
1030    /// child is killed — closing its stream tears the harness down, so a bad turn
1031    /// is cut off instead of paid for in full — and the partial outcome (the
1032    /// events seen so far) is returned.
1033    fn run_streaming(
1034        &self,
1035        args: &RunArgs<'_>,
1036        on_event: &mut dyn FnMut(&ToolEvent) -> ControlFlow<()>,
1037    ) -> Result<RunOutcome> {
1038        let timeout = self.timeout_secs.to_string();
1039        let mut cmd = Command::new(&self.bin);
1040        // `--stream` emits NDJSON: one `{"type":"event",…}` line per tool event
1041        // as observed, then a terminal `{"type":"result","report":{…}}`. It
1042        // implies `--events`; no `--compact` (the stream is line-oriented) and no
1043        // `--mode` (oneharness's default applies — see `run`).
1044        cmd.args([
1045            "run",
1046            "--harness",
1047            args.harness,
1048            "--stream",
1049            "--events",
1050            "--timeout",
1051            &timeout,
1052            "--prompt-file",
1053            "-",
1054        ]);
1055        if !args.model.is_empty() {
1056            cmd.args(["--model", args.model]);
1057        }
1058        if let Some(system) = args.system {
1059            cmd.args(["--system", system]);
1060        }
1061        if let Some(resume) = args.resume {
1062            cmd.args(["--resume", resume]);
1063        }
1064        push_history_args(&mut cmd, args.history.as_ref());
1065        let mock_files = args.mocks.map(MockFiles::prepare).transpose()?;
1066        if let Some(files) = &mock_files {
1067            if let Some(rules) = &files.rules {
1068                cmd.arg("--mock-rules");
1069                cmd.arg(rules);
1070            }
1071            cmd.arg("--spy-file");
1072            cmd.arg(&files.spy);
1073        }
1074
1075        let mut child = cmd
1076            .stdin(Stdio::piped())
1077            .stdout(Stdio::piped())
1078            .stderr(Stdio::piped())
1079            .spawn()
1080            .map_err(|e| {
1081                Error::provider(
1082                    "oneharness",
1083                    format!(
1084                        "could not run `{}`: {e}. Is oneharness installed and on PATH?",
1085                        self.bin
1086                    ),
1087                )
1088            })?;
1089
1090        // Write the prompt and close stdin so oneharness starts, then read its
1091        // NDJSON incrementally — events arrive live and we never deadlock on a
1092        // full stdout pipe.
1093        {
1094            let mut stdin = child
1095                .stdin
1096                .take()
1097                .ok_or_else(|| Error::provider("oneharness", "could not open oneharness stdin"))?;
1098            stdin.write_all(args.prompt.as_bytes()).map_err(|e| {
1099                Error::provider("oneharness", format!("could not write prompt: {e}"))
1100            })?;
1101        }
1102
1103        let stdout = child
1104            .stdout
1105            .take()
1106            .ok_or_else(|| Error::provider("oneharness", "could not open oneharness stdout"))?;
1107        let reader = BufReader::new(stdout);
1108
1109        let mut events: Vec<ToolEvent> = Vec::new();
1110        let mut result_env: Option<OhEnvelope> = None;
1111        let mut aborted = false;
1112
1113        for line in reader.lines() {
1114            let line = line.map_err(|e| {
1115                Error::provider("oneharness", format!("could not read stream: {e}"))
1116            })?;
1117            let trimmed = line.trim();
1118            if trimmed.is_empty() {
1119                continue;
1120            }
1121            // Tolerate non-JSON log lines interleaved on the stream.
1122            let Ok(value) = serde_json::from_str::<serde_json::Value>(trimmed) else {
1123                continue;
1124            };
1125            match value.get("type").and_then(serde_json::Value::as_str) {
1126                Some("event") => {
1127                    if let Ok(event) = serde_json::from_value::<ToolEvent>(value["event"].clone()) {
1128                        let flow = on_event(&event);
1129                        events.push(event);
1130                        if flow.is_break() {
1131                            aborted = true;
1132                            let _ = child.kill();
1133                            break;
1134                        }
1135                    }
1136                }
1137                Some("result") => {
1138                    if let Ok(env) = serde_json::from_value::<OhEnvelope>(value["report"].clone()) {
1139                        result_env = Some(env);
1140                    }
1141                }
1142                _ => {}
1143            }
1144        }
1145
1146        let output = child.wait_with_output().map_err(|e| {
1147            Error::provider("oneharness", format!("oneharness did not complete: {e}"))
1148        })?;
1149
1150        if aborted {
1151            // Torn down on purpose; return the partial turn (events seen so
1152            // far). The spy log may be torn mid-line by the kill, and an
1153            // aborted run is never scored, so no records are reported.
1154            return Ok(RunOutcome {
1155                text: String::new(),
1156                session_id: None,
1157                usage: None,
1158                events,
1159                mock_calls: None,
1160                // An aborted run is never scored or reviewed.
1161                history_file: None,
1162            });
1163        }
1164
1165        let envelope = result_env.ok_or_else(|| {
1166            Error::provider(
1167                "oneharness",
1168                format!(
1169                    "oneharness stream produced no result; stderr: {}",
1170                    String::from_utf8_lossy(&output.stderr).trim()
1171                ),
1172            )
1173        })?;
1174        // `history_file` is on the run report, not per result — capture it
1175        // before consuming `results`.
1176        let history_file = envelope.history_file;
1177        let result = envelope
1178            .results
1179            .into_iter()
1180            .next()
1181            .ok_or_else(|| Error::provider("oneharness", "oneharness returned no results"))?;
1182        if result.status != "ok" {
1183            let detail = result
1184                .error
1185                .filter(|e| !e.is_empty())
1186                .or_else(|| Some(result.stderr.clone()).filter(|s| !s.is_empty()))
1187                .unwrap_or_else(|| format!("status `{}`", result.status));
1188            let context = format!("oneharness:{}", args.harness);
1189            let message = format!("harness run failed: {detail}");
1190            return Err(oh_failure(
1191                context,
1192                message,
1193                result.failure_kind.as_deref(),
1194                &result.status,
1195            ));
1196        }
1197        let text = select_reply_text(result.text, &result.stdout).ok_or_else(|| {
1198            Error::provider(
1199                format!("oneharness:{}", args.harness),
1200                "harness produced neither extractable text nor stdout",
1201            )
1202        })?;
1203        // Prefer the events we streamed; fall back to the result's events only if
1204        // the stream carried none.
1205        let events = if events.is_empty() {
1206            result.events.unwrap_or_default()
1207        } else {
1208            events
1209        };
1210        let mock_calls = mock_files.as_ref().map(MockFiles::records).transpose()?;
1211        Ok(RunOutcome {
1212            text,
1213            session_id: result.session_id,
1214            usage: result.usage,
1215            events,
1216            mock_calls,
1217            history_file,
1218        })
1219    }
1220}
1221
1222impl Provider for OneharnessProvider {
1223    fn respond(
1224        &self,
1225        platform: &str,
1226        model: &str,
1227        skill: &SkillRef<'_>,
1228        messages: &[Message],
1229        session: Option<&str>,
1230    ) -> Result<AssistantTurn> {
1231        self.respond_with_mocks(platform, model, skill, messages, session, None)
1232    }
1233
1234    fn respond_with_mocks(
1235        &self,
1236        platform: &str,
1237        model: &str,
1238        skill: &SkillRef<'_>,
1239        messages: &[Message],
1240        session: Option<&str>,
1241        mocks: Option<&MockPlan<'_>>,
1242    ) -> Result<AssistantTurn> {
1243        // If we have a real session to continue on a supporting harness, only
1244        // send the last user message — the harness still has its prior state.
1245        // Otherwise inline the whole transcript so harnesses without resume
1246        // still see the conversation.
1247        let prompt = if session.is_some() {
1248            latest_user_message(messages).unwrap_or_default()
1249        } else {
1250            render_transcript_for_respond(messages)
1251        };
1252        let history_name = self.history_name(platform, model, messages);
1253        let outcome = self.run(&RunArgs {
1254            harness: platform,
1255            model,
1256            prompt: &prompt,
1257            system: Some(skill.instructions),
1258            resume: session,
1259            mocks,
1260            history: history_name.as_deref().map(|name| HistoryArgs {
1261                dir: &self.history_dir,
1262                name,
1263            }),
1264        })?;
1265        let history_command = self.history_command(history_name.as_deref(), &outcome);
1266        Ok(AssistantTurn {
1267            message: outcome.text.trim().to_string(),
1268            done: false,
1269            usage: outcome.usage,
1270            session_id: outcome.session_id,
1271            events: outcome.events,
1272            mock_calls: outcome.mock_calls,
1273            history_command,
1274        })
1275    }
1276
1277    fn respond_streaming(
1278        &self,
1279        platform: &str,
1280        model: &str,
1281        skill: &SkillRef<'_>,
1282        messages: &[Message],
1283        session: Option<&str>,
1284        on_event: &mut dyn FnMut(&ToolEvent) -> ControlFlow<()>,
1285    ) -> Result<AssistantTurn> {
1286        self.respond_streaming_with_mocks(platform, model, skill, messages, session, None, on_event)
1287    }
1288
1289    // One over clippy's arg limit; the signature is respond_streaming's plus
1290    // the mock plan, and a params struct would obscure the trait symmetry.
1291    #[allow(clippy::too_many_arguments)]
1292    fn respond_streaming_with_mocks(
1293        &self,
1294        platform: &str,
1295        model: &str,
1296        skill: &SkillRef<'_>,
1297        messages: &[Message],
1298        session: Option<&str>,
1299        mocks: Option<&MockPlan<'_>>,
1300        on_event: &mut dyn FnMut(&ToolEvent) -> ControlFlow<()>,
1301    ) -> Result<AssistantTurn> {
1302        let prompt = if session.is_some() {
1303            latest_user_message(messages).unwrap_or_default()
1304        } else {
1305            render_transcript_for_respond(messages)
1306        };
1307        let history_name = self.history_name(platform, model, messages);
1308        let outcome = self.run_streaming(
1309            &RunArgs {
1310                harness: platform,
1311                model,
1312                prompt: &prompt,
1313                system: Some(skill.instructions),
1314                resume: session,
1315                mocks,
1316                history: history_name.as_deref().map(|name| HistoryArgs {
1317                    dir: &self.history_dir,
1318                    name,
1319                }),
1320            },
1321            on_event,
1322        )?;
1323        let history_command = self.history_command(history_name.as_deref(), &outcome);
1324        Ok(AssistantTurn {
1325            message: outcome.text.trim().to_string(),
1326            done: false,
1327            usage: outcome.usage,
1328            session_id: outcome.session_id,
1329            events: outcome.events,
1330            mock_calls: outcome.mock_calls,
1331            history_command,
1332        })
1333    }
1334
1335    fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
1336        let prompt = build_user_prompt(persona, messages);
1337        let outcome = self.run(&RunArgs::plain(&self.judge_harness, model, &prompt))?;
1338        Ok(UserTurn {
1339            message: outcome.text.trim().to_string(),
1340            stop: false,
1341            usage: outcome.usage,
1342        })
1343    }
1344
1345    fn judge(
1346        &self,
1347        model: &str,
1348        query: &JudgeQuery<'_>,
1349        messages: &[Message],
1350    ) -> Result<JudgeVerdict> {
1351        let prompt = build_judge_prompt(query, messages);
1352        let outcome = self.run(&RunArgs::plain(&self.judge_harness, model, &prompt))?;
1353        let mut verdict = parse_verdict(query.kind, &outcome.text)?;
1354        verdict.usage = outcome.usage;
1355        Ok(verdict)
1356    }
1357
1358    fn supports_resume(&self, platform: &str) -> bool {
1359        supports_resume(platform)
1360    }
1361}
1362
1363/// The harnesses oneharness's adapter table marks `supports_resume = true`
1364/// (claude-code's `--resume`, opencode's `--session`, cursor's `--resume`). Kept
1365/// in sync with the `oneharness list` registry — when a new harness ships
1366/// session continuation, add it here so the runner threads `session_id`.
1367#[must_use]
1368pub fn supports_resume(harness: &str) -> bool {
1369    matches!(harness, "claude-code" | "opencode" | "cursor")
1370}
1371
1372// ---------------------------------------------------------------------------
1373// Run history helpers
1374// ---------------------------------------------------------------------------
1375
1376/// The centralized directory oneharness run history is written to, shared across
1377/// every skilltest invocation so past runs accumulate in one reviewable place
1378/// (rather than scattering per project the way oneharness's own default would).
1379///
1380/// Resolution: `SKILLTEST_HISTORY_DIR` if set (the escape hatch tests use);
1381/// otherwise `<state dir>/skilltest/oneharness-history`, where the state dir is
1382/// `$XDG_STATE_HOME` or, failing that, `$HOME/.local/state` — the same
1383/// convention oneharness follows across the Linux/macOS matrix. A last-resort
1384/// fallback uses the temp dir so the path is always absolute.
1385#[must_use]
1386pub fn default_history_dir() -> PathBuf {
1387    resolve_history_dir(
1388        std::env::var_os("SKILLTEST_HISTORY_DIR"),
1389        std::env::var_os("XDG_STATE_HOME"),
1390        std::env::var_os("HOME"),
1391    )
1392}
1393
1394/// The pure core of [`default_history_dir`], parameterized on the three env
1395/// values so it can be tested without mutating the process environment.
1396fn resolve_history_dir(
1397    override_dir: Option<std::ffi::OsString>,
1398    xdg_state: Option<std::ffi::OsString>,
1399    home: Option<std::ffi::OsString>,
1400) -> PathBuf {
1401    if let Some(dir) = override_dir.filter(|d| !d.is_empty()) {
1402        return PathBuf::from(dir);
1403    }
1404    let state = xdg_state
1405        .filter(|d| !d.is_empty())
1406        .map(PathBuf::from)
1407        .or_else(|| {
1408            home.filter(|h| !h.is_empty())
1409                .map(|home| PathBuf::from(home).join(".local").join("state"))
1410        })
1411        .unwrap_or_else(std::env::temp_dir);
1412    state.join("skilltest").join("oneharness-history")
1413}
1414
1415/// Append the history-recording flags for a skill run, when set.
1416fn push_history_args(cmd: &mut Command, history: Option<&HistoryArgs<'_>>) {
1417    if let Some(h) = history {
1418        cmd.arg("--history");
1419        cmd.arg("--history-dir");
1420        cmd.arg(h.dir);
1421        cmd.arg("--history-name");
1422        cmd.arg(h.name);
1423    }
1424}
1425
1426/// A stable, review-friendly `oneharness --history-name` for a skill run.
1427///
1428/// It combines the platform, model, and a slug + short hash of the case's
1429/// opening user prompt so that: distinct cases (different prompts) get distinct
1430/// sessions; every turn of one case reuses the same name (the opening prompt is
1431/// constant across a run's turns), so a resumed multi-turn run stays one
1432/// reviewable session; and re-running a case reuses the name, so `history show`
1433/// (newest match wins) lands on the latest run. The result is filesystem- and
1434/// shell-safe by construction (lowercase ascii, digits, and `-`).
1435fn history_session_name(platform: &str, model: &str, messages: &[Message]) -> String {
1436    let seed = messages
1437        .iter()
1438        .find(|m| m.role == Role::User)
1439        .map_or("", |m| m.content.as_str());
1440    let model = if model.is_empty() { "default" } else { model };
1441    format!(
1442        "skilltest-{}-{}-{}-{:08x}",
1443        slug(platform),
1444        slug(model),
1445        slug(seed),
1446        (fnv1a(seed.as_bytes()) & 0xFFFF_FFFF) as u32,
1447    )
1448}
1449
1450/// The command that replays a recorded run: `<bin> history show <name>
1451/// --history-dir <dir>`. `bin` and `dir` are shell-quoted in case of spaces;
1452/// `name` is safe by construction (see [`history_session_name`]).
1453fn history_view_command(bin: &str, dir: &Path, name: &str) -> String {
1454    format!(
1455        "{} history show {name} --history-dir {}",
1456        shell_quote(bin),
1457        shell_quote(&dir.to_string_lossy()),
1458    )
1459}
1460
1461/// Lowercase-ascii/digit/`-` slug of `s`, hyphen-collapsed and capped to 32
1462/// chars, for use as a history-name segment. Empty input yields `x` so a name
1463/// never has an empty segment.
1464fn slug(s: &str) -> String {
1465    let mut out = String::with_capacity(s.len().min(32));
1466    let mut dash = false;
1467    for c in s.chars() {
1468        if c.is_ascii_alphanumeric() {
1469            out.push(c.to_ascii_lowercase());
1470            dash = false;
1471        } else if !out.is_empty() && !dash {
1472            out.push('-');
1473            dash = true;
1474        }
1475        if out.len() >= 32 {
1476            break;
1477        }
1478    }
1479    out.truncate(32);
1480    let trimmed = out.trim_matches('-');
1481    if trimmed.is_empty() {
1482        "x".to_string()
1483    } else {
1484        trimmed.to_string()
1485    }
1486}
1487
1488/// FNV-1a (64-bit). Deterministic and dependency-free — used only to give a
1489/// history name a short, collision-resistant suffix per opening prompt.
1490fn fnv1a(bytes: &[u8]) -> u64 {
1491    let mut hash: u64 = 0xcbf2_9ce4_8422_2325;
1492    for &b in bytes {
1493        hash ^= u64::from(b);
1494        hash = hash.wrapping_mul(0x0000_0100_0000_01b3);
1495    }
1496    hash
1497}
1498
1499/// POSIX single-quote a value for display in a runnable command. Leaves an
1500/// already-safe token (letters, digits, and a few path chars) unquoted for
1501/// readability.
1502fn shell_quote(s: &str) -> String {
1503    let safe = !s.is_empty()
1504        && s.chars().all(|c| {
1505            c.is_ascii_alphanumeric() || matches!(c, '-' | '_' | '/' | '.' | ':' | '@' | '+' | '=')
1506        });
1507    if safe {
1508        return s.to_string();
1509    }
1510    let mut out = String::with_capacity(s.len() + 2);
1511    out.push('\'');
1512    for c in s.chars() {
1513        if c == '\'' {
1514            out.push_str("'\\''");
1515        } else {
1516            out.push(c);
1517        }
1518    }
1519    out.push('\'');
1520    out
1521}
1522
1523// ---------------------------------------------------------------------------
1524// ApiJudgeProvider + SplitProvider
1525// ---------------------------------------------------------------------------
1526
1527/// A judge-only [`Provider`] that scores evals and plays the simulated user with
1528/// a *direct* model API call (Anthropic or OpenAI), rather than running them
1529/// through a harness.
1530///
1531/// Why this exists: routing the judge through a full agentic harness pays an
1532/// agent-loop cold start on every short verdict. A direct API call is one HTTP
1533/// round trip — faster and cheaper on API-key auth — and still reuses the exact
1534/// same judge/user prompts and tolerant verdict parsing as
1535/// [`OneharnessProvider`], so the two are directly comparable.
1536///
1537/// It does not run skills: `respond` returns an error. Compose it with a
1538/// skill-running provider via [`SplitProvider`] so the harness under test still
1539/// drives `respond`, while the judge runs on the API.
1540///
1541/// The request is sent with `curl` (Rust has no official vendor SDK). The API
1542/// key is read from an env var and passed through a private (`0600`) `curl`
1543/// config file, so it never appears in `argv` / `ps`.
1544pub struct ApiJudgeProvider {
1545    vendor: ApiVendor,
1546    api_key_env: String,
1547    endpoint: String,
1548    timeout_secs: u64,
1549    curl_bin: String,
1550    strict_json: bool,
1551}
1552
1553/// How many times a transient API failure (rate limit / overload) is retried
1554/// before giving up, with exponential backoff between attempts.
1555const MAX_RETRIES: u32 = 2;
1556
1557/// One model reply plus the usage the API reported for it.
1558#[derive(Debug)]
1559struct ChatOutcome {
1560    text: String,
1561    usage: Option<Usage>,
1562}
1563
1564/// A minimal system prompt; the full judge / user-simulation instructions live
1565/// in the shared prompt builders, so this stays identical across vendors.
1566const JUDGE_SYSTEM: &str =
1567    "Follow the user's instructions exactly and respond with only what they ask for.";
1568
1569impl ApiJudgeProvider {
1570    /// Build a provider from its configuration, resolving per-vendor defaults
1571    /// for the API-key env var and endpoint.
1572    #[must_use]
1573    pub fn new(config: &ApiJudgeConfig) -> Self {
1574        let api_key_env = config
1575            .api_key_env
1576            .clone()
1577            .unwrap_or_else(|| match config.vendor {
1578                ApiVendor::Anthropic => "ANTHROPIC_API_KEY".to_string(),
1579                ApiVendor::Openai => "OPENAI_API_KEY".to_string(),
1580            });
1581        let endpoint = config
1582            .base_url
1583            .clone()
1584            .unwrap_or_else(|| match config.vendor {
1585                ApiVendor::Anthropic => "https://api.anthropic.com/v1/messages".to_string(),
1586                ApiVendor::Openai => "https://api.openai.com/v1/chat/completions".to_string(),
1587            });
1588        Self {
1589            vendor: config.vendor,
1590            api_key_env,
1591            endpoint,
1592            timeout_secs: config.timeout_secs,
1593            curl_bin: config.curl_bin.clone(),
1594            strict_json: config.strict_json,
1595        }
1596    }
1597
1598    /// One chat round trip: build the vendor request, POST it, parse the reply.
1599    /// `schema`, when set, constrains the reply to that JSON schema via the
1600    /// vendor's structured-outputs feature. Transient failures (rate limit /
1601    /// overload) are retried with exponential backoff.
1602    fn chat(
1603        &self,
1604        model: &str,
1605        system: &str,
1606        user: &str,
1607        schema: Option<serde_json::Value>,
1608    ) -> Result<ChatOutcome> {
1609        let key = std::env::var(&self.api_key_env).map_err(|_| {
1610            Error::provider_classified(
1611                "api-judge",
1612                format!("API key env var `{}` is not set", self.api_key_env),
1613                ProviderErrorKind::Auth,
1614            )
1615        })?;
1616        let body = build_chat_body(self.vendor, model, system, user, schema);
1617        let payload = serde_json::to_vec(&body)
1618            .map_err(|e| Error::provider("api-judge", format!("could not encode request: {e}")))?;
1619
1620        let mut attempt = 0;
1621        loop {
1622            let result = self
1623                .run_curl(&key, &payload)
1624                .and_then(|raw| parse_chat_response(self.vendor, &raw));
1625            match result {
1626                Ok(outcome) => return Ok(outcome),
1627                Err(err) if attempt < MAX_RETRIES && is_retryable(&err) => {
1628                    attempt += 1;
1629                    std::thread::sleep(std::time::Duration::from_millis(500 * (1 << attempt)));
1630                }
1631                Err(err) => return Err(err),
1632            }
1633        }
1634    }
1635
1636    /// Per-vendor request headers.
1637    fn headers(&self, key: &str) -> Vec<(String, String)> {
1638        match self.vendor {
1639            ApiVendor::Anthropic => vec![
1640                ("x-api-key".to_string(), key.to_string()),
1641                ("anthropic-version".to_string(), "2023-06-01".to_string()),
1642                ("content-type".to_string(), "application/json".to_string()),
1643            ],
1644            ApiVendor::Openai => vec![
1645                ("authorization".to_string(), format!("Bearer {key}")),
1646                ("content-type".to_string(), "application/json".to_string()),
1647            ],
1648        }
1649    }
1650
1651    /// POST `body` via `curl`, with the URL + headers (including the API key) in
1652    /// a private config file so the key stays out of `argv`. Returns stdout.
1653    fn run_curl(&self, key: &str, body: &[u8]) -> Result<String> {
1654        let path = std::env::temp_dir().join(format!(
1655            "skilltest-judge-{}-{}.cfg",
1656            std::process::id(),
1657            curl_config_nonce()
1658        ));
1659        write_curl_config(&path, &self.endpoint, &self.headers(key), self.timeout_secs)?;
1660        let outcome = self.exec_curl(&path, body);
1661        // The key-bearing config is needed only for this one invocation.
1662        let _ = std::fs::remove_file(&path);
1663        outcome
1664    }
1665
1666    fn exec_curl(&self, config_path: &std::path::Path, body: &[u8]) -> Result<String> {
1667        let mut child = Command::new(&self.curl_bin)
1668            .arg("--config")
1669            .arg(config_path)
1670            .arg("--data-binary")
1671            .arg("@-")
1672            .stdin(Stdio::piped())
1673            .stdout(Stdio::piped())
1674            .stderr(Stdio::piped())
1675            .spawn()
1676            .map_err(|e| {
1677                Error::provider(
1678                    "api-judge",
1679                    format!(
1680                        "could not run `{}`: {e}. Is curl installed and on PATH?",
1681                        self.curl_bin
1682                    ),
1683                )
1684            })?;
1685
1686        child
1687            .stdin
1688            .as_mut()
1689            .ok_or_else(|| Error::provider("api-judge", "could not open curl stdin"))?
1690            .write_all(body)
1691            .map_err(|e| Error::provider("api-judge", format!("could not write request: {e}")))?;
1692
1693        let output = child
1694            .wait_with_output()
1695            .map_err(|e| Error::provider("api-judge", format!("curl did not complete: {e}")))?;
1696
1697        if !output.status.success() {
1698            let stderr = String::from_utf8_lossy(&output.stderr);
1699            let message = format!("curl failed ({}): {}", output.status, stderr.trim());
1700            // curl exit 28 is "operation timed out" (`--max-time` elapsed) — the
1701            // one curl status skilltest can classify structurally.
1702            return Err(match output.status.code() {
1703                Some(28) => {
1704                    Error::provider_classified("api-judge", message, ProviderErrorKind::Timeout)
1705                }
1706                _ => Error::provider("api-judge", message),
1707            });
1708        }
1709        Ok(String::from_utf8_lossy(&output.stdout).into_owned())
1710    }
1711}
1712
1713impl Provider for ApiJudgeProvider {
1714    fn respond(
1715        &self,
1716        _platform: &str,
1717        _model: &str,
1718        _skill: &SkillRef<'_>,
1719        _messages: &[Message],
1720        _session: Option<&str>,
1721    ) -> Result<AssistantTurn> {
1722        Err(Error::provider(
1723            "api-judge",
1724            "the API judge does not run skills; use it as the judge in a SplitProvider",
1725        ))
1726    }
1727
1728    fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
1729        let prompt = build_user_prompt(persona, messages);
1730        // Free-form text reply — never schema-constrained.
1731        let outcome = self.chat(model, JUDGE_SYSTEM, &prompt, None)?;
1732        Ok(UserTurn {
1733            message: outcome.text.trim().to_string(),
1734            stop: false,
1735            usage: outcome.usage,
1736        })
1737    }
1738
1739    fn judge(
1740        &self,
1741        model: &str,
1742        query: &JudgeQuery<'_>,
1743        messages: &[Message],
1744    ) -> Result<JudgeVerdict> {
1745        let prompt = build_judge_prompt(query, messages);
1746        // Constrain the verdict to the `{value, reason}` schema when strict JSON
1747        // is on, so the reply is guaranteed parseable rather than scraped.
1748        let schema = self.strict_json.then(|| verdict_schema(query.kind));
1749        let outcome = self.chat(model, JUDGE_SYSTEM, &prompt, schema)?;
1750        let mut verdict = parse_verdict(query.kind, &outcome.text)?;
1751        verdict.usage = outcome.usage;
1752        Ok(verdict)
1753    }
1754}
1755
1756/// A [`Provider`] that runs skills with one provider and judges with another:
1757/// `respond` (and `supports_resume`) go to the skill-running provider; `judge`
1758/// and `simulate_user` go to the judge. This keeps harness fidelity for the
1759/// thing under test while letting the judge run on a fast, cheap, deterministic
1760/// backend (typically [`ApiJudgeProvider`]).
1761pub struct SplitProvider {
1762    responder: Box<dyn Provider>,
1763    judge: ApiJudgeProvider,
1764}
1765
1766impl SplitProvider {
1767    /// Compose a skill-running `responder` with an API `judge`.
1768    #[must_use]
1769    pub fn new(responder: Box<dyn Provider>, judge: ApiJudgeProvider) -> Self {
1770        Self { responder, judge }
1771    }
1772}
1773
1774impl Provider for SplitProvider {
1775    fn respond(
1776        &self,
1777        platform: &str,
1778        model: &str,
1779        skill: &SkillRef<'_>,
1780        messages: &[Message],
1781        session: Option<&str>,
1782    ) -> Result<AssistantTurn> {
1783        self.responder
1784            .respond(platform, model, skill, messages, session)
1785    }
1786
1787    fn respond_with_mocks(
1788        &self,
1789        platform: &str,
1790        model: &str,
1791        skill: &SkillRef<'_>,
1792        messages: &[Message],
1793        session: Option<&str>,
1794        mocks: Option<&MockPlan<'_>>,
1795    ) -> Result<AssistantTurn> {
1796        self.responder
1797            .respond_with_mocks(platform, model, skill, messages, session, mocks)
1798    }
1799
1800    // One over clippy's arg limit; the signature is respond_streaming's plus
1801    // the mock plan, and a params struct would obscure the trait symmetry.
1802    #[allow(clippy::too_many_arguments)]
1803    fn respond_streaming_with_mocks(
1804        &self,
1805        platform: &str,
1806        model: &str,
1807        skill: &SkillRef<'_>,
1808        messages: &[Message],
1809        session: Option<&str>,
1810        mocks: Option<&MockPlan<'_>>,
1811        on_event: &mut dyn FnMut(&ToolEvent) -> ControlFlow<()>,
1812    ) -> Result<AssistantTurn> {
1813        self.responder.respond_streaming_with_mocks(
1814            platform, model, skill, messages, session, mocks, on_event,
1815        )
1816    }
1817
1818    fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
1819        self.judge.simulate_user(model, persona, messages)
1820    }
1821
1822    fn judge(
1823        &self,
1824        model: &str,
1825        query: &JudgeQuery<'_>,
1826        messages: &[Message],
1827    ) -> Result<JudgeVerdict> {
1828        self.judge.judge(model, query, messages)
1829    }
1830
1831    fn supports_resume(&self, platform: &str) -> bool {
1832        self.responder.supports_resume(platform)
1833    }
1834}
1835
1836/// A process-local monotonic counter, combined with the pid to make a unique
1837/// temp-file name for each concurrent `curl` config.
1838fn curl_config_nonce() -> u64 {
1839    use std::sync::atomic::{AtomicU64, Ordering};
1840    static COUNTER: AtomicU64 = AtomicU64::new(0);
1841    COUNTER.fetch_add(1, Ordering::Relaxed)
1842}
1843
1844/// Escape a value for a double-quoted `curl` config entry.
1845fn curl_escape(value: &str) -> String {
1846    value.replace('\\', "\\\\").replace('"', "\\\"")
1847}
1848
1849/// Write a `curl` config file (`0600` on Unix) carrying the URL, headers, and
1850/// timeout. The request body is streamed separately on stdin (`--data-binary
1851/// @-`), so it never needs escaping into this file.
1852fn write_curl_config(
1853    path: &std::path::Path,
1854    url: &str,
1855    headers: &[(String, String)],
1856    timeout_secs: u64,
1857) -> Result<()> {
1858    let mut config = String::new();
1859    config.push_str(&format!("url = \"{}\"\n", curl_escape(url)));
1860    config.push_str("request = \"POST\"\n");
1861    for (name, value) in headers {
1862        config.push_str(&format!("header = \"{}: {}\"\n", name, curl_escape(value)));
1863    }
1864    config.push_str(&format!("max-time = {timeout_secs}\n"));
1865    config.push_str("silent\nshow-error\n");
1866
1867    let mut options = std::fs::OpenOptions::new();
1868    options.write(true).create(true).truncate(true);
1869    #[cfg(unix)]
1870    {
1871        use std::os::unix::fs::OpenOptionsExt as _;
1872        options.mode(0o600);
1873    }
1874    let mut file = options
1875        .open(path)
1876        .map_err(|e| Error::provider("api-judge", format!("could not write curl config: {e}")))?;
1877    file.write_all(config.as_bytes())
1878        .map_err(|e| Error::provider("api-judge", format!("could not write curl config: {e}")))?;
1879    Ok(())
1880}
1881
1882/// The JSON schema a judge verdict must match: `{value, reason}` with `value`
1883/// typed by the eval kind. Numeric bounds are intentionally omitted — vendor
1884/// structured outputs don't enforce `minimum`/`maximum`, and the runner already
1885/// range-checks the parsed value.
1886fn verdict_schema(kind: JudgeKind) -> serde_json::Value {
1887    let value_type = match kind {
1888        JudgeKind::Boolean => "boolean",
1889        JudgeKind::Numeric => "number",
1890    };
1891    serde_json::json!({
1892        "type": "object",
1893        "properties": {
1894            "value": { "type": value_type },
1895            "reason": { "type": "string" },
1896        },
1897        "required": ["value", "reason"],
1898        "additionalProperties": false,
1899    })
1900}
1901
1902/// Build the JSON request body for one chat completion. Outgoing data, so it is
1903/// constructed directly; responses are parsed into typed models below. When
1904/// `schema` is set, the vendor's structured-outputs field is added so the reply
1905/// is guaranteed to match it.
1906fn build_chat_body(
1907    vendor: ApiVendor,
1908    model: &str,
1909    system: &str,
1910    user: &str,
1911    schema: Option<serde_json::Value>,
1912) -> serde_json::Value {
1913    match vendor {
1914        ApiVendor::Anthropic => {
1915            let mut body = serde_json::json!({
1916                "model": model,
1917                "max_tokens": 1024,
1918                "system": system,
1919                "messages": [{ "role": "user", "content": user }],
1920            });
1921            if let Some(schema) = schema {
1922                body["output_config"] =
1923                    serde_json::json!({ "format": { "type": "json_schema", "schema": schema } });
1924            }
1925            body
1926        }
1927        ApiVendor::Openai => {
1928            let mut body = serde_json::json!({
1929                "model": model,
1930                "max_tokens": 1024,
1931                "messages": [
1932                    { "role": "system", "content": system },
1933                    { "role": "user", "content": user },
1934                ],
1935            });
1936            if let Some(schema) = schema {
1937                body["response_format"] = serde_json::json!({
1938                    "type": "json_schema",
1939                    "json_schema": { "name": "verdict", "strict": true, "schema": schema },
1940                });
1941            }
1942            body
1943        }
1944    }
1945}
1946
1947/// True iff the error is a transient API condition worth retrying.
1948fn is_retryable(err: &Error) -> bool {
1949    matches!(
1950        err,
1951        Error::Provider {
1952            kind: Some(ProviderErrorKind::RateLimit | ProviderErrorKind::Overloaded),
1953            ..
1954        }
1955    )
1956}
1957
1958// Typed views of the vendor responses (trust-boundary input — always parsed,
1959// never string-matched).
1960
1961#[derive(Deserialize)]
1962struct ApiErrorBody {
1963    #[serde(rename = "type", default)]
1964    kind: Option<String>,
1965    #[serde(default)]
1966    message: Option<String>,
1967}
1968
1969#[derive(Deserialize)]
1970struct AnthropicBlock {
1971    #[serde(rename = "type")]
1972    kind: String,
1973    #[serde(default)]
1974    text: Option<String>,
1975}
1976
1977#[derive(Deserialize)]
1978struct AnthropicUsage {
1979    #[serde(default)]
1980    input_tokens: Option<u64>,
1981    #[serde(default)]
1982    output_tokens: Option<u64>,
1983}
1984
1985#[derive(Deserialize)]
1986struct AnthropicResponse {
1987    #[serde(default)]
1988    content: Vec<AnthropicBlock>,
1989    #[serde(default)]
1990    usage: Option<AnthropicUsage>,
1991    #[serde(default)]
1992    stop_reason: Option<String>,
1993    #[serde(default)]
1994    error: Option<ApiErrorBody>,
1995}
1996
1997#[derive(Deserialize)]
1998struct OpenAiMessage {
1999    #[serde(default)]
2000    content: Option<String>,
2001}
2002
2003#[derive(Deserialize)]
2004struct OpenAiChoice {
2005    #[serde(default)]
2006    message: Option<OpenAiMessage>,
2007}
2008
2009#[derive(Deserialize)]
2010struct OpenAiUsage {
2011    #[serde(default)]
2012    prompt_tokens: Option<u64>,
2013    #[serde(default)]
2014    completion_tokens: Option<u64>,
2015}
2016
2017#[derive(Deserialize)]
2018struct OpenAiResponse {
2019    #[serde(default)]
2020    choices: Vec<OpenAiChoice>,
2021    #[serde(default)]
2022    usage: Option<OpenAiUsage>,
2023    #[serde(default)]
2024    error: Option<ApiErrorBody>,
2025}
2026
2027/// Map a vendor error `type` onto skilltest's classified provider-error kinds so
2028/// consumers get the same categories (and the CLI the same pointed hints) it
2029/// gives for harness failures.
2030fn classify_api_error(kind: Option<&str>) -> Option<ProviderErrorKind> {
2031    match kind? {
2032        "authentication_error" | "invalid_api_key" | "permission_error" => {
2033            Some(ProviderErrorKind::Auth)
2034        }
2035        "rate_limit_error" | "rate_limit_exceeded" => Some(ProviderErrorKind::RateLimit),
2036        "insufficient_quota" | "billing_error" => Some(ProviderErrorKind::Quota),
2037        "not_found_error" => Some(ProviderErrorKind::ModelNotFound),
2038        // Transient server-side conditions — surfaced as `overloaded` so the
2039        // runner retries them (see `is_retryable`).
2040        "overloaded_error" | "api_error" | "server_error" | "service_unavailable" => {
2041            Some(ProviderErrorKind::Overloaded)
2042        }
2043        _ => None,
2044    }
2045}
2046
2047fn api_error(err: ApiErrorBody) -> Error {
2048    let message = err
2049        .message
2050        .unwrap_or_else(|| "API returned an error".to_string());
2051    match classify_api_error(err.kind.as_deref()) {
2052        Some(kind) => Error::provider_classified("api-judge", message, kind),
2053        None => Error::provider("api-judge", message),
2054    }
2055}
2056
2057/// Take the first chars of `raw` for an error message, on a UTF-8 boundary.
2058fn truncate_for_error(raw: &str) -> String {
2059    raw.chars().take(500).collect()
2060}
2061
2062/// Parse a vendor chat response into the reply text plus normalized usage.
2063fn parse_chat_response(vendor: ApiVendor, raw: &str) -> Result<ChatOutcome> {
2064    match vendor {
2065        ApiVendor::Anthropic => {
2066            let resp: AnthropicResponse = serde_json::from_str(raw.trim()).map_err(|e| {
2067                Error::provider(
2068                    "api-judge",
2069                    format!(
2070                        "could not parse API response: {e}; got: {}",
2071                        truncate_for_error(raw)
2072                    ),
2073                )
2074            })?;
2075            if let Some(err) = resp.error {
2076                return Err(api_error(err));
2077            }
2078            let text = resp
2079                .content
2080                .iter()
2081                .filter(|b| b.kind == "text")
2082                .filter_map(|b| b.text.as_deref())
2083                .collect::<String>();
2084            if text.trim().is_empty() {
2085                return Err(Error::provider(
2086                    "api-judge",
2087                    format!(
2088                        "judge returned no text (stop_reason: {:?})",
2089                        resp.stop_reason
2090                    ),
2091                ));
2092            }
2093            let usage = resp.usage.map(|u| Usage {
2094                input_tokens: u.input_tokens,
2095                output_tokens: u.output_tokens,
2096                cost_usd: None,
2097            });
2098            Ok(ChatOutcome { text, usage })
2099        }
2100        ApiVendor::Openai => {
2101            let resp: OpenAiResponse = serde_json::from_str(raw.trim()).map_err(|e| {
2102                Error::provider(
2103                    "api-judge",
2104                    format!(
2105                        "could not parse API response: {e}; got: {}",
2106                        truncate_for_error(raw)
2107                    ),
2108                )
2109            })?;
2110            if let Some(err) = resp.error {
2111                return Err(api_error(err));
2112            }
2113            let text = resp
2114                .choices
2115                .into_iter()
2116                .next()
2117                .and_then(|c| c.message)
2118                .and_then(|m| m.content)
2119                .unwrap_or_default();
2120            if text.trim().is_empty() {
2121                return Err(Error::provider("api-judge", "judge returned no text"));
2122            }
2123            let usage = resp.usage.map(|u| Usage {
2124                input_tokens: u.prompt_tokens,
2125                output_tokens: u.completion_tokens,
2126                cost_usd: None,
2127            });
2128            Ok(ChatOutcome { text, usage })
2129        }
2130    }
2131}
2132
2133/// Render the conversation as `Role: content` lines for inlining in a prompt.
2134/// Used by the judge, the simulated user, and the no-resume fallback path of
2135/// `respond`.
2136fn render_transcript(messages: &[Message]) -> String {
2137    messages
2138        .iter()
2139        .map(|m| {
2140            let role = match m.role {
2141                Role::User => "User",
2142                Role::Assistant => "Assistant",
2143                Role::System => "System",
2144            };
2145            format!("{role}: {}", m.content)
2146        })
2147        .collect::<Vec<_>>()
2148        .join("\n")
2149}
2150
2151/// The prompt for `respond` when we cannot resume a harness session: inline the
2152/// whole conversation so the stateless harness call sees it. The skill is
2153/// passed separately as `--system`, so it does *not* appear here.
2154fn render_transcript_for_respond(messages: &[Message]) -> String {
2155    format!(
2156        "Conversation so far (most recent last):\n{}\n\n\
2157         Write only the assistant's next reply, following your system \
2158         instructions. Output the reply text and nothing else.",
2159        render_transcript(messages),
2160    )
2161}
2162
2163/// The most recent user message in the transcript — used as the next-turn
2164/// prompt when resuming a real harness session.
2165fn latest_user_message(messages: &[Message]) -> Option<String> {
2166    messages
2167        .iter()
2168        .rev()
2169        .find(|m| m.role == Role::User)
2170        .map(|m| m.content.clone())
2171}
2172
2173fn build_user_prompt(persona: &str, messages: &[Message]) -> String {
2174    format!(
2175        "You are role-playing the USER in a conversation with an AI assistant. \
2176         Stay in character:\n\n{persona}\n\n\
2177         Conversation so far (most recent last):\n{transcript}\n\n\
2178         Write only the user's next message. Output the message text and nothing \
2179         else.",
2180        transcript = render_transcript(messages),
2181    )
2182}
2183
2184fn build_judge_prompt(query: &JudgeQuery<'_>, messages: &[Message]) -> String {
2185    let transcript = render_transcript(messages);
2186    match query.kind {
2187        JudgeKind::Boolean => format!(
2188            "You are a strict, careful evaluator of an AI assistant's behavior.\n\n\
2189             Criterion: {criterion}\n\n\
2190             Transcript:\n{transcript}\n\n\
2191             Decide whether the criterion is satisfied. Respond with ONLY a \
2192             single-line JSON object and nothing else:\n\
2193             {{\"value\": true or false, \"reason\": \"<one short sentence>\"}}",
2194            criterion = query.criterion,
2195        ),
2196        JudgeKind::Numeric => {
2197            let (min, max) = query.scale.unwrap_or((0.0, 10.0));
2198            format!(
2199                "You are a strict, careful evaluator of an AI assistant's behavior.\n\n\
2200                 Criterion: {criterion}\n\n\
2201                 Transcript:\n{transcript}\n\n\
2202                 Score how well the criterion is satisfied on a scale from {min} to \
2203                 {max} (inclusive). Respond with ONLY a single-line JSON object and \
2204                 nothing else:\n\
2205                 {{\"value\": <number between {min} and {max}>, \"reason\": \"<one short sentence>\"}}",
2206                criterion = query.criterion,
2207            )
2208        }
2209    }
2210}
2211
2212/// Extract the first JSON object from `text`, tolerating code fences and prose
2213/// around it (real models do not always emit bare JSON).
2214fn extract_json_object(text: &str) -> Option<&str> {
2215    let start = text.find('{')?;
2216    let end = text.rfind('}')?;
2217    if end > start {
2218        Some(&text[start..=end])
2219    } else {
2220        None
2221    }
2222}
2223
2224fn parse_verdict(kind: JudgeKind, text: &str) -> Result<JudgeVerdict> {
2225    let json = extract_json_object(text).ok_or_else(|| {
2226        Error::provider(
2227            "oneharness:judge",
2228            format!("judge did not return a JSON object; got: {text}"),
2229        )
2230    })?;
2231    let value: serde_json::Value = serde_json::from_str(json).map_err(|e| {
2232        Error::provider(
2233            "oneharness:judge",
2234            format!("judge verdict was not valid JSON: {e}; got: {json}"),
2235        )
2236    })?;
2237    let reason = value
2238        .get("reason")
2239        .and_then(serde_json::Value::as_str)
2240        .unwrap_or("")
2241        .to_string();
2242    let raw = value
2243        .get("value")
2244        .ok_or_else(|| Error::provider("oneharness:judge", "judge verdict has no `value` field"))?;
2245
2246    let verdict_value = match kind {
2247        JudgeKind::Boolean => JudgeValue::Bool(raw.as_bool().ok_or_else(|| {
2248            Error::provider(
2249                "oneharness:judge",
2250                format!("boolean judge `value` was not a bool: {raw}"),
2251            )
2252        })?),
2253        JudgeKind::Numeric => JudgeValue::Number(raw.as_f64().ok_or_else(|| {
2254            Error::provider(
2255                "oneharness:judge",
2256                format!("numeric judge `value` was not a number: {raw}"),
2257            )
2258        })?),
2259    };
2260
2261    Ok(JudgeVerdict {
2262        value: verdict_value,
2263        reason,
2264        usage: None,
2265    })
2266}
2267
2268#[cfg(test)]
2269mod tests {
2270    use super::*;
2271
2272    #[test]
2273    fn empty_argv_is_rejected() {
2274        assert!(CommandProvider::new(vec![]).is_err());
2275    }
2276
2277    #[test]
2278    fn request_serializes_with_op_tag() {
2279        let req = Request::Judge {
2280            model: "m",
2281            kind: "numeric",
2282            criterion: "polite",
2283            min: Some(0.0),
2284            max: Some(10.0),
2285            messages: &[],
2286        };
2287        let json = serde_json::to_string(&req).unwrap();
2288        assert!(json.contains("\"op\":\"judge\""));
2289        assert!(json.contains("\"kind\":\"numeric\""));
2290    }
2291
2292    #[test]
2293    fn respond_no_session_inlines_transcript_but_not_skill() {
2294        // The skill is passed via --system now, so the prompt the harness sees
2295        // for respond carries only the transcript.
2296        let messages = [
2297            Message::user("Hi"),
2298            Message::assistant("Hello"),
2299            Message::user("Again?"),
2300        ];
2301        let prompt = render_transcript_for_respond(&messages);
2302        assert!(prompt.contains("User: Hi"));
2303        assert!(prompt.contains("Assistant: Hello"));
2304        assert!(prompt.contains("User: Again?"));
2305        // The skill body must not leak here — it belongs in --system.
2306        assert!(!prompt.contains("SKILL"));
2307    }
2308
2309    #[test]
2310    fn respond_with_session_sends_only_latest_user_message() {
2311        let messages = [
2312            Message::user("Hi"),
2313            Message::assistant("Hello"),
2314            Message::user("Again?"),
2315        ];
2316        assert_eq!(latest_user_message(&messages).as_deref(), Some("Again?"));
2317    }
2318
2319    #[test]
2320    fn extracts_json_from_fenced_or_prose_text() {
2321        assert_eq!(
2322            extract_json_object("```json\n{\"value\": true}\n```"),
2323            Some("{\"value\": true}")
2324        );
2325        assert_eq!(
2326            extract_json_object("Sure! {\"value\": 8, \"reason\": \"x\"} done"),
2327            Some("{\"value\": 8, \"reason\": \"x\"}")
2328        );
2329        assert_eq!(extract_json_object("no json here"), None);
2330    }
2331
2332    #[test]
2333    fn parses_boolean_and_numeric_verdicts() {
2334        let b = parse_verdict(JudgeKind::Boolean, "{\"value\": true, \"reason\": \"ok\"}").unwrap();
2335        assert!(matches!(b.value, JudgeValue::Bool(true)));
2336        assert_eq!(b.reason, "ok");
2337
2338        let n =
2339            parse_verdict(JudgeKind::Numeric, "{\"value\": 8.5, \"reason\": \"good\"}").unwrap();
2340        assert!(matches!(n.value, JudgeValue::Number(v) if (v - 8.5).abs() < f64::EPSILON));
2341    }
2342
2343    #[test]
2344    fn verdict_with_wrong_value_type_errors() {
2345        assert!(parse_verdict(JudgeKind::Boolean, "{\"value\": 3}").is_err());
2346        assert!(parse_verdict(JudgeKind::Numeric, "{\"value\": true}").is_err());
2347        assert!(parse_verdict(JudgeKind::Boolean, "no json").is_err());
2348    }
2349
2350    #[test]
2351    fn usage_accumulates_independently_per_field() {
2352        let mut total = Usage::default();
2353        total.add(&Usage {
2354            input_tokens: Some(10),
2355            output_tokens: None,
2356            cost_usd: Some(0.01),
2357        });
2358        total.add(&Usage {
2359            input_tokens: Some(5),
2360            output_tokens: Some(3),
2361            cost_usd: None,
2362        });
2363        assert_eq!(total.input_tokens, Some(15));
2364        assert_eq!(total.output_tokens, Some(3));
2365        assert!((total.cost_usd.unwrap() - 0.01).abs() < f64::EPSILON);
2366        assert!(!total.is_empty());
2367    }
2368
2369    #[test]
2370    fn reply_text_prefers_extracted_then_falls_back_to_stdout() {
2371        // Extracted text wins when present.
2372        assert_eq!(
2373            select_reply_text(Some("clean reply".into()), "raw noise"),
2374            Some("clean reply".into())
2375        );
2376        // Null/blank extracted text falls back to raw stdout (the contract's
2377        // escape hatch when oneharness can't extract but the reply is in stdout).
2378        assert_eq!(
2379            select_reply_text(None, "{\"type\":\"text\",\"part\":{\"text\":\"pong\"}}"),
2380            Some("{\"type\":\"text\",\"part\":{\"text\":\"pong\"}}".into())
2381        );
2382        assert_eq!(
2383            select_reply_text(Some("   ".into()), "fallback"),
2384            Some("fallback".into())
2385        );
2386        // Neither present is the only real error.
2387        assert_eq!(select_reply_text(None, "   \n"), None);
2388        assert_eq!(select_reply_text(Some(String::new()), ""), None);
2389    }
2390
2391    #[test]
2392    fn supports_resume_covers_known_harnesses() {
2393        assert!(supports_resume("claude-code"));
2394        assert!(supports_resume("opencode"));
2395        assert!(supports_resume("cursor"));
2396        assert!(!supports_resume("codex"));
2397        assert!(!supports_resume("goose"));
2398    }
2399
2400    #[test]
2401    fn slug_is_lowercase_hyphenated_and_bounded() {
2402        assert_eq!(slug("Greet Dr. Smith!"), "greet-dr-smith");
2403        assert_eq!(slug("  --weird__name-- "), "weird-name");
2404        // Empty / punctuation-only input never yields an empty segment.
2405        assert_eq!(slug(""), "x");
2406        assert_eq!(slug("!!!"), "x");
2407        // Capped at 32 chars, with no trailing hyphen.
2408        let long = slug(&"a b ".repeat(40));
2409        assert!(long.len() <= 32, "len {}", long.len());
2410        assert!(!long.ends_with('-'));
2411    }
2412
2413    #[test]
2414    fn history_session_name_is_stable_and_case_distinguishing() {
2415        let a = [Message::user("Greet Dr. Smith")];
2416        let b = [Message::user("Book an appointment")];
2417        // Same inputs → same name (so re-runs and later turns reuse the session).
2418        assert_eq!(
2419            history_session_name("claude-code", "sonnet", &a),
2420            history_session_name("claude-code", "sonnet", &a),
2421        );
2422        // Different opening prompts → different names (each case is reviewable
2423        // on its own).
2424        assert_ne!(
2425            history_session_name("claude-code", "sonnet", &a),
2426            history_session_name("claude-code", "sonnet", &b),
2427        );
2428        let name = history_session_name("claude-code", "sonnet", &a);
2429        assert!(name.starts_with("skilltest-claude-code-sonnet-greet-dr-smith-"));
2430        // Shell/filesystem-safe by construction.
2431        assert!(name
2432            .chars()
2433            .all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-'));
2434        // An empty model becomes a concrete segment, never a blank.
2435        assert!(history_session_name("cursor", "", &a).contains("-default-"));
2436    }
2437
2438    #[test]
2439    fn shell_quote_leaves_safe_tokens_and_wraps_the_rest() {
2440        assert_eq!(shell_quote("oneharness"), "oneharness");
2441        assert_eq!(shell_quote("/a/b-c.d"), "/a/b-c.d");
2442        assert_eq!(shell_quote("has space"), "'has space'");
2443        // Embedded single quotes are escaped with the POSIX '\'' idiom.
2444        assert_eq!(shell_quote("a'b"), "'a'\\''b'");
2445        assert_eq!(shell_quote(""), "''");
2446    }
2447
2448    #[test]
2449    fn history_view_command_is_runnable() {
2450        let cmd = history_view_command(
2451            "oneharness",
2452            Path::new("/home/u/.local/state/skilltest/oneharness-history"),
2453            "skilltest-claude-code-sonnet-greet-1a2b3c4d",
2454        );
2455        assert_eq!(
2456            cmd,
2457            "oneharness history show skilltest-claude-code-sonnet-greet-1a2b3c4d \
2458             --history-dir /home/u/.local/state/skilltest/oneharness-history",
2459        );
2460        // A directory with spaces is quoted so the command stays runnable.
2461        let spaced = history_view_command("oneharness", Path::new("/tmp/my runs"), "n");
2462        assert!(spaced.contains("--history-dir '/tmp/my runs'"), "{spaced}");
2463    }
2464
2465    #[test]
2466    fn new_resolves_default_history_dir_when_config_unset() {
2467        // A default config leaves `history_dir` unset, so the provider resolves
2468        // the centralized default (a non-empty path) with history on.
2469        let provider = OneharnessProvider::new(&OneharnessConfig::default());
2470        assert!(provider.history);
2471        assert!(!provider.history_dir.as_os_str().is_empty());
2472
2473        // An explicit dir is used verbatim.
2474        let provider = OneharnessProvider::new(&OneharnessConfig {
2475            history_dir: Some("/shared/hist".to_string()),
2476            ..OneharnessConfig::default()
2477        });
2478        assert_eq!(provider.history_dir, PathBuf::from("/shared/hist"));
2479    }
2480
2481    #[test]
2482    fn resolve_history_dir_prefers_override_then_xdg_then_home() {
2483        use std::ffi::OsString;
2484        let os = |s: &str| Some(OsString::from(s));
2485
2486        // The explicit override wins over everything.
2487        assert_eq!(
2488            resolve_history_dir(os("/shared/history"), os("/xdg"), os("/home/u")),
2489            PathBuf::from("/shared/history"),
2490        );
2491        // Otherwise XDG_STATE_HOME, under a skilltest namespace.
2492        assert_eq!(
2493            resolve_history_dir(None, os("/xdg/state"), os("/home/u")),
2494            PathBuf::from("/xdg/state/skilltest/oneharness-history"),
2495        );
2496        // Otherwise ~/.local/state.
2497        assert_eq!(
2498            resolve_history_dir(None, None, os("/home/u")),
2499            PathBuf::from("/home/u/.local/state/skilltest/oneharness-history"),
2500        );
2501        // Empty values are ignored (treated as unset).
2502        assert_eq!(
2503            resolve_history_dir(os(""), Some(OsString::new()), os("/home/u")),
2504            PathBuf::from("/home/u/.local/state/skilltest/oneharness-history"),
2505        );
2506        // With nothing set it still yields an absolute path under the temp dir.
2507        assert!(resolve_history_dir(None, None, None).ends_with("skilltest/oneharness-history"));
2508    }
2509
2510    fn api_config(vendor: ApiVendor) -> ApiJudgeConfig {
2511        ApiJudgeConfig {
2512            vendor,
2513            api_key_env: None,
2514            base_url: None,
2515            timeout_secs: 60,
2516            curl_bin: "curl".to_string(),
2517            strict_json: true,
2518        }
2519    }
2520
2521    #[test]
2522    fn api_judge_resolves_vendor_defaults() {
2523        let anthropic = ApiJudgeProvider::new(&api_config(ApiVendor::Anthropic));
2524        assert_eq!(anthropic.api_key_env, "ANTHROPIC_API_KEY");
2525        assert_eq!(anthropic.endpoint, "https://api.anthropic.com/v1/messages");
2526
2527        let openai = ApiJudgeProvider::new(&api_config(ApiVendor::Openai));
2528        assert_eq!(openai.api_key_env, "OPENAI_API_KEY");
2529        assert_eq!(
2530            openai.endpoint,
2531            "https://api.openai.com/v1/chat/completions"
2532        );
2533    }
2534
2535    #[test]
2536    fn api_judge_honors_overrides() {
2537        let provider = ApiJudgeProvider::new(&ApiJudgeConfig {
2538            vendor: ApiVendor::Openai,
2539            api_key_env: Some("MY_KEY".to_string()),
2540            base_url: Some("https://proxy.example/v1/chat/completions".to_string()),
2541            timeout_secs: 5,
2542            curl_bin: "curl".to_string(),
2543            strict_json: true,
2544        });
2545        assert_eq!(provider.api_key_env, "MY_KEY");
2546        assert_eq!(
2547            provider.endpoint,
2548            "https://proxy.example/v1/chat/completions"
2549        );
2550    }
2551
2552    #[test]
2553    fn build_chat_body_shapes_per_vendor() {
2554        let anthropic = build_chat_body(ApiVendor::Anthropic, "claude-x", "sys", "hi", None);
2555        assert_eq!(anthropic["model"], "claude-x");
2556        assert_eq!(anthropic["system"], "sys");
2557        assert_eq!(anthropic["messages"][0]["role"], "user");
2558        // Anthropic carries the system prompt in its own top-level field.
2559        assert_eq!(anthropic["messages"].as_array().unwrap().len(), 1);
2560        // No schema requested → no structured-outputs field.
2561        assert!(anthropic.get("output_config").is_none());
2562
2563        let openai = build_chat_body(ApiVendor::Openai, "gpt-x", "sys", "hi", None);
2564        assert_eq!(openai["messages"][0]["role"], "system");
2565        assert_eq!(openai["messages"][1]["role"], "user");
2566        assert!(openai.get("system").is_none());
2567        assert!(openai.get("response_format").is_none());
2568    }
2569
2570    #[test]
2571    fn build_chat_body_attaches_strict_schema_per_vendor() {
2572        let schema = verdict_schema(JudgeKind::Boolean);
2573        let anthropic = build_chat_body(
2574            ApiVendor::Anthropic,
2575            "claude-x",
2576            "sys",
2577            "hi",
2578            Some(schema.clone()),
2579        );
2580        // Anthropic uses output_config.format.
2581        assert_eq!(anthropic["output_config"]["format"]["type"], "json_schema");
2582        assert_eq!(
2583            anthropic["output_config"]["format"]["schema"]["properties"]["value"]["type"],
2584            "boolean"
2585        );
2586
2587        let numeric = verdict_schema(JudgeKind::Numeric);
2588        let openai = build_chat_body(ApiVendor::Openai, "gpt-x", "sys", "hi", Some(numeric));
2589        // OpenAI uses response_format.json_schema with strict: true.
2590        assert_eq!(openai["response_format"]["type"], "json_schema");
2591        assert_eq!(openai["response_format"]["json_schema"]["strict"], true);
2592        assert_eq!(
2593            openai["response_format"]["json_schema"]["schema"]["properties"]["value"]["type"],
2594            "number"
2595        );
2596    }
2597
2598    #[test]
2599    fn verdict_schema_requires_value_and_reason_with_no_extras() {
2600        let schema = verdict_schema(JudgeKind::Numeric);
2601        assert_eq!(schema["additionalProperties"], false);
2602        let required: Vec<&str> = schema["required"]
2603            .as_array()
2604            .unwrap()
2605            .iter()
2606            .map(|v| v.as_str().unwrap())
2607            .collect();
2608        assert_eq!(required, ["value", "reason"]);
2609    }
2610
2611    #[test]
2612    fn parses_anthropic_success_with_usage() {
2613        let raw = r#"{"content":[{"type":"text","text":"{\"value\": true}"}],
2614            "stop_reason":"end_turn","usage":{"input_tokens":12,"output_tokens":3}}"#;
2615        let outcome = parse_chat_response(ApiVendor::Anthropic, raw).unwrap();
2616        assert_eq!(outcome.text, "{\"value\": true}");
2617        let usage = outcome.usage.unwrap();
2618        assert_eq!(usage.input_tokens, Some(12));
2619        assert_eq!(usage.output_tokens, Some(3));
2620        assert!(usage.cost_usd.is_none());
2621    }
2622
2623    #[test]
2624    fn parses_openai_success_with_usage() {
2625        let raw = r#"{"choices":[{"message":{"content":"{\"value\": 8}"}}],
2626            "usage":{"prompt_tokens":20,"completion_tokens":4}}"#;
2627        let outcome = parse_chat_response(ApiVendor::Openai, raw).unwrap();
2628        assert_eq!(outcome.text, "{\"value\": 8}");
2629        let usage = outcome.usage.unwrap();
2630        assert_eq!(usage.input_tokens, Some(20));
2631        assert_eq!(usage.output_tokens, Some(4));
2632    }
2633
2634    #[test]
2635    fn parses_and_classifies_api_errors() {
2636        let auth = r#"{"error":{"type":"authentication_error","message":"bad key"}}"#;
2637        let err = parse_chat_response(ApiVendor::Anthropic, auth).unwrap_err();
2638        assert!(matches!(
2639            err,
2640            Error::Provider {
2641                kind: Some(ProviderErrorKind::Auth),
2642                ..
2643            }
2644        ));
2645
2646        let rate = r#"{"error":{"type":"rate_limit_exceeded","message":"slow down"}}"#;
2647        let err = parse_chat_response(ApiVendor::Openai, rate).unwrap_err();
2648        assert!(matches!(
2649            err,
2650            Error::Provider {
2651                kind: Some(ProviderErrorKind::RateLimit),
2652                ..
2653            }
2654        ));
2655    }
2656
2657    #[test]
2658    fn empty_reply_is_an_error() {
2659        let raw = r#"{"content":[],"stop_reason":"refusal"}"#;
2660        assert!(parse_chat_response(ApiVendor::Anthropic, raw).is_err());
2661    }
2662
2663    #[test]
2664    fn classify_api_error_maps_known_kinds() {
2665        assert_eq!(
2666            classify_api_error(Some("invalid_api_key")),
2667            Some(ProviderErrorKind::Auth)
2668        );
2669        assert_eq!(
2670            classify_api_error(Some("insufficient_quota")),
2671            Some(ProviderErrorKind::Quota)
2672        );
2673        assert_eq!(
2674            classify_api_error(Some("not_found_error")),
2675            Some(ProviderErrorKind::ModelNotFound)
2676        );
2677        assert_eq!(
2678            classify_api_error(Some("overloaded_error")),
2679            Some(ProviderErrorKind::Overloaded)
2680        );
2681        assert_eq!(classify_api_error(Some("something_else")), None);
2682        assert_eq!(classify_api_error(None), None);
2683    }
2684
2685    #[test]
2686    fn retryable_covers_transient_errors_only() {
2687        let overloaded = r#"{"error":{"type":"overloaded_error","message":"busy"}}"#;
2688        let err = parse_chat_response(ApiVendor::Anthropic, overloaded).unwrap_err();
2689        assert!(is_retryable(&err), "overload should retry");
2690
2691        let rate = r#"{"error":{"type":"rate_limit_error","message":"slow"}}"#;
2692        let err = parse_chat_response(ApiVendor::Anthropic, rate).unwrap_err();
2693        assert!(is_retryable(&err), "rate limit should retry");
2694
2695        let auth = r#"{"error":{"type":"authentication_error","message":"bad key"}}"#;
2696        let err = parse_chat_response(ApiVendor::Anthropic, auth).unwrap_err();
2697        assert!(!is_retryable(&err), "auth must not retry");
2698    }
2699
2700    #[test]
2701    fn curl_escape_handles_quotes_and_backslashes() {
2702        assert_eq!(curl_escape(r#"a"b\c"#), r#"a\"b\\c"#);
2703    }
2704
2705    /// A skill-running provider stub so the SplitProvider's delegation can be
2706    /// checked without touching the network.
2707    struct StubResponder;
2708
2709    impl Provider for StubResponder {
2710        fn respond(
2711            &self,
2712            _platform: &str,
2713            _model: &str,
2714            _skill: &SkillRef<'_>,
2715            _messages: &[Message],
2716            _session: Option<&str>,
2717        ) -> Result<AssistantTurn> {
2718            Ok(AssistantTurn {
2719                message: "stub reply".to_string(),
2720                ..Default::default()
2721            })
2722        }
2723
2724        fn simulate_user(
2725            &self,
2726            _model: &str,
2727            _persona: &str,
2728            _messages: &[Message],
2729        ) -> Result<UserTurn> {
2730            unreachable!("split provider routes user simulation to the judge")
2731        }
2732
2733        fn judge(
2734            &self,
2735            _model: &str,
2736            _query: &JudgeQuery<'_>,
2737            _messages: &[Message],
2738        ) -> Result<JudgeVerdict> {
2739            unreachable!("split provider routes judging to the judge")
2740        }
2741
2742        fn supports_resume(&self, platform: &str) -> bool {
2743            platform == "claude-code"
2744        }
2745    }
2746
2747    #[test]
2748    fn split_provider_delegates_respond_and_resume() {
2749        let split = SplitProvider::new(
2750            Box::new(StubResponder),
2751            ApiJudgeProvider::new(&api_config(ApiVendor::Anthropic)),
2752        );
2753        // respond + supports_resume go to the responder...
2754        assert!(split.supports_resume("claude-code"));
2755        assert!(!split.supports_resume("codex"));
2756        let skill = SkillRef {
2757            name: "s",
2758            dir: "/tmp/s",
2759            instructions: "do things",
2760        };
2761        let turn = split
2762            .respond("claude-code", "m", &skill, &[], None)
2763            .unwrap();
2764        assert_eq!(turn.message, "stub reply");
2765    }
2766
2767    #[test]
2768    fn api_judge_does_not_run_skills() {
2769        let provider = ApiJudgeProvider::new(&api_config(ApiVendor::Anthropic));
2770        let skill = SkillRef {
2771            name: "s",
2772            dir: "/tmp/s",
2773            instructions: "x",
2774        };
2775        assert!(provider.respond("p", "m", &skill, &[], None).is_err());
2776    }
2777
2778    // -----------------------------------------------------------------------
2779    // Subprocess-driven coverage: these spawn small shell scripts standing in
2780    // for the provider command / oneharness / curl, so the actual process
2781    // plumbing (`CommandProvider::call`, `OneharnessProvider::run`,
2782    // `ApiJudgeProvider::run_curl`/`exec_curl`/`write_curl_config`) is exercised
2783    // end to end without any network. Unix-only; the whole crate ships to a
2784    // Linux/macOS matrix (see AGENTS.md "Stack and composition").
2785
2786    #[cfg(unix)]
2787    mod subprocess {
2788        // `std::io::Write` is already in scope via `super::*` (the module-level
2789        // `use std::io::Write as _`), so `write_all` resolves without a re-import.
2790        use super::*;
2791        use std::os::unix::fs::PermissionsExt as _;
2792        use std::path::PathBuf;
2793
2794        /// Write an executable shell script into a unique temp dir and return its
2795        /// path. Each call gets its own directory so concurrent tests never race.
2796        fn script(tag: &str, body: &str) -> PathBuf {
2797            use std::sync::atomic::{AtomicU64, Ordering};
2798            static N: AtomicU64 = AtomicU64::new(0);
2799            let dir = std::env::temp_dir().join(format!(
2800                "skilltest-prov-{}-{tag}-{}",
2801                std::process::id(),
2802                N.fetch_add(1, Ordering::Relaxed)
2803            ));
2804            std::fs::create_dir_all(&dir).unwrap();
2805            let path = dir.join("script.sh");
2806            let mut f = std::fs::File::create(&path).unwrap();
2807            f.write_all(format!("#!/bin/sh\n{body}").as_bytes())
2808                .unwrap();
2809            let mut perms = std::fs::metadata(&path).unwrap().permissions();
2810            perms.set_mode(0o755);
2811            std::fs::set_permissions(&path, perms).unwrap();
2812            path
2813        }
2814
2815        fn skill_ref() -> SkillRef<'static> {
2816            SkillRef {
2817                name: "greeter",
2818                dir: "/tmp/greeter",
2819                instructions: "Be nice.",
2820            }
2821        }
2822
2823        // ---- CommandProvider over a real subprocess ----
2824
2825        #[test]
2826        fn command_provider_respond_parses_response() {
2827            // Echo a fixed respond payload; ignore stdin.
2828            let bin = script(
2829                "respond",
2830                "cat >/dev/null\necho '{\"message\":\"hi there\",\"done\":true,\
2831                 \"usage\":{\"input_tokens\":4,\"output_tokens\":2},\"session_id\":\"s1\"}'\n",
2832            );
2833            let provider = CommandProvider::new(vec![bin.to_string_lossy().into_owned()]).unwrap();
2834            let turn = provider
2835                .respond("demo", "fake", &skill_ref(), &[Message::user("hi")], None)
2836                .unwrap();
2837            assert_eq!(turn.message, "hi there");
2838            assert!(turn.done);
2839            assert_eq!(turn.session_id.as_deref(), Some("s1"));
2840            assert_eq!(turn.usage.unwrap().input_tokens, Some(4));
2841        }
2842
2843        #[test]
2844        fn command_provider_user_and_judge_parse_responses() {
2845            let user_bin = script(
2846                "user",
2847                "cat >/dev/null\necho '{\"message\":\"more please\",\"stop\":true}'\n",
2848            );
2849            let user_provider =
2850                CommandProvider::new(vec![user_bin.to_string_lossy().into_owned()]).unwrap();
2851            let user = user_provider.simulate_user("m", "persona", &[]).unwrap();
2852            assert_eq!(user.message, "more please");
2853            assert!(user.stop);
2854
2855            let judge_bin = script(
2856                "judge",
2857                "cat >/dev/null\necho '{\"value\":7.5,\"reason\":\"ok\"}'\n",
2858            );
2859            let judge_provider =
2860                CommandProvider::new(vec![judge_bin.to_string_lossy().into_owned()]).unwrap();
2861            let query = JudgeQuery {
2862                kind: JudgeKind::Numeric,
2863                criterion: "polite",
2864                scale: Some((0.0, 10.0)),
2865            };
2866            let verdict = judge_provider.judge("m", &query, &[]).unwrap();
2867            assert!(matches!(verdict.value, JudgeValue::Number(v) if (v - 7.5).abs() < 1e-9));
2868            assert_eq!(verdict.reason, "ok");
2869        }
2870
2871        #[test]
2872        fn command_provider_surfaces_nonzero_exit() {
2873            let bin = script("fail", "cat >/dev/null\necho 'boom' 1>&2\nexit 2\n");
2874            let provider = CommandProvider::new(vec![bin.to_string_lossy().into_owned()]).unwrap();
2875            let err = provider.simulate_user("m", "p", &[]).unwrap_err();
2876            let msg = err.to_string();
2877            assert!(msg.contains("provider exited"), "got: {msg}");
2878            assert!(msg.contains("boom"), "stderr is surfaced: {msg}");
2879        }
2880
2881        #[test]
2882        fn command_provider_rejects_empty_and_bad_output() {
2883            let empty = script("empty", "cat >/dev/null\n");
2884            let provider =
2885                CommandProvider::new(vec![empty.to_string_lossy().into_owned()]).unwrap();
2886            assert!(provider
2887                .judge(
2888                    "m",
2889                    &JudgeQuery {
2890                        kind: JudgeKind::Boolean,
2891                        criterion: "x",
2892                        scale: None
2893                    },
2894                    &[],
2895                )
2896                .unwrap_err()
2897                .to_string()
2898                .contains("no output"));
2899
2900            let garbage = script("garbage", "cat >/dev/null\necho 'not json'\n");
2901            let provider =
2902                CommandProvider::new(vec![garbage.to_string_lossy().into_owned()]).unwrap();
2903            assert!(provider
2904                .respond("demo", "m", &skill_ref(), &[], None)
2905                .unwrap_err()
2906                .to_string()
2907                .contains("not valid JSON"));
2908        }
2909
2910        #[test]
2911        fn command_provider_reports_missing_binary() {
2912            let provider =
2913                CommandProvider::new(vec!["/no/such/skilltest-provider-binary".to_string()])
2914                    .unwrap();
2915            let err = provider.simulate_user("m", "p", &[]).unwrap_err();
2916            assert!(err.to_string().contains("could not run provider"));
2917        }
2918
2919        #[test]
2920        fn command_provider_session_is_threaded_into_request() {
2921            // The script writes the request it received to a sidecar file so the
2922            // test can assert the `session` field made it onto the wire.
2923            let dir =
2924                std::env::temp_dir().join(format!("skilltest-prov-sess-{}", std::process::id()));
2925            std::fs::create_dir_all(&dir).unwrap();
2926            let seen = dir.join("seen.json");
2927            let bin = script(
2928                "session",
2929                &format!(
2930                    "cat > '{}'\necho '{{\"message\":\"ok\"}}'\n",
2931                    seen.display()
2932                ),
2933            );
2934            let provider = CommandProvider::new(vec![bin.to_string_lossy().into_owned()]).unwrap();
2935            provider
2936                .respond(
2937                    "demo",
2938                    "m",
2939                    &skill_ref(),
2940                    &[Message::user("hi")],
2941                    Some("session-xyz"),
2942                )
2943                .unwrap();
2944            let request = std::fs::read_to_string(&seen).unwrap();
2945            assert!(
2946                request.contains("\"session\":\"session-xyz\""),
2947                "got: {request}"
2948            );
2949            assert!(request.contains("\"op\":\"respond\""));
2950        }
2951
2952        #[test]
2953        fn command_provider_threads_mocks_and_parses_records() {
2954            // The script records the request and answers with mock_calls.
2955            let dir =
2956                std::env::temp_dir().join(format!("skilltest-prov-mocks-{}", std::process::id()));
2957            std::fs::create_dir_all(&dir).unwrap();
2958            let seen = dir.join("seen.json");
2959            let bin = script(
2960                "mocks",
2961                &format!(
2962                    "cat > '{}'\necho '{{\"message\":\"ok\",\"mock_calls\":[{{\"tool\":\"bash\",                     \"input\":{{\"command\":\"git push\"}},\"action\":\"stub\",\"rule\":0}}]}}'\n",
2963                    seen.display()
2964                ),
2965            );
2966            let provider = CommandProvider::new(vec![bin.to_string_lossy().into_owned()]).unwrap();
2967            let rules = serde_json::json!({ "rules": [] });
2968            let plan = MockPlan {
2969                rules: Some(&rules),
2970            };
2971            let turn = provider
2972                .respond_with_mocks("demo", "m", &skill_ref(), &[], None, Some(&plan))
2973                .unwrap();
2974            let records = turn.mock_calls.expect("channel was on");
2975            assert_eq!(records.len(), 1);
2976            assert_eq!(records[0].action, "stub");
2977            assert_eq!(records[0].rule, Some(0));
2978            // The request carried the mocks block with the compiled rules.
2979            let request = std::fs::read_to_string(&seen).unwrap();
2980            assert!(
2981                request.contains("\"mocks\":{\"rules\":{\"rules\":[]}}"),
2982                "got: {request}"
2983            );
2984        }
2985
2986        #[test]
2987        fn command_provider_ignoring_mocks_is_loud() {
2988            // A provider that answers without `mock_calls` despite a plan has
2989            // silently ignored the mocks — that must never pass vacuously.
2990            let bin = script(
2991                "mocks-ignored",
2992                "cat >/dev/null\necho '{\"message\":\"ok\"}'\n",
2993            );
2994            let provider = CommandProvider::new(vec![bin.to_string_lossy().into_owned()]).unwrap();
2995            let plan = MockPlan { rules: None };
2996            let err = provider
2997                .respond_with_mocks("demo", "m", &skill_ref(), &[], None, Some(&plan))
2998                .unwrap_err();
2999            assert!(
3000                err.to_string().contains("ignored the request's `mocks`"),
3001                "{err}"
3002            );
3003        }
3004
3005        #[test]
3006        fn default_provider_rejects_mocks_loudly() {
3007            // A Provider impl without mock support (the trait default) must
3008            // refuse a plan, never silently drop it.
3009            let plan = MockPlan { rules: None };
3010            let err = super::StubResponder
3011                .respond_with_mocks("p", "m", &skill_ref(), &[], None, Some(&plan))
3012                .unwrap_err();
3013            assert!(
3014                err.to_string().contains("does not support tool mocking"),
3015                "{err}"
3016            );
3017            // And with no plan it delegates to the plain respond.
3018            let turn = super::StubResponder
3019                .respond_with_mocks("p", "m", &skill_ref(), &[], None, None)
3020                .unwrap();
3021            assert_eq!(turn.message, "stub reply");
3022        }
3023
3024        #[test]
3025        fn default_streaming_rejects_mocks_and_delegates_without() {
3026            // The streaming default mirrors the buffered one: loud on a plan,
3027            // plain replay otherwise.
3028            let plan = MockPlan { rules: None };
3029            let err = super::StubResponder
3030                .respond_streaming_with_mocks(
3031                    "p",
3032                    "m",
3033                    &skill_ref(),
3034                    &[],
3035                    None,
3036                    Some(&plan),
3037                    &mut |_| ControlFlow::Continue(()),
3038                )
3039                .unwrap_err();
3040            assert!(err.to_string().contains("does not support tool mocking"));
3041            let turn = super::StubResponder
3042                .respond_streaming_with_mocks("p", "m", &skill_ref(), &[], None, None, &mut |_| {
3043                    ControlFlow::Continue(())
3044                })
3045                .unwrap();
3046            assert_eq!(turn.message, "stub reply");
3047        }
3048
3049        #[test]
3050        fn command_provider_streaming_with_mocks_replays_events() {
3051            // The command protocol is buffered; its streaming path replays the
3052            // finished turn's events and still carries the records.
3053            let bin = script(
3054                "mocks-stream",
3055                "cat >/dev/null\necho '{\"message\":\"ok\",\"events\":[{\"kind\":\"tool_call\",\"name\":\"bash\",\"input\":{\"command\":\"ls\"},\"index\":0}],\"mock_calls\":[]}'\n",
3056            );
3057            let provider = CommandProvider::new(vec![bin.to_string_lossy().into_owned()]).unwrap();
3058            let plan = MockPlan { rules: None };
3059            let mut seen = 0usize;
3060            let turn = provider
3061                .respond_streaming_with_mocks(
3062                    "demo",
3063                    "m",
3064                    &skill_ref(),
3065                    &[],
3066                    None,
3067                    Some(&plan),
3068                    &mut |event| {
3069                        seen += 1;
3070                        assert_eq!(event.name.as_deref(), Some("bash"));
3071                        ControlFlow::Break(())
3072                    },
3073                )
3074                .unwrap();
3075            assert_eq!(seen, 1);
3076            assert_eq!(turn.mock_calls, Some(Vec::new()));
3077        }
3078
3079        // ---- OneharnessProvider over a fake oneharness ----
3080
3081        /// A fixed, easily-asserted history directory for the fake-oneharness
3082        /// tests. Nothing is actually written there — the fake echoes JSON — so a
3083        /// constant string keeps the argv assertions simple.
3084        const TEST_HISTORY_DIR: &str = "/tmp/skilltest-test-history";
3085
3086        fn oh_provider(bin: PathBuf) -> OneharnessProvider {
3087            oh_provider_cfg(bin, true)
3088        }
3089
3090        fn oh_provider_cfg(bin: PathBuf, history: bool) -> OneharnessProvider {
3091            OneharnessProvider::new(&OneharnessConfig {
3092                bin: bin.to_string_lossy().into_owned(),
3093                judge_harness: "claude-code".to_string(),
3094                timeout_secs: 30,
3095                history,
3096                history_dir: Some(TEST_HISTORY_DIR.to_string()),
3097            })
3098        }
3099
3100        #[test]
3101        fn oneharness_respond_extracts_text_and_session() {
3102            let bin = script(
3103                "oh-ok",
3104                "cat >/dev/null\necho '{\"results\":[{\"status\":\"ok\",\
3105                 \"text\":\"  hello back  \",\"session_id\":\"oh1\",\
3106                 \"usage\":{\"input_tokens\":5}}]}'\n",
3107            );
3108            let turn = oh_provider(bin)
3109                .respond(
3110                    "claude-code",
3111                    "sonnet",
3112                    &skill_ref(),
3113                    &[Message::user("hi")],
3114                    None,
3115                )
3116                .unwrap();
3117            assert_eq!(turn.message, "hello back");
3118            assert_eq!(turn.session_id.as_deref(), Some("oh1"));
3119            assert_eq!(turn.usage.unwrap().input_tokens, Some(5));
3120            assert!(turn.events.is_empty());
3121        }
3122
3123        #[test]
3124        fn oneharness_respond_surfaces_normalized_events() {
3125            // oneharness `--events` populates a per-result `events` array; the
3126            // provider lifts it onto the assistant turn so consumers can analyze
3127            // what the skill did.
3128            let bin = script(
3129                "oh-events",
3130                "cat >/dev/null\necho '{\"results\":[{\"status\":\"ok\",\
3131                 \"text\":\"done\",\"events\":[{\"kind\":\"tool_call\",\"name\":\"bash\",\
3132                 \"input\":{\"command\":\"git commit -m x\"},\"output\":\"ok\",\"index\":0}]}]}'\n",
3133            );
3134            let turn = oh_provider(bin)
3135                .respond(
3136                    "claude-code",
3137                    "sonnet",
3138                    &skill_ref(),
3139                    &[Message::user("hi")],
3140                    None,
3141                )
3142                .unwrap();
3143            assert_eq!(turn.events.len(), 1);
3144            assert_eq!(turn.events[0].kind, "tool_call");
3145            assert_eq!(turn.events[0].name.as_deref(), Some("bash"));
3146            assert_eq!(
3147                turn.events[0].input,
3148                Some(serde_json::json!({"command": "git commit -m x"}))
3149            );
3150            assert_eq!(turn.events[0].output.as_deref(), Some("ok"));
3151        }
3152
3153        #[test]
3154        fn oneharness_respond_events_absent_is_empty_not_error() {
3155            // A harness that exposes no tool transcript yields no `events`; the
3156            // turn simply carries an empty list (never an error).
3157            let bin = script(
3158                "oh-noevents",
3159                "cat >/dev/null\necho '{\"results\":[{\"status\":\"ok\",\"text\":\"hi\"}]}'\n",
3160            );
3161            let turn = oh_provider(bin)
3162                .respond("goose", "m", &skill_ref(), &[Message::user("hi")], None)
3163                .unwrap();
3164            assert!(turn.events.is_empty());
3165        }
3166
3167        #[test]
3168        fn oneharness_stream_forwards_events_then_parses_the_result() {
3169            // `oneharness run --stream` emits one NDJSON `{"type":"event",…}` line
3170            // per tool event, then a terminal `{"type":"result","report":{…}}`.
3171            // `respond_streaming` forwards each event live and returns the turn
3172            // parsed from the result.
3173            let bin = script(
3174                "oh-stream",
3175                "cat >/dev/null\n\
3176                 printf '%s\\n' '{\"type\":\"event\",\"event\":{\"kind\":\"tool_call\",\
3177                 \"name\":\"bash\",\"input\":{\"command\":\"ls\"},\"index\":0}}'\n\
3178                 printf '%s\\n' '{\"type\":\"result\",\"report\":{\"results\":[{\"status\":\"ok\",\
3179                 \"text\":\"  done  \",\"session_id\":\"s1\",\"usage\":{\"input_tokens\":7}}]}}'\n",
3180            );
3181            let mut seen = Vec::new();
3182            let turn = oh_provider(bin)
3183                .respond_streaming(
3184                    "claude-code",
3185                    "sonnet",
3186                    &skill_ref(),
3187                    &[Message::user("hi")],
3188                    None,
3189                    &mut |event| {
3190                        seen.push(event.name.clone());
3191                        ControlFlow::Continue(())
3192                    },
3193                )
3194                .unwrap();
3195            // The event was forwarded live, and the result was parsed for the turn.
3196            assert_eq!(seen, vec![Some("bash".to_string())]);
3197            assert_eq!(turn.message, "done");
3198            assert_eq!(turn.session_id.as_deref(), Some("s1"));
3199            assert_eq!(turn.usage.unwrap().input_tokens, Some(7));
3200            assert_eq!(turn.events.len(), 1);
3201            assert_eq!(turn.events[0].name.as_deref(), Some("bash"));
3202        }
3203
3204        #[test]
3205        fn oneharness_stream_short_circuits_on_break() {
3206            // The sink breaks on the first event; the oneharness child is killed
3207            // and the later events/result are never delivered. The turn carries
3208            // only the events seen before the abort.
3209            let bin = script(
3210                "oh-stream-abort",
3211                "cat >/dev/null\n\
3212                 printf '%s\\n' '{\"type\":\"event\",\"event\":{\"kind\":\"tool_call\",\
3213                 \"name\":\"rm\",\"input\":{\"command\":\"rm -rf /\"},\"index\":0}}'\n\
3214                 printf '%s\\n' '{\"type\":\"event\",\"event\":{\"kind\":\"tool_call\",\
3215                 \"name\":\"bash\",\"input\":{\"command\":\"ls\"},\"index\":1}}'\n\
3216                 printf '%s\\n' '{\"type\":\"result\",\"report\":{\"results\":[{\"status\":\"ok\",\
3217                 \"text\":\"done\"}]}}'\n",
3218            );
3219            let mut seen = 0usize;
3220            let turn = oh_provider(bin)
3221                .respond_streaming(
3222                    "claude-code",
3223                    "sonnet",
3224                    &skill_ref(),
3225                    &[Message::user("hi")],
3226                    None,
3227                    &mut |event| {
3228                        seen += 1;
3229                        assert_eq!(event.name.as_deref(), Some("rm"));
3230                        ControlFlow::Break(())
3231                    },
3232                )
3233                .unwrap();
3234            assert_eq!(seen, 1, "aborted after the first event");
3235            assert_eq!(turn.events.len(), 1);
3236            assert_eq!(turn.events[0].name.as_deref(), Some("rm"));
3237            // Torn off before the result line, so no reply text.
3238            assert!(turn.message.is_empty());
3239        }
3240
3241        #[test]
3242        fn oneharness_stream_errors_when_no_result_line() {
3243            // A stream that ends without a terminal `result` line is a protocol
3244            // error (distinct from a deliberate abort).
3245            let bin = script(
3246                "oh-stream-noresult",
3247                "cat >/dev/null\n\
3248                 printf '%s\\n' '{\"type\":\"event\",\"event\":{\"kind\":\"tool_call\",\
3249                 \"name\":\"bash\",\"input\":{},\"index\":0}}'\n",
3250            );
3251            let err = oh_provider(bin)
3252                .respond_streaming(
3253                    "claude-code",
3254                    "sonnet",
3255                    &skill_ref(),
3256                    &[Message::user("hi")],
3257                    None,
3258                    &mut |_| ControlFlow::Continue(()),
3259                )
3260                .unwrap_err();
3261            assert!(
3262                matches!(err, Error::Provider { .. }),
3263                "expected a provider error, got: {err:?}"
3264            );
3265        }
3266
3267        #[test]
3268        fn oneharness_buffered_run_passes_events_and_omits_mode() {
3269            // The buffered path uses `--compact --events` and — deliberately —
3270            // passes no `--mode` (oneharness's default applies).
3271            let bin = script(
3272                "oh-args",
3273                "d=$(dirname \"$0\"); printf '%s\\n' \"$@\" > \"$d/args\"\n\
3274                 cat >/dev/null\necho '{\"results\":[{\"status\":\"ok\",\"text\":\"hi\"}]}'\n",
3275            );
3276            let dir = bin.parent().unwrap().to_path_buf();
3277            oh_provider(bin)
3278                .respond(
3279                    "claude-code",
3280                    "sonnet",
3281                    &skill_ref(),
3282                    &[Message::user("hi")],
3283                    None,
3284                )
3285                .unwrap();
3286            let args: Vec<String> = std::fs::read_to_string(dir.join("args"))
3287                .unwrap()
3288                .lines()
3289                .map(str::to_string)
3290                .collect();
3291            assert!(args.iter().any(|a| a == "--events"), "got: {args:?}");
3292            assert!(args.iter().any(|a| a == "--compact"), "got: {args:?}");
3293            assert!(!args.iter().any(|a| a == "--mode"), "got: {args:?}");
3294            assert!(!args.iter().any(|a| a == "--stream"), "got: {args:?}");
3295        }
3296
3297        #[test]
3298        fn oneharness_stream_run_passes_stream_and_omits_mode() {
3299            // The streaming path uses `--stream --events` and — like the buffered
3300            // path — passes no `--mode`.
3301            let bin = script(
3302                "oh-args-stream",
3303                "d=$(dirname \"$0\"); printf '%s\\n' \"$@\" > \"$d/args\"\n\
3304                 cat >/dev/null\n\
3305                 printf '%s\\n' '{\"type\":\"result\",\"report\":{\"results\":[{\"status\":\"ok\",\
3306                 \"text\":\"hi\"}]}}'\n",
3307            );
3308            let dir = bin.parent().unwrap().to_path_buf();
3309            oh_provider(bin)
3310                .respond_streaming(
3311                    "claude-code",
3312                    "sonnet",
3313                    &skill_ref(),
3314                    &[Message::user("hi")],
3315                    None,
3316                    &mut |_| ControlFlow::Continue(()),
3317                )
3318                .unwrap();
3319            let args: Vec<String> = std::fs::read_to_string(dir.join("args"))
3320                .unwrap()
3321                .lines()
3322                .map(str::to_string)
3323                .collect();
3324            assert!(args.iter().any(|a| a == "--stream"), "got: {args:?}");
3325            assert!(args.iter().any(|a| a == "--events"), "got: {args:?}");
3326            assert!(!args.iter().any(|a| a == "--mode"), "got: {args:?}");
3327            assert!(!args.iter().any(|a| a == "--compact"), "got: {args:?}");
3328        }
3329
3330        /// Split a whitespace-free argv dump (one arg per line) into a vector.
3331        fn argv_lines(path: &std::path::Path) -> Vec<String> {
3332            std::fs::read_to_string(path)
3333                .unwrap()
3334                .lines()
3335                .map(str::to_string)
3336                .collect()
3337        }
3338
3339        #[test]
3340        fn oneharness_respond_records_history_and_surfaces_command() {
3341            // The fake oneharness dumps its argv and echoes a `history_file`,
3342            // standing in for a recorded session. The provider must pass
3343            // --history/--history-dir/--history-name and turn the confirmed
3344            // history_file into a runnable `history show` command.
3345            let bin = script(
3346                "oh-history",
3347                "d=$(dirname \"$0\"); printf '%s\\n' \"$@\" > \"$d/args\"\n\
3348                 cat >/dev/null\n\
3349                 echo '{\"results\":[{\"status\":\"ok\",\"text\":\"hi\"}],\
3350                 \"history_file\":\"/tmp/skilltest-test-history/p/s.jsonl\"}'\n",
3351            );
3352            let dir = bin.parent().unwrap().to_path_buf();
3353            let turn = oh_provider(bin)
3354                .respond(
3355                    "claude-code",
3356                    "sonnet",
3357                    &skill_ref(),
3358                    &[Message::user("Greet Dr. Smith")],
3359                    None,
3360                )
3361                .unwrap();
3362            let args = argv_lines(&dir.join("args"));
3363            assert!(args.iter().any(|a| a == "--history"), "got: {args:?}");
3364            // --history-dir and --history-name are passed as flag/value pairs.
3365            let hd = args
3366                .iter()
3367                .position(|a| a == "--history-dir")
3368                .expect("--history-dir");
3369            assert_eq!(args[hd + 1], TEST_HISTORY_DIR);
3370            let hn = args
3371                .iter()
3372                .position(|a| a == "--history-name")
3373                .expect("--history-name");
3374            assert!(
3375                args[hn + 1].starts_with("skilltest-claude-code-sonnet-greet-dr-smith-"),
3376                "name: {}",
3377                args[hn + 1]
3378            );
3379            // The surfaced command replays exactly that session — same name, same
3380            // dir — so it is directly runnable.
3381            let cmd = turn.history_command.expect("history recorded");
3382            assert!(
3383                cmd.contains(&format!("history show {}", args[hn + 1])),
3384                "cmd: {cmd}"
3385            );
3386            assert!(
3387                cmd.contains(&format!("--history-dir {TEST_HISTORY_DIR}")),
3388                "cmd: {cmd}"
3389            );
3390        }
3391
3392        #[test]
3393        fn oneharness_history_command_absent_when_not_recorded() {
3394            // History is requested (the flags go out), but oneharness reports no
3395            // `history_file` (e.g. an older build) — so no command is surfaced,
3396            // rather than one that would resolve to nothing.
3397            let bin = script(
3398                "oh-history-none",
3399                "d=$(dirname \"$0\"); printf '%s\\n' \"$@\" > \"$d/args\"\n\
3400                 cat >/dev/null\necho '{\"results\":[{\"status\":\"ok\",\"text\":\"hi\"}]}'\n",
3401            );
3402            let dir = bin.parent().unwrap().to_path_buf();
3403            let turn = oh_provider(bin)
3404                .respond(
3405                    "claude-code",
3406                    "sonnet",
3407                    &skill_ref(),
3408                    &[Message::user("hi")],
3409                    None,
3410                )
3411                .unwrap();
3412            assert!(
3413                std::fs::read_to_string(dir.join("args"))
3414                    .unwrap()
3415                    .contains("--history"),
3416                "flags were still requested"
3417            );
3418            assert!(
3419                turn.history_command.is_none(),
3420                "no confirmed session → no command"
3421            );
3422        }
3423
3424        #[test]
3425        fn oneharness_history_disabled_omits_flags_and_command() {
3426            // With recording off, no history flags are passed and no command is
3427            // surfaced — even if the ambient oneharness emitted a history_file.
3428            let bin = script(
3429                "oh-history-off",
3430                "d=$(dirname \"$0\"); printf '%s\\n' \"$@\" > \"$d/args\"\n\
3431                 cat >/dev/null\n\
3432                 echo '{\"results\":[{\"status\":\"ok\",\"text\":\"hi\"}],\
3433                 \"history_file\":\"/x/s.jsonl\"}'\n",
3434            );
3435            let dir = bin.parent().unwrap().to_path_buf();
3436            let turn = oh_provider_cfg(bin, false)
3437                .respond(
3438                    "claude-code",
3439                    "sonnet",
3440                    &skill_ref(),
3441                    &[Message::user("hi")],
3442                    None,
3443                )
3444                .unwrap();
3445            assert!(
3446                !std::fs::read_to_string(dir.join("args"))
3447                    .unwrap()
3448                    .contains("--history"),
3449                "history was disabled"
3450            );
3451            assert!(turn.history_command.is_none());
3452        }
3453
3454        #[test]
3455        fn oneharness_judge_and_user_runs_are_never_recorded() {
3456            // Only the skill under test is recorded; the judge / simulated-user
3457            // calls must carry no history flags.
3458            let bin = script(
3459                "oh-history-judge",
3460                "d=$(dirname \"$0\"); printf '%s\\n' \"$@\" > \"$d/args\"\n\
3461                 cat >/dev/null\necho '{\"results\":[{\"status\":\"ok\",\
3462                 \"text\":\"{\\\"value\\\": true, \\\"reason\\\": \\\"ok\\\"}\"}]}'\n",
3463            );
3464            let dir = bin.parent().unwrap().to_path_buf();
3465            let query = JudgeQuery {
3466                kind: JudgeKind::Boolean,
3467                criterion: "polite",
3468                scale: None,
3469            };
3470            oh_provider(bin).judge("m", &query, &[]).unwrap();
3471            assert!(
3472                !std::fs::read_to_string(dir.join("args"))
3473                    .unwrap()
3474                    .contains("--history"),
3475                "the judge run must not be recorded"
3476            );
3477        }
3478
3479        #[test]
3480        fn oneharness_stream_records_history() {
3481            let bin = script(
3482                "oh-history-stream",
3483                "d=$(dirname \"$0\"); printf '%s\\n' \"$@\" > \"$d/args\"\n\
3484                 cat >/dev/null\n\
3485                 printf '%s\\n' '{\"type\":\"result\",\"report\":{\"results\":[{\"status\":\"ok\",\
3486                 \"text\":\"hi\"}],\"history_file\":\"/tmp/skilltest-test-history/p/s.jsonl\"}}'\n",
3487            );
3488            let dir = bin.parent().unwrap().to_path_buf();
3489            let turn = oh_provider(bin)
3490                .respond_streaming(
3491                    "claude-code",
3492                    "sonnet",
3493                    &skill_ref(),
3494                    &[Message::user("hi")],
3495                    None,
3496                    &mut |_| ControlFlow::Continue(()),
3497                )
3498                .unwrap();
3499            assert!(
3500                std::fs::read_to_string(dir.join("args"))
3501                    .unwrap()
3502                    .contains("--history"),
3503                "the streaming skill run is recorded"
3504            );
3505            assert!(turn.history_command.is_some());
3506        }
3507
3508        #[test]
3509        fn oneharness_falls_back_to_stdout_when_text_null() {
3510            let bin = script(
3511                "oh-fallback",
3512                "cat >/dev/null\necho '{\"results\":[{\"status\":\"ok\",\
3513                 \"stdout\":\"raw reply\"}]}'\n",
3514            );
3515            let user = oh_provider(bin).simulate_user("m", "persona", &[]).unwrap();
3516            assert_eq!(user.message, "raw reply");
3517        }
3518
3519        #[test]
3520        fn oneharness_judge_parses_verdict() {
3521            let bin = script(
3522                "oh-judge",
3523                "cat >/dev/null\necho '{\"results\":[{\"status\":\"ok\",\
3524                 \"text\":\"{\\\"value\\\": true, \\\"reason\\\": \\\"good\\\"}\"}]}'\n",
3525            );
3526            let query = JudgeQuery {
3527                kind: JudgeKind::Boolean,
3528                criterion: "polite",
3529                scale: None,
3530            };
3531            let verdict = oh_provider(bin).judge("m", &query, &[]).unwrap();
3532            assert!(matches!(verdict.value, JudgeValue::Bool(true)));
3533            assert_eq!(verdict.reason, "good");
3534        }
3535
3536        #[test]
3537        fn oneharness_classifies_failure_kind() {
3538            let bin = script(
3539                "oh-auth",
3540                "cat >/dev/null\necho '{\"results\":[{\"status\":\"error\",\
3541                 \"failure_kind\":\"auth\",\"error\":\"no creds\"}]}'\n",
3542            );
3543            let err = oh_provider(bin)
3544                .respond("claude-code", "m", &skill_ref(), &[], None)
3545                .unwrap_err();
3546            assert!(matches!(
3547                err,
3548                Error::Provider {
3549                    kind: Some(ProviderErrorKind::Auth),
3550                    ..
3551                }
3552            ));
3553        }
3554
3555        #[test]
3556        fn oneharness_classifies_timeout_status_without_failure_kind() {
3557            // oneharness reports a deadline as `status: "timeout"` with no
3558            // `failure_kind`. skilltest still classifies it structurally, so the
3559            // consuming SDK sees a Timeout kind rather than only the word
3560            // "timeout" in the message.
3561            let bin = script(
3562                "oh-err",
3563                "cat >/dev/null\necho '{\"results\":[{\"status\":\"timeout\",\
3564                 \"stderr\":\"deadline\"}]}'\n",
3565            );
3566            let err = oh_provider(bin).simulate_user("m", "p", &[]).unwrap_err();
3567            let msg = err.to_string();
3568            assert!(msg.contains("harness run failed"), "got: {msg}");
3569            assert!(msg.contains("deadline"));
3570            assert!(matches!(
3571                err,
3572                Error::Provider {
3573                    kind: Some(ProviderErrorKind::Timeout),
3574                    ..
3575                }
3576            ));
3577        }
3578
3579        #[test]
3580        fn oneharness_failure_without_classifiable_signal_is_unclassified() {
3581            // A non-`ok` status skilltest can't map (no `failure_kind`, an
3582            // unrecognized status) stays an unclassified provider error — kind
3583            // `None`, distinct from the `Other` catch-all.
3584            let bin = script(
3585                "oh-plain",
3586                "cat >/dev/null\necho '{\"results\":[{\"status\":\"error\",\
3587                 \"error\":\"something broke\"}]}'\n",
3588            );
3589            let err = oh_provider(bin)
3590                .respond("claude-code", "m", &skill_ref(), &[], None)
3591                .unwrap_err();
3592            assert!(matches!(err, Error::Provider { kind: None, .. }));
3593            assert!(err.to_string().contains("something broke"));
3594        }
3595
3596        #[test]
3597        fn oneharness_errors_on_unparseable_output_and_no_results() {
3598            let garbage = script(
3599                "oh-garbage",
3600                "cat >/dev/null\necho 'not json' 1>&2\necho 'x'\n",
3601            );
3602            assert!(oh_provider(garbage)
3603                .simulate_user("m", "p", &[])
3604                .unwrap_err()
3605                .to_string()
3606                .contains("could not parse oneharness output"));
3607
3608            let empty = script("oh-empty", "cat >/dev/null\necho '{\"results\":[]}'\n");
3609            assert!(oh_provider(empty)
3610                .simulate_user("m", "p", &[])
3611                .unwrap_err()
3612                .to_string()
3613                .contains("no results"));
3614        }
3615
3616        #[test]
3617        fn oneharness_errors_when_no_text_or_stdout() {
3618            let bin = script(
3619                "oh-silent",
3620                "cat >/dev/null\necho '{\"results\":[{\"status\":\"ok\"}]}'\n",
3621            );
3622            assert!(oh_provider(bin)
3623                .respond("claude-code", "m", &skill_ref(), &[], None)
3624                .unwrap_err()
3625                .to_string()
3626                .contains("neither extractable text nor stdout"));
3627        }
3628
3629        #[test]
3630        fn oneharness_respond_resume_sends_only_latest_message() {
3631            // Capture the prompt (stdin) and the argv to assert resume behavior:
3632            // with a session, only the last user message is sent and --resume is
3633            // forwarded; without, the whole transcript is inlined.
3634            let dir =
3635                std::env::temp_dir().join(format!("skilltest-oh-resume-{}", std::process::id()));
3636            std::fs::create_dir_all(&dir).unwrap();
3637            let prompt_file = dir.join("prompt.txt");
3638            let argv_file = dir.join("argv.txt");
3639            let bin = script(
3640                "oh-resume",
3641                &format!(
3642                    "echo \"$@\" > '{}'\ncat > '{}'\necho '{{\"results\":[{{\"status\":\"ok\",\"text\":\"ok\"}}]}}'\n",
3643                    argv_file.display(),
3644                    prompt_file.display(),
3645                ),
3646            );
3647            let messages = [
3648                Message::user("first"),
3649                Message::assistant("reply"),
3650                Message::user("second"),
3651            ];
3652            oh_provider(bin.clone())
3653                .respond(
3654                    "claude-code",
3655                    "sonnet",
3656                    &skill_ref(),
3657                    &messages,
3658                    Some("sess-1"),
3659                )
3660                .unwrap();
3661            let prompt = std::fs::read_to_string(&prompt_file).unwrap();
3662            assert_eq!(
3663                prompt.trim(),
3664                "second",
3665                "resume sends only the latest user message"
3666            );
3667            let argv = std::fs::read_to_string(&argv_file).unwrap();
3668            assert!(argv.contains("--resume sess-1"), "argv: {argv}");
3669            assert!(
3670                argv.contains("--system"),
3671                "skill is the system prompt: {argv}"
3672            );
3673        }
3674
3675        #[test]
3676        fn oneharness_omits_model_flag_when_model_empty() {
3677            let dir =
3678                std::env::temp_dir().join(format!("skilltest-oh-model-{}", std::process::id()));
3679            std::fs::create_dir_all(&dir).unwrap();
3680            let argv_file = dir.join("argv.txt");
3681            let bin = script(
3682                "oh-nomodel",
3683                &format!(
3684                    "echo \"$@\" > '{}'\ncat >/dev/null\necho '{{\"results\":[{{\"status\":\"ok\",\"text\":\"ok\"}}]}}'\n",
3685                    argv_file.display(),
3686                ),
3687            );
3688            oh_provider(bin)
3689                .respond("cursor", "", &skill_ref(), &[Message::user("hi")], None)
3690                .unwrap();
3691            let argv = std::fs::read_to_string(&argv_file).unwrap();
3692            assert!(
3693                !argv.contains("--model"),
3694                "empty model omits the flag: {argv}"
3695            );
3696        }
3697
3698        #[test]
3699        fn oneharness_reports_missing_binary() {
3700            let provider = oh_provider(PathBuf::from("/no/such/oneharness-binary"));
3701            let err = provider
3702                .respond("claude-code", "m", &skill_ref(), &[], None)
3703                .unwrap_err();
3704            assert!(err.to_string().contains("could not run"));
3705        }
3706
3707        #[test]
3708        fn oneharness_respond_with_mocks_passes_flags_and_reads_spy_log() {
3709            // The fake oneharness extracts --mock-rules/--spy-file from its
3710            // argv, copies the rules it was handed to a sidecar, and appends
3711            // spy lines the way `oneharness mock` would.
3712            let dir =
3713                std::env::temp_dir().join(format!("skilltest-oh-mocks-{}", std::process::id()));
3714            std::fs::create_dir_all(&dir).unwrap();
3715            let rules_seen = dir.join("rules-seen.json");
3716            let bin = script(
3717                "oh-mocks",
3718                &format!(
3719                    r#"rules=""; spy=""
3720while [ $# -gt 0 ]; do
3721  [ "$1" = "--mock-rules" ] && rules="$2"
3722  [ "$1" = "--spy-file" ] && spy="$2"
3723  shift
3724done
3725cat >/dev/null
3726cp "$rules" '{seen}'
3727printf '%s
3728' '{{"harness":"claude-code","event":{{"tool_name":"Bash","tool_input":{{"command":"git push"}}}},"action":"stub","rule":0}}' >> "$spy"
3729printf '%s
3730' '{{"harness":"claude-code","event":{{"tool_name":"Bash","tool_input":{{"command":"ls"}}}},"action":"allow","rule":null}}' >> "$spy"
3731echo '{{"results":[{{"status":"ok","text":"done"}}]}}'
3732"#,
3733                    seen = rules_seen.display(),
3734                ),
3735            );
3736            let rules = serde_json::json!({ "rules": [
3737                { "match": { "event_contains": "git push" },
3738                  "action": { "stub": { "output": "up-to-date", "exit_code": 0 } } }
3739            ]});
3740            let plan = MockPlan {
3741                rules: Some(&rules),
3742            };
3743            let turn = oh_provider(bin)
3744                .respond_with_mocks(
3745                    "claude-code",
3746                    "sonnet",
3747                    &skill_ref(),
3748                    &[Message::user("hi")],
3749                    None,
3750                    Some(&plan),
3751                )
3752                .unwrap();
3753            // The compiled rules reached oneharness verbatim.
3754            let seen: serde_json::Value =
3755                serde_json::from_str(&std::fs::read_to_string(&rules_seen).unwrap()).unwrap();
3756            assert_eq!(seen, rules);
3757            // The spy log came back as records, original inputs intact.
3758            let records = turn.mock_calls.expect("channel was on");
3759            assert_eq!(records.len(), 2);
3760            assert_eq!(records[0].action, "stub");
3761            assert_eq!(records[0].rule, Some(0));
3762            assert_eq!(records[0].input.as_ref().unwrap()["command"], "git push");
3763            assert_eq!(records[1].action, "allow");
3764        }
3765
3766        #[test]
3767        fn oneharness_spy_only_plan_omits_rules_flag_and_missing_log_is_empty() {
3768            // A spy-only plan (no rules): no --mock-rules flag, --spy-file
3769            // still passed; a run whose hook never fired leaves no log, which
3770            // reads as zero records — the channel stays Some.
3771            let dir =
3772                std::env::temp_dir().join(format!("skilltest-oh-spyonly-{}", std::process::id()));
3773            std::fs::create_dir_all(&dir).unwrap();
3774            let argv_file = dir.join("argv.txt");
3775            let bin = script(
3776                "oh-spyonly",
3777                &format!(
3778                    "echo \"$@\" > '{}'\ncat >/dev/null\necho '{{\"results\":[{{\"status\":\"ok\",\"text\":\"ok\"}}]}}'\n",
3779                    argv_file.display(),
3780                ),
3781            );
3782            let plan = MockPlan { rules: None };
3783            let turn = oh_provider(bin)
3784                .respond_with_mocks(
3785                    "claude-code",
3786                    "sonnet",
3787                    &skill_ref(),
3788                    &[Message::user("hi")],
3789                    None,
3790                    Some(&plan),
3791                )
3792                .unwrap();
3793            assert_eq!(turn.mock_calls, Some(Vec::new()));
3794            let argv = std::fs::read_to_string(&argv_file).unwrap();
3795            assert!(argv.contains("--spy-file"), "argv: {argv}");
3796            assert!(!argv.contains("--mock-rules"), "argv: {argv}");
3797        }
3798
3799        // ---- ApiJudgeProvider over a fake curl ----
3800
3801        fn api_provider_with_curl(curl: PathBuf, vendor: ApiVendor) -> ApiJudgeProvider {
3802            ApiJudgeProvider::new(&ApiJudgeConfig {
3803                vendor,
3804                api_key_env: Some("SKILLTEST_TEST_API_KEY".to_string()),
3805                base_url: Some("https://example.invalid/v1".to_string()),
3806                timeout_secs: 5,
3807                curl_bin: curl.to_string_lossy().into_owned(),
3808                strict_json: true,
3809            })
3810        }
3811
3812        #[test]
3813        fn api_judge_judges_through_fake_curl() {
3814            // The fake curl echoes an Anthropic-shaped success body.
3815            let curl = script(
3816                "curl-ok",
3817                "cat >/dev/null\necho '{\"content\":[{\"type\":\"text\",\
3818                 \"text\":\"{\\\"value\\\": true, \\\"reason\\\": \\\"polite\\\"}\"}],\
3819                 \"usage\":{\"input_tokens\":9,\"output_tokens\":3}}'\n",
3820            );
3821            std::env::set_var("SKILLTEST_TEST_API_KEY", "sk-test");
3822            let provider = api_provider_with_curl(curl, ApiVendor::Anthropic);
3823            let query = JudgeQuery {
3824                kind: JudgeKind::Boolean,
3825                criterion: "polite",
3826                scale: None,
3827            };
3828            let verdict = provider
3829                .judge("claude-x", &query, &[Message::user("hi")])
3830                .unwrap();
3831            assert!(matches!(verdict.value, JudgeValue::Bool(true)));
3832            assert_eq!(verdict.usage.unwrap().input_tokens, Some(9));
3833            std::env::remove_var("SKILLTEST_TEST_API_KEY");
3834        }
3835
3836        #[test]
3837        fn api_judge_simulates_user_through_fake_curl() {
3838            let curl = script(
3839                "curl-user",
3840                "cat >/dev/null\necho '{\"choices\":[{\"message\":\
3841                 {\"content\":\"sure, go on\"}}]}'\n",
3842            );
3843            std::env::set_var("SKILLTEST_TEST_API_KEY", "sk-test");
3844            let provider = api_provider_with_curl(curl, ApiVendor::Openai);
3845            let user = provider.simulate_user("gpt-x", "a patient", &[]).unwrap();
3846            assert_eq!(user.message, "sure, go on");
3847            std::env::remove_var("SKILLTEST_TEST_API_KEY");
3848        }
3849
3850        #[test]
3851        fn api_judge_errors_when_key_absent() {
3852            let curl = script("curl-unused", "cat >/dev/null\necho '{}'\n");
3853            std::env::remove_var("SKILLTEST_TEST_API_KEY");
3854            let provider = api_provider_with_curl(curl, ApiVendor::Anthropic);
3855            let err = provider
3856                .judge(
3857                    "m",
3858                    &JudgeQuery {
3859                        kind: JudgeKind::Boolean,
3860                        criterion: "x",
3861                        scale: None,
3862                    },
3863                    &[],
3864                )
3865                .unwrap_err();
3866            assert!(matches!(
3867                err,
3868                Error::Provider {
3869                    kind: Some(ProviderErrorKind::Auth),
3870                    ..
3871                }
3872            ));
3873        }
3874
3875        #[test]
3876        fn api_judge_surfaces_curl_failure() {
3877            let curl = script(
3878                "curl-fail",
3879                "cat >/dev/null\necho 'curl: (6) bad host' 1>&2\nexit 6\n",
3880            );
3881            std::env::set_var("SKILLTEST_TEST_API_KEY", "sk-test");
3882            let provider = api_provider_with_curl(curl, ApiVendor::Anthropic);
3883            let err = provider
3884                .judge(
3885                    "m",
3886                    &JudgeQuery {
3887                        kind: JudgeKind::Boolean,
3888                        criterion: "x",
3889                        scale: None,
3890                    },
3891                    &[],
3892                )
3893                .unwrap_err();
3894            assert!(err.to_string().contains("curl failed"));
3895            std::env::remove_var("SKILLTEST_TEST_API_KEY");
3896        }
3897
3898        #[test]
3899        fn write_curl_config_sets_private_mode_and_headers() {
3900            let dir = std::env::temp_dir().join(format!("skilltest-cfg-{}", std::process::id()));
3901            std::fs::create_dir_all(&dir).unwrap();
3902            let path = dir.join("c.cfg");
3903            write_curl_config(
3904                &path,
3905                "https://api.example/v1",
3906                &[("x-api-key".to_string(), "secret\"quote".to_string())],
3907                42,
3908            )
3909            .unwrap();
3910            let text = std::fs::read_to_string(&path).unwrap();
3911            assert!(text.contains("url = \"https://api.example/v1\""));
3912            assert!(text.contains("max-time = 42"));
3913            // The quote in the header value is escaped.
3914            assert!(text.contains("secret\\\"quote"), "escaped header: {text}");
3915            let mode = std::fs::metadata(&path).unwrap().permissions().mode();
3916            assert_eq!(mode & 0o777, 0o600, "config is private");
3917        }
3918
3919        #[test]
3920        fn api_judge_retries_a_transient_error_then_succeeds() {
3921            // The fake curl returns an overloaded error on its first invocation
3922            // and a success on the second, so the retry path in `chat` is taken.
3923            let dir = std::env::temp_dir().join(format!("skilltest-retry-{}", std::process::id()));
3924            std::fs::create_dir_all(&dir).unwrap();
3925            let counter = dir.join("n");
3926            let curl = script(
3927                "curl-retry",
3928                &format!(
3929                    "cat >/dev/null\nif [ -f '{c}' ]; then \
3930                       echo '{{\"content\":[{{\"type\":\"text\",\"text\":\"{{\\\"value\\\": true, \\\"reason\\\": \\\"ok\\\"}}\"}}]}}'; \
3931                     else touch '{c}'; \
3932                       echo '{{\"error\":{{\"type\":\"overloaded_error\",\"message\":\"busy\"}}}}'; \
3933                     fi\n",
3934                    c = counter.display(),
3935                ),
3936            );
3937            std::env::set_var("SKILLTEST_TEST_API_KEY", "sk-test");
3938            let provider = api_provider_with_curl(curl, ApiVendor::Anthropic);
3939            let verdict = provider
3940                .judge(
3941                    "m",
3942                    &JudgeQuery {
3943                        kind: JudgeKind::Boolean,
3944                        criterion: "x",
3945                        scale: None,
3946                    },
3947                    &[],
3948                )
3949                .unwrap();
3950            assert!(matches!(verdict.value, JudgeValue::Bool(true)));
3951            std::env::remove_var("SKILLTEST_TEST_API_KEY");
3952        }
3953
3954        #[test]
3955        fn api_judge_gives_up_after_max_retries() {
3956            // Always overloaded: the loop exhausts MAX_RETRIES and surfaces it.
3957            let curl = script(
3958                "curl-busy",
3959                "cat >/dev/null\necho '{\"error\":{\"type\":\"overloaded_error\",\"message\":\"busy\"}}'\n",
3960            );
3961            std::env::set_var("SKILLTEST_TEST_API_KEY", "sk-test");
3962            let provider = api_provider_with_curl(curl, ApiVendor::Anthropic);
3963            let err = provider
3964                .judge(
3965                    "m",
3966                    &JudgeQuery {
3967                        kind: JudgeKind::Boolean,
3968                        criterion: "x",
3969                        scale: None,
3970                    },
3971                    &[],
3972                )
3973                .unwrap_err();
3974            assert!(matches!(
3975                err,
3976                Error::Provider {
3977                    kind: Some(ProviderErrorKind::Overloaded),
3978                    ..
3979                }
3980            ));
3981            std::env::remove_var("SKILLTEST_TEST_API_KEY");
3982        }
3983
3984        #[test]
3985        fn api_judge_classifies_curl_timeout() {
3986            // curl exit 28 is a `--max-time` timeout; skilltest classifies it so
3987            // a slow judge surfaces as a Timeout kind, not an opaque curl failure.
3988            let curl = script(
3989                "curl-timeout",
3990                "cat >/dev/null\necho 'curl: (28) Operation timed out' 1>&2\nexit 28\n",
3991            );
3992            std::env::set_var("SKILLTEST_TEST_API_KEY", "sk-test");
3993            let provider = api_provider_with_curl(curl, ApiVendor::Anthropic);
3994            let err = provider
3995                .judge(
3996                    "m",
3997                    &JudgeQuery {
3998                        kind: JudgeKind::Boolean,
3999                        criterion: "x",
4000                        scale: None,
4001                    },
4002                    &[],
4003                )
4004                .unwrap_err();
4005            assert!(matches!(
4006                err,
4007                Error::Provider {
4008                    kind: Some(ProviderErrorKind::Timeout),
4009                    ..
4010                }
4011            ));
4012            std::env::remove_var("SKILLTEST_TEST_API_KEY");
4013        }
4014
4015        #[test]
4016        fn api_judge_reports_missing_curl_binary() {
4017            std::env::set_var("SKILLTEST_TEST_API_KEY", "sk-test");
4018            let provider =
4019                api_provider_with_curl(PathBuf::from("/no/such/curl-binary"), ApiVendor::Anthropic);
4020            let err = provider
4021                .judge(
4022                    "m",
4023                    &JudgeQuery {
4024                        kind: JudgeKind::Boolean,
4025                        criterion: "x",
4026                        scale: None,
4027                    },
4028                    &[],
4029                )
4030                .unwrap_err();
4031            assert!(err.to_string().contains("could not run"));
4032            std::env::remove_var("SKILLTEST_TEST_API_KEY");
4033        }
4034
4035        #[test]
4036        fn api_judge_surfaces_unparseable_response() {
4037            let curl = script("curl-garbage", "cat >/dev/null\necho 'not json at all'\n");
4038            std::env::set_var("SKILLTEST_TEST_API_KEY", "sk-test");
4039            let provider = api_provider_with_curl(curl, ApiVendor::Openai);
4040            let err = provider
4041                .judge(
4042                    "m",
4043                    &JudgeQuery {
4044                        kind: JudgeKind::Boolean,
4045                        criterion: "x",
4046                        scale: None,
4047                    },
4048                    &[],
4049                )
4050                .unwrap_err();
4051            assert!(err.to_string().contains("could not parse API response"));
4052            std::env::remove_var("SKILLTEST_TEST_API_KEY");
4053        }
4054
4055        #[test]
4056        fn split_provider_routes_judge_and_user_through_the_api() {
4057            // A SplitProvider's judge/simulate_user must hit the API judge (the
4058            // fake curl), while respond goes to the stub responder.
4059            let curl = script(
4060                "split-curl",
4061                "cat >/dev/null\necho '{\"content\":[{\"type\":\"text\",\
4062                 \"text\":\"{\\\"value\\\": true, \\\"reason\\\": \\\"ok\\\"}\"}]}'\n",
4063            );
4064            std::env::set_var("SKILLTEST_TEST_API_KEY", "sk-test");
4065            let judge = api_provider_with_curl(curl, ApiVendor::Anthropic);
4066            let split = SplitProvider::new(Box::new(super::StubResponder), judge);
4067            let verdict = split
4068                .judge(
4069                    "m",
4070                    &JudgeQuery {
4071                        kind: JudgeKind::Boolean,
4072                        criterion: "polite",
4073                        scale: None,
4074                    },
4075                    &[],
4076                )
4077                .unwrap();
4078            assert!(matches!(verdict.value, JudgeValue::Bool(true)));
4079            std::env::remove_var("SKILLTEST_TEST_API_KEY");
4080        }
4081
4082        #[test]
4083        fn oneharness_numeric_judge_uses_numeric_prompt() {
4084            // A numeric judge exercises the numeric branch of build_judge_prompt
4085            // (the scale text) and the numeric verdict parse path.
4086            let dir = std::env::temp_dir().join(format!("skilltest-ohnum-{}", std::process::id()));
4087            std::fs::create_dir_all(&dir).unwrap();
4088            let prompt_file = dir.join("prompt.txt");
4089            let bin = script(
4090                "oh-numeric",
4091                &format!(
4092                    "cat > '{}'\necho '{{\"results\":[{{\"status\":\"ok\",\"text\":\"{{\\\"value\\\": 8.5, \\\"reason\\\": \\\"warm\\\"}}\"}}]}}'\n",
4093                    prompt_file.display(),
4094                ),
4095            );
4096            let query = JudgeQuery {
4097                kind: JudgeKind::Numeric,
4098                criterion: "warmth",
4099                scale: Some((0.0, 10.0)),
4100            };
4101            let verdict = oh_provider(bin)
4102                .judge("m", &query, &[Message::assistant("hi")])
4103                .unwrap();
4104            assert!(matches!(verdict.value, JudgeValue::Number(v) if (v - 8.5).abs() < 1e-9));
4105            let prompt = std::fs::read_to_string(&prompt_file).unwrap();
4106            assert!(
4107                prompt.contains("scale from 0 to 10"),
4108                "numeric prompt: {prompt}"
4109            );
4110        }
4111
4112        #[test]
4113        fn supports_resume_method_matches_free_function() {
4114            let provider = oh_provider(PathBuf::from("/bin/true"));
4115            assert!(provider.supports_resume("claude-code"));
4116            assert!(!provider.supports_resume("codex"));
4117        }
4118    }
4119
4120    // Non-subprocess error-path coverage for the verdict parser and the
4121    // classified-error fallback.
4122
4123    #[test]
4124    fn parse_verdict_rejects_missing_object_and_value() {
4125        // No JSON object at all.
4126        assert!(parse_verdict(JudgeKind::Boolean, "just prose, no braces").is_err());
4127        // A JSON object with no `value` field.
4128        assert!(parse_verdict(JudgeKind::Boolean, "{\"reason\": \"x\"}").is_err());
4129        // Malformed JSON inside the braces.
4130        assert!(parse_verdict(JudgeKind::Numeric, "{not: valid}").is_err());
4131    }
4132
4133    #[test]
4134    fn extract_json_object_handles_reversed_braces() {
4135        // A stray `}` before `{` is not a valid object span.
4136        assert_eq!(extract_json_object("} then {"), None);
4137    }
4138
4139    #[test]
4140    fn unclassified_api_error_falls_back_to_plain_provider_error() {
4141        // An error type we don't classify becomes an unclassified provider error.
4142        let raw = r#"{"error":{"type":"some_new_error","message":"odd"}}"#;
4143        let err = parse_chat_response(ApiVendor::Openai, raw).unwrap_err();
4144        assert!(matches!(err, Error::Provider { kind: None, .. }));
4145        assert!(err.to_string().contains("odd"));
4146    }
4147
4148    #[test]
4149    fn openai_empty_choice_text_is_an_error() {
4150        let raw = r#"{"choices":[]}"#;
4151        assert!(parse_chat_response(ApiVendor::Openai, raw).is_err());
4152    }
4153
4154    #[test]
4155    fn truncate_for_error_caps_length_on_a_char_boundary() {
4156        let long = "x".repeat(1000);
4157        assert_eq!(truncate_for_error(&long).chars().count(), 500);
4158    }
4159}
skilltest_core/provider.rs

skilltest_core/
provider.rs