Skip to main content

skilltest_core/
provider.rs

1//! The provider boundary. `skilltest` never talks to a model directly; a
2//! [`Provider`] runs the skill, plays the simulated user, and judges the
3//! transcript.
4//!
5//! There are two real implementations. [`OneharnessProvider`] (the default) runs
6//! each prompt on a harness through the
7//! [`oneharness`](https://github.com/nickderobertis/oneharness) CLI and parses
8//! its JSON. [`CommandProvider`] speaks a small JSON-lines protocol (see
9//! `docs/protocol.md`) and backs both the deterministic `skilltest-fake-provider`
10//! used by the gate and any custom provider you write. The [`Provider`] trait
11//! also lets the runner be unit-tested against an in-memory fake.
12
13use std::io::Write as _;
14use std::process::{Command, Stdio};
15
16use serde::{Deserialize, Serialize};
17
18use crate::config::{ApiJudgeConfig, ApiVendor, OneharnessConfig};
19use crate::conversation::{Message, Role};
20use crate::error::{Error, Result};
21use crate::eval::JudgeValue;
22
23/// A borrowed view of the skill under test, as sent to the provider.
24pub struct SkillRef<'a> {
25    pub name: &'a str,
26    pub dir: &'a str,
27    pub instructions: &'a str,
28}
29
30/// The kind of judgement requested.
31#[derive(Debug, Clone, Copy, PartialEq, Eq)]
32pub enum JudgeKind {
33    Boolean,
34    Numeric,
35}
36
37impl JudgeKind {
38    fn as_str(self) -> &'static str {
39        match self {
40            JudgeKind::Boolean => "boolean",
41            JudgeKind::Numeric => "numeric",
42        }
43    }
44}
45
46/// A judge query: the criterion, its kind, and (for numeric) the scale.
47pub struct JudgeQuery<'a> {
48    pub kind: JudgeKind,
49    pub criterion: &'a str,
50    pub scale: Option<(f64, f64)>,
51}
52
53/// Token / cost usage for one provider call.
54///
55/// Each field is independently optional because not every harness reports every
56/// signal (cost is commonly absent on subscription auth; some harnesses report
57/// no usage at all). The whole struct is `Option<Usage>` on a turn — `None`
58/// means "no signal," not "zero."
59#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
60pub struct Usage {
61    #[serde(default, skip_serializing_if = "Option::is_none")]
62    pub input_tokens: Option<u64>,
63    #[serde(default, skip_serializing_if = "Option::is_none")]
64    pub output_tokens: Option<u64>,
65    #[serde(default, skip_serializing_if = "Option::is_none")]
66    pub cost_usd: Option<f64>,
67}
68
69impl Usage {
70    /// True iff every field is `None`.
71    #[must_use]
72    pub fn is_empty(&self) -> bool {
73        self.input_tokens.is_none() && self.output_tokens.is_none() && self.cost_usd.is_none()
74    }
75
76    /// Add another sample into this total. `None` values stay `None` until
77    /// something reports a real number, at which point they accumulate.
78    pub fn add(&mut self, other: &Usage) {
79        if let Some(v) = other.input_tokens {
80            self.input_tokens = Some(self.input_tokens.unwrap_or(0) + v);
81        }
82        if let Some(v) = other.output_tokens {
83            self.output_tokens = Some(self.output_tokens.unwrap_or(0) + v);
84        }
85        if let Some(v) = other.cost_usd {
86            self.cost_usd = Some(self.cost_usd.unwrap_or(0.0) + v);
87        }
88    }
89}
90
91/// An assistant/skill turn produced by the provider.
92#[derive(Debug, Clone, Default)]
93pub struct AssistantTurn {
94    pub message: String,
95    /// The skill signalled it considers the task complete.
96    pub done: bool,
97    /// Cost/token usage for this call, if the provider reported it.
98    pub usage: Option<Usage>,
99    /// A session handle the runner can pass back on the next `respond` call to
100    /// continue the same conversation against the real harness (only some
101    /// harnesses expose this — see `OneharnessProvider::supports_resume`).
102    pub session_id: Option<String>,
103}
104
105/// A simulated-user turn produced by the provider.
106#[derive(Debug, Clone, Default)]
107pub struct UserTurn {
108    pub message: String,
109    /// The simulated user chose to end the conversation.
110    pub stop: bool,
111    pub usage: Option<Usage>,
112}
113
114/// A judge verdict: the raw value (bool or number) plus the stated reason.
115#[derive(Debug, Clone)]
116pub struct JudgeVerdict {
117    pub value: JudgeValue,
118    pub reason: String,
119    pub usage: Option<Usage>,
120}
121
122/// The provider boundary.
123pub trait Provider {
124    /// Run one assistant/skill turn given the conversation so far. `session`,
125    /// when `Some`, is a handle returned by a previous `respond` call on this
126    /// run that the provider may use to continue the same harness session
127    /// (e.g. via `oneharness run --resume`); providers that don't support
128    /// continuation should ignore it.
129    ///
130    /// # Errors
131    /// [`Error::Provider`] if the command fails or returns malformed output.
132    fn respond(
133        &self,
134        platform: &str,
135        model: &str,
136        skill: &SkillRef<'_>,
137        messages: &[Message],
138        session: Option<&str>,
139    ) -> Result<AssistantTurn>;
140
141    /// Produce one simulated-user turn.
142    ///
143    /// # Errors
144    /// [`Error::Provider`] if the command fails or returns malformed output.
145    fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn>;
146
147    /// Score a criterion against the conversation.
148    ///
149    /// # Errors
150    /// [`Error::Provider`] if the command fails or returns malformed output.
151    fn judge(
152        &self,
153        model: &str,
154        query: &JudgeQuery<'_>,
155        messages: &[Message],
156    ) -> Result<JudgeVerdict>;
157
158    /// True iff `respond` on `platform` will faithfully continue a prior
159    /// session when given its `session_id`. The default is `false`; providers
160    /// that support resume override this so the runner knows to thread the
161    /// session id through.
162    fn supports_resume(&self, _platform: &str) -> bool {
163        false
164    }
165}
166
167// ---------------------------------------------------------------------------
168// Wire types (CommandProvider JSON-lines protocol)
169// ---------------------------------------------------------------------------
170
171#[derive(Serialize)]
172struct SkillPayload<'a> {
173    name: &'a str,
174    path: &'a str,
175    instructions: &'a str,
176}
177
178#[derive(Serialize)]
179#[serde(tag = "op", rename_all = "lowercase")]
180enum Request<'a> {
181    Respond {
182        platform: &'a str,
183        model: &'a str,
184        skill: SkillPayload<'a>,
185        messages: &'a [Message],
186        #[serde(skip_serializing_if = "Option::is_none")]
187        session: Option<&'a str>,
188    },
189    User {
190        model: &'a str,
191        persona: &'a str,
192        messages: &'a [Message],
193    },
194    Judge {
195        model: &'a str,
196        kind: &'a str,
197        criterion: &'a str,
198        #[serde(skip_serializing_if = "Option::is_none")]
199        min: Option<f64>,
200        #[serde(skip_serializing_if = "Option::is_none")]
201        max: Option<f64>,
202        messages: &'a [Message],
203    },
204}
205
206#[derive(Deserialize)]
207struct RespondPayload {
208    message: String,
209    #[serde(default)]
210    done: bool,
211    #[serde(default)]
212    usage: Option<Usage>,
213    #[serde(default)]
214    session_id: Option<String>,
215}
216
217#[derive(Deserialize)]
218struct UserPayload {
219    message: String,
220    #[serde(default)]
221    stop: bool,
222    #[serde(default)]
223    usage: Option<Usage>,
224}
225
226#[derive(Deserialize)]
227struct JudgePayload {
228    value: JudgeValue,
229    #[serde(default)]
230    reason: String,
231    #[serde(default)]
232    usage: Option<Usage>,
233}
234
235// ---------------------------------------------------------------------------
236// CommandProvider
237// ---------------------------------------------------------------------------
238
239/// A [`Provider`] backed by an external command speaking the JSON protocol.
240pub struct CommandProvider {
241    argv: Vec<String>,
242}
243
244impl CommandProvider {
245    /// Build a provider from an argv vector (program + args). The program is
246    /// resolved on `PATH`.
247    ///
248    /// # Errors
249    /// [`Error::Invalid`] if `argv` is empty.
250    pub fn new(argv: Vec<String>) -> Result<Self> {
251        if argv.is_empty() {
252            return Err(Error::Invalid("provider command is empty".into()));
253        }
254        Ok(Self { argv })
255    }
256
257    /// Send one request and parse the single response object from stdout.
258    fn call<T: for<'de> Deserialize<'de>>(&self, request: &Request<'_>, op: &str) -> Result<T> {
259        let payload = serde_json::to_vec(request).map_err(|e| {
260            Error::provider(op.to_string(), format!("could not encode request: {e}"))
261        })?;
262
263        let mut child = Command::new(&self.argv[0])
264            .args(&self.argv[1..])
265            .stdin(Stdio::piped())
266            .stdout(Stdio::piped())
267            .stderr(Stdio::piped())
268            .spawn()
269            .map_err(|e| {
270                Error::provider(
271                    op.to_string(),
272                    format!(
273                        "could not run provider `{}`: {e}. Is it installed and on PATH?",
274                        self.argv[0]
275                    ),
276                )
277            })?;
278
279        // Write the request, then close stdin so the child can finish. Writing
280        // before reading stdout is safe here because responses are small.
281        {
282            let stdin = child
283                .stdin
284                .as_mut()
285                .ok_or_else(|| Error::provider(op.to_string(), "could not open provider stdin"))?;
286            stdin
287                .write_all(&payload)
288                .and_then(|()| stdin.write_all(b"\n"))
289                .map_err(|e| {
290                    Error::provider(op.to_string(), format!("could not write request: {e}"))
291                })?;
292        }
293
294        let output = child.wait_with_output().map_err(|e| {
295            Error::provider(op.to_string(), format!("provider did not complete: {e}"))
296        })?;
297
298        if !output.status.success() {
299            let stderr = String::from_utf8_lossy(&output.stderr);
300            return Err(Error::provider(
301                op.to_string(),
302                format!("provider exited with {}: {}", output.status, stderr.trim()),
303            ));
304        }
305
306        let stdout = String::from_utf8_lossy(&output.stdout);
307        let line = stdout.trim();
308        if line.is_empty() {
309            return Err(Error::provider(
310                op.to_string(),
311                "provider produced no output (expected one JSON response object)",
312            ));
313        }
314        serde_json::from_str(line).map_err(|e| {
315            Error::provider(
316                op.to_string(),
317                format!("provider response was not valid JSON for `{op}`: {e}; got: {line}"),
318            )
319        })
320    }
321}
322
323impl Provider for CommandProvider {
324    fn respond(
325        &self,
326        platform: &str,
327        model: &str,
328        skill: &SkillRef<'_>,
329        messages: &[Message],
330        session: Option<&str>,
331    ) -> Result<AssistantTurn> {
332        let request = Request::Respond {
333            platform,
334            model,
335            skill: SkillPayload {
336                name: skill.name,
337                path: skill.dir,
338                instructions: skill.instructions,
339            },
340            messages,
341            session,
342        };
343        let payload: RespondPayload = self.call(&request, "respond")?;
344        Ok(AssistantTurn {
345            message: payload.message,
346            done: payload.done,
347            usage: payload.usage,
348            session_id: payload.session_id,
349        })
350    }
351
352    fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
353        let request = Request::User {
354            model,
355            persona,
356            messages,
357        };
358        let payload: UserPayload = self.call(&request, "user")?;
359        Ok(UserTurn {
360            message: payload.message,
361            stop: payload.stop,
362            usage: payload.usage,
363        })
364    }
365
366    fn judge(
367        &self,
368        model: &str,
369        query: &JudgeQuery<'_>,
370        messages: &[Message],
371    ) -> Result<JudgeVerdict> {
372        let (min, max) = match query.scale {
373            Some((lo, hi)) => (Some(lo), Some(hi)),
374            None => (None, None),
375        };
376        let request = Request::Judge {
377            model,
378            kind: query.kind.as_str(),
379            criterion: query.criterion,
380            min,
381            max,
382            messages,
383        };
384        let payload: JudgePayload = self.call(&request, "judge")?;
385        Ok(JudgeVerdict {
386            value: payload.value,
387            reason: payload.reason,
388            usage: payload.usage,
389        })
390    }
391}
392
393// ---------------------------------------------------------------------------
394// OneharnessProvider
395// ---------------------------------------------------------------------------
396
397/// The default [`Provider`]: runs each prompt on a harness through the
398/// `oneharness` CLI.
399///
400/// Wires four real oneharness features that ship in v0.2.0:
401///
402/// * `--system <skill instructions>` — the skill becomes a *real* system prompt
403///   on the underlying harness (e.g. `--append-system-prompt` for claude-code),
404///   instead of being inlined into the user message.
405/// * `--resume <session>` — multi-turn `respond` calls thread the previous
406///   `session_id` so the harness sees a continuing conversation (and keeps its
407///   tool state, files, etc.) instead of being re-prompted with a stringified
408///   transcript. Used only for harnesses that report `supports_resume` in the
409///   registry (claude-code, opencode, cursor today); other harnesses fall back
410///   to the inline-transcript path.
411/// * Normalized `usage` (`input_tokens`, `output_tokens`, `cost_usd`) — surfaced
412///   on every turn so cross-model cost reporting is portable.
413/// * Normalized `failure_kind` (`auth`, `rate_limit`, `model_not_found`, …) —
414///   classified provider errors so the CLI can distinguish a broken environment
415///   from a broken skill.
416///
417/// Evals and the simulated user always run on the configured `judge_harness`,
418/// independent of the harness under test, so the evaluator does not drift with
419/// the matrix.
420pub struct OneharnessProvider {
421    bin: String,
422    judge_harness: String,
423    timeout_secs: u64,
424}
425
426/// The subset of the `oneharness run` JSON envelope we consume.
427#[derive(Deserialize)]
428struct OhEnvelope {
429    results: Vec<OhResult>,
430}
431
432#[derive(Deserialize)]
433struct OhResult {
434    status: String,
435    #[serde(default)]
436    text: Option<String>,
437    /// Raw harness stdout. oneharness's `text` extraction is best-effort and may
438    /// be null when a harness's output shape defeats it, with stdout as the
439    /// documented fallback; we honor that rather than hard-failing. No harness in
440    /// the live matrix relies on it today (OpenCode's JSONL — the case that
441    /// motivated this — is extracted natively as of oneharness v0.2.37), but the
442    /// contract holds for any harness, so the fallback stays as defense-in-depth.
443    #[serde(default)]
444    stdout: String,
445    #[serde(default)]
446    stderr: String,
447    #[serde(default)]
448    error: Option<String>,
449    #[serde(default)]
450    session_id: Option<String>,
451    #[serde(default)]
452    usage: Option<Usage>,
453    #[serde(default)]
454    failure_kind: Option<String>,
455}
456
457/// Parameters for one `oneharness run` invocation.
458struct RunArgs<'a> {
459    harness: &'a str,
460    model: &'a str,
461    prompt: &'a str,
462    /// Becomes `--system <text>`; only set on `respond` so the skill is the
463    /// system prompt rather than inlined into the user turn.
464    system: Option<&'a str>,
465    /// Becomes `--resume <id>`; only set when the runner wants to continue a
466    /// prior harness session.
467    resume: Option<&'a str>,
468}
469
470/// What we get back from one `oneharness run`.
471struct RunOutcome {
472    text: String,
473    session_id: Option<String>,
474    usage: Option<Usage>,
475}
476
477/// Choose the harness's reply text: oneharness's extracted `text` when non-empty,
478/// otherwise its raw stdout. oneharness extracts `text` on a best-effort basis
479/// and, per its contract, may leave it null when a harness's output shape defeats
480/// extraction — the reply still survives in stdout. (OpenCode's JSONL once hit
481/// this; oneharness v0.2.37 extracts it natively, so the fallback is now
482/// defense-in-depth.) Returns `None` only when both are empty, the one case that
483/// is a genuine "the harness said nothing" error.
484fn select_reply_text(text: Option<String>, stdout: &str) -> Option<String> {
485    text.filter(|t| !t.trim().is_empty())
486        .or_else(|| (!stdout.trim().is_empty()).then(|| stdout.to_string()))
487}
488
489impl OneharnessProvider {
490    /// Build a provider from its configuration.
491    #[must_use]
492    pub fn new(config: &OneharnessConfig) -> Self {
493        Self {
494            bin: config.bin.clone(),
495            judge_harness: config.judge_harness.clone(),
496            timeout_secs: config.timeout_secs,
497        }
498    }
499
500    /// Run one prompt on `harness` and return the normalized text plus the
501    /// session id and usage (when oneharness lifted them from the harness's
502    /// output).
503    fn run(&self, args: &RunArgs<'_>) -> Result<RunOutcome> {
504        let timeout = self.timeout_secs.to_string();
505        let mut cmd = Command::new(&self.bin);
506        // Intentionally no `--output-format` override: oneharness already requests
507        // each harness's *default* format (json for claude-code/opencode,
508        // stream-json for cursor, text for codex/goose/qwen/crush/copilot) and
509        // extracts the reply accordingly. Forcing `json` everywhere broke the
510        // text-native harnesses — oneharness would json-extract their plain-text
511        // reply and find nothing ("harness produced no extractable text").
512        cmd.args([
513            "run",
514            "--harness",
515            args.harness,
516            "--compact",
517            "--timeout",
518            &timeout,
519            "--prompt-file",
520            "-",
521        ]);
522        // An empty model means "unspecified" — omit `--model` so the harness uses
523        // its own default (cursor/crush/copilot) or an env-selected model (qwen
524        // via OPENAI_MODEL, goose via GOOSE_MODEL), exactly as oneharness's own
525        // smoke scripts do. Forwarding `--model ""` would push a broken empty
526        // model flag to the harness CLI.
527        if !args.model.is_empty() {
528            cmd.args(["--model", args.model]);
529        }
530        if let Some(system) = args.system {
531            cmd.args(["--system", system]);
532        }
533        if let Some(resume) = args.resume {
534            cmd.args(["--resume", resume]);
535        }
536
537        let mut child = cmd
538            .stdin(Stdio::piped())
539            .stdout(Stdio::piped())
540            .stderr(Stdio::piped())
541            .spawn()
542            .map_err(|e| {
543                Error::provider(
544                    "oneharness",
545                    format!(
546                        "could not run `{}`: {e}. Is oneharness installed and on PATH?",
547                        self.bin
548                    ),
549                )
550            })?;
551
552        child
553            .stdin
554            .as_mut()
555            .ok_or_else(|| Error::provider("oneharness", "could not open oneharness stdin"))?
556            .write_all(args.prompt.as_bytes())
557            .map_err(|e| Error::provider("oneharness", format!("could not write prompt: {e}")))?;
558
559        let output = child.wait_with_output().map_err(|e| {
560            Error::provider("oneharness", format!("oneharness did not complete: {e}"))
561        })?;
562
563        let stdout = String::from_utf8_lossy(&output.stdout);
564        let envelope: OhEnvelope = serde_json::from_str(stdout.trim()).map_err(|e| {
565            Error::provider(
566                "oneharness",
567                format!(
568                    "could not parse oneharness output: {e}; stderr: {}",
569                    String::from_utf8_lossy(&output.stderr).trim()
570                ),
571            )
572        })?;
573
574        let result = envelope
575            .results
576            .into_iter()
577            .next()
578            .ok_or_else(|| Error::provider("oneharness", "oneharness returned no results"))?;
579
580        if result.status != "ok" {
581            let detail = result
582                .error
583                .filter(|e| !e.is_empty())
584                .or_else(|| Some(result.stderr.clone()).filter(|s| !s.is_empty()))
585                .unwrap_or_else(|| format!("status `{}`", result.status));
586            let context = format!("oneharness:{}", args.harness);
587            let message = format!("harness run failed: {detail}");
588            return Err(match result.failure_kind {
589                Some(kind) if !kind.is_empty() => {
590                    Error::provider_classified(context, message, kind)
591                }
592                _ => Error::provider(context, message),
593            });
594        }
595
596        // Prefer oneharness's extracted `text`; fall back to raw stdout when a
597        // harness's output shape defeats extraction (oneharness's documented
598        // contract — see OhResult::stdout). Only a run that produced *neither* is
599        // a real error.
600        let text = select_reply_text(result.text, &result.stdout).ok_or_else(|| {
601            Error::provider(
602                format!("oneharness:{}", args.harness),
603                "harness produced neither extractable text nor stdout",
604            )
605        })?;
606        Ok(RunOutcome {
607            text,
608            session_id: result.session_id,
609            usage: result.usage,
610        })
611    }
612}
613
614impl Provider for OneharnessProvider {
615    fn respond(
616        &self,
617        platform: &str,
618        model: &str,
619        skill: &SkillRef<'_>,
620        messages: &[Message],
621        session: Option<&str>,
622    ) -> Result<AssistantTurn> {
623        // If we have a real session to continue on a supporting harness, only
624        // send the last user message — the harness still has its prior state.
625        // Otherwise inline the whole transcript so harnesses without resume
626        // still see the conversation.
627        let prompt = if session.is_some() {
628            latest_user_message(messages).unwrap_or_default()
629        } else {
630            render_transcript_for_respond(messages)
631        };
632        let outcome = self.run(&RunArgs {
633            harness: platform,
634            model,
635            prompt: &prompt,
636            system: Some(skill.instructions),
637            resume: session,
638        })?;
639        Ok(AssistantTurn {
640            message: outcome.text.trim().to_string(),
641            done: false,
642            usage: outcome.usage,
643            session_id: outcome.session_id,
644        })
645    }
646
647    fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
648        let prompt = build_user_prompt(persona, messages);
649        let outcome = self.run(&RunArgs {
650            harness: &self.judge_harness,
651            model,
652            prompt: &prompt,
653            system: None,
654            resume: None,
655        })?;
656        Ok(UserTurn {
657            message: outcome.text.trim().to_string(),
658            stop: false,
659            usage: outcome.usage,
660        })
661    }
662
663    fn judge(
664        &self,
665        model: &str,
666        query: &JudgeQuery<'_>,
667        messages: &[Message],
668    ) -> Result<JudgeVerdict> {
669        let prompt = build_judge_prompt(query, messages);
670        let outcome = self.run(&RunArgs {
671            harness: &self.judge_harness,
672            model,
673            prompt: &prompt,
674            system: None,
675            resume: None,
676        })?;
677        let mut verdict = parse_verdict(query.kind, &outcome.text)?;
678        verdict.usage = outcome.usage;
679        Ok(verdict)
680    }
681
682    fn supports_resume(&self, platform: &str) -> bool {
683        supports_resume(platform)
684    }
685}
686
687/// The harnesses oneharness's adapter table marks `supports_resume = true`
688/// (claude-code's `--resume`, opencode's `--session`, cursor's `--resume`). Kept
689/// in sync with the `oneharness list` registry — when a new harness ships
690/// session continuation, add it here so the runner threads `session_id`.
691#[must_use]
692pub fn supports_resume(harness: &str) -> bool {
693    matches!(harness, "claude-code" | "opencode" | "cursor")
694}
695
696// ---------------------------------------------------------------------------
697// ApiJudgeProvider + SplitProvider
698// ---------------------------------------------------------------------------
699
700/// A judge-only [`Provider`] that scores evals and plays the simulated user with
701/// a *direct* model API call (Anthropic or OpenAI), rather than running them
702/// through a harness.
703///
704/// Why this exists: routing the judge through a full agentic harness pays an
705/// agent-loop cold start on every short verdict. A direct API call is one HTTP
706/// round trip — faster and cheaper on API-key auth — and still reuses the exact
707/// same judge/user prompts and tolerant verdict parsing as
708/// [`OneharnessProvider`], so the two are directly comparable.
709///
710/// It does not run skills: `respond` returns an error. Compose it with a
711/// skill-running provider via [`SplitProvider`] so the harness under test still
712/// drives `respond`, while the judge runs on the API.
713///
714/// The request is sent with `curl` (Rust has no official vendor SDK). The API
715/// key is read from an env var and passed through a private (`0600`) `curl`
716/// config file, so it never appears in `argv` / `ps`.
717pub struct ApiJudgeProvider {
718    vendor: ApiVendor,
719    api_key_env: String,
720    endpoint: String,
721    timeout_secs: u64,
722    curl_bin: String,
723    strict_json: bool,
724}
725
726/// How many times a transient API failure (rate limit / overload) is retried
727/// before giving up, with exponential backoff between attempts.
728const MAX_RETRIES: u32 = 2;
729
730/// One model reply plus the usage the API reported for it.
731#[derive(Debug)]
732struct ChatOutcome {
733    text: String,
734    usage: Option<Usage>,
735}
736
737/// A minimal system prompt; the full judge / user-simulation instructions live
738/// in the shared prompt builders, so this stays identical across vendors.
739const JUDGE_SYSTEM: &str =
740    "Follow the user's instructions exactly and respond with only what they ask for.";
741
742impl ApiJudgeProvider {
743    /// Build a provider from its configuration, resolving per-vendor defaults
744    /// for the API-key env var and endpoint.
745    #[must_use]
746    pub fn new(config: &ApiJudgeConfig) -> Self {
747        let api_key_env = config
748            .api_key_env
749            .clone()
750            .unwrap_or_else(|| match config.vendor {
751                ApiVendor::Anthropic => "ANTHROPIC_API_KEY".to_string(),
752                ApiVendor::Openai => "OPENAI_API_KEY".to_string(),
753            });
754        let endpoint = config
755            .base_url
756            .clone()
757            .unwrap_or_else(|| match config.vendor {
758                ApiVendor::Anthropic => "https://api.anthropic.com/v1/messages".to_string(),
759                ApiVendor::Openai => "https://api.openai.com/v1/chat/completions".to_string(),
760            });
761        Self {
762            vendor: config.vendor,
763            api_key_env,
764            endpoint,
765            timeout_secs: config.timeout_secs,
766            curl_bin: config.curl_bin.clone(),
767            strict_json: config.strict_json,
768        }
769    }
770
771    /// One chat round trip: build the vendor request, POST it, parse the reply.
772    /// `schema`, when set, constrains the reply to that JSON schema via the
773    /// vendor's structured-outputs feature. Transient failures (rate limit /
774    /// overload) are retried with exponential backoff.
775    fn chat(
776        &self,
777        model: &str,
778        system: &str,
779        user: &str,
780        schema: Option<serde_json::Value>,
781    ) -> Result<ChatOutcome> {
782        let key = std::env::var(&self.api_key_env).map_err(|_| {
783            Error::provider_classified(
784                "api-judge",
785                format!("API key env var `{}` is not set", self.api_key_env),
786                "auth",
787            )
788        })?;
789        let body = build_chat_body(self.vendor, model, system, user, schema);
790        let payload = serde_json::to_vec(&body)
791            .map_err(|e| Error::provider("api-judge", format!("could not encode request: {e}")))?;
792
793        let mut attempt = 0;
794        loop {
795            let result = self
796                .run_curl(&key, &payload)
797                .and_then(|raw| parse_chat_response(self.vendor, &raw));
798            match result {
799                Ok(outcome) => return Ok(outcome),
800                Err(err) if attempt < MAX_RETRIES && is_retryable(&err) => {
801                    attempt += 1;
802                    std::thread::sleep(std::time::Duration::from_millis(500 * (1 << attempt)));
803                }
804                Err(err) => return Err(err),
805            }
806        }
807    }
808
809    /// Per-vendor request headers.
810    fn headers(&self, key: &str) -> Vec<(String, String)> {
811        match self.vendor {
812            ApiVendor::Anthropic => vec![
813                ("x-api-key".to_string(), key.to_string()),
814                ("anthropic-version".to_string(), "2023-06-01".to_string()),
815                ("content-type".to_string(), "application/json".to_string()),
816            ],
817            ApiVendor::Openai => vec![
818                ("authorization".to_string(), format!("Bearer {key}")),
819                ("content-type".to_string(), "application/json".to_string()),
820            ],
821        }
822    }
823
824    /// POST `body` via `curl`, with the URL + headers (including the API key) in
825    /// a private config file so the key stays out of `argv`. Returns stdout.
826    fn run_curl(&self, key: &str, body: &[u8]) -> Result<String> {
827        let path = std::env::temp_dir().join(format!(
828            "skilltest-judge-{}-{}.cfg",
829            std::process::id(),
830            curl_config_nonce()
831        ));
832        write_curl_config(&path, &self.endpoint, &self.headers(key), self.timeout_secs)?;
833        let outcome = self.exec_curl(&path, body);
834        // The key-bearing config is needed only for this one invocation.
835        let _ = std::fs::remove_file(&path);
836        outcome
837    }
838
839    fn exec_curl(&self, config_path: &std::path::Path, body: &[u8]) -> Result<String> {
840        let mut child = Command::new(&self.curl_bin)
841            .arg("--config")
842            .arg(config_path)
843            .arg("--data-binary")
844            .arg("@-")
845            .stdin(Stdio::piped())
846            .stdout(Stdio::piped())
847            .stderr(Stdio::piped())
848            .spawn()
849            .map_err(|e| {
850                Error::provider(
851                    "api-judge",
852                    format!(
853                        "could not run `{}`: {e}. Is curl installed and on PATH?",
854                        self.curl_bin
855                    ),
856                )
857            })?;
858
859        child
860            .stdin
861            .as_mut()
862            .ok_or_else(|| Error::provider("api-judge", "could not open curl stdin"))?
863            .write_all(body)
864            .map_err(|e| Error::provider("api-judge", format!("could not write request: {e}")))?;
865
866        let output = child
867            .wait_with_output()
868            .map_err(|e| Error::provider("api-judge", format!("curl did not complete: {e}")))?;
869
870        if !output.status.success() {
871            let stderr = String::from_utf8_lossy(&output.stderr);
872            return Err(Error::provider(
873                "api-judge",
874                format!("curl failed ({}): {}", output.status, stderr.trim()),
875            ));
876        }
877        Ok(String::from_utf8_lossy(&output.stdout).into_owned())
878    }
879}
880
881impl Provider for ApiJudgeProvider {
882    fn respond(
883        &self,
884        _platform: &str,
885        _model: &str,
886        _skill: &SkillRef<'_>,
887        _messages: &[Message],
888        _session: Option<&str>,
889    ) -> Result<AssistantTurn> {
890        Err(Error::provider(
891            "api-judge",
892            "the API judge does not run skills; use it as the judge in a SplitProvider",
893        ))
894    }
895
896    fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
897        let prompt = build_user_prompt(persona, messages);
898        // Free-form text reply — never schema-constrained.
899        let outcome = self.chat(model, JUDGE_SYSTEM, &prompt, None)?;
900        Ok(UserTurn {
901            message: outcome.text.trim().to_string(),
902            stop: false,
903            usage: outcome.usage,
904        })
905    }
906
907    fn judge(
908        &self,
909        model: &str,
910        query: &JudgeQuery<'_>,
911        messages: &[Message],
912    ) -> Result<JudgeVerdict> {
913        let prompt = build_judge_prompt(query, messages);
914        // Constrain the verdict to the `{value, reason}` schema when strict JSON
915        // is on, so the reply is guaranteed parseable rather than scraped.
916        let schema = self.strict_json.then(|| verdict_schema(query.kind));
917        let outcome = self.chat(model, JUDGE_SYSTEM, &prompt, schema)?;
918        let mut verdict = parse_verdict(query.kind, &outcome.text)?;
919        verdict.usage = outcome.usage;
920        Ok(verdict)
921    }
922}
923
924/// A [`Provider`] that runs skills with one provider and judges with another:
925/// `respond` (and `supports_resume`) go to the skill-running provider; `judge`
926/// and `simulate_user` go to the judge. This keeps harness fidelity for the
927/// thing under test while letting the judge run on a fast, cheap, deterministic
928/// backend (typically [`ApiJudgeProvider`]).
929pub struct SplitProvider {
930    responder: Box<dyn Provider>,
931    judge: ApiJudgeProvider,
932}
933
934impl SplitProvider {
935    /// Compose a skill-running `responder` with an API `judge`.
936    #[must_use]
937    pub fn new(responder: Box<dyn Provider>, judge: ApiJudgeProvider) -> Self {
938        Self { responder, judge }
939    }
940}
941
942impl Provider for SplitProvider {
943    fn respond(
944        &self,
945        platform: &str,
946        model: &str,
947        skill: &SkillRef<'_>,
948        messages: &[Message],
949        session: Option<&str>,
950    ) -> Result<AssistantTurn> {
951        self.responder
952            .respond(platform, model, skill, messages, session)
953    }
954
955    fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
956        self.judge.simulate_user(model, persona, messages)
957    }
958
959    fn judge(
960        &self,
961        model: &str,
962        query: &JudgeQuery<'_>,
963        messages: &[Message],
964    ) -> Result<JudgeVerdict> {
965        self.judge.judge(model, query, messages)
966    }
967
968    fn supports_resume(&self, platform: &str) -> bool {
969        self.responder.supports_resume(platform)
970    }
971}
972
973/// A process-local monotonic counter, combined with the pid to make a unique
974/// temp-file name for each concurrent `curl` config.
975fn curl_config_nonce() -> u64 {
976    use std::sync::atomic::{AtomicU64, Ordering};
977    static COUNTER: AtomicU64 = AtomicU64::new(0);
978    COUNTER.fetch_add(1, Ordering::Relaxed)
979}
980
981/// Escape a value for a double-quoted `curl` config entry.
982fn curl_escape(value: &str) -> String {
983    value.replace('\\', "\\\\").replace('"', "\\\"")
984}
985
986/// Write a `curl` config file (`0600` on Unix) carrying the URL, headers, and
987/// timeout. The request body is streamed separately on stdin (`--data-binary
988/// @-`), so it never needs escaping into this file.
989fn write_curl_config(
990    path: &std::path::Path,
991    url: &str,
992    headers: &[(String, String)],
993    timeout_secs: u64,
994) -> Result<()> {
995    let mut config = String::new();
996    config.push_str(&format!("url = \"{}\"\n", curl_escape(url)));
997    config.push_str("request = \"POST\"\n");
998    for (name, value) in headers {
999        config.push_str(&format!("header = \"{}: {}\"\n", name, curl_escape(value)));
1000    }
1001    config.push_str(&format!("max-time = {timeout_secs}\n"));
1002    config.push_str("silent\nshow-error\n");
1003
1004    let mut options = std::fs::OpenOptions::new();
1005    options.write(true).create(true).truncate(true);
1006    #[cfg(unix)]
1007    {
1008        use std::os::unix::fs::OpenOptionsExt as _;
1009        options.mode(0o600);
1010    }
1011    let mut file = options
1012        .open(path)
1013        .map_err(|e| Error::provider("api-judge", format!("could not write curl config: {e}")))?;
1014    file.write_all(config.as_bytes())
1015        .map_err(|e| Error::provider("api-judge", format!("could not write curl config: {e}")))?;
1016    Ok(())
1017}
1018
1019/// The JSON schema a judge verdict must match: `{value, reason}` with `value`
1020/// typed by the eval kind. Numeric bounds are intentionally omitted — vendor
1021/// structured outputs don't enforce `minimum`/`maximum`, and the runner already
1022/// range-checks the parsed value.
1023fn verdict_schema(kind: JudgeKind) -> serde_json::Value {
1024    let value_type = match kind {
1025        JudgeKind::Boolean => "boolean",
1026        JudgeKind::Numeric => "number",
1027    };
1028    serde_json::json!({
1029        "type": "object",
1030        "properties": {
1031            "value": { "type": value_type },
1032            "reason": { "type": "string" },
1033        },
1034        "required": ["value", "reason"],
1035        "additionalProperties": false,
1036    })
1037}
1038
1039/// Build the JSON request body for one chat completion. Outgoing data, so it is
1040/// constructed directly; responses are parsed into typed models below. When
1041/// `schema` is set, the vendor's structured-outputs field is added so the reply
1042/// is guaranteed to match it.
1043fn build_chat_body(
1044    vendor: ApiVendor,
1045    model: &str,
1046    system: &str,
1047    user: &str,
1048    schema: Option<serde_json::Value>,
1049) -> serde_json::Value {
1050    match vendor {
1051        ApiVendor::Anthropic => {
1052            let mut body = serde_json::json!({
1053                "model": model,
1054                "max_tokens": 1024,
1055                "system": system,
1056                "messages": [{ "role": "user", "content": user }],
1057            });
1058            if let Some(schema) = schema {
1059                body["output_config"] =
1060                    serde_json::json!({ "format": { "type": "json_schema", "schema": schema } });
1061            }
1062            body
1063        }
1064        ApiVendor::Openai => {
1065            let mut body = serde_json::json!({
1066                "model": model,
1067                "max_tokens": 1024,
1068                "messages": [
1069                    { "role": "system", "content": system },
1070                    { "role": "user", "content": user },
1071                ],
1072            });
1073            if let Some(schema) = schema {
1074                body["response_format"] = serde_json::json!({
1075                    "type": "json_schema",
1076                    "json_schema": { "name": "verdict", "strict": true, "schema": schema },
1077                });
1078            }
1079            body
1080        }
1081    }
1082}
1083
1084/// True iff the error is a transient API condition worth retrying.
1085fn is_retryable(err: &Error) -> bool {
1086    matches!(
1087        err,
1088        Error::Provider { kind: Some(k), .. } if k == "rate_limit" || k == "overloaded"
1089    )
1090}
1091
1092// Typed views of the vendor responses (trust-boundary input — always parsed,
1093// never string-matched).
1094
1095#[derive(Deserialize)]
1096struct ApiErrorBody {
1097    #[serde(rename = "type", default)]
1098    kind: Option<String>,
1099    #[serde(default)]
1100    message: Option<String>,
1101}
1102
1103#[derive(Deserialize)]
1104struct AnthropicBlock {
1105    #[serde(rename = "type")]
1106    kind: String,
1107    #[serde(default)]
1108    text: Option<String>,
1109}
1110
1111#[derive(Deserialize)]
1112struct AnthropicUsage {
1113    #[serde(default)]
1114    input_tokens: Option<u64>,
1115    #[serde(default)]
1116    output_tokens: Option<u64>,
1117}
1118
1119#[derive(Deserialize)]
1120struct AnthropicResponse {
1121    #[serde(default)]
1122    content: Vec<AnthropicBlock>,
1123    #[serde(default)]
1124    usage: Option<AnthropicUsage>,
1125    #[serde(default)]
1126    stop_reason: Option<String>,
1127    #[serde(default)]
1128    error: Option<ApiErrorBody>,
1129}
1130
1131#[derive(Deserialize)]
1132struct OpenAiMessage {
1133    #[serde(default)]
1134    content: Option<String>,
1135}
1136
1137#[derive(Deserialize)]
1138struct OpenAiChoice {
1139    #[serde(default)]
1140    message: Option<OpenAiMessage>,
1141}
1142
1143#[derive(Deserialize)]
1144struct OpenAiUsage {
1145    #[serde(default)]
1146    prompt_tokens: Option<u64>,
1147    #[serde(default)]
1148    completion_tokens: Option<u64>,
1149}
1150
1151#[derive(Deserialize)]
1152struct OpenAiResponse {
1153    #[serde(default)]
1154    choices: Vec<OpenAiChoice>,
1155    #[serde(default)]
1156    usage: Option<OpenAiUsage>,
1157    #[serde(default)]
1158    error: Option<ApiErrorBody>,
1159}
1160
1161/// Map a vendor error `type` onto skilltest's classified provider-error kinds so
1162/// the CLI can give the same pointed hints it gives for harness failures.
1163fn classify_api_error(kind: Option<&str>) -> Option<String> {
1164    match kind? {
1165        "authentication_error" | "invalid_api_key" | "permission_error" => Some("auth".to_string()),
1166        "rate_limit_error" | "rate_limit_exceeded" => Some("rate_limit".to_string()),
1167        "insufficient_quota" | "billing_error" => Some("quota".to_string()),
1168        "not_found_error" => Some("model_not_found".to_string()),
1169        // Transient server-side conditions — surfaced as `overloaded` so the
1170        // runner retries them (see `is_retryable`).
1171        "overloaded_error" | "api_error" | "server_error" | "service_unavailable" => {
1172            Some("overloaded".to_string())
1173        }
1174        _ => None,
1175    }
1176}
1177
1178fn api_error(err: ApiErrorBody) -> Error {
1179    let message = err
1180        .message
1181        .unwrap_or_else(|| "API returned an error".to_string());
1182    match classify_api_error(err.kind.as_deref()) {
1183        Some(kind) => Error::provider_classified("api-judge", message, kind),
1184        None => Error::provider("api-judge", message),
1185    }
1186}
1187
1188/// Take the first chars of `raw` for an error message, on a UTF-8 boundary.
1189fn truncate_for_error(raw: &str) -> String {
1190    raw.chars().take(500).collect()
1191}
1192
1193/// Parse a vendor chat response into the reply text plus normalized usage.
1194fn parse_chat_response(vendor: ApiVendor, raw: &str) -> Result<ChatOutcome> {
1195    match vendor {
1196        ApiVendor::Anthropic => {
1197            let resp: AnthropicResponse = serde_json::from_str(raw.trim()).map_err(|e| {
1198                Error::provider(
1199                    "api-judge",
1200                    format!(
1201                        "could not parse API response: {e}; got: {}",
1202                        truncate_for_error(raw)
1203                    ),
1204                )
1205            })?;
1206            if let Some(err) = resp.error {
1207                return Err(api_error(err));
1208            }
1209            let text = resp
1210                .content
1211                .iter()
1212                .filter(|b| b.kind == "text")
1213                .filter_map(|b| b.text.as_deref())
1214                .collect::<String>();
1215            if text.trim().is_empty() {
1216                return Err(Error::provider(
1217                    "api-judge",
1218                    format!(
1219                        "judge returned no text (stop_reason: {:?})",
1220                        resp.stop_reason
1221                    ),
1222                ));
1223            }
1224            let usage = resp.usage.map(|u| Usage {
1225                input_tokens: u.input_tokens,
1226                output_tokens: u.output_tokens,
1227                cost_usd: None,
1228            });
1229            Ok(ChatOutcome { text, usage })
1230        }
1231        ApiVendor::Openai => {
1232            let resp: OpenAiResponse = serde_json::from_str(raw.trim()).map_err(|e| {
1233                Error::provider(
1234                    "api-judge",
1235                    format!(
1236                        "could not parse API response: {e}; got: {}",
1237                        truncate_for_error(raw)
1238                    ),
1239                )
1240            })?;
1241            if let Some(err) = resp.error {
1242                return Err(api_error(err));
1243            }
1244            let text = resp
1245                .choices
1246                .into_iter()
1247                .next()
1248                .and_then(|c| c.message)
1249                .and_then(|m| m.content)
1250                .unwrap_or_default();
1251            if text.trim().is_empty() {
1252                return Err(Error::provider("api-judge", "judge returned no text"));
1253            }
1254            let usage = resp.usage.map(|u| Usage {
1255                input_tokens: u.prompt_tokens,
1256                output_tokens: u.completion_tokens,
1257                cost_usd: None,
1258            });
1259            Ok(ChatOutcome { text, usage })
1260        }
1261    }
1262}
1263
1264/// Render the conversation as `Role: content` lines for inlining in a prompt.
1265/// Used by the judge, the simulated user, and the no-resume fallback path of
1266/// `respond`.
1267fn render_transcript(messages: &[Message]) -> String {
1268    messages
1269        .iter()
1270        .map(|m| {
1271            let role = match m.role {
1272                Role::User => "User",
1273                Role::Assistant => "Assistant",
1274                Role::System => "System",
1275            };
1276            format!("{role}: {}", m.content)
1277        })
1278        .collect::<Vec<_>>()
1279        .join("\n")
1280}
1281
1282/// The prompt for `respond` when we cannot resume a harness session: inline the
1283/// whole conversation so the stateless harness call sees it. The skill is
1284/// passed separately as `--system`, so it does *not* appear here.
1285fn render_transcript_for_respond(messages: &[Message]) -> String {
1286    format!(
1287        "Conversation so far (most recent last):\n{}\n\n\
1288         Write only the assistant's next reply, following your system \
1289         instructions. Output the reply text and nothing else.",
1290        render_transcript(messages),
1291    )
1292}
1293
1294/// The most recent user message in the transcript — used as the next-turn
1295/// prompt when resuming a real harness session.
1296fn latest_user_message(messages: &[Message]) -> Option<String> {
1297    messages
1298        .iter()
1299        .rev()
1300        .find(|m| m.role == Role::User)
1301        .map(|m| m.content.clone())
1302}
1303
1304fn build_user_prompt(persona: &str, messages: &[Message]) -> String {
1305    format!(
1306        "You are role-playing the USER in a conversation with an AI assistant. \
1307         Stay in character:\n\n{persona}\n\n\
1308         Conversation so far (most recent last):\n{transcript}\n\n\
1309         Write only the user's next message. Output the message text and nothing \
1310         else.",
1311        transcript = render_transcript(messages),
1312    )
1313}
1314
1315fn build_judge_prompt(query: &JudgeQuery<'_>, messages: &[Message]) -> String {
1316    let transcript = render_transcript(messages);
1317    match query.kind {
1318        JudgeKind::Boolean => format!(
1319            "You are a strict, careful evaluator of an AI assistant's behavior.\n\n\
1320             Criterion: {criterion}\n\n\
1321             Transcript:\n{transcript}\n\n\
1322             Decide whether the criterion is satisfied. Respond with ONLY a \
1323             single-line JSON object and nothing else:\n\
1324             {{\"value\": true or false, \"reason\": \"<one short sentence>\"}}",
1325            criterion = query.criterion,
1326        ),
1327        JudgeKind::Numeric => {
1328            let (min, max) = query.scale.unwrap_or((0.0, 10.0));
1329            format!(
1330                "You are a strict, careful evaluator of an AI assistant's behavior.\n\n\
1331                 Criterion: {criterion}\n\n\
1332                 Transcript:\n{transcript}\n\n\
1333                 Score how well the criterion is satisfied on a scale from {min} to \
1334                 {max} (inclusive). Respond with ONLY a single-line JSON object and \
1335                 nothing else:\n\
1336                 {{\"value\": <number between {min} and {max}>, \"reason\": \"<one short sentence>\"}}",
1337                criterion = query.criterion,
1338            )
1339        }
1340    }
1341}
1342
1343/// Extract the first JSON object from `text`, tolerating code fences and prose
1344/// around it (real models do not always emit bare JSON).
1345fn extract_json_object(text: &str) -> Option<&str> {
1346    let start = text.find('{')?;
1347    let end = text.rfind('}')?;
1348    if end > start {
1349        Some(&text[start..=end])
1350    } else {
1351        None
1352    }
1353}
1354
1355fn parse_verdict(kind: JudgeKind, text: &str) -> Result<JudgeVerdict> {
1356    let json = extract_json_object(text).ok_or_else(|| {
1357        Error::provider(
1358            "oneharness:judge",
1359            format!("judge did not return a JSON object; got: {text}"),
1360        )
1361    })?;
1362    let value: serde_json::Value = serde_json::from_str(json).map_err(|e| {
1363        Error::provider(
1364            "oneharness:judge",
1365            format!("judge verdict was not valid JSON: {e}; got: {json}"),
1366        )
1367    })?;
1368    let reason = value
1369        .get("reason")
1370        .and_then(serde_json::Value::as_str)
1371        .unwrap_or("")
1372        .to_string();
1373    let raw = value
1374        .get("value")
1375        .ok_or_else(|| Error::provider("oneharness:judge", "judge verdict has no `value` field"))?;
1376
1377    let verdict_value = match kind {
1378        JudgeKind::Boolean => JudgeValue::Bool(raw.as_bool().ok_or_else(|| {
1379            Error::provider(
1380                "oneharness:judge",
1381                format!("boolean judge `value` was not a bool: {raw}"),
1382            )
1383        })?),
1384        JudgeKind::Numeric => JudgeValue::Number(raw.as_f64().ok_or_else(|| {
1385            Error::provider(
1386                "oneharness:judge",
1387                format!("numeric judge `value` was not a number: {raw}"),
1388            )
1389        })?),
1390    };
1391
1392    Ok(JudgeVerdict {
1393        value: verdict_value,
1394        reason,
1395        usage: None,
1396    })
1397}
1398
1399#[cfg(test)]
1400mod tests {
1401    use super::*;
1402
1403    #[test]
1404    fn empty_argv_is_rejected() {
1405        assert!(CommandProvider::new(vec![]).is_err());
1406    }
1407
1408    #[test]
1409    fn request_serializes_with_op_tag() {
1410        let req = Request::Judge {
1411            model: "m",
1412            kind: "numeric",
1413            criterion: "polite",
1414            min: Some(0.0),
1415            max: Some(10.0),
1416            messages: &[],
1417        };
1418        let json = serde_json::to_string(&req).unwrap();
1419        assert!(json.contains("\"op\":\"judge\""));
1420        assert!(json.contains("\"kind\":\"numeric\""));
1421    }
1422
1423    #[test]
1424    fn respond_no_session_inlines_transcript_but_not_skill() {
1425        // The skill is passed via --system now, so the prompt the harness sees
1426        // for respond carries only the transcript.
1427        let messages = [
1428            Message::user("Hi"),
1429            Message::assistant("Hello"),
1430            Message::user("Again?"),
1431        ];
1432        let prompt = render_transcript_for_respond(&messages);
1433        assert!(prompt.contains("User: Hi"));
1434        assert!(prompt.contains("Assistant: Hello"));
1435        assert!(prompt.contains("User: Again?"));
1436        // The skill body must not leak here — it belongs in --system.
1437        assert!(!prompt.contains("SKILL"));
1438    }
1439
1440    #[test]
1441    fn respond_with_session_sends_only_latest_user_message() {
1442        let messages = [
1443            Message::user("Hi"),
1444            Message::assistant("Hello"),
1445            Message::user("Again?"),
1446        ];
1447        assert_eq!(latest_user_message(&messages).as_deref(), Some("Again?"));
1448    }
1449
1450    #[test]
1451    fn extracts_json_from_fenced_or_prose_text() {
1452        assert_eq!(
1453            extract_json_object("```json\n{\"value\": true}\n```"),
1454            Some("{\"value\": true}")
1455        );
1456        assert_eq!(
1457            extract_json_object("Sure! {\"value\": 8, \"reason\": \"x\"} done"),
1458            Some("{\"value\": 8, \"reason\": \"x\"}")
1459        );
1460        assert_eq!(extract_json_object("no json here"), None);
1461    }
1462
1463    #[test]
1464    fn parses_boolean_and_numeric_verdicts() {
1465        let b = parse_verdict(JudgeKind::Boolean, "{\"value\": true, \"reason\": \"ok\"}").unwrap();
1466        assert!(matches!(b.value, JudgeValue::Bool(true)));
1467        assert_eq!(b.reason, "ok");
1468
1469        let n =
1470            parse_verdict(JudgeKind::Numeric, "{\"value\": 8.5, \"reason\": \"good\"}").unwrap();
1471        assert!(matches!(n.value, JudgeValue::Number(v) if (v - 8.5).abs() < f64::EPSILON));
1472    }
1473
1474    #[test]
1475    fn verdict_with_wrong_value_type_errors() {
1476        assert!(parse_verdict(JudgeKind::Boolean, "{\"value\": 3}").is_err());
1477        assert!(parse_verdict(JudgeKind::Numeric, "{\"value\": true}").is_err());
1478        assert!(parse_verdict(JudgeKind::Boolean, "no json").is_err());
1479    }
1480
1481    #[test]
1482    fn usage_accumulates_independently_per_field() {
1483        let mut total = Usage::default();
1484        total.add(&Usage {
1485            input_tokens: Some(10),
1486            output_tokens: None,
1487            cost_usd: Some(0.01),
1488        });
1489        total.add(&Usage {
1490            input_tokens: Some(5),
1491            output_tokens: Some(3),
1492            cost_usd: None,
1493        });
1494        assert_eq!(total.input_tokens, Some(15));
1495        assert_eq!(total.output_tokens, Some(3));
1496        assert!((total.cost_usd.unwrap() - 0.01).abs() < f64::EPSILON);
1497        assert!(!total.is_empty());
1498    }
1499
1500    #[test]
1501    fn reply_text_prefers_extracted_then_falls_back_to_stdout() {
1502        // Extracted text wins when present.
1503        assert_eq!(
1504            select_reply_text(Some("clean reply".into()), "raw noise"),
1505            Some("clean reply".into())
1506        );
1507        // Null/blank extracted text falls back to raw stdout (the contract's
1508        // escape hatch when oneharness can't extract but the reply is in stdout).
1509        assert_eq!(
1510            select_reply_text(None, "{\"type\":\"text\",\"part\":{\"text\":\"pong\"}}"),
1511            Some("{\"type\":\"text\",\"part\":{\"text\":\"pong\"}}".into())
1512        );
1513        assert_eq!(
1514            select_reply_text(Some("   ".into()), "fallback"),
1515            Some("fallback".into())
1516        );
1517        // Neither present is the only real error.
1518        assert_eq!(select_reply_text(None, "   \n"), None);
1519        assert_eq!(select_reply_text(Some(String::new()), ""), None);
1520    }
1521
1522    #[test]
1523    fn supports_resume_covers_known_harnesses() {
1524        assert!(supports_resume("claude-code"));
1525        assert!(supports_resume("opencode"));
1526        assert!(supports_resume("cursor"));
1527        assert!(!supports_resume("codex"));
1528        assert!(!supports_resume("goose"));
1529    }
1530
1531    fn api_config(vendor: ApiVendor) -> ApiJudgeConfig {
1532        ApiJudgeConfig {
1533            vendor,
1534            api_key_env: None,
1535            base_url: None,
1536            timeout_secs: 60,
1537            curl_bin: "curl".to_string(),
1538            strict_json: true,
1539        }
1540    }
1541
1542    #[test]
1543    fn api_judge_resolves_vendor_defaults() {
1544        let anthropic = ApiJudgeProvider::new(&api_config(ApiVendor::Anthropic));
1545        assert_eq!(anthropic.api_key_env, "ANTHROPIC_API_KEY");
1546        assert_eq!(anthropic.endpoint, "https://api.anthropic.com/v1/messages");
1547
1548        let openai = ApiJudgeProvider::new(&api_config(ApiVendor::Openai));
1549        assert_eq!(openai.api_key_env, "OPENAI_API_KEY");
1550        assert_eq!(
1551            openai.endpoint,
1552            "https://api.openai.com/v1/chat/completions"
1553        );
1554    }
1555
1556    #[test]
1557    fn api_judge_honors_overrides() {
1558        let provider = ApiJudgeProvider::new(&ApiJudgeConfig {
1559            vendor: ApiVendor::Openai,
1560            api_key_env: Some("MY_KEY".to_string()),
1561            base_url: Some("https://proxy.example/v1/chat/completions".to_string()),
1562            timeout_secs: 5,
1563            curl_bin: "curl".to_string(),
1564            strict_json: true,
1565        });
1566        assert_eq!(provider.api_key_env, "MY_KEY");
1567        assert_eq!(
1568            provider.endpoint,
1569            "https://proxy.example/v1/chat/completions"
1570        );
1571    }
1572
1573    #[test]
1574    fn build_chat_body_shapes_per_vendor() {
1575        let anthropic = build_chat_body(ApiVendor::Anthropic, "claude-x", "sys", "hi", None);
1576        assert_eq!(anthropic["model"], "claude-x");
1577        assert_eq!(anthropic["system"], "sys");
1578        assert_eq!(anthropic["messages"][0]["role"], "user");
1579        // Anthropic carries the system prompt in its own top-level field.
1580        assert_eq!(anthropic["messages"].as_array().unwrap().len(), 1);
1581        // No schema requested → no structured-outputs field.
1582        assert!(anthropic.get("output_config").is_none());
1583
1584        let openai = build_chat_body(ApiVendor::Openai, "gpt-x", "sys", "hi", None);
1585        assert_eq!(openai["messages"][0]["role"], "system");
1586        assert_eq!(openai["messages"][1]["role"], "user");
1587        assert!(openai.get("system").is_none());
1588        assert!(openai.get("response_format").is_none());
1589    }
1590
1591    #[test]
1592    fn build_chat_body_attaches_strict_schema_per_vendor() {
1593        let schema = verdict_schema(JudgeKind::Boolean);
1594        let anthropic = build_chat_body(
1595            ApiVendor::Anthropic,
1596            "claude-x",
1597            "sys",
1598            "hi",
1599            Some(schema.clone()),
1600        );
1601        // Anthropic uses output_config.format.
1602        assert_eq!(anthropic["output_config"]["format"]["type"], "json_schema");
1603        assert_eq!(
1604            anthropic["output_config"]["format"]["schema"]["properties"]["value"]["type"],
1605            "boolean"
1606        );
1607
1608        let numeric = verdict_schema(JudgeKind::Numeric);
1609        let openai = build_chat_body(ApiVendor::Openai, "gpt-x", "sys", "hi", Some(numeric));
1610        // OpenAI uses response_format.json_schema with strict: true.
1611        assert_eq!(openai["response_format"]["type"], "json_schema");
1612        assert_eq!(openai["response_format"]["json_schema"]["strict"], true);
1613        assert_eq!(
1614            openai["response_format"]["json_schema"]["schema"]["properties"]["value"]["type"],
1615            "number"
1616        );
1617    }
1618
1619    #[test]
1620    fn verdict_schema_requires_value_and_reason_with_no_extras() {
1621        let schema = verdict_schema(JudgeKind::Numeric);
1622        assert_eq!(schema["additionalProperties"], false);
1623        let required: Vec<&str> = schema["required"]
1624            .as_array()
1625            .unwrap()
1626            .iter()
1627            .map(|v| v.as_str().unwrap())
1628            .collect();
1629        assert_eq!(required, ["value", "reason"]);
1630    }
1631
1632    #[test]
1633    fn parses_anthropic_success_with_usage() {
1634        let raw = r#"{"content":[{"type":"text","text":"{\"value\": true}"}],
1635            "stop_reason":"end_turn","usage":{"input_tokens":12,"output_tokens":3}}"#;
1636        let outcome = parse_chat_response(ApiVendor::Anthropic, raw).unwrap();
1637        assert_eq!(outcome.text, "{\"value\": true}");
1638        let usage = outcome.usage.unwrap();
1639        assert_eq!(usage.input_tokens, Some(12));
1640        assert_eq!(usage.output_tokens, Some(3));
1641        assert!(usage.cost_usd.is_none());
1642    }
1643
1644    #[test]
1645    fn parses_openai_success_with_usage() {
1646        let raw = r#"{"choices":[{"message":{"content":"{\"value\": 8}"}}],
1647            "usage":{"prompt_tokens":20,"completion_tokens":4}}"#;
1648        let outcome = parse_chat_response(ApiVendor::Openai, raw).unwrap();
1649        assert_eq!(outcome.text, "{\"value\": 8}");
1650        let usage = outcome.usage.unwrap();
1651        assert_eq!(usage.input_tokens, Some(20));
1652        assert_eq!(usage.output_tokens, Some(4));
1653    }
1654
1655    #[test]
1656    fn parses_and_classifies_api_errors() {
1657        let auth = r#"{"error":{"type":"authentication_error","message":"bad key"}}"#;
1658        let err = parse_chat_response(ApiVendor::Anthropic, auth).unwrap_err();
1659        assert!(matches!(err, Error::Provider { kind: Some(k), .. } if k == "auth"));
1660
1661        let rate = r#"{"error":{"type":"rate_limit_exceeded","message":"slow down"}}"#;
1662        let err = parse_chat_response(ApiVendor::Openai, rate).unwrap_err();
1663        assert!(matches!(err, Error::Provider { kind: Some(k), .. } if k == "rate_limit"));
1664    }
1665
1666    #[test]
1667    fn empty_reply_is_an_error() {
1668        let raw = r#"{"content":[],"stop_reason":"refusal"}"#;
1669        assert!(parse_chat_response(ApiVendor::Anthropic, raw).is_err());
1670    }
1671
1672    #[test]
1673    fn classify_api_error_maps_known_kinds() {
1674        assert_eq!(
1675            classify_api_error(Some("invalid_api_key")).as_deref(),
1676            Some("auth")
1677        );
1678        assert_eq!(
1679            classify_api_error(Some("insufficient_quota")).as_deref(),
1680            Some("quota")
1681        );
1682        assert_eq!(
1683            classify_api_error(Some("not_found_error")).as_deref(),
1684            Some("model_not_found")
1685        );
1686        assert_eq!(
1687            classify_api_error(Some("overloaded_error")).as_deref(),
1688            Some("overloaded")
1689        );
1690        assert_eq!(classify_api_error(Some("something_else")), None);
1691        assert_eq!(classify_api_error(None), None);
1692    }
1693
1694    #[test]
1695    fn retryable_covers_transient_errors_only() {
1696        let overloaded = r#"{"error":{"type":"overloaded_error","message":"busy"}}"#;
1697        let err = parse_chat_response(ApiVendor::Anthropic, overloaded).unwrap_err();
1698        assert!(is_retryable(&err), "overload should retry");
1699
1700        let rate = r#"{"error":{"type":"rate_limit_error","message":"slow"}}"#;
1701        let err = parse_chat_response(ApiVendor::Anthropic, rate).unwrap_err();
1702        assert!(is_retryable(&err), "rate limit should retry");
1703
1704        let auth = r#"{"error":{"type":"authentication_error","message":"bad key"}}"#;
1705        let err = parse_chat_response(ApiVendor::Anthropic, auth).unwrap_err();
1706        assert!(!is_retryable(&err), "auth must not retry");
1707    }
1708
1709    #[test]
1710    fn curl_escape_handles_quotes_and_backslashes() {
1711        assert_eq!(curl_escape(r#"a"b\c"#), r#"a\"b\\c"#);
1712    }
1713
1714    /// A skill-running provider stub so the SplitProvider's delegation can be
1715    /// checked without touching the network.
1716    struct StubResponder;
1717
1718    impl Provider for StubResponder {
1719        fn respond(
1720            &self,
1721            _platform: &str,
1722            _model: &str,
1723            _skill: &SkillRef<'_>,
1724            _messages: &[Message],
1725            _session: Option<&str>,
1726        ) -> Result<AssistantTurn> {
1727            Ok(AssistantTurn {
1728                message: "stub reply".to_string(),
1729                ..Default::default()
1730            })
1731        }
1732
1733        fn simulate_user(
1734            &self,
1735            _model: &str,
1736            _persona: &str,
1737            _messages: &[Message],
1738        ) -> Result<UserTurn> {
1739            unreachable!("split provider routes user simulation to the judge")
1740        }
1741
1742        fn judge(
1743            &self,
1744            _model: &str,
1745            _query: &JudgeQuery<'_>,
1746            _messages: &[Message],
1747        ) -> Result<JudgeVerdict> {
1748            unreachable!("split provider routes judging to the judge")
1749        }
1750
1751        fn supports_resume(&self, platform: &str) -> bool {
1752            platform == "claude-code"
1753        }
1754    }
1755
1756    #[test]
1757    fn split_provider_delegates_respond_and_resume() {
1758        let split = SplitProvider::new(
1759            Box::new(StubResponder),
1760            ApiJudgeProvider::new(&api_config(ApiVendor::Anthropic)),
1761        );
1762        // respond + supports_resume go to the responder...
1763        assert!(split.supports_resume("claude-code"));
1764        assert!(!split.supports_resume("codex"));
1765        let skill = SkillRef {
1766            name: "s",
1767            dir: "/tmp/s",
1768            instructions: "do things",
1769        };
1770        let turn = split
1771            .respond("claude-code", "m", &skill, &[], None)
1772            .unwrap();
1773        assert_eq!(turn.message, "stub reply");
1774    }
1775
1776    #[test]
1777    fn api_judge_does_not_run_skills() {
1778        let provider = ApiJudgeProvider::new(&api_config(ApiVendor::Anthropic));
1779        let skill = SkillRef {
1780            name: "s",
1781            dir: "/tmp/s",
1782            instructions: "x",
1783        };
1784        assert!(provider.respond("p", "m", &skill, &[], None).is_err());
1785    }
1786}