skilltest_core/
provider.rs

1//! The provider boundary. `skilltest` never talks to a model directly; a
2//! [`Provider`] runs the skill, plays the simulated user, and judges the
3//! transcript.
4//!
5//! There are two real implementations. [`OneharnessProvider`] (the default) runs
6//! each prompt on a harness through the
7//! [`oneharness`](https://github.com/nickderobertis/oneharness) CLI and parses
8//! its JSON. [`CommandProvider`] speaks a small JSON-lines protocol (see
9//! `docs/protocol.md`) and backs both the deterministic `skilltest-fake-provider`
10//! used by the gate and any custom provider you write. The [`Provider`] trait
11//! also lets the runner be unit-tested against an in-memory fake.
12
13use std::io::Write as _;
14use std::process::{Command, Stdio};
15
16use serde::{Deserialize, Serialize};
17
18use crate::config::OneharnessConfig;
19use crate::conversation::{Message, Role};
20use crate::error::{Error, Result};
21use crate::eval::JudgeValue;
22
23/// A borrowed view of the skill under test, as sent to the provider.
24pub struct SkillRef<'a> {
25    pub name: &'a str,
26    pub dir: &'a str,
27    pub instructions: &'a str,
28}
29
30/// The kind of judgement requested.
31#[derive(Debug, Clone, Copy, PartialEq, Eq)]
32pub enum JudgeKind {
33    Boolean,
34    Numeric,
35}
36
37impl JudgeKind {
38    fn as_str(self) -> &'static str {
39        match self {
40            JudgeKind::Boolean => "boolean",
41            JudgeKind::Numeric => "numeric",
42        }
43    }
44}
45
46/// A judge query: the criterion, its kind, and (for numeric) the scale.
47pub struct JudgeQuery<'a> {
48    pub kind: JudgeKind,
49    pub criterion: &'a str,
50    pub scale: Option<(f64, f64)>,
51}
52
53/// Token / cost usage for one provider call.
54///
55/// Each field is independently optional because not every harness reports every
56/// signal (cost is commonly absent on subscription auth; some harnesses report
57/// no usage at all). The whole struct is `Option<Usage>` on a turn — `None`
58/// means "no signal," not "zero."
59#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
60pub struct Usage {
61    #[serde(default, skip_serializing_if = "Option::is_none")]
62    pub input_tokens: Option<u64>,
63    #[serde(default, skip_serializing_if = "Option::is_none")]
64    pub output_tokens: Option<u64>,
65    #[serde(default, skip_serializing_if = "Option::is_none")]
66    pub cost_usd: Option<f64>,
67}
68
69impl Usage {
70    /// True iff every field is `None`.
71    #[must_use]
72    pub fn is_empty(&self) -> bool {
73        self.input_tokens.is_none() && self.output_tokens.is_none() && self.cost_usd.is_none()
74    }
75
76    /// Add another sample into this total. `None` values stay `None` until
77    /// something reports a real number, at which point they accumulate.
78    pub fn add(&mut self, other: &Usage) {
79        if let Some(v) = other.input_tokens {
80            self.input_tokens = Some(self.input_tokens.unwrap_or(0) + v);
81        }
82        if let Some(v) = other.output_tokens {
83            self.output_tokens = Some(self.output_tokens.unwrap_or(0) + v);
84        }
85        if let Some(v) = other.cost_usd {
86            self.cost_usd = Some(self.cost_usd.unwrap_or(0.0) + v);
87        }
88    }
89}
90
91/// An assistant/skill turn produced by the provider.
92#[derive(Debug, Clone, Default)]
93pub struct AssistantTurn {
94    pub message: String,
95    /// The skill signalled it considers the task complete.
96    pub done: bool,
97    /// Cost/token usage for this call, if the provider reported it.
98    pub usage: Option<Usage>,
99    /// A session handle the runner can pass back on the next `respond` call to
100    /// continue the same conversation against the real harness (only some
101    /// harnesses expose this — see `OneharnessProvider::supports_resume`).
102    pub session_id: Option<String>,
103}
104
105/// A simulated-user turn produced by the provider.
106#[derive(Debug, Clone, Default)]
107pub struct UserTurn {
108    pub message: String,
109    /// The simulated user chose to end the conversation.
110    pub stop: bool,
111    pub usage: Option<Usage>,
112}
113
114/// A judge verdict: the raw value (bool or number) plus the stated reason.
115#[derive(Debug, Clone)]
116pub struct JudgeVerdict {
117    pub value: JudgeValue,
118    pub reason: String,
119    pub usage: Option<Usage>,
120}
121
122/// The provider boundary.
123pub trait Provider {
124    /// Run one assistant/skill turn given the conversation so far. `session`,
125    /// when `Some`, is a handle returned by a previous `respond` call on this
126    /// run that the provider may use to continue the same harness session
127    /// (e.g. via `oneharness run --resume`); providers that don't support
128    /// continuation should ignore it.
129    ///
130    /// # Errors
131    /// [`Error::Provider`] if the command fails or returns malformed output.
132    fn respond(
133        &self,
134        platform: &str,
135        model: &str,
136        skill: &SkillRef<'_>,
137        messages: &[Message],
138        session: Option<&str>,
139    ) -> Result<AssistantTurn>;
140
141    /// Produce one simulated-user turn.
142    ///
143    /// # Errors
144    /// [`Error::Provider`] if the command fails or returns malformed output.
145    fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn>;
146
147    /// Score a criterion against the conversation.
148    ///
149    /// # Errors
150    /// [`Error::Provider`] if the command fails or returns malformed output.
151    fn judge(
152        &self,
153        model: &str,
154        query: &JudgeQuery<'_>,
155        messages: &[Message],
156    ) -> Result<JudgeVerdict>;
157
158    /// True iff `respond` on `platform` will faithfully continue a prior
159    /// session when given its `session_id`. The default is `false`; providers
160    /// that support resume override this so the runner knows to thread the
161    /// session id through.
162    fn supports_resume(&self, _platform: &str) -> bool {
163        false
164    }
165}
166
167// ---------------------------------------------------------------------------
168// Wire types (CommandProvider JSON-lines protocol)
169// ---------------------------------------------------------------------------
170
171#[derive(Serialize)]
172struct SkillPayload<'a> {
173    name: &'a str,
174    path: &'a str,
175    instructions: &'a str,
176}
177
178#[derive(Serialize)]
179#[serde(tag = "op", rename_all = "lowercase")]
180enum Request<'a> {
181    Respond {
182        platform: &'a str,
183        model: &'a str,
184        skill: SkillPayload<'a>,
185        messages: &'a [Message],
186        #[serde(skip_serializing_if = "Option::is_none")]
187        session: Option<&'a str>,
188    },
189    User {
190        model: &'a str,
191        persona: &'a str,
192        messages: &'a [Message],
193    },
194    Judge {
195        model: &'a str,
196        kind: &'a str,
197        criterion: &'a str,
198        #[serde(skip_serializing_if = "Option::is_none")]
199        min: Option<f64>,
200        #[serde(skip_serializing_if = "Option::is_none")]
201        max: Option<f64>,
202        messages: &'a [Message],
203    },
204}
205
206#[derive(Deserialize)]
207struct RespondPayload {
208    message: String,
209    #[serde(default)]
210    done: bool,
211    #[serde(default)]
212    usage: Option<Usage>,
213    #[serde(default)]
214    session_id: Option<String>,
215}
216
217#[derive(Deserialize)]
218struct UserPayload {
219    message: String,
220    #[serde(default)]
221    stop: bool,
222    #[serde(default)]
223    usage: Option<Usage>,
224}
225
226#[derive(Deserialize)]
227struct JudgePayload {
228    value: JudgeValue,
229    #[serde(default)]
230    reason: String,
231    #[serde(default)]
232    usage: Option<Usage>,
233}
234
235// ---------------------------------------------------------------------------
236// CommandProvider
237// ---------------------------------------------------------------------------
238
239/// A [`Provider`] backed by an external command speaking the JSON protocol.
240pub struct CommandProvider {
241    argv: Vec<String>,
242}
243
244impl CommandProvider {
245    /// Build a provider from an argv vector (program + args). The program is
246    /// resolved on `PATH`.
247    ///
248    /// # Errors
249    /// [`Error::Invalid`] if `argv` is empty.
250    pub fn new(argv: Vec<String>) -> Result<Self> {
251        if argv.is_empty() {
252            return Err(Error::Invalid("provider command is empty".into()));
253        }
254        Ok(Self { argv })
255    }
256
257    /// Send one request and parse the single response object from stdout.
258    fn call<T: for<'de> Deserialize<'de>>(&self, request: &Request<'_>, op: &str) -> Result<T> {
259        let payload = serde_json::to_vec(request).map_err(|e| {
260            Error::provider(op.to_string(), format!("could not encode request: {e}"))
261        })?;
262
263        let mut child = Command::new(&self.argv[0])
264            .args(&self.argv[1..])
265            .stdin(Stdio::piped())
266            .stdout(Stdio::piped())
267            .stderr(Stdio::piped())
268            .spawn()
269            .map_err(|e| {
270                Error::provider(
271                    op.to_string(),
272                    format!(
273                        "could not run provider `{}`: {e}. Is it installed and on PATH?",
274                        self.argv[0]
275                    ),
276                )
277            })?;
278
279        // Write the request, then close stdin so the child can finish. Writing
280        // before reading stdout is safe here because responses are small.
281        {
282            let stdin = child
283                .stdin
284                .as_mut()
285                .ok_or_else(|| Error::provider(op.to_string(), "could not open provider stdin"))?;
286            stdin
287                .write_all(&payload)
288                .and_then(|()| stdin.write_all(b"\n"))
289                .map_err(|e| {
290                    Error::provider(op.to_string(), format!("could not write request: {e}"))
291                })?;
292        }
293
294        let output = child.wait_with_output().map_err(|e| {
295            Error::provider(op.to_string(), format!("provider did not complete: {e}"))
296        })?;
297
298        if !output.status.success() {
299            let stderr = String::from_utf8_lossy(&output.stderr);
300            return Err(Error::provider(
301                op.to_string(),
302                format!("provider exited with {}: {}", output.status, stderr.trim()),
303            ));
304        }
305
306        let stdout = String::from_utf8_lossy(&output.stdout);
307        let line = stdout.trim();
308        if line.is_empty() {
309            return Err(Error::provider(
310                op.to_string(),
311                "provider produced no output (expected one JSON response object)",
312            ));
313        }
314        serde_json::from_str(line).map_err(|e| {
315            Error::provider(
316                op.to_string(),
317                format!("provider response was not valid JSON for `{op}`: {e}; got: {line}"),
318            )
319        })
320    }
321}
322
323impl Provider for CommandProvider {
324    fn respond(
325        &self,
326        platform: &str,
327        model: &str,
328        skill: &SkillRef<'_>,
329        messages: &[Message],
330        session: Option<&str>,
331    ) -> Result<AssistantTurn> {
332        let request = Request::Respond {
333            platform,
334            model,
335            skill: SkillPayload {
336                name: skill.name,
337                path: skill.dir,
338                instructions: skill.instructions,
339            },
340            messages,
341            session,
342        };
343        let payload: RespondPayload = self.call(&request, "respond")?;
344        Ok(AssistantTurn {
345            message: payload.message,
346            done: payload.done,
347            usage: payload.usage,
348            session_id: payload.session_id,
349        })
350    }
351
352    fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
353        let request = Request::User {
354            model,
355            persona,
356            messages,
357        };
358        let payload: UserPayload = self.call(&request, "user")?;
359        Ok(UserTurn {
360            message: payload.message,
361            stop: payload.stop,
362            usage: payload.usage,
363        })
364    }
365
366    fn judge(
367        &self,
368        model: &str,
369        query: &JudgeQuery<'_>,
370        messages: &[Message],
371    ) -> Result<JudgeVerdict> {
372        let (min, max) = match query.scale {
373            Some((lo, hi)) => (Some(lo), Some(hi)),
374            None => (None, None),
375        };
376        let request = Request::Judge {
377            model,
378            kind: query.kind.as_str(),
379            criterion: query.criterion,
380            min,
381            max,
382            messages,
383        };
384        let payload: JudgePayload = self.call(&request, "judge")?;
385        Ok(JudgeVerdict {
386            value: payload.value,
387            reason: payload.reason,
388            usage: payload.usage,
389        })
390    }
391}
392
393// ---------------------------------------------------------------------------
394// OneharnessProvider
395// ---------------------------------------------------------------------------
396
397/// The default [`Provider`]: runs each prompt on a harness through the
398/// `oneharness` CLI.
399///
400/// Wires four real oneharness features that ship in v0.2.0:
401///
402/// * `--system <skill instructions>` — the skill becomes a *real* system prompt
403///   on the underlying harness (e.g. `--append-system-prompt` for claude-code),
404///   instead of being inlined into the user message.
405/// * `--resume <session>` — multi-turn `respond` calls thread the previous
406///   `session_id` so the harness sees a continuing conversation (and keeps its
407///   tool state, files, etc.) instead of being re-prompted with a stringified
408///   transcript. Used only for harnesses that report `supports_resume` in the
409///   registry (claude-code, opencode, cursor today); other harnesses fall back
410///   to the inline-transcript path.
411/// * Normalized `usage` (`input_tokens`, `output_tokens`, `cost_usd`) — surfaced
412///   on every turn so cross-model cost reporting is portable.
413/// * Normalized `failure_kind` (`auth`, `rate_limit`, `model_not_found`, …) —
414///   classified provider errors so the CLI can distinguish a broken environment
415///   from a broken skill.
416///
417/// Evals and the simulated user always run on the configured `judge_harness`,
418/// independent of the harness under test, so the evaluator does not drift with
419/// the matrix.
420pub struct OneharnessProvider {
421    bin: String,
422    judge_harness: String,
423    timeout_secs: u64,
424}
425
426/// The subset of the `oneharness run` JSON envelope we consume.
427#[derive(Deserialize)]
428struct OhEnvelope {
429    results: Vec<OhResult>,
430}
431
432#[derive(Deserialize)]
433struct OhResult {
434    status: String,
435    #[serde(default)]
436    text: Option<String>,
437    /// Raw harness stdout. oneharness's `text` extraction is best-effort and may
438    /// be null when a harness's output shape defeats it, with stdout as the
439    /// documented fallback; we honor that rather than hard-failing. No harness in
440    /// the live matrix relies on it today (OpenCode's JSONL — the case that
441    /// motivated this — is extracted natively as of oneharness v0.2.37), but the
442    /// contract holds for any harness, so the fallback stays as defense-in-depth.
443    #[serde(default)]
444    stdout: String,
445    #[serde(default)]
446    stderr: String,
447    #[serde(default)]
448    error: Option<String>,
449    #[serde(default)]
450    session_id: Option<String>,
451    #[serde(default)]
452    usage: Option<Usage>,
453    #[serde(default)]
454    failure_kind: Option<String>,
455}
456
457/// Parameters for one `oneharness run` invocation.
458struct RunArgs<'a> {
459    harness: &'a str,
460    model: &'a str,
461    prompt: &'a str,
462    /// Becomes `--system <text>`; only set on `respond` so the skill is the
463    /// system prompt rather than inlined into the user turn.
464    system: Option<&'a str>,
465    /// Becomes `--resume <id>`; only set when the runner wants to continue a
466    /// prior harness session.
467    resume: Option<&'a str>,
468}
469
470/// What we get back from one `oneharness run`.
471struct RunOutcome {
472    text: String,
473    session_id: Option<String>,
474    usage: Option<Usage>,
475}
476
477/// Choose the harness's reply text: oneharness's extracted `text` when non-empty,
478/// otherwise its raw stdout. oneharness extracts `text` on a best-effort basis
479/// and, per its contract, may leave it null when a harness's output shape defeats
480/// extraction — the reply still survives in stdout. (OpenCode's JSONL once hit
481/// this; oneharness v0.2.37 extracts it natively, so the fallback is now
482/// defense-in-depth.) Returns `None` only when both are empty, the one case that
483/// is a genuine "the harness said nothing" error.
484fn select_reply_text(text: Option<String>, stdout: &str) -> Option<String> {
485    text.filter(|t| !t.trim().is_empty())
486        .or_else(|| (!stdout.trim().is_empty()).then(|| stdout.to_string()))
487}
488
489impl OneharnessProvider {
490    /// Build a provider from its configuration.
491    #[must_use]
492    pub fn new(config: &OneharnessConfig) -> Self {
493        Self {
494            bin: config.bin.clone(),
495            judge_harness: config.judge_harness.clone(),
496            timeout_secs: config.timeout_secs,
497        }
498    }
499
500    /// Run one prompt on `harness` and return the normalized text plus the
501    /// session id and usage (when oneharness lifted them from the harness's
502    /// output).
503    fn run(&self, args: &RunArgs<'_>) -> Result<RunOutcome> {
504        let timeout = self.timeout_secs.to_string();
505        let mut cmd = Command::new(&self.bin);
506        // Intentionally no `--output-format` override: oneharness already requests
507        // each harness's *default* format (json for claude-code/opencode,
508        // stream-json for cursor, text for codex/goose/qwen/crush/copilot) and
509        // extracts the reply accordingly. Forcing `json` everywhere broke the
510        // text-native harnesses — oneharness would json-extract their plain-text
511        // reply and find nothing ("harness produced no extractable text").
512        cmd.args([
513            "run",
514            "--harness",
515            args.harness,
516            "--compact",
517            "--timeout",
518            &timeout,
519            "--prompt-file",
520            "-",
521        ]);
522        // An empty model means "unspecified" — omit `--model` so the harness uses
523        // its own default (cursor/crush/copilot) or an env-selected model (qwen
524        // via OPENAI_MODEL, goose via GOOSE_MODEL), exactly as oneharness's own
525        // smoke scripts do. Forwarding `--model ""` would push a broken empty
526        // model flag to the harness CLI.
527        if !args.model.is_empty() {
528            cmd.args(["--model", args.model]);
529        }
530        if let Some(system) = args.system {
531            cmd.args(["--system", system]);
532        }
533        if let Some(resume) = args.resume {
534            cmd.args(["--resume", resume]);
535        }
536
537        let mut child = cmd
538            .stdin(Stdio::piped())
539            .stdout(Stdio::piped())
540            .stderr(Stdio::piped())
541            .spawn()
542            .map_err(|e| {
543                Error::provider(
544                    "oneharness",
545                    format!(
546                        "could not run `{}`: {e}. Is oneharness installed and on PATH?",
547                        self.bin
548                    ),
549                )
550            })?;
551
552        child
553            .stdin
554            .as_mut()
555            .ok_or_else(|| Error::provider("oneharness", "could not open oneharness stdin"))?
556            .write_all(args.prompt.as_bytes())
557            .map_err(|e| Error::provider("oneharness", format!("could not write prompt: {e}")))?;
558
559        let output = child.wait_with_output().map_err(|e| {
560            Error::provider("oneharness", format!("oneharness did not complete: {e}"))
561        })?;
562
563        let stdout = String::from_utf8_lossy(&output.stdout);
564        let envelope: OhEnvelope = serde_json::from_str(stdout.trim()).map_err(|e| {
565            Error::provider(
566                "oneharness",
567                format!(
568                    "could not parse oneharness output: {e}; stderr: {}",
569                    String::from_utf8_lossy(&output.stderr).trim()
570                ),
571            )
572        })?;
573
574        let result = envelope
575            .results
576            .into_iter()
577            .next()
578            .ok_or_else(|| Error::provider("oneharness", "oneharness returned no results"))?;
579
580        if result.status != "ok" {
581            let detail = result
582                .error
583                .filter(|e| !e.is_empty())
584                .or_else(|| Some(result.stderr.clone()).filter(|s| !s.is_empty()))
585                .unwrap_or_else(|| format!("status `{}`", result.status));
586            let context = format!("oneharness:{}", args.harness);
587            let message = format!("harness run failed: {detail}");
588            return Err(match result.failure_kind {
589                Some(kind) if !kind.is_empty() => {
590                    Error::provider_classified(context, message, kind)
591                }
592                _ => Error::provider(context, message),
593            });
594        }
595
596        // Prefer oneharness's extracted `text`; fall back to raw stdout when a
597        // harness's output shape defeats extraction (oneharness's documented
598        // contract — see OhResult::stdout). Only a run that produced *neither* is
599        // a real error.
600        let text = select_reply_text(result.text, &result.stdout).ok_or_else(|| {
601            Error::provider(
602                format!("oneharness:{}", args.harness),
603                "harness produced neither extractable text nor stdout",
604            )
605        })?;
606        Ok(RunOutcome {
607            text,
608            session_id: result.session_id,
609            usage: result.usage,
610        })
611    }
612}
613
614impl Provider for OneharnessProvider {
615    fn respond(
616        &self,
617        platform: &str,
618        model: &str,
619        skill: &SkillRef<'_>,
620        messages: &[Message],
621        session: Option<&str>,
622    ) -> Result<AssistantTurn> {
623        // If we have a real session to continue on a supporting harness, only
624        // send the last user message — the harness still has its prior state.
625        // Otherwise inline the whole transcript so harnesses without resume
626        // still see the conversation.
627        let prompt = if session.is_some() {
628            latest_user_message(messages).unwrap_or_default()
629        } else {
630            render_transcript_for_respond(messages)
631        };
632        let outcome = self.run(&RunArgs {
633            harness: platform,
634            model,
635            prompt: &prompt,
636            system: Some(skill.instructions),
637            resume: session,
638        })?;
639        Ok(AssistantTurn {
640            message: outcome.text.trim().to_string(),
641            done: false,
642            usage: outcome.usage,
643            session_id: outcome.session_id,
644        })
645    }
646
647    fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
648        let prompt = build_user_prompt(persona, messages);
649        let outcome = self.run(&RunArgs {
650            harness: &self.judge_harness,
651            model,
652            prompt: &prompt,
653            system: None,
654            resume: None,
655        })?;
656        Ok(UserTurn {
657            message: outcome.text.trim().to_string(),
658            stop: false,
659            usage: outcome.usage,
660        })
661    }
662
663    fn judge(
664        &self,
665        model: &str,
666        query: &JudgeQuery<'_>,
667        messages: &[Message],
668    ) -> Result<JudgeVerdict> {
669        let prompt = build_judge_prompt(query, messages);
670        let outcome = self.run(&RunArgs {
671            harness: &self.judge_harness,
672            model,
673            prompt: &prompt,
674            system: None,
675            resume: None,
676        })?;
677        let mut verdict = parse_verdict(query.kind, &outcome.text)?;
678        verdict.usage = outcome.usage;
679        Ok(verdict)
680    }
681
682    fn supports_resume(&self, platform: &str) -> bool {
683        supports_resume(platform)
684    }
685}
686
687/// The harnesses oneharness's adapter table marks `supports_resume = true`
688/// (claude-code's `--resume`, opencode's `--session`, cursor's `--resume`). Kept
689/// in sync with the `oneharness list` registry — when a new harness ships
690/// session continuation, add it here so the runner threads `session_id`.
691#[must_use]
692pub fn supports_resume(harness: &str) -> bool {
693    matches!(harness, "claude-code" | "opencode" | "cursor")
694}
695
696/// Render the conversation as `Role: content` lines for inlining in a prompt.
697/// Used by the judge, the simulated user, and the no-resume fallback path of
698/// `respond`.
699fn render_transcript(messages: &[Message]) -> String {
700    messages
701        .iter()
702        .map(|m| {
703            let role = match m.role {
704                Role::User => "User",
705                Role::Assistant => "Assistant",
706                Role::System => "System",
707            };
708            format!("{role}: {}", m.content)
709        })
710        .collect::<Vec<_>>()
711        .join("\n")
712}
713
714/// The prompt for `respond` when we cannot resume a harness session: inline the
715/// whole conversation so the stateless harness call sees it. The skill is
716/// passed separately as `--system`, so it does *not* appear here.
717fn render_transcript_for_respond(messages: &[Message]) -> String {
718    format!(
719        "Conversation so far (most recent last):\n{}\n\n\
720         Write only the assistant's next reply, following your system \
721         instructions. Output the reply text and nothing else.",
722        render_transcript(messages),
723    )
724}
725
726/// The most recent user message in the transcript — used as the next-turn
727/// prompt when resuming a real harness session.
728fn latest_user_message(messages: &[Message]) -> Option<String> {
729    messages
730        .iter()
731        .rev()
732        .find(|m| m.role == Role::User)
733        .map(|m| m.content.clone())
734}
735
736fn build_user_prompt(persona: &str, messages: &[Message]) -> String {
737    format!(
738        "You are role-playing the USER in a conversation with an AI assistant. \
739         Stay in character:\n\n{persona}\n\n\
740         Conversation so far (most recent last):\n{transcript}\n\n\
741         Write only the user's next message. Output the message text and nothing \
742         else.",
743        transcript = render_transcript(messages),
744    )
745}
746
747fn build_judge_prompt(query: &JudgeQuery<'_>, messages: &[Message]) -> String {
748    let transcript = render_transcript(messages);
749    match query.kind {
750        JudgeKind::Boolean => format!(
751            "You are a strict, careful evaluator of an AI assistant's behavior.\n\n\
752             Criterion: {criterion}\n\n\
753             Transcript:\n{transcript}\n\n\
754             Decide whether the criterion is satisfied. Respond with ONLY a \
755             single-line JSON object and nothing else:\n\
756             {{\"value\": true or false, \"reason\": \"<one short sentence>\"}}",
757            criterion = query.criterion,
758        ),
759        JudgeKind::Numeric => {
760            let (min, max) = query.scale.unwrap_or((0.0, 10.0));
761            format!(
762                "You are a strict, careful evaluator of an AI assistant's behavior.\n\n\
763                 Criterion: {criterion}\n\n\
764                 Transcript:\n{transcript}\n\n\
765                 Score how well the criterion is satisfied on a scale from {min} to \
766                 {max} (inclusive). Respond with ONLY a single-line JSON object and \
767                 nothing else:\n\
768                 {{\"value\": <number between {min} and {max}>, \"reason\": \"<one short sentence>\"}}",
769                criterion = query.criterion,
770            )
771        }
772    }
773}
774
775/// Extract the first JSON object from `text`, tolerating code fences and prose
776/// around it (real models do not always emit bare JSON).
777fn extract_json_object(text: &str) -> Option<&str> {
778    let start = text.find('{')?;
779    let end = text.rfind('}')?;
780    if end > start {
781        Some(&text[start..=end])
782    } else {
783        None
784    }
785}
786
787fn parse_verdict(kind: JudgeKind, text: &str) -> Result<JudgeVerdict> {
788    let json = extract_json_object(text).ok_or_else(|| {
789        Error::provider(
790            "oneharness:judge",
791            format!("judge did not return a JSON object; got: {text}"),
792        )
793    })?;
794    let value: serde_json::Value = serde_json::from_str(json).map_err(|e| {
795        Error::provider(
796            "oneharness:judge",
797            format!("judge verdict was not valid JSON: {e}; got: {json}"),
798        )
799    })?;
800    let reason = value
801        .get("reason")
802        .and_then(serde_json::Value::as_str)
803        .unwrap_or("")
804        .to_string();
805    let raw = value
806        .get("value")
807        .ok_or_else(|| Error::provider("oneharness:judge", "judge verdict has no `value` field"))?;
808
809    let verdict_value = match kind {
810        JudgeKind::Boolean => JudgeValue::Bool(raw.as_bool().ok_or_else(|| {
811            Error::provider(
812                "oneharness:judge",
813                format!("boolean judge `value` was not a bool: {raw}"),
814            )
815        })?),
816        JudgeKind::Numeric => JudgeValue::Number(raw.as_f64().ok_or_else(|| {
817            Error::provider(
818                "oneharness:judge",
819                format!("numeric judge `value` was not a number: {raw}"),
820            )
821        })?),
822    };
823
824    Ok(JudgeVerdict {
825        value: verdict_value,
826        reason,
827        usage: None,
828    })
829}
830
831#[cfg(test)]
832mod tests {
833    use super::*;
834
835    #[test]
836    fn empty_argv_is_rejected() {
837        assert!(CommandProvider::new(vec![]).is_err());
838    }
839
840    #[test]
841    fn request_serializes_with_op_tag() {
842        let req = Request::Judge {
843            model: "m",
844            kind: "numeric",
845            criterion: "polite",
846            min: Some(0.0),
847            max: Some(10.0),
848            messages: &[],
849        };
850        let json = serde_json::to_string(&req).unwrap();
851        assert!(json.contains("\"op\":\"judge\""));
852        assert!(json.contains("\"kind\":\"numeric\""));
853    }
854
855    #[test]
856    fn respond_no_session_inlines_transcript_but_not_skill() {
857        // The skill is passed via --system now, so the prompt the harness sees
858        // for respond carries only the transcript.
859        let messages = [
860            Message::user("Hi"),
861            Message::assistant("Hello"),
862            Message::user("Again?"),
863        ];
864        let prompt = render_transcript_for_respond(&messages);
865        assert!(prompt.contains("User: Hi"));
866        assert!(prompt.contains("Assistant: Hello"));
867        assert!(prompt.contains("User: Again?"));
868        // The skill body must not leak here — it belongs in --system.
869        assert!(!prompt.contains("SKILL"));
870    }
871
872    #[test]
873    fn respond_with_session_sends_only_latest_user_message() {
874        let messages = [
875            Message::user("Hi"),
876            Message::assistant("Hello"),
877            Message::user("Again?"),
878        ];
879        assert_eq!(latest_user_message(&messages).as_deref(), Some("Again?"));
880    }
881
882    #[test]
883    fn extracts_json_from_fenced_or_prose_text() {
884        assert_eq!(
885            extract_json_object("```json\n{\"value\": true}\n```"),
886            Some("{\"value\": true}")
887        );
888        assert_eq!(
889            extract_json_object("Sure! {\"value\": 8, \"reason\": \"x\"} done"),
890            Some("{\"value\": 8, \"reason\": \"x\"}")
891        );
892        assert_eq!(extract_json_object("no json here"), None);
893    }
894
895    #[test]
896    fn parses_boolean_and_numeric_verdicts() {
897        let b = parse_verdict(JudgeKind::Boolean, "{\"value\": true, \"reason\": \"ok\"}").unwrap();
898        assert!(matches!(b.value, JudgeValue::Bool(true)));
899        assert_eq!(b.reason, "ok");
900
901        let n =
902            parse_verdict(JudgeKind::Numeric, "{\"value\": 8.5, \"reason\": \"good\"}").unwrap();
903        assert!(matches!(n.value, JudgeValue::Number(v) if (v - 8.5).abs() < f64::EPSILON));
904    }
905
906    #[test]
907    fn verdict_with_wrong_value_type_errors() {
908        assert!(parse_verdict(JudgeKind::Boolean, "{\"value\": 3}").is_err());
909        assert!(parse_verdict(JudgeKind::Numeric, "{\"value\": true}").is_err());
910        assert!(parse_verdict(JudgeKind::Boolean, "no json").is_err());
911    }
912
913    #[test]
914    fn usage_accumulates_independently_per_field() {
915        let mut total = Usage::default();
916        total.add(&Usage {
917            input_tokens: Some(10),
918            output_tokens: None,
919            cost_usd: Some(0.01),
920        });
921        total.add(&Usage {
922            input_tokens: Some(5),
923            output_tokens: Some(3),
924            cost_usd: None,
925        });
926        assert_eq!(total.input_tokens, Some(15));
927        assert_eq!(total.output_tokens, Some(3));
928        assert!((total.cost_usd.unwrap() - 0.01).abs() < f64::EPSILON);
929        assert!(!total.is_empty());
930    }
931
932    #[test]
933    fn reply_text_prefers_extracted_then_falls_back_to_stdout() {
934        // Extracted text wins when present.
935        assert_eq!(
936            select_reply_text(Some("clean reply".into()), "raw noise"),
937            Some("clean reply".into())
938        );
939        // Null/blank extracted text falls back to raw stdout (the contract's
940        // escape hatch when oneharness can't extract but the reply is in stdout).
941        assert_eq!(
942            select_reply_text(None, "{\"type\":\"text\",\"part\":{\"text\":\"pong\"}}"),
943            Some("{\"type\":\"text\",\"part\":{\"text\":\"pong\"}}".into())
944        );
945        assert_eq!(
946            select_reply_text(Some("   ".into()), "fallback"),
947            Some("fallback".into())
948        );
949        // Neither present is the only real error.
950        assert_eq!(select_reply_text(None, "   \n"), None);
951        assert_eq!(select_reply_text(Some(String::new()), ""), None);
952    }
953
954    #[test]
955    fn supports_resume_covers_known_harnesses() {
956        assert!(supports_resume("claude-code"));
957        assert!(supports_resume("opencode"));
958        assert!(supports_resume("cursor"));
959        assert!(!supports_resume("codex"));
960        assert!(!supports_resume("goose"));
961    }
962}
skilltest_core/provider.rs

skilltest_core/
provider.rs