use std::io::Write as _;
use std::process::{Command, Stdio};
use serde::{Deserialize, Serialize};
use crate::config::{ApiJudgeConfig, ApiVendor, OneharnessConfig};
use crate::conversation::{Message, Role};
use crate::error::{Error, Result};
use crate::eval::JudgeValue;
pub struct SkillRef<'a> {
pub name: &'a str,
pub dir: &'a str,
pub instructions: &'a str,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum JudgeKind {
Boolean,
Numeric,
}
impl JudgeKind {
fn as_str(self) -> &'static str {
match self {
JudgeKind::Boolean => "boolean",
JudgeKind::Numeric => "numeric",
}
}
}
pub struct JudgeQuery<'a> {
pub kind: JudgeKind,
pub criterion: &'a str,
pub scale: Option<(f64, f64)>,
}
#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
pub struct Usage {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub input_tokens: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub output_tokens: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cost_usd: Option<f64>,
}
impl Usage {
#[must_use]
pub fn is_empty(&self) -> bool {
self.input_tokens.is_none() && self.output_tokens.is_none() && self.cost_usd.is_none()
}
pub fn add(&mut self, other: &Usage) {
if let Some(v) = other.input_tokens {
self.input_tokens = Some(self.input_tokens.unwrap_or(0) + v);
}
if let Some(v) = other.output_tokens {
self.output_tokens = Some(self.output_tokens.unwrap_or(0) + v);
}
if let Some(v) = other.cost_usd {
self.cost_usd = Some(self.cost_usd.unwrap_or(0.0) + v);
}
}
}
#[derive(Debug, Clone, Default)]
pub struct AssistantTurn {
pub message: String,
pub done: bool,
pub usage: Option<Usage>,
pub session_id: Option<String>,
}
#[derive(Debug, Clone, Default)]
pub struct UserTurn {
pub message: String,
pub stop: bool,
pub usage: Option<Usage>,
}
#[derive(Debug, Clone)]
pub struct JudgeVerdict {
pub value: JudgeValue,
pub reason: String,
pub usage: Option<Usage>,
}
pub trait Provider {
fn respond(
&self,
platform: &str,
model: &str,
skill: &SkillRef<'_>,
messages: &[Message],
session: Option<&str>,
) -> Result<AssistantTurn>;
fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn>;
fn judge(
&self,
model: &str,
query: &JudgeQuery<'_>,
messages: &[Message],
) -> Result<JudgeVerdict>;
fn supports_resume(&self, _platform: &str) -> bool {
false
}
}
#[derive(Serialize)]
struct SkillPayload<'a> {
name: &'a str,
path: &'a str,
instructions: &'a str,
}
#[derive(Serialize)]
#[serde(tag = "op", rename_all = "lowercase")]
enum Request<'a> {
Respond {
platform: &'a str,
model: &'a str,
skill: SkillPayload<'a>,
messages: &'a [Message],
#[serde(skip_serializing_if = "Option::is_none")]
session: Option<&'a str>,
},
User {
model: &'a str,
persona: &'a str,
messages: &'a [Message],
},
Judge {
model: &'a str,
kind: &'a str,
criterion: &'a str,
#[serde(skip_serializing_if = "Option::is_none")]
min: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
max: Option<f64>,
messages: &'a [Message],
},
}
#[derive(Deserialize)]
struct RespondPayload {
message: String,
#[serde(default)]
done: bool,
#[serde(default)]
usage: Option<Usage>,
#[serde(default)]
session_id: Option<String>,
}
#[derive(Deserialize)]
struct UserPayload {
message: String,
#[serde(default)]
stop: bool,
#[serde(default)]
usage: Option<Usage>,
}
#[derive(Deserialize)]
struct JudgePayload {
value: JudgeValue,
#[serde(default)]
reason: String,
#[serde(default)]
usage: Option<Usage>,
}
pub struct CommandProvider {
argv: Vec<String>,
}
impl CommandProvider {
pub fn new(argv: Vec<String>) -> Result<Self> {
if argv.is_empty() {
return Err(Error::Invalid("provider command is empty".into()));
}
Ok(Self { argv })
}
fn call<T: for<'de> Deserialize<'de>>(&self, request: &Request<'_>, op: &str) -> Result<T> {
let payload = serde_json::to_vec(request).map_err(|e| {
Error::provider(op.to_string(), format!("could not encode request: {e}"))
})?;
let mut child = Command::new(&self.argv[0])
.args(&self.argv[1..])
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.map_err(|e| {
Error::provider(
op.to_string(),
format!(
"could not run provider `{}`: {e}. Is it installed and on PATH?",
self.argv[0]
),
)
})?;
{
let stdin = child
.stdin
.as_mut()
.ok_or_else(|| Error::provider(op.to_string(), "could not open provider stdin"))?;
stdin
.write_all(&payload)
.and_then(|()| stdin.write_all(b"\n"))
.map_err(|e| {
Error::provider(op.to_string(), format!("could not write request: {e}"))
})?;
}
let output = child.wait_with_output().map_err(|e| {
Error::provider(op.to_string(), format!("provider did not complete: {e}"))
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(Error::provider(
op.to_string(),
format!("provider exited with {}: {}", output.status, stderr.trim()),
));
}
let stdout = String::from_utf8_lossy(&output.stdout);
let line = stdout.trim();
if line.is_empty() {
return Err(Error::provider(
op.to_string(),
"provider produced no output (expected one JSON response object)",
));
}
serde_json::from_str(line).map_err(|e| {
Error::provider(
op.to_string(),
format!("provider response was not valid JSON for `{op}`: {e}; got: {line}"),
)
})
}
}
impl Provider for CommandProvider {
fn respond(
&self,
platform: &str,
model: &str,
skill: &SkillRef<'_>,
messages: &[Message],
session: Option<&str>,
) -> Result<AssistantTurn> {
let request = Request::Respond {
platform,
model,
skill: SkillPayload {
name: skill.name,
path: skill.dir,
instructions: skill.instructions,
},
messages,
session,
};
let payload: RespondPayload = self.call(&request, "respond")?;
Ok(AssistantTurn {
message: payload.message,
done: payload.done,
usage: payload.usage,
session_id: payload.session_id,
})
}
fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
let request = Request::User {
model,
persona,
messages,
};
let payload: UserPayload = self.call(&request, "user")?;
Ok(UserTurn {
message: payload.message,
stop: payload.stop,
usage: payload.usage,
})
}
fn judge(
&self,
model: &str,
query: &JudgeQuery<'_>,
messages: &[Message],
) -> Result<JudgeVerdict> {
let (min, max) = match query.scale {
Some((lo, hi)) => (Some(lo), Some(hi)),
None => (None, None),
};
let request = Request::Judge {
model,
kind: query.kind.as_str(),
criterion: query.criterion,
min,
max,
messages,
};
let payload: JudgePayload = self.call(&request, "judge")?;
Ok(JudgeVerdict {
value: payload.value,
reason: payload.reason,
usage: payload.usage,
})
}
}
pub struct OneharnessProvider {
bin: String,
judge_harness: String,
timeout_secs: u64,
}
#[derive(Deserialize)]
struct OhEnvelope {
results: Vec<OhResult>,
}
#[derive(Deserialize)]
struct OhResult {
status: String,
#[serde(default)]
text: Option<String>,
#[serde(default)]
stdout: String,
#[serde(default)]
stderr: String,
#[serde(default)]
error: Option<String>,
#[serde(default)]
session_id: Option<String>,
#[serde(default)]
usage: Option<Usage>,
#[serde(default)]
failure_kind: Option<String>,
}
struct RunArgs<'a> {
harness: &'a str,
model: &'a str,
prompt: &'a str,
system: Option<&'a str>,
resume: Option<&'a str>,
}
struct RunOutcome {
text: String,
session_id: Option<String>,
usage: Option<Usage>,
}
fn select_reply_text(text: Option<String>, stdout: &str) -> Option<String> {
text.filter(|t| !t.trim().is_empty())
.or_else(|| (!stdout.trim().is_empty()).then(|| stdout.to_string()))
}
impl OneharnessProvider {
#[must_use]
pub fn new(config: &OneharnessConfig) -> Self {
Self {
bin: config.bin.clone(),
judge_harness: config.judge_harness.clone(),
timeout_secs: config.timeout_secs,
}
}
fn run(&self, args: &RunArgs<'_>) -> Result<RunOutcome> {
let timeout = self.timeout_secs.to_string();
let mut cmd = Command::new(&self.bin);
cmd.args([
"run",
"--harness",
args.harness,
"--compact",
"--timeout",
&timeout,
"--prompt-file",
"-",
]);
if !args.model.is_empty() {
cmd.args(["--model", args.model]);
}
if let Some(system) = args.system {
cmd.args(["--system", system]);
}
if let Some(resume) = args.resume {
cmd.args(["--resume", resume]);
}
let mut child = cmd
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.map_err(|e| {
Error::provider(
"oneharness",
format!(
"could not run `{}`: {e}. Is oneharness installed and on PATH?",
self.bin
),
)
})?;
child
.stdin
.as_mut()
.ok_or_else(|| Error::provider("oneharness", "could not open oneharness stdin"))?
.write_all(args.prompt.as_bytes())
.map_err(|e| Error::provider("oneharness", format!("could not write prompt: {e}")))?;
let output = child.wait_with_output().map_err(|e| {
Error::provider("oneharness", format!("oneharness did not complete: {e}"))
})?;
let stdout = String::from_utf8_lossy(&output.stdout);
let envelope: OhEnvelope = serde_json::from_str(stdout.trim()).map_err(|e| {
Error::provider(
"oneharness",
format!(
"could not parse oneharness output: {e}; stderr: {}",
String::from_utf8_lossy(&output.stderr).trim()
),
)
})?;
let result = envelope
.results
.into_iter()
.next()
.ok_or_else(|| Error::provider("oneharness", "oneharness returned no results"))?;
if result.status != "ok" {
let detail = result
.error
.filter(|e| !e.is_empty())
.or_else(|| Some(result.stderr.clone()).filter(|s| !s.is_empty()))
.unwrap_or_else(|| format!("status `{}`", result.status));
let context = format!("oneharness:{}", args.harness);
let message = format!("harness run failed: {detail}");
return Err(match result.failure_kind {
Some(kind) if !kind.is_empty() => {
Error::provider_classified(context, message, kind)
}
_ => Error::provider(context, message),
});
}
let text = select_reply_text(result.text, &result.stdout).ok_or_else(|| {
Error::provider(
format!("oneharness:{}", args.harness),
"harness produced neither extractable text nor stdout",
)
})?;
Ok(RunOutcome {
text,
session_id: result.session_id,
usage: result.usage,
})
}
}
impl Provider for OneharnessProvider {
fn respond(
&self,
platform: &str,
model: &str,
skill: &SkillRef<'_>,
messages: &[Message],
session: Option<&str>,
) -> Result<AssistantTurn> {
let prompt = if session.is_some() {
latest_user_message(messages).unwrap_or_default()
} else {
render_transcript_for_respond(messages)
};
let outcome = self.run(&RunArgs {
harness: platform,
model,
prompt: &prompt,
system: Some(skill.instructions),
resume: session,
})?;
Ok(AssistantTurn {
message: outcome.text.trim().to_string(),
done: false,
usage: outcome.usage,
session_id: outcome.session_id,
})
}
fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
let prompt = build_user_prompt(persona, messages);
let outcome = self.run(&RunArgs {
harness: &self.judge_harness,
model,
prompt: &prompt,
system: None,
resume: None,
})?;
Ok(UserTurn {
message: outcome.text.trim().to_string(),
stop: false,
usage: outcome.usage,
})
}
fn judge(
&self,
model: &str,
query: &JudgeQuery<'_>,
messages: &[Message],
) -> Result<JudgeVerdict> {
let prompt = build_judge_prompt(query, messages);
let outcome = self.run(&RunArgs {
harness: &self.judge_harness,
model,
prompt: &prompt,
system: None,
resume: None,
})?;
let mut verdict = parse_verdict(query.kind, &outcome.text)?;
verdict.usage = outcome.usage;
Ok(verdict)
}
fn supports_resume(&self, platform: &str) -> bool {
supports_resume(platform)
}
}
#[must_use]
pub fn supports_resume(harness: &str) -> bool {
matches!(harness, "claude-code" | "opencode" | "cursor")
}
pub struct ApiJudgeProvider {
vendor: ApiVendor,
api_key_env: String,
endpoint: String,
timeout_secs: u64,
curl_bin: String,
strict_json: bool,
}
const MAX_RETRIES: u32 = 2;
#[derive(Debug)]
struct ChatOutcome {
text: String,
usage: Option<Usage>,
}
const JUDGE_SYSTEM: &str =
"Follow the user's instructions exactly and respond with only what they ask for.";
impl ApiJudgeProvider {
#[must_use]
pub fn new(config: &ApiJudgeConfig) -> Self {
let api_key_env = config
.api_key_env
.clone()
.unwrap_or_else(|| match config.vendor {
ApiVendor::Anthropic => "ANTHROPIC_API_KEY".to_string(),
ApiVendor::Openai => "OPENAI_API_KEY".to_string(),
});
let endpoint = config
.base_url
.clone()
.unwrap_or_else(|| match config.vendor {
ApiVendor::Anthropic => "https://api.anthropic.com/v1/messages".to_string(),
ApiVendor::Openai => "https://api.openai.com/v1/chat/completions".to_string(),
});
Self {
vendor: config.vendor,
api_key_env,
endpoint,
timeout_secs: config.timeout_secs,
curl_bin: config.curl_bin.clone(),
strict_json: config.strict_json,
}
}
fn chat(
&self,
model: &str,
system: &str,
user: &str,
schema: Option<serde_json::Value>,
) -> Result<ChatOutcome> {
let key = std::env::var(&self.api_key_env).map_err(|_| {
Error::provider_classified(
"api-judge",
format!("API key env var `{}` is not set", self.api_key_env),
"auth",
)
})?;
let body = build_chat_body(self.vendor, model, system, user, schema);
let payload = serde_json::to_vec(&body)
.map_err(|e| Error::provider("api-judge", format!("could not encode request: {e}")))?;
let mut attempt = 0;
loop {
let result = self
.run_curl(&key, &payload)
.and_then(|raw| parse_chat_response(self.vendor, &raw));
match result {
Ok(outcome) => return Ok(outcome),
Err(err) if attempt < MAX_RETRIES && is_retryable(&err) => {
attempt += 1;
std::thread::sleep(std::time::Duration::from_millis(500 * (1 << attempt)));
}
Err(err) => return Err(err),
}
}
}
fn headers(&self, key: &str) -> Vec<(String, String)> {
match self.vendor {
ApiVendor::Anthropic => vec![
("x-api-key".to_string(), key.to_string()),
("anthropic-version".to_string(), "2023-06-01".to_string()),
("content-type".to_string(), "application/json".to_string()),
],
ApiVendor::Openai => vec![
("authorization".to_string(), format!("Bearer {key}")),
("content-type".to_string(), "application/json".to_string()),
],
}
}
fn run_curl(&self, key: &str, body: &[u8]) -> Result<String> {
let path = std::env::temp_dir().join(format!(
"skilltest-judge-{}-{}.cfg",
std::process::id(),
curl_config_nonce()
));
write_curl_config(&path, &self.endpoint, &self.headers(key), self.timeout_secs)?;
let outcome = self.exec_curl(&path, body);
let _ = std::fs::remove_file(&path);
outcome
}
fn exec_curl(&self, config_path: &std::path::Path, body: &[u8]) -> Result<String> {
let mut child = Command::new(&self.curl_bin)
.arg("--config")
.arg(config_path)
.arg("--data-binary")
.arg("@-")
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.map_err(|e| {
Error::provider(
"api-judge",
format!(
"could not run `{}`: {e}. Is curl installed and on PATH?",
self.curl_bin
),
)
})?;
child
.stdin
.as_mut()
.ok_or_else(|| Error::provider("api-judge", "could not open curl stdin"))?
.write_all(body)
.map_err(|e| Error::provider("api-judge", format!("could not write request: {e}")))?;
let output = child
.wait_with_output()
.map_err(|e| Error::provider("api-judge", format!("curl did not complete: {e}")))?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(Error::provider(
"api-judge",
format!("curl failed ({}): {}", output.status, stderr.trim()),
));
}
Ok(String::from_utf8_lossy(&output.stdout).into_owned())
}
}
impl Provider for ApiJudgeProvider {
fn respond(
&self,
_platform: &str,
_model: &str,
_skill: &SkillRef<'_>,
_messages: &[Message],
_session: Option<&str>,
) -> Result<AssistantTurn> {
Err(Error::provider(
"api-judge",
"the API judge does not run skills; use it as the judge in a SplitProvider",
))
}
fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
let prompt = build_user_prompt(persona, messages);
let outcome = self.chat(model, JUDGE_SYSTEM, &prompt, None)?;
Ok(UserTurn {
message: outcome.text.trim().to_string(),
stop: false,
usage: outcome.usage,
})
}
fn judge(
&self,
model: &str,
query: &JudgeQuery<'_>,
messages: &[Message],
) -> Result<JudgeVerdict> {
let prompt = build_judge_prompt(query, messages);
let schema = self.strict_json.then(|| verdict_schema(query.kind));
let outcome = self.chat(model, JUDGE_SYSTEM, &prompt, schema)?;
let mut verdict = parse_verdict(query.kind, &outcome.text)?;
verdict.usage = outcome.usage;
Ok(verdict)
}
}
pub struct SplitProvider {
responder: Box<dyn Provider>,
judge: ApiJudgeProvider,
}
impl SplitProvider {
#[must_use]
pub fn new(responder: Box<dyn Provider>, judge: ApiJudgeProvider) -> Self {
Self { responder, judge }
}
}
impl Provider for SplitProvider {
fn respond(
&self,
platform: &str,
model: &str,
skill: &SkillRef<'_>,
messages: &[Message],
session: Option<&str>,
) -> Result<AssistantTurn> {
self.responder
.respond(platform, model, skill, messages, session)
}
fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
self.judge.simulate_user(model, persona, messages)
}
fn judge(
&self,
model: &str,
query: &JudgeQuery<'_>,
messages: &[Message],
) -> Result<JudgeVerdict> {
self.judge.judge(model, query, messages)
}
fn supports_resume(&self, platform: &str) -> bool {
self.responder.supports_resume(platform)
}
}
fn curl_config_nonce() -> u64 {
use std::sync::atomic::{AtomicU64, Ordering};
static COUNTER: AtomicU64 = AtomicU64::new(0);
COUNTER.fetch_add(1, Ordering::Relaxed)
}
fn curl_escape(value: &str) -> String {
value.replace('\\', "\\\\").replace('"', "\\\"")
}
fn write_curl_config(
path: &std::path::Path,
url: &str,
headers: &[(String, String)],
timeout_secs: u64,
) -> Result<()> {
let mut config = String::new();
config.push_str(&format!("url = \"{}\"\n", curl_escape(url)));
config.push_str("request = \"POST\"\n");
for (name, value) in headers {
config.push_str(&format!("header = \"{}: {}\"\n", name, curl_escape(value)));
}
config.push_str(&format!("max-time = {timeout_secs}\n"));
config.push_str("silent\nshow-error\n");
let mut options = std::fs::OpenOptions::new();
options.write(true).create(true).truncate(true);
#[cfg(unix)]
{
use std::os::unix::fs::OpenOptionsExt as _;
options.mode(0o600);
}
let mut file = options
.open(path)
.map_err(|e| Error::provider("api-judge", format!("could not write curl config: {e}")))?;
file.write_all(config.as_bytes())
.map_err(|e| Error::provider("api-judge", format!("could not write curl config: {e}")))?;
Ok(())
}
fn verdict_schema(kind: JudgeKind) -> serde_json::Value {
let value_type = match kind {
JudgeKind::Boolean => "boolean",
JudgeKind::Numeric => "number",
};
serde_json::json!({
"type": "object",
"properties": {
"value": { "type": value_type },
"reason": { "type": "string" },
},
"required": ["value", "reason"],
"additionalProperties": false,
})
}
fn build_chat_body(
vendor: ApiVendor,
model: &str,
system: &str,
user: &str,
schema: Option<serde_json::Value>,
) -> serde_json::Value {
match vendor {
ApiVendor::Anthropic => {
let mut body = serde_json::json!({
"model": model,
"max_tokens": 1024,
"system": system,
"messages": [{ "role": "user", "content": user }],
});
if let Some(schema) = schema {
body["output_config"] =
serde_json::json!({ "format": { "type": "json_schema", "schema": schema } });
}
body
}
ApiVendor::Openai => {
let mut body = serde_json::json!({
"model": model,
"max_tokens": 1024,
"messages": [
{ "role": "system", "content": system },
{ "role": "user", "content": user },
],
});
if let Some(schema) = schema {
body["response_format"] = serde_json::json!({
"type": "json_schema",
"json_schema": { "name": "verdict", "strict": true, "schema": schema },
});
}
body
}
}
}
fn is_retryable(err: &Error) -> bool {
matches!(
err,
Error::Provider { kind: Some(k), .. } if k == "rate_limit" || k == "overloaded"
)
}
#[derive(Deserialize)]
struct ApiErrorBody {
#[serde(rename = "type", default)]
kind: Option<String>,
#[serde(default)]
message: Option<String>,
}
#[derive(Deserialize)]
struct AnthropicBlock {
#[serde(rename = "type")]
kind: String,
#[serde(default)]
text: Option<String>,
}
#[derive(Deserialize)]
struct AnthropicUsage {
#[serde(default)]
input_tokens: Option<u64>,
#[serde(default)]
output_tokens: Option<u64>,
}
#[derive(Deserialize)]
struct AnthropicResponse {
#[serde(default)]
content: Vec<AnthropicBlock>,
#[serde(default)]
usage: Option<AnthropicUsage>,
#[serde(default)]
stop_reason: Option<String>,
#[serde(default)]
error: Option<ApiErrorBody>,
}
#[derive(Deserialize)]
struct OpenAiMessage {
#[serde(default)]
content: Option<String>,
}
#[derive(Deserialize)]
struct OpenAiChoice {
#[serde(default)]
message: Option<OpenAiMessage>,
}
#[derive(Deserialize)]
struct OpenAiUsage {
#[serde(default)]
prompt_tokens: Option<u64>,
#[serde(default)]
completion_tokens: Option<u64>,
}
#[derive(Deserialize)]
struct OpenAiResponse {
#[serde(default)]
choices: Vec<OpenAiChoice>,
#[serde(default)]
usage: Option<OpenAiUsage>,
#[serde(default)]
error: Option<ApiErrorBody>,
}
fn classify_api_error(kind: Option<&str>) -> Option<String> {
match kind? {
"authentication_error" | "invalid_api_key" | "permission_error" => Some("auth".to_string()),
"rate_limit_error" | "rate_limit_exceeded" => Some("rate_limit".to_string()),
"insufficient_quota" | "billing_error" => Some("quota".to_string()),
"not_found_error" => Some("model_not_found".to_string()),
"overloaded_error" | "api_error" | "server_error" | "service_unavailable" => {
Some("overloaded".to_string())
}
_ => None,
}
}
fn api_error(err: ApiErrorBody) -> Error {
let message = err
.message
.unwrap_or_else(|| "API returned an error".to_string());
match classify_api_error(err.kind.as_deref()) {
Some(kind) => Error::provider_classified("api-judge", message, kind),
None => Error::provider("api-judge", message),
}
}
fn truncate_for_error(raw: &str) -> String {
raw.chars().take(500).collect()
}
fn parse_chat_response(vendor: ApiVendor, raw: &str) -> Result<ChatOutcome> {
match vendor {
ApiVendor::Anthropic => {
let resp: AnthropicResponse = serde_json::from_str(raw.trim()).map_err(|e| {
Error::provider(
"api-judge",
format!(
"could not parse API response: {e}; got: {}",
truncate_for_error(raw)
),
)
})?;
if let Some(err) = resp.error {
return Err(api_error(err));
}
let text = resp
.content
.iter()
.filter(|b| b.kind == "text")
.filter_map(|b| b.text.as_deref())
.collect::<String>();
if text.trim().is_empty() {
return Err(Error::provider(
"api-judge",
format!(
"judge returned no text (stop_reason: {:?})",
resp.stop_reason
),
));
}
let usage = resp.usage.map(|u| Usage {
input_tokens: u.input_tokens,
output_tokens: u.output_tokens,
cost_usd: None,
});
Ok(ChatOutcome { text, usage })
}
ApiVendor::Openai => {
let resp: OpenAiResponse = serde_json::from_str(raw.trim()).map_err(|e| {
Error::provider(
"api-judge",
format!(
"could not parse API response: {e}; got: {}",
truncate_for_error(raw)
),
)
})?;
if let Some(err) = resp.error {
return Err(api_error(err));
}
let text = resp
.choices
.into_iter()
.next()
.and_then(|c| c.message)
.and_then(|m| m.content)
.unwrap_or_default();
if text.trim().is_empty() {
return Err(Error::provider("api-judge", "judge returned no text"));
}
let usage = resp.usage.map(|u| Usage {
input_tokens: u.prompt_tokens,
output_tokens: u.completion_tokens,
cost_usd: None,
});
Ok(ChatOutcome { text, usage })
}
}
}
fn render_transcript(messages: &[Message]) -> String {
messages
.iter()
.map(|m| {
let role = match m.role {
Role::User => "User",
Role::Assistant => "Assistant",
Role::System => "System",
};
format!("{role}: {}", m.content)
})
.collect::<Vec<_>>()
.join("\n")
}
fn render_transcript_for_respond(messages: &[Message]) -> String {
format!(
"Conversation so far (most recent last):\n{}\n\n\
Write only the assistant's next reply, following your system \
instructions. Output the reply text and nothing else.",
render_transcript(messages),
)
}
fn latest_user_message(messages: &[Message]) -> Option<String> {
messages
.iter()
.rev()
.find(|m| m.role == Role::User)
.map(|m| m.content.clone())
}
fn build_user_prompt(persona: &str, messages: &[Message]) -> String {
format!(
"You are role-playing the USER in a conversation with an AI assistant. \
Stay in character:\n\n{persona}\n\n\
Conversation so far (most recent last):\n{transcript}\n\n\
Write only the user's next message. Output the message text and nothing \
else.",
transcript = render_transcript(messages),
)
}
fn build_judge_prompt(query: &JudgeQuery<'_>, messages: &[Message]) -> String {
let transcript = render_transcript(messages);
match query.kind {
JudgeKind::Boolean => format!(
"You are a strict, careful evaluator of an AI assistant's behavior.\n\n\
Criterion: {criterion}\n\n\
Transcript:\n{transcript}\n\n\
Decide whether the criterion is satisfied. Respond with ONLY a \
single-line JSON object and nothing else:\n\
{{\"value\": true or false, \"reason\": \"<one short sentence>\"}}",
criterion = query.criterion,
),
JudgeKind::Numeric => {
let (min, max) = query.scale.unwrap_or((0.0, 10.0));
format!(
"You are a strict, careful evaluator of an AI assistant's behavior.\n\n\
Criterion: {criterion}\n\n\
Transcript:\n{transcript}\n\n\
Score how well the criterion is satisfied on a scale from {min} to \
{max} (inclusive). Respond with ONLY a single-line JSON object and \
nothing else:\n\
{{\"value\": <number between {min} and {max}>, \"reason\": \"<one short sentence>\"}}",
criterion = query.criterion,
)
}
}
}
fn extract_json_object(text: &str) -> Option<&str> {
let start = text.find('{')?;
let end = text.rfind('}')?;
if end > start {
Some(&text[start..=end])
} else {
None
}
}
fn parse_verdict(kind: JudgeKind, text: &str) -> Result<JudgeVerdict> {
let json = extract_json_object(text).ok_or_else(|| {
Error::provider(
"oneharness:judge",
format!("judge did not return a JSON object; got: {text}"),
)
})?;
let value: serde_json::Value = serde_json::from_str(json).map_err(|e| {
Error::provider(
"oneharness:judge",
format!("judge verdict was not valid JSON: {e}; got: {json}"),
)
})?;
let reason = value
.get("reason")
.and_then(serde_json::Value::as_str)
.unwrap_or("")
.to_string();
let raw = value
.get("value")
.ok_or_else(|| Error::provider("oneharness:judge", "judge verdict has no `value` field"))?;
let verdict_value = match kind {
JudgeKind::Boolean => JudgeValue::Bool(raw.as_bool().ok_or_else(|| {
Error::provider(
"oneharness:judge",
format!("boolean judge `value` was not a bool: {raw}"),
)
})?),
JudgeKind::Numeric => JudgeValue::Number(raw.as_f64().ok_or_else(|| {
Error::provider(
"oneharness:judge",
format!("numeric judge `value` was not a number: {raw}"),
)
})?),
};
Ok(JudgeVerdict {
value: verdict_value,
reason,
usage: None,
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_argv_is_rejected() {
assert!(CommandProvider::new(vec![]).is_err());
}
#[test]
fn request_serializes_with_op_tag() {
let req = Request::Judge {
model: "m",
kind: "numeric",
criterion: "polite",
min: Some(0.0),
max: Some(10.0),
messages: &[],
};
let json = serde_json::to_string(&req).unwrap();
assert!(json.contains("\"op\":\"judge\""));
assert!(json.contains("\"kind\":\"numeric\""));
}
#[test]
fn respond_no_session_inlines_transcript_but_not_skill() {
let messages = [
Message::user("Hi"),
Message::assistant("Hello"),
Message::user("Again?"),
];
let prompt = render_transcript_for_respond(&messages);
assert!(prompt.contains("User: Hi"));
assert!(prompt.contains("Assistant: Hello"));
assert!(prompt.contains("User: Again?"));
assert!(!prompt.contains("SKILL"));
}
#[test]
fn respond_with_session_sends_only_latest_user_message() {
let messages = [
Message::user("Hi"),
Message::assistant("Hello"),
Message::user("Again?"),
];
assert_eq!(latest_user_message(&messages).as_deref(), Some("Again?"));
}
#[test]
fn extracts_json_from_fenced_or_prose_text() {
assert_eq!(
extract_json_object("```json\n{\"value\": true}\n```"),
Some("{\"value\": true}")
);
assert_eq!(
extract_json_object("Sure! {\"value\": 8, \"reason\": \"x\"} done"),
Some("{\"value\": 8, \"reason\": \"x\"}")
);
assert_eq!(extract_json_object("no json here"), None);
}
#[test]
fn parses_boolean_and_numeric_verdicts() {
let b = parse_verdict(JudgeKind::Boolean, "{\"value\": true, \"reason\": \"ok\"}").unwrap();
assert!(matches!(b.value, JudgeValue::Bool(true)));
assert_eq!(b.reason, "ok");
let n =
parse_verdict(JudgeKind::Numeric, "{\"value\": 8.5, \"reason\": \"good\"}").unwrap();
assert!(matches!(n.value, JudgeValue::Number(v) if (v - 8.5).abs() < f64::EPSILON));
}
#[test]
fn verdict_with_wrong_value_type_errors() {
assert!(parse_verdict(JudgeKind::Boolean, "{\"value\": 3}").is_err());
assert!(parse_verdict(JudgeKind::Numeric, "{\"value\": true}").is_err());
assert!(parse_verdict(JudgeKind::Boolean, "no json").is_err());
}
#[test]
fn usage_accumulates_independently_per_field() {
let mut total = Usage::default();
total.add(&Usage {
input_tokens: Some(10),
output_tokens: None,
cost_usd: Some(0.01),
});
total.add(&Usage {
input_tokens: Some(5),
output_tokens: Some(3),
cost_usd: None,
});
assert_eq!(total.input_tokens, Some(15));
assert_eq!(total.output_tokens, Some(3));
assert!((total.cost_usd.unwrap() - 0.01).abs() < f64::EPSILON);
assert!(!total.is_empty());
}
#[test]
fn reply_text_prefers_extracted_then_falls_back_to_stdout() {
assert_eq!(
select_reply_text(Some("clean reply".into()), "raw noise"),
Some("clean reply".into())
);
assert_eq!(
select_reply_text(None, "{\"type\":\"text\",\"part\":{\"text\":\"pong\"}}"),
Some("{\"type\":\"text\",\"part\":{\"text\":\"pong\"}}".into())
);
assert_eq!(
select_reply_text(Some(" ".into()), "fallback"),
Some("fallback".into())
);
assert_eq!(select_reply_text(None, " \n"), None);
assert_eq!(select_reply_text(Some(String::new()), ""), None);
}
#[test]
fn supports_resume_covers_known_harnesses() {
assert!(supports_resume("claude-code"));
assert!(supports_resume("opencode"));
assert!(supports_resume("cursor"));
assert!(!supports_resume("codex"));
assert!(!supports_resume("goose"));
}
fn api_config(vendor: ApiVendor) -> ApiJudgeConfig {
ApiJudgeConfig {
vendor,
api_key_env: None,
base_url: None,
timeout_secs: 60,
curl_bin: "curl".to_string(),
strict_json: true,
}
}
#[test]
fn api_judge_resolves_vendor_defaults() {
let anthropic = ApiJudgeProvider::new(&api_config(ApiVendor::Anthropic));
assert_eq!(anthropic.api_key_env, "ANTHROPIC_API_KEY");
assert_eq!(anthropic.endpoint, "https://api.anthropic.com/v1/messages");
let openai = ApiJudgeProvider::new(&api_config(ApiVendor::Openai));
assert_eq!(openai.api_key_env, "OPENAI_API_KEY");
assert_eq!(
openai.endpoint,
"https://api.openai.com/v1/chat/completions"
);
}
#[test]
fn api_judge_honors_overrides() {
let provider = ApiJudgeProvider::new(&ApiJudgeConfig {
vendor: ApiVendor::Openai,
api_key_env: Some("MY_KEY".to_string()),
base_url: Some("https://proxy.example/v1/chat/completions".to_string()),
timeout_secs: 5,
curl_bin: "curl".to_string(),
strict_json: true,
});
assert_eq!(provider.api_key_env, "MY_KEY");
assert_eq!(
provider.endpoint,
"https://proxy.example/v1/chat/completions"
);
}
#[test]
fn build_chat_body_shapes_per_vendor() {
let anthropic = build_chat_body(ApiVendor::Anthropic, "claude-x", "sys", "hi", None);
assert_eq!(anthropic["model"], "claude-x");
assert_eq!(anthropic["system"], "sys");
assert_eq!(anthropic["messages"][0]["role"], "user");
assert_eq!(anthropic["messages"].as_array().unwrap().len(), 1);
assert!(anthropic.get("output_config").is_none());
let openai = build_chat_body(ApiVendor::Openai, "gpt-x", "sys", "hi", None);
assert_eq!(openai["messages"][0]["role"], "system");
assert_eq!(openai["messages"][1]["role"], "user");
assert!(openai.get("system").is_none());
assert!(openai.get("response_format").is_none());
}
#[test]
fn build_chat_body_attaches_strict_schema_per_vendor() {
let schema = verdict_schema(JudgeKind::Boolean);
let anthropic = build_chat_body(
ApiVendor::Anthropic,
"claude-x",
"sys",
"hi",
Some(schema.clone()),
);
assert_eq!(anthropic["output_config"]["format"]["type"], "json_schema");
assert_eq!(
anthropic["output_config"]["format"]["schema"]["properties"]["value"]["type"],
"boolean"
);
let numeric = verdict_schema(JudgeKind::Numeric);
let openai = build_chat_body(ApiVendor::Openai, "gpt-x", "sys", "hi", Some(numeric));
assert_eq!(openai["response_format"]["type"], "json_schema");
assert_eq!(openai["response_format"]["json_schema"]["strict"], true);
assert_eq!(
openai["response_format"]["json_schema"]["schema"]["properties"]["value"]["type"],
"number"
);
}
#[test]
fn verdict_schema_requires_value_and_reason_with_no_extras() {
let schema = verdict_schema(JudgeKind::Numeric);
assert_eq!(schema["additionalProperties"], false);
let required: Vec<&str> = schema["required"]
.as_array()
.unwrap()
.iter()
.map(|v| v.as_str().unwrap())
.collect();
assert_eq!(required, ["value", "reason"]);
}
#[test]
fn parses_anthropic_success_with_usage() {
let raw = r#"{"content":[{"type":"text","text":"{\"value\": true}"}],
"stop_reason":"end_turn","usage":{"input_tokens":12,"output_tokens":3}}"#;
let outcome = parse_chat_response(ApiVendor::Anthropic, raw).unwrap();
assert_eq!(outcome.text, "{\"value\": true}");
let usage = outcome.usage.unwrap();
assert_eq!(usage.input_tokens, Some(12));
assert_eq!(usage.output_tokens, Some(3));
assert!(usage.cost_usd.is_none());
}
#[test]
fn parses_openai_success_with_usage() {
let raw = r#"{"choices":[{"message":{"content":"{\"value\": 8}"}}],
"usage":{"prompt_tokens":20,"completion_tokens":4}}"#;
let outcome = parse_chat_response(ApiVendor::Openai, raw).unwrap();
assert_eq!(outcome.text, "{\"value\": 8}");
let usage = outcome.usage.unwrap();
assert_eq!(usage.input_tokens, Some(20));
assert_eq!(usage.output_tokens, Some(4));
}
#[test]
fn parses_and_classifies_api_errors() {
let auth = r#"{"error":{"type":"authentication_error","message":"bad key"}}"#;
let err = parse_chat_response(ApiVendor::Anthropic, auth).unwrap_err();
assert!(matches!(err, Error::Provider { kind: Some(k), .. } if k == "auth"));
let rate = r#"{"error":{"type":"rate_limit_exceeded","message":"slow down"}}"#;
let err = parse_chat_response(ApiVendor::Openai, rate).unwrap_err();
assert!(matches!(err, Error::Provider { kind: Some(k), .. } if k == "rate_limit"));
}
#[test]
fn empty_reply_is_an_error() {
let raw = r#"{"content":[],"stop_reason":"refusal"}"#;
assert!(parse_chat_response(ApiVendor::Anthropic, raw).is_err());
}
#[test]
fn classify_api_error_maps_known_kinds() {
assert_eq!(
classify_api_error(Some("invalid_api_key")).as_deref(),
Some("auth")
);
assert_eq!(
classify_api_error(Some("insufficient_quota")).as_deref(),
Some("quota")
);
assert_eq!(
classify_api_error(Some("not_found_error")).as_deref(),
Some("model_not_found")
);
assert_eq!(
classify_api_error(Some("overloaded_error")).as_deref(),
Some("overloaded")
);
assert_eq!(classify_api_error(Some("something_else")), None);
assert_eq!(classify_api_error(None), None);
}
#[test]
fn retryable_covers_transient_errors_only() {
let overloaded = r#"{"error":{"type":"overloaded_error","message":"busy"}}"#;
let err = parse_chat_response(ApiVendor::Anthropic, overloaded).unwrap_err();
assert!(is_retryable(&err), "overload should retry");
let rate = r#"{"error":{"type":"rate_limit_error","message":"slow"}}"#;
let err = parse_chat_response(ApiVendor::Anthropic, rate).unwrap_err();
assert!(is_retryable(&err), "rate limit should retry");
let auth = r#"{"error":{"type":"authentication_error","message":"bad key"}}"#;
let err = parse_chat_response(ApiVendor::Anthropic, auth).unwrap_err();
assert!(!is_retryable(&err), "auth must not retry");
}
#[test]
fn curl_escape_handles_quotes_and_backslashes() {
assert_eq!(curl_escape(r#"a"b\c"#), r#"a\"b\\c"#);
}
struct StubResponder;
impl Provider for StubResponder {
fn respond(
&self,
_platform: &str,
_model: &str,
_skill: &SkillRef<'_>,
_messages: &[Message],
_session: Option<&str>,
) -> Result<AssistantTurn> {
Ok(AssistantTurn {
message: "stub reply".to_string(),
..Default::default()
})
}
fn simulate_user(
&self,
_model: &str,
_persona: &str,
_messages: &[Message],
) -> Result<UserTurn> {
unreachable!("split provider routes user simulation to the judge")
}
fn judge(
&self,
_model: &str,
_query: &JudgeQuery<'_>,
_messages: &[Message],
) -> Result<JudgeVerdict> {
unreachable!("split provider routes judging to the judge")
}
fn supports_resume(&self, platform: &str) -> bool {
platform == "claude-code"
}
}
#[test]
fn split_provider_delegates_respond_and_resume() {
let split = SplitProvider::new(
Box::new(StubResponder),
ApiJudgeProvider::new(&api_config(ApiVendor::Anthropic)),
);
assert!(split.supports_resume("claude-code"));
assert!(!split.supports_resume("codex"));
let skill = SkillRef {
name: "s",
dir: "/tmp/s",
instructions: "do things",
};
let turn = split
.respond("claude-code", "m", &skill, &[], None)
.unwrap();
assert_eq!(turn.message, "stub reply");
}
#[test]
fn api_judge_does_not_run_skills() {
let provider = ApiJudgeProvider::new(&api_config(ApiVendor::Anthropic));
let skill = SkillRef {
name: "s",
dir: "/tmp/s",
instructions: "x",
};
assert!(provider.respond("p", "m", &skill, &[], None).is_err());
}
}