1use std::io::Write as _;
14use std::process::{Command, Stdio};
15
16use serde::{Deserialize, Serialize};
17
18use crate::config::{ApiJudgeConfig, ApiVendor, OneharnessConfig};
19use crate::conversation::{Message, Role};
20use crate::error::{Error, Result};
21use crate::eval::JudgeValue;
22
23pub struct SkillRef<'a> {
25 pub name: &'a str,
26 pub dir: &'a str,
27 pub instructions: &'a str,
28}
29
30#[derive(Debug, Clone, Copy, PartialEq, Eq)]
32pub enum JudgeKind {
33 Boolean,
34 Numeric,
35}
36
37impl JudgeKind {
38 fn as_str(self) -> &'static str {
39 match self {
40 JudgeKind::Boolean => "boolean",
41 JudgeKind::Numeric => "numeric",
42 }
43 }
44}
45
46pub struct JudgeQuery<'a> {
48 pub kind: JudgeKind,
49 pub criterion: &'a str,
50 pub scale: Option<(f64, f64)>,
51}
52
53#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
60pub struct Usage {
61 #[serde(default, skip_serializing_if = "Option::is_none")]
62 pub input_tokens: Option<u64>,
63 #[serde(default, skip_serializing_if = "Option::is_none")]
64 pub output_tokens: Option<u64>,
65 #[serde(default, skip_serializing_if = "Option::is_none")]
66 pub cost_usd: Option<f64>,
67}
68
69impl Usage {
70 #[must_use]
72 pub fn is_empty(&self) -> bool {
73 self.input_tokens.is_none() && self.output_tokens.is_none() && self.cost_usd.is_none()
74 }
75
76 pub fn add(&mut self, other: &Usage) {
79 if let Some(v) = other.input_tokens {
80 self.input_tokens = Some(self.input_tokens.unwrap_or(0) + v);
81 }
82 if let Some(v) = other.output_tokens {
83 self.output_tokens = Some(self.output_tokens.unwrap_or(0) + v);
84 }
85 if let Some(v) = other.cost_usd {
86 self.cost_usd = Some(self.cost_usd.unwrap_or(0.0) + v);
87 }
88 }
89}
90
91#[derive(Debug, Clone, Default)]
93pub struct AssistantTurn {
94 pub message: String,
95 pub done: bool,
97 pub usage: Option<Usage>,
99 pub session_id: Option<String>,
103}
104
105#[derive(Debug, Clone, Default)]
107pub struct UserTurn {
108 pub message: String,
109 pub stop: bool,
111 pub usage: Option<Usage>,
112}
113
114#[derive(Debug, Clone)]
116pub struct JudgeVerdict {
117 pub value: JudgeValue,
118 pub reason: String,
119 pub usage: Option<Usage>,
120}
121
122pub trait Provider {
124 fn respond(
133 &self,
134 platform: &str,
135 model: &str,
136 skill: &SkillRef<'_>,
137 messages: &[Message],
138 session: Option<&str>,
139 ) -> Result<AssistantTurn>;
140
141 fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn>;
146
147 fn judge(
152 &self,
153 model: &str,
154 query: &JudgeQuery<'_>,
155 messages: &[Message],
156 ) -> Result<JudgeVerdict>;
157
158 fn supports_resume(&self, _platform: &str) -> bool {
163 false
164 }
165}
166
167#[derive(Serialize)]
172struct SkillPayload<'a> {
173 name: &'a str,
174 path: &'a str,
175 instructions: &'a str,
176}
177
178#[derive(Serialize)]
179#[serde(tag = "op", rename_all = "lowercase")]
180enum Request<'a> {
181 Respond {
182 platform: &'a str,
183 model: &'a str,
184 skill: SkillPayload<'a>,
185 messages: &'a [Message],
186 #[serde(skip_serializing_if = "Option::is_none")]
187 session: Option<&'a str>,
188 },
189 User {
190 model: &'a str,
191 persona: &'a str,
192 messages: &'a [Message],
193 },
194 Judge {
195 model: &'a str,
196 kind: &'a str,
197 criterion: &'a str,
198 #[serde(skip_serializing_if = "Option::is_none")]
199 min: Option<f64>,
200 #[serde(skip_serializing_if = "Option::is_none")]
201 max: Option<f64>,
202 messages: &'a [Message],
203 },
204}
205
206#[derive(Deserialize)]
207struct RespondPayload {
208 message: String,
209 #[serde(default)]
210 done: bool,
211 #[serde(default)]
212 usage: Option<Usage>,
213 #[serde(default)]
214 session_id: Option<String>,
215}
216
217#[derive(Deserialize)]
218struct UserPayload {
219 message: String,
220 #[serde(default)]
221 stop: bool,
222 #[serde(default)]
223 usage: Option<Usage>,
224}
225
226#[derive(Deserialize)]
227struct JudgePayload {
228 value: JudgeValue,
229 #[serde(default)]
230 reason: String,
231 #[serde(default)]
232 usage: Option<Usage>,
233}
234
235pub struct CommandProvider {
241 argv: Vec<String>,
242}
243
244impl CommandProvider {
245 pub fn new(argv: Vec<String>) -> Result<Self> {
251 if argv.is_empty() {
252 return Err(Error::Invalid("provider command is empty".into()));
253 }
254 Ok(Self { argv })
255 }
256
257 fn call<T: for<'de> Deserialize<'de>>(&self, request: &Request<'_>, op: &str) -> Result<T> {
259 let payload = serde_json::to_vec(request).map_err(|e| {
260 Error::provider(op.to_string(), format!("could not encode request: {e}"))
261 })?;
262
263 let mut child = Command::new(&self.argv[0])
264 .args(&self.argv[1..])
265 .stdin(Stdio::piped())
266 .stdout(Stdio::piped())
267 .stderr(Stdio::piped())
268 .spawn()
269 .map_err(|e| {
270 Error::provider(
271 op.to_string(),
272 format!(
273 "could not run provider `{}`: {e}. Is it installed and on PATH?",
274 self.argv[0]
275 ),
276 )
277 })?;
278
279 {
282 let stdin = child
283 .stdin
284 .as_mut()
285 .ok_or_else(|| Error::provider(op.to_string(), "could not open provider stdin"))?;
286 stdin
287 .write_all(&payload)
288 .and_then(|()| stdin.write_all(b"\n"))
289 .map_err(|e| {
290 Error::provider(op.to_string(), format!("could not write request: {e}"))
291 })?;
292 }
293
294 let output = child.wait_with_output().map_err(|e| {
295 Error::provider(op.to_string(), format!("provider did not complete: {e}"))
296 })?;
297
298 if !output.status.success() {
299 let stderr = String::from_utf8_lossy(&output.stderr);
300 return Err(Error::provider(
301 op.to_string(),
302 format!("provider exited with {}: {}", output.status, stderr.trim()),
303 ));
304 }
305
306 let stdout = String::from_utf8_lossy(&output.stdout);
307 let line = stdout.trim();
308 if line.is_empty() {
309 return Err(Error::provider(
310 op.to_string(),
311 "provider produced no output (expected one JSON response object)",
312 ));
313 }
314 serde_json::from_str(line).map_err(|e| {
315 Error::provider(
316 op.to_string(),
317 format!("provider response was not valid JSON for `{op}`: {e}; got: {line}"),
318 )
319 })
320 }
321}
322
323impl Provider for CommandProvider {
324 fn respond(
325 &self,
326 platform: &str,
327 model: &str,
328 skill: &SkillRef<'_>,
329 messages: &[Message],
330 session: Option<&str>,
331 ) -> Result<AssistantTurn> {
332 let request = Request::Respond {
333 platform,
334 model,
335 skill: SkillPayload {
336 name: skill.name,
337 path: skill.dir,
338 instructions: skill.instructions,
339 },
340 messages,
341 session,
342 };
343 let payload: RespondPayload = self.call(&request, "respond")?;
344 Ok(AssistantTurn {
345 message: payload.message,
346 done: payload.done,
347 usage: payload.usage,
348 session_id: payload.session_id,
349 })
350 }
351
352 fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
353 let request = Request::User {
354 model,
355 persona,
356 messages,
357 };
358 let payload: UserPayload = self.call(&request, "user")?;
359 Ok(UserTurn {
360 message: payload.message,
361 stop: payload.stop,
362 usage: payload.usage,
363 })
364 }
365
366 fn judge(
367 &self,
368 model: &str,
369 query: &JudgeQuery<'_>,
370 messages: &[Message],
371 ) -> Result<JudgeVerdict> {
372 let (min, max) = match query.scale {
373 Some((lo, hi)) => (Some(lo), Some(hi)),
374 None => (None, None),
375 };
376 let request = Request::Judge {
377 model,
378 kind: query.kind.as_str(),
379 criterion: query.criterion,
380 min,
381 max,
382 messages,
383 };
384 let payload: JudgePayload = self.call(&request, "judge")?;
385 Ok(JudgeVerdict {
386 value: payload.value,
387 reason: payload.reason,
388 usage: payload.usage,
389 })
390 }
391}
392
393pub struct OneharnessProvider {
421 bin: String,
422 judge_harness: String,
423 timeout_secs: u64,
424}
425
426#[derive(Deserialize)]
428struct OhEnvelope {
429 results: Vec<OhResult>,
430}
431
432#[derive(Deserialize)]
433struct OhResult {
434 status: String,
435 #[serde(default)]
436 text: Option<String>,
437 #[serde(default)]
444 stdout: String,
445 #[serde(default)]
446 stderr: String,
447 #[serde(default)]
448 error: Option<String>,
449 #[serde(default)]
450 session_id: Option<String>,
451 #[serde(default)]
452 usage: Option<Usage>,
453 #[serde(default)]
454 failure_kind: Option<String>,
455}
456
457struct RunArgs<'a> {
459 harness: &'a str,
460 model: &'a str,
461 prompt: &'a str,
462 system: Option<&'a str>,
465 resume: Option<&'a str>,
468}
469
470struct RunOutcome {
472 text: String,
473 session_id: Option<String>,
474 usage: Option<Usage>,
475}
476
477fn select_reply_text(text: Option<String>, stdout: &str) -> Option<String> {
485 text.filter(|t| !t.trim().is_empty())
486 .or_else(|| (!stdout.trim().is_empty()).then(|| stdout.to_string()))
487}
488
489impl OneharnessProvider {
490 #[must_use]
492 pub fn new(config: &OneharnessConfig) -> Self {
493 Self {
494 bin: config.bin.clone(),
495 judge_harness: config.judge_harness.clone(),
496 timeout_secs: config.timeout_secs,
497 }
498 }
499
500 fn run(&self, args: &RunArgs<'_>) -> Result<RunOutcome> {
504 let timeout = self.timeout_secs.to_string();
505 let mut cmd = Command::new(&self.bin);
506 cmd.args([
513 "run",
514 "--harness",
515 args.harness,
516 "--compact",
517 "--timeout",
518 &timeout,
519 "--prompt-file",
520 "-",
521 ]);
522 if !args.model.is_empty() {
528 cmd.args(["--model", args.model]);
529 }
530 if let Some(system) = args.system {
531 cmd.args(["--system", system]);
532 }
533 if let Some(resume) = args.resume {
534 cmd.args(["--resume", resume]);
535 }
536
537 let mut child = cmd
538 .stdin(Stdio::piped())
539 .stdout(Stdio::piped())
540 .stderr(Stdio::piped())
541 .spawn()
542 .map_err(|e| {
543 Error::provider(
544 "oneharness",
545 format!(
546 "could not run `{}`: {e}. Is oneharness installed and on PATH?",
547 self.bin
548 ),
549 )
550 })?;
551
552 child
553 .stdin
554 .as_mut()
555 .ok_or_else(|| Error::provider("oneharness", "could not open oneharness stdin"))?
556 .write_all(args.prompt.as_bytes())
557 .map_err(|e| Error::provider("oneharness", format!("could not write prompt: {e}")))?;
558
559 let output = child.wait_with_output().map_err(|e| {
560 Error::provider("oneharness", format!("oneharness did not complete: {e}"))
561 })?;
562
563 let stdout = String::from_utf8_lossy(&output.stdout);
564 let envelope: OhEnvelope = serde_json::from_str(stdout.trim()).map_err(|e| {
565 Error::provider(
566 "oneharness",
567 format!(
568 "could not parse oneharness output: {e}; stderr: {}",
569 String::from_utf8_lossy(&output.stderr).trim()
570 ),
571 )
572 })?;
573
574 let result = envelope
575 .results
576 .into_iter()
577 .next()
578 .ok_or_else(|| Error::provider("oneharness", "oneharness returned no results"))?;
579
580 if result.status != "ok" {
581 let detail = result
582 .error
583 .filter(|e| !e.is_empty())
584 .or_else(|| Some(result.stderr.clone()).filter(|s| !s.is_empty()))
585 .unwrap_or_else(|| format!("status `{}`", result.status));
586 let context = format!("oneharness:{}", args.harness);
587 let message = format!("harness run failed: {detail}");
588 return Err(match result.failure_kind {
589 Some(kind) if !kind.is_empty() => {
590 Error::provider_classified(context, message, kind)
591 }
592 _ => Error::provider(context, message),
593 });
594 }
595
596 let text = select_reply_text(result.text, &result.stdout).ok_or_else(|| {
601 Error::provider(
602 format!("oneharness:{}", args.harness),
603 "harness produced neither extractable text nor stdout",
604 )
605 })?;
606 Ok(RunOutcome {
607 text,
608 session_id: result.session_id,
609 usage: result.usage,
610 })
611 }
612}
613
614impl Provider for OneharnessProvider {
615 fn respond(
616 &self,
617 platform: &str,
618 model: &str,
619 skill: &SkillRef<'_>,
620 messages: &[Message],
621 session: Option<&str>,
622 ) -> Result<AssistantTurn> {
623 let prompt = if session.is_some() {
628 latest_user_message(messages).unwrap_or_default()
629 } else {
630 render_transcript_for_respond(messages)
631 };
632 let outcome = self.run(&RunArgs {
633 harness: platform,
634 model,
635 prompt: &prompt,
636 system: Some(skill.instructions),
637 resume: session,
638 })?;
639 Ok(AssistantTurn {
640 message: outcome.text.trim().to_string(),
641 done: false,
642 usage: outcome.usage,
643 session_id: outcome.session_id,
644 })
645 }
646
647 fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
648 let prompt = build_user_prompt(persona, messages);
649 let outcome = self.run(&RunArgs {
650 harness: &self.judge_harness,
651 model,
652 prompt: &prompt,
653 system: None,
654 resume: None,
655 })?;
656 Ok(UserTurn {
657 message: outcome.text.trim().to_string(),
658 stop: false,
659 usage: outcome.usage,
660 })
661 }
662
663 fn judge(
664 &self,
665 model: &str,
666 query: &JudgeQuery<'_>,
667 messages: &[Message],
668 ) -> Result<JudgeVerdict> {
669 let prompt = build_judge_prompt(query, messages);
670 let outcome = self.run(&RunArgs {
671 harness: &self.judge_harness,
672 model,
673 prompt: &prompt,
674 system: None,
675 resume: None,
676 })?;
677 let mut verdict = parse_verdict(query.kind, &outcome.text)?;
678 verdict.usage = outcome.usage;
679 Ok(verdict)
680 }
681
682 fn supports_resume(&self, platform: &str) -> bool {
683 supports_resume(platform)
684 }
685}
686
687#[must_use]
692pub fn supports_resume(harness: &str) -> bool {
693 matches!(harness, "claude-code" | "opencode" | "cursor")
694}
695
696pub struct ApiJudgeProvider {
718 vendor: ApiVendor,
719 api_key_env: String,
720 endpoint: String,
721 timeout_secs: u64,
722 curl_bin: String,
723 strict_json: bool,
724}
725
726const MAX_RETRIES: u32 = 2;
729
730#[derive(Debug)]
732struct ChatOutcome {
733 text: String,
734 usage: Option<Usage>,
735}
736
737const JUDGE_SYSTEM: &str =
740 "Follow the user's instructions exactly and respond with only what they ask for.";
741
742impl ApiJudgeProvider {
743 #[must_use]
746 pub fn new(config: &ApiJudgeConfig) -> Self {
747 let api_key_env = config
748 .api_key_env
749 .clone()
750 .unwrap_or_else(|| match config.vendor {
751 ApiVendor::Anthropic => "ANTHROPIC_API_KEY".to_string(),
752 ApiVendor::Openai => "OPENAI_API_KEY".to_string(),
753 });
754 let endpoint = config
755 .base_url
756 .clone()
757 .unwrap_or_else(|| match config.vendor {
758 ApiVendor::Anthropic => "https://api.anthropic.com/v1/messages".to_string(),
759 ApiVendor::Openai => "https://api.openai.com/v1/chat/completions".to_string(),
760 });
761 Self {
762 vendor: config.vendor,
763 api_key_env,
764 endpoint,
765 timeout_secs: config.timeout_secs,
766 curl_bin: config.curl_bin.clone(),
767 strict_json: config.strict_json,
768 }
769 }
770
771 fn chat(
776 &self,
777 model: &str,
778 system: &str,
779 user: &str,
780 schema: Option<serde_json::Value>,
781 ) -> Result<ChatOutcome> {
782 let key = std::env::var(&self.api_key_env).map_err(|_| {
783 Error::provider_classified(
784 "api-judge",
785 format!("API key env var `{}` is not set", self.api_key_env),
786 "auth",
787 )
788 })?;
789 let body = build_chat_body(self.vendor, model, system, user, schema);
790 let payload = serde_json::to_vec(&body)
791 .map_err(|e| Error::provider("api-judge", format!("could not encode request: {e}")))?;
792
793 let mut attempt = 0;
794 loop {
795 let result = self
796 .run_curl(&key, &payload)
797 .and_then(|raw| parse_chat_response(self.vendor, &raw));
798 match result {
799 Ok(outcome) => return Ok(outcome),
800 Err(err) if attempt < MAX_RETRIES && is_retryable(&err) => {
801 attempt += 1;
802 std::thread::sleep(std::time::Duration::from_millis(500 * (1 << attempt)));
803 }
804 Err(err) => return Err(err),
805 }
806 }
807 }
808
809 fn headers(&self, key: &str) -> Vec<(String, String)> {
811 match self.vendor {
812 ApiVendor::Anthropic => vec![
813 ("x-api-key".to_string(), key.to_string()),
814 ("anthropic-version".to_string(), "2023-06-01".to_string()),
815 ("content-type".to_string(), "application/json".to_string()),
816 ],
817 ApiVendor::Openai => vec![
818 ("authorization".to_string(), format!("Bearer {key}")),
819 ("content-type".to_string(), "application/json".to_string()),
820 ],
821 }
822 }
823
824 fn run_curl(&self, key: &str, body: &[u8]) -> Result<String> {
827 let path = std::env::temp_dir().join(format!(
828 "skilltest-judge-{}-{}.cfg",
829 std::process::id(),
830 curl_config_nonce()
831 ));
832 write_curl_config(&path, &self.endpoint, &self.headers(key), self.timeout_secs)?;
833 let outcome = self.exec_curl(&path, body);
834 let _ = std::fs::remove_file(&path);
836 outcome
837 }
838
839 fn exec_curl(&self, config_path: &std::path::Path, body: &[u8]) -> Result<String> {
840 let mut child = Command::new(&self.curl_bin)
841 .arg("--config")
842 .arg(config_path)
843 .arg("--data-binary")
844 .arg("@-")
845 .stdin(Stdio::piped())
846 .stdout(Stdio::piped())
847 .stderr(Stdio::piped())
848 .spawn()
849 .map_err(|e| {
850 Error::provider(
851 "api-judge",
852 format!(
853 "could not run `{}`: {e}. Is curl installed and on PATH?",
854 self.curl_bin
855 ),
856 )
857 })?;
858
859 child
860 .stdin
861 .as_mut()
862 .ok_or_else(|| Error::provider("api-judge", "could not open curl stdin"))?
863 .write_all(body)
864 .map_err(|e| Error::provider("api-judge", format!("could not write request: {e}")))?;
865
866 let output = child
867 .wait_with_output()
868 .map_err(|e| Error::provider("api-judge", format!("curl did not complete: {e}")))?;
869
870 if !output.status.success() {
871 let stderr = String::from_utf8_lossy(&output.stderr);
872 return Err(Error::provider(
873 "api-judge",
874 format!("curl failed ({}): {}", output.status, stderr.trim()),
875 ));
876 }
877 Ok(String::from_utf8_lossy(&output.stdout).into_owned())
878 }
879}
880
881impl Provider for ApiJudgeProvider {
882 fn respond(
883 &self,
884 _platform: &str,
885 _model: &str,
886 _skill: &SkillRef<'_>,
887 _messages: &[Message],
888 _session: Option<&str>,
889 ) -> Result<AssistantTurn> {
890 Err(Error::provider(
891 "api-judge",
892 "the API judge does not run skills; use it as the judge in a SplitProvider",
893 ))
894 }
895
896 fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
897 let prompt = build_user_prompt(persona, messages);
898 let outcome = self.chat(model, JUDGE_SYSTEM, &prompt, None)?;
900 Ok(UserTurn {
901 message: outcome.text.trim().to_string(),
902 stop: false,
903 usage: outcome.usage,
904 })
905 }
906
907 fn judge(
908 &self,
909 model: &str,
910 query: &JudgeQuery<'_>,
911 messages: &[Message],
912 ) -> Result<JudgeVerdict> {
913 let prompt = build_judge_prompt(query, messages);
914 let schema = self.strict_json.then(|| verdict_schema(query.kind));
917 let outcome = self.chat(model, JUDGE_SYSTEM, &prompt, schema)?;
918 let mut verdict = parse_verdict(query.kind, &outcome.text)?;
919 verdict.usage = outcome.usage;
920 Ok(verdict)
921 }
922}
923
924pub struct SplitProvider {
930 responder: Box<dyn Provider>,
931 judge: ApiJudgeProvider,
932}
933
934impl SplitProvider {
935 #[must_use]
937 pub fn new(responder: Box<dyn Provider>, judge: ApiJudgeProvider) -> Self {
938 Self { responder, judge }
939 }
940}
941
942impl Provider for SplitProvider {
943 fn respond(
944 &self,
945 platform: &str,
946 model: &str,
947 skill: &SkillRef<'_>,
948 messages: &[Message],
949 session: Option<&str>,
950 ) -> Result<AssistantTurn> {
951 self.responder
952 .respond(platform, model, skill, messages, session)
953 }
954
955 fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
956 self.judge.simulate_user(model, persona, messages)
957 }
958
959 fn judge(
960 &self,
961 model: &str,
962 query: &JudgeQuery<'_>,
963 messages: &[Message],
964 ) -> Result<JudgeVerdict> {
965 self.judge.judge(model, query, messages)
966 }
967
968 fn supports_resume(&self, platform: &str) -> bool {
969 self.responder.supports_resume(platform)
970 }
971}
972
973fn curl_config_nonce() -> u64 {
976 use std::sync::atomic::{AtomicU64, Ordering};
977 static COUNTER: AtomicU64 = AtomicU64::new(0);
978 COUNTER.fetch_add(1, Ordering::Relaxed)
979}
980
981fn curl_escape(value: &str) -> String {
983 value.replace('\\', "\\\\").replace('"', "\\\"")
984}
985
986fn write_curl_config(
990 path: &std::path::Path,
991 url: &str,
992 headers: &[(String, String)],
993 timeout_secs: u64,
994) -> Result<()> {
995 let mut config = String::new();
996 config.push_str(&format!("url = \"{}\"\n", curl_escape(url)));
997 config.push_str("request = \"POST\"\n");
998 for (name, value) in headers {
999 config.push_str(&format!("header = \"{}: {}\"\n", name, curl_escape(value)));
1000 }
1001 config.push_str(&format!("max-time = {timeout_secs}\n"));
1002 config.push_str("silent\nshow-error\n");
1003
1004 let mut options = std::fs::OpenOptions::new();
1005 options.write(true).create(true).truncate(true);
1006 #[cfg(unix)]
1007 {
1008 use std::os::unix::fs::OpenOptionsExt as _;
1009 options.mode(0o600);
1010 }
1011 let mut file = options
1012 .open(path)
1013 .map_err(|e| Error::provider("api-judge", format!("could not write curl config: {e}")))?;
1014 file.write_all(config.as_bytes())
1015 .map_err(|e| Error::provider("api-judge", format!("could not write curl config: {e}")))?;
1016 Ok(())
1017}
1018
1019fn verdict_schema(kind: JudgeKind) -> serde_json::Value {
1024 let value_type = match kind {
1025 JudgeKind::Boolean => "boolean",
1026 JudgeKind::Numeric => "number",
1027 };
1028 serde_json::json!({
1029 "type": "object",
1030 "properties": {
1031 "value": { "type": value_type },
1032 "reason": { "type": "string" },
1033 },
1034 "required": ["value", "reason"],
1035 "additionalProperties": false,
1036 })
1037}
1038
1039fn build_chat_body(
1044 vendor: ApiVendor,
1045 model: &str,
1046 system: &str,
1047 user: &str,
1048 schema: Option<serde_json::Value>,
1049) -> serde_json::Value {
1050 match vendor {
1051 ApiVendor::Anthropic => {
1052 let mut body = serde_json::json!({
1053 "model": model,
1054 "max_tokens": 1024,
1055 "system": system,
1056 "messages": [{ "role": "user", "content": user }],
1057 });
1058 if let Some(schema) = schema {
1059 body["output_config"] =
1060 serde_json::json!({ "format": { "type": "json_schema", "schema": schema } });
1061 }
1062 body
1063 }
1064 ApiVendor::Openai => {
1065 let mut body = serde_json::json!({
1066 "model": model,
1067 "max_tokens": 1024,
1068 "messages": [
1069 { "role": "system", "content": system },
1070 { "role": "user", "content": user },
1071 ],
1072 });
1073 if let Some(schema) = schema {
1074 body["response_format"] = serde_json::json!({
1075 "type": "json_schema",
1076 "json_schema": { "name": "verdict", "strict": true, "schema": schema },
1077 });
1078 }
1079 body
1080 }
1081 }
1082}
1083
1084fn is_retryable(err: &Error) -> bool {
1086 matches!(
1087 err,
1088 Error::Provider { kind: Some(k), .. } if k == "rate_limit" || k == "overloaded"
1089 )
1090}
1091
1092#[derive(Deserialize)]
1096struct ApiErrorBody {
1097 #[serde(rename = "type", default)]
1098 kind: Option<String>,
1099 #[serde(default)]
1100 message: Option<String>,
1101}
1102
1103#[derive(Deserialize)]
1104struct AnthropicBlock {
1105 #[serde(rename = "type")]
1106 kind: String,
1107 #[serde(default)]
1108 text: Option<String>,
1109}
1110
1111#[derive(Deserialize)]
1112struct AnthropicUsage {
1113 #[serde(default)]
1114 input_tokens: Option<u64>,
1115 #[serde(default)]
1116 output_tokens: Option<u64>,
1117}
1118
1119#[derive(Deserialize)]
1120struct AnthropicResponse {
1121 #[serde(default)]
1122 content: Vec<AnthropicBlock>,
1123 #[serde(default)]
1124 usage: Option<AnthropicUsage>,
1125 #[serde(default)]
1126 stop_reason: Option<String>,
1127 #[serde(default)]
1128 error: Option<ApiErrorBody>,
1129}
1130
1131#[derive(Deserialize)]
1132struct OpenAiMessage {
1133 #[serde(default)]
1134 content: Option<String>,
1135}
1136
1137#[derive(Deserialize)]
1138struct OpenAiChoice {
1139 #[serde(default)]
1140 message: Option<OpenAiMessage>,
1141}
1142
1143#[derive(Deserialize)]
1144struct OpenAiUsage {
1145 #[serde(default)]
1146 prompt_tokens: Option<u64>,
1147 #[serde(default)]
1148 completion_tokens: Option<u64>,
1149}
1150
1151#[derive(Deserialize)]
1152struct OpenAiResponse {
1153 #[serde(default)]
1154 choices: Vec<OpenAiChoice>,
1155 #[serde(default)]
1156 usage: Option<OpenAiUsage>,
1157 #[serde(default)]
1158 error: Option<ApiErrorBody>,
1159}
1160
1161fn classify_api_error(kind: Option<&str>) -> Option<String> {
1164 match kind? {
1165 "authentication_error" | "invalid_api_key" | "permission_error" => Some("auth".to_string()),
1166 "rate_limit_error" | "rate_limit_exceeded" => Some("rate_limit".to_string()),
1167 "insufficient_quota" | "billing_error" => Some("quota".to_string()),
1168 "not_found_error" => Some("model_not_found".to_string()),
1169 "overloaded_error" | "api_error" | "server_error" | "service_unavailable" => {
1172 Some("overloaded".to_string())
1173 }
1174 _ => None,
1175 }
1176}
1177
1178fn api_error(err: ApiErrorBody) -> Error {
1179 let message = err
1180 .message
1181 .unwrap_or_else(|| "API returned an error".to_string());
1182 match classify_api_error(err.kind.as_deref()) {
1183 Some(kind) => Error::provider_classified("api-judge", message, kind),
1184 None => Error::provider("api-judge", message),
1185 }
1186}
1187
1188fn truncate_for_error(raw: &str) -> String {
1190 raw.chars().take(500).collect()
1191}
1192
1193fn parse_chat_response(vendor: ApiVendor, raw: &str) -> Result<ChatOutcome> {
1195 match vendor {
1196 ApiVendor::Anthropic => {
1197 let resp: AnthropicResponse = serde_json::from_str(raw.trim()).map_err(|e| {
1198 Error::provider(
1199 "api-judge",
1200 format!(
1201 "could not parse API response: {e}; got: {}",
1202 truncate_for_error(raw)
1203 ),
1204 )
1205 })?;
1206 if let Some(err) = resp.error {
1207 return Err(api_error(err));
1208 }
1209 let text = resp
1210 .content
1211 .iter()
1212 .filter(|b| b.kind == "text")
1213 .filter_map(|b| b.text.as_deref())
1214 .collect::<String>();
1215 if text.trim().is_empty() {
1216 return Err(Error::provider(
1217 "api-judge",
1218 format!(
1219 "judge returned no text (stop_reason: {:?})",
1220 resp.stop_reason
1221 ),
1222 ));
1223 }
1224 let usage = resp.usage.map(|u| Usage {
1225 input_tokens: u.input_tokens,
1226 output_tokens: u.output_tokens,
1227 cost_usd: None,
1228 });
1229 Ok(ChatOutcome { text, usage })
1230 }
1231 ApiVendor::Openai => {
1232 let resp: OpenAiResponse = serde_json::from_str(raw.trim()).map_err(|e| {
1233 Error::provider(
1234 "api-judge",
1235 format!(
1236 "could not parse API response: {e}; got: {}",
1237 truncate_for_error(raw)
1238 ),
1239 )
1240 })?;
1241 if let Some(err) = resp.error {
1242 return Err(api_error(err));
1243 }
1244 let text = resp
1245 .choices
1246 .into_iter()
1247 .next()
1248 .and_then(|c| c.message)
1249 .and_then(|m| m.content)
1250 .unwrap_or_default();
1251 if text.trim().is_empty() {
1252 return Err(Error::provider("api-judge", "judge returned no text"));
1253 }
1254 let usage = resp.usage.map(|u| Usage {
1255 input_tokens: u.prompt_tokens,
1256 output_tokens: u.completion_tokens,
1257 cost_usd: None,
1258 });
1259 Ok(ChatOutcome { text, usage })
1260 }
1261 }
1262}
1263
1264fn render_transcript(messages: &[Message]) -> String {
1268 messages
1269 .iter()
1270 .map(|m| {
1271 let role = match m.role {
1272 Role::User => "User",
1273 Role::Assistant => "Assistant",
1274 Role::System => "System",
1275 };
1276 format!("{role}: {}", m.content)
1277 })
1278 .collect::<Vec<_>>()
1279 .join("\n")
1280}
1281
1282fn render_transcript_for_respond(messages: &[Message]) -> String {
1286 format!(
1287 "Conversation so far (most recent last):\n{}\n\n\
1288 Write only the assistant's next reply, following your system \
1289 instructions. Output the reply text and nothing else.",
1290 render_transcript(messages),
1291 )
1292}
1293
1294fn latest_user_message(messages: &[Message]) -> Option<String> {
1297 messages
1298 .iter()
1299 .rev()
1300 .find(|m| m.role == Role::User)
1301 .map(|m| m.content.clone())
1302}
1303
1304fn build_user_prompt(persona: &str, messages: &[Message]) -> String {
1305 format!(
1306 "You are role-playing the USER in a conversation with an AI assistant. \
1307 Stay in character:\n\n{persona}\n\n\
1308 Conversation so far (most recent last):\n{transcript}\n\n\
1309 Write only the user's next message. Output the message text and nothing \
1310 else.",
1311 transcript = render_transcript(messages),
1312 )
1313}
1314
1315fn build_judge_prompt(query: &JudgeQuery<'_>, messages: &[Message]) -> String {
1316 let transcript = render_transcript(messages);
1317 match query.kind {
1318 JudgeKind::Boolean => format!(
1319 "You are a strict, careful evaluator of an AI assistant's behavior.\n\n\
1320 Criterion: {criterion}\n\n\
1321 Transcript:\n{transcript}\n\n\
1322 Decide whether the criterion is satisfied. Respond with ONLY a \
1323 single-line JSON object and nothing else:\n\
1324 {{\"value\": true or false, \"reason\": \"<one short sentence>\"}}",
1325 criterion = query.criterion,
1326 ),
1327 JudgeKind::Numeric => {
1328 let (min, max) = query.scale.unwrap_or((0.0, 10.0));
1329 format!(
1330 "You are a strict, careful evaluator of an AI assistant's behavior.\n\n\
1331 Criterion: {criterion}\n\n\
1332 Transcript:\n{transcript}\n\n\
1333 Score how well the criterion is satisfied on a scale from {min} to \
1334 {max} (inclusive). Respond with ONLY a single-line JSON object and \
1335 nothing else:\n\
1336 {{\"value\": <number between {min} and {max}>, \"reason\": \"<one short sentence>\"}}",
1337 criterion = query.criterion,
1338 )
1339 }
1340 }
1341}
1342
1343fn extract_json_object(text: &str) -> Option<&str> {
1346 let start = text.find('{')?;
1347 let end = text.rfind('}')?;
1348 if end > start {
1349 Some(&text[start..=end])
1350 } else {
1351 None
1352 }
1353}
1354
1355fn parse_verdict(kind: JudgeKind, text: &str) -> Result<JudgeVerdict> {
1356 let json = extract_json_object(text).ok_or_else(|| {
1357 Error::provider(
1358 "oneharness:judge",
1359 format!("judge did not return a JSON object; got: {text}"),
1360 )
1361 })?;
1362 let value: serde_json::Value = serde_json::from_str(json).map_err(|e| {
1363 Error::provider(
1364 "oneharness:judge",
1365 format!("judge verdict was not valid JSON: {e}; got: {json}"),
1366 )
1367 })?;
1368 let reason = value
1369 .get("reason")
1370 .and_then(serde_json::Value::as_str)
1371 .unwrap_or("")
1372 .to_string();
1373 let raw = value
1374 .get("value")
1375 .ok_or_else(|| Error::provider("oneharness:judge", "judge verdict has no `value` field"))?;
1376
1377 let verdict_value = match kind {
1378 JudgeKind::Boolean => JudgeValue::Bool(raw.as_bool().ok_or_else(|| {
1379 Error::provider(
1380 "oneharness:judge",
1381 format!("boolean judge `value` was not a bool: {raw}"),
1382 )
1383 })?),
1384 JudgeKind::Numeric => JudgeValue::Number(raw.as_f64().ok_or_else(|| {
1385 Error::provider(
1386 "oneharness:judge",
1387 format!("numeric judge `value` was not a number: {raw}"),
1388 )
1389 })?),
1390 };
1391
1392 Ok(JudgeVerdict {
1393 value: verdict_value,
1394 reason,
1395 usage: None,
1396 })
1397}
1398
1399#[cfg(test)]
1400mod tests {
1401 use super::*;
1402
1403 #[test]
1404 fn empty_argv_is_rejected() {
1405 assert!(CommandProvider::new(vec![]).is_err());
1406 }
1407
1408 #[test]
1409 fn request_serializes_with_op_tag() {
1410 let req = Request::Judge {
1411 model: "m",
1412 kind: "numeric",
1413 criterion: "polite",
1414 min: Some(0.0),
1415 max: Some(10.0),
1416 messages: &[],
1417 };
1418 let json = serde_json::to_string(&req).unwrap();
1419 assert!(json.contains("\"op\":\"judge\""));
1420 assert!(json.contains("\"kind\":\"numeric\""));
1421 }
1422
1423 #[test]
1424 fn respond_no_session_inlines_transcript_but_not_skill() {
1425 let messages = [
1428 Message::user("Hi"),
1429 Message::assistant("Hello"),
1430 Message::user("Again?"),
1431 ];
1432 let prompt = render_transcript_for_respond(&messages);
1433 assert!(prompt.contains("User: Hi"));
1434 assert!(prompt.contains("Assistant: Hello"));
1435 assert!(prompt.contains("User: Again?"));
1436 assert!(!prompt.contains("SKILL"));
1438 }
1439
1440 #[test]
1441 fn respond_with_session_sends_only_latest_user_message() {
1442 let messages = [
1443 Message::user("Hi"),
1444 Message::assistant("Hello"),
1445 Message::user("Again?"),
1446 ];
1447 assert_eq!(latest_user_message(&messages).as_deref(), Some("Again?"));
1448 }
1449
1450 #[test]
1451 fn extracts_json_from_fenced_or_prose_text() {
1452 assert_eq!(
1453 extract_json_object("```json\n{\"value\": true}\n```"),
1454 Some("{\"value\": true}")
1455 );
1456 assert_eq!(
1457 extract_json_object("Sure! {\"value\": 8, \"reason\": \"x\"} done"),
1458 Some("{\"value\": 8, \"reason\": \"x\"}")
1459 );
1460 assert_eq!(extract_json_object("no json here"), None);
1461 }
1462
1463 #[test]
1464 fn parses_boolean_and_numeric_verdicts() {
1465 let b = parse_verdict(JudgeKind::Boolean, "{\"value\": true, \"reason\": \"ok\"}").unwrap();
1466 assert!(matches!(b.value, JudgeValue::Bool(true)));
1467 assert_eq!(b.reason, "ok");
1468
1469 let n =
1470 parse_verdict(JudgeKind::Numeric, "{\"value\": 8.5, \"reason\": \"good\"}").unwrap();
1471 assert!(matches!(n.value, JudgeValue::Number(v) if (v - 8.5).abs() < f64::EPSILON));
1472 }
1473
1474 #[test]
1475 fn verdict_with_wrong_value_type_errors() {
1476 assert!(parse_verdict(JudgeKind::Boolean, "{\"value\": 3}").is_err());
1477 assert!(parse_verdict(JudgeKind::Numeric, "{\"value\": true}").is_err());
1478 assert!(parse_verdict(JudgeKind::Boolean, "no json").is_err());
1479 }
1480
1481 #[test]
1482 fn usage_accumulates_independently_per_field() {
1483 let mut total = Usage::default();
1484 total.add(&Usage {
1485 input_tokens: Some(10),
1486 output_tokens: None,
1487 cost_usd: Some(0.01),
1488 });
1489 total.add(&Usage {
1490 input_tokens: Some(5),
1491 output_tokens: Some(3),
1492 cost_usd: None,
1493 });
1494 assert_eq!(total.input_tokens, Some(15));
1495 assert_eq!(total.output_tokens, Some(3));
1496 assert!((total.cost_usd.unwrap() - 0.01).abs() < f64::EPSILON);
1497 assert!(!total.is_empty());
1498 }
1499
1500 #[test]
1501 fn reply_text_prefers_extracted_then_falls_back_to_stdout() {
1502 assert_eq!(
1504 select_reply_text(Some("clean reply".into()), "raw noise"),
1505 Some("clean reply".into())
1506 );
1507 assert_eq!(
1510 select_reply_text(None, "{\"type\":\"text\",\"part\":{\"text\":\"pong\"}}"),
1511 Some("{\"type\":\"text\",\"part\":{\"text\":\"pong\"}}".into())
1512 );
1513 assert_eq!(
1514 select_reply_text(Some(" ".into()), "fallback"),
1515 Some("fallback".into())
1516 );
1517 assert_eq!(select_reply_text(None, " \n"), None);
1519 assert_eq!(select_reply_text(Some(String::new()), ""), None);
1520 }
1521
1522 #[test]
1523 fn supports_resume_covers_known_harnesses() {
1524 assert!(supports_resume("claude-code"));
1525 assert!(supports_resume("opencode"));
1526 assert!(supports_resume("cursor"));
1527 assert!(!supports_resume("codex"));
1528 assert!(!supports_resume("goose"));
1529 }
1530
1531 fn api_config(vendor: ApiVendor) -> ApiJudgeConfig {
1532 ApiJudgeConfig {
1533 vendor,
1534 api_key_env: None,
1535 base_url: None,
1536 timeout_secs: 60,
1537 curl_bin: "curl".to_string(),
1538 strict_json: true,
1539 }
1540 }
1541
1542 #[test]
1543 fn api_judge_resolves_vendor_defaults() {
1544 let anthropic = ApiJudgeProvider::new(&api_config(ApiVendor::Anthropic));
1545 assert_eq!(anthropic.api_key_env, "ANTHROPIC_API_KEY");
1546 assert_eq!(anthropic.endpoint, "https://api.anthropic.com/v1/messages");
1547
1548 let openai = ApiJudgeProvider::new(&api_config(ApiVendor::Openai));
1549 assert_eq!(openai.api_key_env, "OPENAI_API_KEY");
1550 assert_eq!(
1551 openai.endpoint,
1552 "https://api.openai.com/v1/chat/completions"
1553 );
1554 }
1555
1556 #[test]
1557 fn api_judge_honors_overrides() {
1558 let provider = ApiJudgeProvider::new(&ApiJudgeConfig {
1559 vendor: ApiVendor::Openai,
1560 api_key_env: Some("MY_KEY".to_string()),
1561 base_url: Some("https://proxy.example/v1/chat/completions".to_string()),
1562 timeout_secs: 5,
1563 curl_bin: "curl".to_string(),
1564 strict_json: true,
1565 });
1566 assert_eq!(provider.api_key_env, "MY_KEY");
1567 assert_eq!(
1568 provider.endpoint,
1569 "https://proxy.example/v1/chat/completions"
1570 );
1571 }
1572
1573 #[test]
1574 fn build_chat_body_shapes_per_vendor() {
1575 let anthropic = build_chat_body(ApiVendor::Anthropic, "claude-x", "sys", "hi", None);
1576 assert_eq!(anthropic["model"], "claude-x");
1577 assert_eq!(anthropic["system"], "sys");
1578 assert_eq!(anthropic["messages"][0]["role"], "user");
1579 assert_eq!(anthropic["messages"].as_array().unwrap().len(), 1);
1581 assert!(anthropic.get("output_config").is_none());
1583
1584 let openai = build_chat_body(ApiVendor::Openai, "gpt-x", "sys", "hi", None);
1585 assert_eq!(openai["messages"][0]["role"], "system");
1586 assert_eq!(openai["messages"][1]["role"], "user");
1587 assert!(openai.get("system").is_none());
1588 assert!(openai.get("response_format").is_none());
1589 }
1590
1591 #[test]
1592 fn build_chat_body_attaches_strict_schema_per_vendor() {
1593 let schema = verdict_schema(JudgeKind::Boolean);
1594 let anthropic = build_chat_body(
1595 ApiVendor::Anthropic,
1596 "claude-x",
1597 "sys",
1598 "hi",
1599 Some(schema.clone()),
1600 );
1601 assert_eq!(anthropic["output_config"]["format"]["type"], "json_schema");
1603 assert_eq!(
1604 anthropic["output_config"]["format"]["schema"]["properties"]["value"]["type"],
1605 "boolean"
1606 );
1607
1608 let numeric = verdict_schema(JudgeKind::Numeric);
1609 let openai = build_chat_body(ApiVendor::Openai, "gpt-x", "sys", "hi", Some(numeric));
1610 assert_eq!(openai["response_format"]["type"], "json_schema");
1612 assert_eq!(openai["response_format"]["json_schema"]["strict"], true);
1613 assert_eq!(
1614 openai["response_format"]["json_schema"]["schema"]["properties"]["value"]["type"],
1615 "number"
1616 );
1617 }
1618
1619 #[test]
1620 fn verdict_schema_requires_value_and_reason_with_no_extras() {
1621 let schema = verdict_schema(JudgeKind::Numeric);
1622 assert_eq!(schema["additionalProperties"], false);
1623 let required: Vec<&str> = schema["required"]
1624 .as_array()
1625 .unwrap()
1626 .iter()
1627 .map(|v| v.as_str().unwrap())
1628 .collect();
1629 assert_eq!(required, ["value", "reason"]);
1630 }
1631
1632 #[test]
1633 fn parses_anthropic_success_with_usage() {
1634 let raw = r#"{"content":[{"type":"text","text":"{\"value\": true}"}],
1635 "stop_reason":"end_turn","usage":{"input_tokens":12,"output_tokens":3}}"#;
1636 let outcome = parse_chat_response(ApiVendor::Anthropic, raw).unwrap();
1637 assert_eq!(outcome.text, "{\"value\": true}");
1638 let usage = outcome.usage.unwrap();
1639 assert_eq!(usage.input_tokens, Some(12));
1640 assert_eq!(usage.output_tokens, Some(3));
1641 assert!(usage.cost_usd.is_none());
1642 }
1643
1644 #[test]
1645 fn parses_openai_success_with_usage() {
1646 let raw = r#"{"choices":[{"message":{"content":"{\"value\": 8}"}}],
1647 "usage":{"prompt_tokens":20,"completion_tokens":4}}"#;
1648 let outcome = parse_chat_response(ApiVendor::Openai, raw).unwrap();
1649 assert_eq!(outcome.text, "{\"value\": 8}");
1650 let usage = outcome.usage.unwrap();
1651 assert_eq!(usage.input_tokens, Some(20));
1652 assert_eq!(usage.output_tokens, Some(4));
1653 }
1654
1655 #[test]
1656 fn parses_and_classifies_api_errors() {
1657 let auth = r#"{"error":{"type":"authentication_error","message":"bad key"}}"#;
1658 let err = parse_chat_response(ApiVendor::Anthropic, auth).unwrap_err();
1659 assert!(matches!(err, Error::Provider { kind: Some(k), .. } if k == "auth"));
1660
1661 let rate = r#"{"error":{"type":"rate_limit_exceeded","message":"slow down"}}"#;
1662 let err = parse_chat_response(ApiVendor::Openai, rate).unwrap_err();
1663 assert!(matches!(err, Error::Provider { kind: Some(k), .. } if k == "rate_limit"));
1664 }
1665
1666 #[test]
1667 fn empty_reply_is_an_error() {
1668 let raw = r#"{"content":[],"stop_reason":"refusal"}"#;
1669 assert!(parse_chat_response(ApiVendor::Anthropic, raw).is_err());
1670 }
1671
1672 #[test]
1673 fn classify_api_error_maps_known_kinds() {
1674 assert_eq!(
1675 classify_api_error(Some("invalid_api_key")).as_deref(),
1676 Some("auth")
1677 );
1678 assert_eq!(
1679 classify_api_error(Some("insufficient_quota")).as_deref(),
1680 Some("quota")
1681 );
1682 assert_eq!(
1683 classify_api_error(Some("not_found_error")).as_deref(),
1684 Some("model_not_found")
1685 );
1686 assert_eq!(
1687 classify_api_error(Some("overloaded_error")).as_deref(),
1688 Some("overloaded")
1689 );
1690 assert_eq!(classify_api_error(Some("something_else")), None);
1691 assert_eq!(classify_api_error(None), None);
1692 }
1693
1694 #[test]
1695 fn retryable_covers_transient_errors_only() {
1696 let overloaded = r#"{"error":{"type":"overloaded_error","message":"busy"}}"#;
1697 let err = parse_chat_response(ApiVendor::Anthropic, overloaded).unwrap_err();
1698 assert!(is_retryable(&err), "overload should retry");
1699
1700 let rate = r#"{"error":{"type":"rate_limit_error","message":"slow"}}"#;
1701 let err = parse_chat_response(ApiVendor::Anthropic, rate).unwrap_err();
1702 assert!(is_retryable(&err), "rate limit should retry");
1703
1704 let auth = r#"{"error":{"type":"authentication_error","message":"bad key"}}"#;
1705 let err = parse_chat_response(ApiVendor::Anthropic, auth).unwrap_err();
1706 assert!(!is_retryable(&err), "auth must not retry");
1707 }
1708
1709 #[test]
1710 fn curl_escape_handles_quotes_and_backslashes() {
1711 assert_eq!(curl_escape(r#"a"b\c"#), r#"a\"b\\c"#);
1712 }
1713
1714 struct StubResponder;
1717
1718 impl Provider for StubResponder {
1719 fn respond(
1720 &self,
1721 _platform: &str,
1722 _model: &str,
1723 _skill: &SkillRef<'_>,
1724 _messages: &[Message],
1725 _session: Option<&str>,
1726 ) -> Result<AssistantTurn> {
1727 Ok(AssistantTurn {
1728 message: "stub reply".to_string(),
1729 ..Default::default()
1730 })
1731 }
1732
1733 fn simulate_user(
1734 &self,
1735 _model: &str,
1736 _persona: &str,
1737 _messages: &[Message],
1738 ) -> Result<UserTurn> {
1739 unreachable!("split provider routes user simulation to the judge")
1740 }
1741
1742 fn judge(
1743 &self,
1744 _model: &str,
1745 _query: &JudgeQuery<'_>,
1746 _messages: &[Message],
1747 ) -> Result<JudgeVerdict> {
1748 unreachable!("split provider routes judging to the judge")
1749 }
1750
1751 fn supports_resume(&self, platform: &str) -> bool {
1752 platform == "claude-code"
1753 }
1754 }
1755
1756 #[test]
1757 fn split_provider_delegates_respond_and_resume() {
1758 let split = SplitProvider::new(
1759 Box::new(StubResponder),
1760 ApiJudgeProvider::new(&api_config(ApiVendor::Anthropic)),
1761 );
1762 assert!(split.supports_resume("claude-code"));
1764 assert!(!split.supports_resume("codex"));
1765 let skill = SkillRef {
1766 name: "s",
1767 dir: "/tmp/s",
1768 instructions: "do things",
1769 };
1770 let turn = split
1771 .respond("claude-code", "m", &skill, &[], None)
1772 .unwrap();
1773 assert_eq!(turn.message, "stub reply");
1774 }
1775
1776 #[test]
1777 fn api_judge_does_not_run_skills() {
1778 let provider = ApiJudgeProvider::new(&api_config(ApiVendor::Anthropic));
1779 let skill = SkillRef {
1780 name: "s",
1781 dir: "/tmp/s",
1782 instructions: "x",
1783 };
1784 assert!(provider.respond("p", "m", &skill, &[], None).is_err());
1785 }
1786}