1use std::io::Write as _;
14use std::process::{Command, Stdio};
15
16use serde::{Deserialize, Serialize};
17
18use crate::config::OneharnessConfig;
19use crate::conversation::{Message, Role};
20use crate::error::{Error, Result};
21use crate::eval::JudgeValue;
22
23pub struct SkillRef<'a> {
25 pub name: &'a str,
26 pub dir: &'a str,
27 pub instructions: &'a str,
28}
29
30#[derive(Debug, Clone, Copy, PartialEq, Eq)]
32pub enum JudgeKind {
33 Boolean,
34 Numeric,
35}
36
37impl JudgeKind {
38 fn as_str(self) -> &'static str {
39 match self {
40 JudgeKind::Boolean => "boolean",
41 JudgeKind::Numeric => "numeric",
42 }
43 }
44}
45
46pub struct JudgeQuery<'a> {
48 pub kind: JudgeKind,
49 pub criterion: &'a str,
50 pub scale: Option<(f64, f64)>,
51}
52
53#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
60pub struct Usage {
61 #[serde(default, skip_serializing_if = "Option::is_none")]
62 pub input_tokens: Option<u64>,
63 #[serde(default, skip_serializing_if = "Option::is_none")]
64 pub output_tokens: Option<u64>,
65 #[serde(default, skip_serializing_if = "Option::is_none")]
66 pub cost_usd: Option<f64>,
67}
68
69impl Usage {
70 #[must_use]
72 pub fn is_empty(&self) -> bool {
73 self.input_tokens.is_none() && self.output_tokens.is_none() && self.cost_usd.is_none()
74 }
75
76 pub fn add(&mut self, other: &Usage) {
79 if let Some(v) = other.input_tokens {
80 self.input_tokens = Some(self.input_tokens.unwrap_or(0) + v);
81 }
82 if let Some(v) = other.output_tokens {
83 self.output_tokens = Some(self.output_tokens.unwrap_or(0) + v);
84 }
85 if let Some(v) = other.cost_usd {
86 self.cost_usd = Some(self.cost_usd.unwrap_or(0.0) + v);
87 }
88 }
89}
90
91#[derive(Debug, Clone, Default)]
93pub struct AssistantTurn {
94 pub message: String,
95 pub done: bool,
97 pub usage: Option<Usage>,
99 pub session_id: Option<String>,
103}
104
105#[derive(Debug, Clone, Default)]
107pub struct UserTurn {
108 pub message: String,
109 pub stop: bool,
111 pub usage: Option<Usage>,
112}
113
114#[derive(Debug, Clone)]
116pub struct JudgeVerdict {
117 pub value: JudgeValue,
118 pub reason: String,
119 pub usage: Option<Usage>,
120}
121
122pub trait Provider {
124 fn respond(
133 &self,
134 platform: &str,
135 model: &str,
136 skill: &SkillRef<'_>,
137 messages: &[Message],
138 session: Option<&str>,
139 ) -> Result<AssistantTurn>;
140
141 fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn>;
146
147 fn judge(
152 &self,
153 model: &str,
154 query: &JudgeQuery<'_>,
155 messages: &[Message],
156 ) -> Result<JudgeVerdict>;
157
158 fn supports_resume(&self, _platform: &str) -> bool {
163 false
164 }
165}
166
167#[derive(Serialize)]
172struct SkillPayload<'a> {
173 name: &'a str,
174 path: &'a str,
175 instructions: &'a str,
176}
177
178#[derive(Serialize)]
179#[serde(tag = "op", rename_all = "lowercase")]
180enum Request<'a> {
181 Respond {
182 platform: &'a str,
183 model: &'a str,
184 skill: SkillPayload<'a>,
185 messages: &'a [Message],
186 #[serde(skip_serializing_if = "Option::is_none")]
187 session: Option<&'a str>,
188 },
189 User {
190 model: &'a str,
191 persona: &'a str,
192 messages: &'a [Message],
193 },
194 Judge {
195 model: &'a str,
196 kind: &'a str,
197 criterion: &'a str,
198 #[serde(skip_serializing_if = "Option::is_none")]
199 min: Option<f64>,
200 #[serde(skip_serializing_if = "Option::is_none")]
201 max: Option<f64>,
202 messages: &'a [Message],
203 },
204}
205
206#[derive(Deserialize)]
207struct RespondPayload {
208 message: String,
209 #[serde(default)]
210 done: bool,
211 #[serde(default)]
212 usage: Option<Usage>,
213 #[serde(default)]
214 session_id: Option<String>,
215}
216
217#[derive(Deserialize)]
218struct UserPayload {
219 message: String,
220 #[serde(default)]
221 stop: bool,
222 #[serde(default)]
223 usage: Option<Usage>,
224}
225
226#[derive(Deserialize)]
227struct JudgePayload {
228 value: JudgeValue,
229 #[serde(default)]
230 reason: String,
231 #[serde(default)]
232 usage: Option<Usage>,
233}
234
235pub struct CommandProvider {
241 argv: Vec<String>,
242}
243
244impl CommandProvider {
245 pub fn new(argv: Vec<String>) -> Result<Self> {
251 if argv.is_empty() {
252 return Err(Error::Invalid("provider command is empty".into()));
253 }
254 Ok(Self { argv })
255 }
256
257 fn call<T: for<'de> Deserialize<'de>>(&self, request: &Request<'_>, op: &str) -> Result<T> {
259 let payload = serde_json::to_vec(request).map_err(|e| {
260 Error::provider(op.to_string(), format!("could not encode request: {e}"))
261 })?;
262
263 let mut child = Command::new(&self.argv[0])
264 .args(&self.argv[1..])
265 .stdin(Stdio::piped())
266 .stdout(Stdio::piped())
267 .stderr(Stdio::piped())
268 .spawn()
269 .map_err(|e| {
270 Error::provider(
271 op.to_string(),
272 format!(
273 "could not run provider `{}`: {e}. Is it installed and on PATH?",
274 self.argv[0]
275 ),
276 )
277 })?;
278
279 {
282 let stdin = child
283 .stdin
284 .as_mut()
285 .ok_or_else(|| Error::provider(op.to_string(), "could not open provider stdin"))?;
286 stdin
287 .write_all(&payload)
288 .and_then(|()| stdin.write_all(b"\n"))
289 .map_err(|e| {
290 Error::provider(op.to_string(), format!("could not write request: {e}"))
291 })?;
292 }
293
294 let output = child.wait_with_output().map_err(|e| {
295 Error::provider(op.to_string(), format!("provider did not complete: {e}"))
296 })?;
297
298 if !output.status.success() {
299 let stderr = String::from_utf8_lossy(&output.stderr);
300 return Err(Error::provider(
301 op.to_string(),
302 format!("provider exited with {}: {}", output.status, stderr.trim()),
303 ));
304 }
305
306 let stdout = String::from_utf8_lossy(&output.stdout);
307 let line = stdout.trim();
308 if line.is_empty() {
309 return Err(Error::provider(
310 op.to_string(),
311 "provider produced no output (expected one JSON response object)",
312 ));
313 }
314 serde_json::from_str(line).map_err(|e| {
315 Error::provider(
316 op.to_string(),
317 format!("provider response was not valid JSON for `{op}`: {e}; got: {line}"),
318 )
319 })
320 }
321}
322
323impl Provider for CommandProvider {
324 fn respond(
325 &self,
326 platform: &str,
327 model: &str,
328 skill: &SkillRef<'_>,
329 messages: &[Message],
330 session: Option<&str>,
331 ) -> Result<AssistantTurn> {
332 let request = Request::Respond {
333 platform,
334 model,
335 skill: SkillPayload {
336 name: skill.name,
337 path: skill.dir,
338 instructions: skill.instructions,
339 },
340 messages,
341 session,
342 };
343 let payload: RespondPayload = self.call(&request, "respond")?;
344 Ok(AssistantTurn {
345 message: payload.message,
346 done: payload.done,
347 usage: payload.usage,
348 session_id: payload.session_id,
349 })
350 }
351
352 fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
353 let request = Request::User {
354 model,
355 persona,
356 messages,
357 };
358 let payload: UserPayload = self.call(&request, "user")?;
359 Ok(UserTurn {
360 message: payload.message,
361 stop: payload.stop,
362 usage: payload.usage,
363 })
364 }
365
366 fn judge(
367 &self,
368 model: &str,
369 query: &JudgeQuery<'_>,
370 messages: &[Message],
371 ) -> Result<JudgeVerdict> {
372 let (min, max) = match query.scale {
373 Some((lo, hi)) => (Some(lo), Some(hi)),
374 None => (None, None),
375 };
376 let request = Request::Judge {
377 model,
378 kind: query.kind.as_str(),
379 criterion: query.criterion,
380 min,
381 max,
382 messages,
383 };
384 let payload: JudgePayload = self.call(&request, "judge")?;
385 Ok(JudgeVerdict {
386 value: payload.value,
387 reason: payload.reason,
388 usage: payload.usage,
389 })
390 }
391}
392
393pub struct OneharnessProvider {
421 bin: String,
422 judge_harness: String,
423 timeout_secs: u64,
424}
425
426#[derive(Deserialize)]
428struct OhEnvelope {
429 results: Vec<OhResult>,
430}
431
432#[derive(Deserialize)]
433struct OhResult {
434 status: String,
435 #[serde(default)]
436 text: Option<String>,
437 #[serde(default)]
444 stdout: String,
445 #[serde(default)]
446 stderr: String,
447 #[serde(default)]
448 error: Option<String>,
449 #[serde(default)]
450 session_id: Option<String>,
451 #[serde(default)]
452 usage: Option<Usage>,
453 #[serde(default)]
454 failure_kind: Option<String>,
455}
456
457struct RunArgs<'a> {
459 harness: &'a str,
460 model: &'a str,
461 prompt: &'a str,
462 system: Option<&'a str>,
465 resume: Option<&'a str>,
468}
469
470struct RunOutcome {
472 text: String,
473 session_id: Option<String>,
474 usage: Option<Usage>,
475}
476
477fn select_reply_text(text: Option<String>, stdout: &str) -> Option<String> {
485 text.filter(|t| !t.trim().is_empty())
486 .or_else(|| (!stdout.trim().is_empty()).then(|| stdout.to_string()))
487}
488
489impl OneharnessProvider {
490 #[must_use]
492 pub fn new(config: &OneharnessConfig) -> Self {
493 Self {
494 bin: config.bin.clone(),
495 judge_harness: config.judge_harness.clone(),
496 timeout_secs: config.timeout_secs,
497 }
498 }
499
500 fn run(&self, args: &RunArgs<'_>) -> Result<RunOutcome> {
504 let timeout = self.timeout_secs.to_string();
505 let mut cmd = Command::new(&self.bin);
506 cmd.args([
513 "run",
514 "--harness",
515 args.harness,
516 "--compact",
517 "--timeout",
518 &timeout,
519 "--prompt-file",
520 "-",
521 ]);
522 if !args.model.is_empty() {
528 cmd.args(["--model", args.model]);
529 }
530 if let Some(system) = args.system {
531 cmd.args(["--system", system]);
532 }
533 if let Some(resume) = args.resume {
534 cmd.args(["--resume", resume]);
535 }
536
537 let mut child = cmd
538 .stdin(Stdio::piped())
539 .stdout(Stdio::piped())
540 .stderr(Stdio::piped())
541 .spawn()
542 .map_err(|e| {
543 Error::provider(
544 "oneharness",
545 format!(
546 "could not run `{}`: {e}. Is oneharness installed and on PATH?",
547 self.bin
548 ),
549 )
550 })?;
551
552 child
553 .stdin
554 .as_mut()
555 .ok_or_else(|| Error::provider("oneharness", "could not open oneharness stdin"))?
556 .write_all(args.prompt.as_bytes())
557 .map_err(|e| Error::provider("oneharness", format!("could not write prompt: {e}")))?;
558
559 let output = child.wait_with_output().map_err(|e| {
560 Error::provider("oneharness", format!("oneharness did not complete: {e}"))
561 })?;
562
563 let stdout = String::from_utf8_lossy(&output.stdout);
564 let envelope: OhEnvelope = serde_json::from_str(stdout.trim()).map_err(|e| {
565 Error::provider(
566 "oneharness",
567 format!(
568 "could not parse oneharness output: {e}; stderr: {}",
569 String::from_utf8_lossy(&output.stderr).trim()
570 ),
571 )
572 })?;
573
574 let result = envelope
575 .results
576 .into_iter()
577 .next()
578 .ok_or_else(|| Error::provider("oneharness", "oneharness returned no results"))?;
579
580 if result.status != "ok" {
581 let detail = result
582 .error
583 .filter(|e| !e.is_empty())
584 .or_else(|| Some(result.stderr.clone()).filter(|s| !s.is_empty()))
585 .unwrap_or_else(|| format!("status `{}`", result.status));
586 let context = format!("oneharness:{}", args.harness);
587 let message = format!("harness run failed: {detail}");
588 return Err(match result.failure_kind {
589 Some(kind) if !kind.is_empty() => {
590 Error::provider_classified(context, message, kind)
591 }
592 _ => Error::provider(context, message),
593 });
594 }
595
596 let text = select_reply_text(result.text, &result.stdout).ok_or_else(|| {
601 Error::provider(
602 format!("oneharness:{}", args.harness),
603 "harness produced neither extractable text nor stdout",
604 )
605 })?;
606 Ok(RunOutcome {
607 text,
608 session_id: result.session_id,
609 usage: result.usage,
610 })
611 }
612}
613
614impl Provider for OneharnessProvider {
615 fn respond(
616 &self,
617 platform: &str,
618 model: &str,
619 skill: &SkillRef<'_>,
620 messages: &[Message],
621 session: Option<&str>,
622 ) -> Result<AssistantTurn> {
623 let prompt = if session.is_some() {
628 latest_user_message(messages).unwrap_or_default()
629 } else {
630 render_transcript_for_respond(messages)
631 };
632 let outcome = self.run(&RunArgs {
633 harness: platform,
634 model,
635 prompt: &prompt,
636 system: Some(skill.instructions),
637 resume: session,
638 })?;
639 Ok(AssistantTurn {
640 message: outcome.text.trim().to_string(),
641 done: false,
642 usage: outcome.usage,
643 session_id: outcome.session_id,
644 })
645 }
646
647 fn simulate_user(&self, model: &str, persona: &str, messages: &[Message]) -> Result<UserTurn> {
648 let prompt = build_user_prompt(persona, messages);
649 let outcome = self.run(&RunArgs {
650 harness: &self.judge_harness,
651 model,
652 prompt: &prompt,
653 system: None,
654 resume: None,
655 })?;
656 Ok(UserTurn {
657 message: outcome.text.trim().to_string(),
658 stop: false,
659 usage: outcome.usage,
660 })
661 }
662
663 fn judge(
664 &self,
665 model: &str,
666 query: &JudgeQuery<'_>,
667 messages: &[Message],
668 ) -> Result<JudgeVerdict> {
669 let prompt = build_judge_prompt(query, messages);
670 let outcome = self.run(&RunArgs {
671 harness: &self.judge_harness,
672 model,
673 prompt: &prompt,
674 system: None,
675 resume: None,
676 })?;
677 let mut verdict = parse_verdict(query.kind, &outcome.text)?;
678 verdict.usage = outcome.usage;
679 Ok(verdict)
680 }
681
682 fn supports_resume(&self, platform: &str) -> bool {
683 supports_resume(platform)
684 }
685}
686
687#[must_use]
692pub fn supports_resume(harness: &str) -> bool {
693 matches!(harness, "claude-code" | "opencode" | "cursor")
694}
695
696fn render_transcript(messages: &[Message]) -> String {
700 messages
701 .iter()
702 .map(|m| {
703 let role = match m.role {
704 Role::User => "User",
705 Role::Assistant => "Assistant",
706 Role::System => "System",
707 };
708 format!("{role}: {}", m.content)
709 })
710 .collect::<Vec<_>>()
711 .join("\n")
712}
713
714fn render_transcript_for_respond(messages: &[Message]) -> String {
718 format!(
719 "Conversation so far (most recent last):\n{}\n\n\
720 Write only the assistant's next reply, following your system \
721 instructions. Output the reply text and nothing else.",
722 render_transcript(messages),
723 )
724}
725
726fn latest_user_message(messages: &[Message]) -> Option<String> {
729 messages
730 .iter()
731 .rev()
732 .find(|m| m.role == Role::User)
733 .map(|m| m.content.clone())
734}
735
736fn build_user_prompt(persona: &str, messages: &[Message]) -> String {
737 format!(
738 "You are role-playing the USER in a conversation with an AI assistant. \
739 Stay in character:\n\n{persona}\n\n\
740 Conversation so far (most recent last):\n{transcript}\n\n\
741 Write only the user's next message. Output the message text and nothing \
742 else.",
743 transcript = render_transcript(messages),
744 )
745}
746
747fn build_judge_prompt(query: &JudgeQuery<'_>, messages: &[Message]) -> String {
748 let transcript = render_transcript(messages);
749 match query.kind {
750 JudgeKind::Boolean => format!(
751 "You are a strict, careful evaluator of an AI assistant's behavior.\n\n\
752 Criterion: {criterion}\n\n\
753 Transcript:\n{transcript}\n\n\
754 Decide whether the criterion is satisfied. Respond with ONLY a \
755 single-line JSON object and nothing else:\n\
756 {{\"value\": true or false, \"reason\": \"<one short sentence>\"}}",
757 criterion = query.criterion,
758 ),
759 JudgeKind::Numeric => {
760 let (min, max) = query.scale.unwrap_or((0.0, 10.0));
761 format!(
762 "You are a strict, careful evaluator of an AI assistant's behavior.\n\n\
763 Criterion: {criterion}\n\n\
764 Transcript:\n{transcript}\n\n\
765 Score how well the criterion is satisfied on a scale from {min} to \
766 {max} (inclusive). Respond with ONLY a single-line JSON object and \
767 nothing else:\n\
768 {{\"value\": <number between {min} and {max}>, \"reason\": \"<one short sentence>\"}}",
769 criterion = query.criterion,
770 )
771 }
772 }
773}
774
775fn extract_json_object(text: &str) -> Option<&str> {
778 let start = text.find('{')?;
779 let end = text.rfind('}')?;
780 if end > start {
781 Some(&text[start..=end])
782 } else {
783 None
784 }
785}
786
787fn parse_verdict(kind: JudgeKind, text: &str) -> Result<JudgeVerdict> {
788 let json = extract_json_object(text).ok_or_else(|| {
789 Error::provider(
790 "oneharness:judge",
791 format!("judge did not return a JSON object; got: {text}"),
792 )
793 })?;
794 let value: serde_json::Value = serde_json::from_str(json).map_err(|e| {
795 Error::provider(
796 "oneharness:judge",
797 format!("judge verdict was not valid JSON: {e}; got: {json}"),
798 )
799 })?;
800 let reason = value
801 .get("reason")
802 .and_then(serde_json::Value::as_str)
803 .unwrap_or("")
804 .to_string();
805 let raw = value
806 .get("value")
807 .ok_or_else(|| Error::provider("oneharness:judge", "judge verdict has no `value` field"))?;
808
809 let verdict_value = match kind {
810 JudgeKind::Boolean => JudgeValue::Bool(raw.as_bool().ok_or_else(|| {
811 Error::provider(
812 "oneharness:judge",
813 format!("boolean judge `value` was not a bool: {raw}"),
814 )
815 })?),
816 JudgeKind::Numeric => JudgeValue::Number(raw.as_f64().ok_or_else(|| {
817 Error::provider(
818 "oneharness:judge",
819 format!("numeric judge `value` was not a number: {raw}"),
820 )
821 })?),
822 };
823
824 Ok(JudgeVerdict {
825 value: verdict_value,
826 reason,
827 usage: None,
828 })
829}
830
831#[cfg(test)]
832mod tests {
833 use super::*;
834
835 #[test]
836 fn empty_argv_is_rejected() {
837 assert!(CommandProvider::new(vec![]).is_err());
838 }
839
840 #[test]
841 fn request_serializes_with_op_tag() {
842 let req = Request::Judge {
843 model: "m",
844 kind: "numeric",
845 criterion: "polite",
846 min: Some(0.0),
847 max: Some(10.0),
848 messages: &[],
849 };
850 let json = serde_json::to_string(&req).unwrap();
851 assert!(json.contains("\"op\":\"judge\""));
852 assert!(json.contains("\"kind\":\"numeric\""));
853 }
854
855 #[test]
856 fn respond_no_session_inlines_transcript_but_not_skill() {
857 let messages = [
860 Message::user("Hi"),
861 Message::assistant("Hello"),
862 Message::user("Again?"),
863 ];
864 let prompt = render_transcript_for_respond(&messages);
865 assert!(prompt.contains("User: Hi"));
866 assert!(prompt.contains("Assistant: Hello"));
867 assert!(prompt.contains("User: Again?"));
868 assert!(!prompt.contains("SKILL"));
870 }
871
872 #[test]
873 fn respond_with_session_sends_only_latest_user_message() {
874 let messages = [
875 Message::user("Hi"),
876 Message::assistant("Hello"),
877 Message::user("Again?"),
878 ];
879 assert_eq!(latest_user_message(&messages).as_deref(), Some("Again?"));
880 }
881
882 #[test]
883 fn extracts_json_from_fenced_or_prose_text() {
884 assert_eq!(
885 extract_json_object("```json\n{\"value\": true}\n```"),
886 Some("{\"value\": true}")
887 );
888 assert_eq!(
889 extract_json_object("Sure! {\"value\": 8, \"reason\": \"x\"} done"),
890 Some("{\"value\": 8, \"reason\": \"x\"}")
891 );
892 assert_eq!(extract_json_object("no json here"), None);
893 }
894
895 #[test]
896 fn parses_boolean_and_numeric_verdicts() {
897 let b = parse_verdict(JudgeKind::Boolean, "{\"value\": true, \"reason\": \"ok\"}").unwrap();
898 assert!(matches!(b.value, JudgeValue::Bool(true)));
899 assert_eq!(b.reason, "ok");
900
901 let n =
902 parse_verdict(JudgeKind::Numeric, "{\"value\": 8.5, \"reason\": \"good\"}").unwrap();
903 assert!(matches!(n.value, JudgeValue::Number(v) if (v - 8.5).abs() < f64::EPSILON));
904 }
905
906 #[test]
907 fn verdict_with_wrong_value_type_errors() {
908 assert!(parse_verdict(JudgeKind::Boolean, "{\"value\": 3}").is_err());
909 assert!(parse_verdict(JudgeKind::Numeric, "{\"value\": true}").is_err());
910 assert!(parse_verdict(JudgeKind::Boolean, "no json").is_err());
911 }
912
913 #[test]
914 fn usage_accumulates_independently_per_field() {
915 let mut total = Usage::default();
916 total.add(&Usage {
917 input_tokens: Some(10),
918 output_tokens: None,
919 cost_usd: Some(0.01),
920 });
921 total.add(&Usage {
922 input_tokens: Some(5),
923 output_tokens: Some(3),
924 cost_usd: None,
925 });
926 assert_eq!(total.input_tokens, Some(15));
927 assert_eq!(total.output_tokens, Some(3));
928 assert!((total.cost_usd.unwrap() - 0.01).abs() < f64::EPSILON);
929 assert!(!total.is_empty());
930 }
931
932 #[test]
933 fn reply_text_prefers_extracted_then_falls_back_to_stdout() {
934 assert_eq!(
936 select_reply_text(Some("clean reply".into()), "raw noise"),
937 Some("clean reply".into())
938 );
939 assert_eq!(
942 select_reply_text(None, "{\"type\":\"text\",\"part\":{\"text\":\"pong\"}}"),
943 Some("{\"type\":\"text\",\"part\":{\"text\":\"pong\"}}".into())
944 );
945 assert_eq!(
946 select_reply_text(Some(" ".into()), "fallback"),
947 Some("fallback".into())
948 );
949 assert_eq!(select_reply_text(None, " \n"), None);
951 assert_eq!(select_reply_text(Some(String::new()), ""), None);
952 }
953
954 #[test]
955 fn supports_resume_covers_known_harnesses() {
956 assert!(supports_resume("claude-code"));
957 assert!(supports_resume("opencode"));
958 assert!(supports_resume("cursor"));
959 assert!(!supports_resume("codex"));
960 assert!(!supports_resume("goose"));
961 }
962}