1use crate::budget::budget_skipped_output;
20use crate::error::MultiError;
21use crate::mailbox::Mailbox;
22use crate::runner::AgentRunner;
23use crate::shared::SharedInfra;
24use crate::types::{AgentOutput, AgentSpec};
25use serde::{Deserialize, Serialize};
26use std::sync::Arc;
27use tracing::instrument;
28
29#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct MatchResult {
32 pub round: u32,
34 pub a: String,
36 pub b: String,
38 pub winner: String,
40 pub rationale: String,
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct TournamentResult {
47 pub task: String,
48 pub candidates: Vec<AgentOutput>,
50 pub matches: Vec<MatchResult>,
52 pub winner_name: String,
54 pub winner_answer: String,
56 pub ranking: Vec<String>,
59}
60
61pub struct Tournament {
63 pub competitors: Vec<AgentSpec>,
65 pub judge: AgentSpec,
68}
69
70impl Tournament {
71 pub fn new(competitors: Vec<AgentSpec>, judge: AgentSpec) -> Self {
72 Self { competitors, judge }
73 }
74
75 #[instrument(name = "multi.tournament", skip_all)]
76 pub async fn run(
77 &self,
78 task: &str,
79 runner: &Arc<dyn AgentRunner>,
80 infra: &SharedInfra,
81 ) -> Result<TournamentResult, MultiError> {
82 let candidates = self.gather_candidates(task, runner, infra).await;
84
85 let mut alive: Vec<(String, String)> = candidates
87 .iter()
88 .filter(|o| o.succeeded())
89 .map(|o| (o.name.clone(), o.answer.clone()))
90 .collect();
91
92 let mut matches = Vec::new();
93 let mut elimination_order: Vec<String> = Vec::new();
96
97 if alive.is_empty() {
98 return Ok(TournamentResult {
99 task: task.to_string(),
100 candidates,
101 matches,
102 winner_name: String::new(),
103 winner_answer: String::new(),
104 ranking: Vec::new(),
105 });
106 }
107
108 let mut round = 1;
109 while alive.len() > 1 {
110 let mut next: Vec<(String, String)> = Vec::new();
111 let mut i = 0;
112 while i + 1 < alive.len() {
113 let (name_a, ans_a) = alive[i].clone();
114 let (name_b, ans_b) = alive[i + 1].clone();
115
116 let (winner_side, rationale) = self
117 .judge_pair(task, &name_a, &ans_a, &name_b, &ans_b, runner, infra)
118 .await;
119
120 let (winner_name, winner_ans, loser_name) = match winner_side {
121 Side::A => (name_a.clone(), ans_a.clone(), name_b.clone()),
122 Side::B => (name_b.clone(), ans_b.clone(), name_a.clone()),
123 };
124 matches.push(MatchResult {
125 round,
126 a: name_a,
127 b: name_b,
128 winner: winner_name.clone(),
129 rationale,
130 });
131 elimination_order.push(loser_name);
132 next.push((winner_name, winner_ans));
133 i += 2;
134 }
135 if i < alive.len() {
137 next.push(alive[i].clone());
138 }
139 alive = next;
140 round += 1;
141 }
142
143 let (winner_name, winner_answer) = alive
144 .into_iter()
145 .next()
146 .unwrap_or_else(|| (String::new(), String::new()));
147
148 let mut ranking = vec![winner_name.clone()];
150 ranking.extend(elimination_order.into_iter().rev());
151
152 Ok(TournamentResult {
153 task: task.to_string(),
154 candidates,
155 matches,
156 winner_name,
157 winner_answer,
158 ranking,
159 })
160 }
161
162 async fn gather_candidates(
164 &self,
165 task: &str,
166 runner: &Arc<dyn AgentRunner>,
167 infra: &SharedInfra,
168 ) -> Vec<AgentOutput> {
169 let mailbox = Arc::new(Mailbox::default());
170 enum Slot {
171 Spawned(usize),
172 Skipped(AgentOutput),
173 }
174 let mut handles: Vec<tokio::task::JoinHandle<Result<AgentOutput, MultiError>>> = Vec::new();
175 let mut slots: Vec<Slot> = Vec::new();
176
177 for spec in &self.competitors {
178 if let Err(e) = infra.begin_agent() {
179 slots.push(Slot::Skipped(budget_skipped_output(&spec.name, &e)));
180 continue;
181 }
182 let runner = Arc::clone(runner);
183 let spec = spec.clone();
184 let task = task.to_string();
185 let mailbox = Arc::clone(&mailbox);
186 let rt = infra.make_runtime();
189 for tool in &spec.tools {
190 rt.register_tool(tool).await;
191 }
192 handles.push(tokio::spawn(async move {
193 runner.run(&spec, &task, &rt, &mailbox).await
194 }));
195 slots.push(Slot::Spawned(handles.len() - 1));
196 }
197
198 let mut results: Vec<Option<_>> = futures::future::join_all(handles)
199 .await
200 .into_iter()
201 .map(Some)
202 .collect();
203 let mut outputs = Vec::new();
204 for (i, slot) in slots.into_iter().enumerate() {
205 let idx = match slot {
206 Slot::Skipped(out) => {
207 outputs.push(out);
208 continue;
209 }
210 Slot::Spawned(idx) => idx,
211 };
212 match results.get_mut(idx).and_then(Option::take) {
213 Some(Ok(Ok(out))) => {
214 infra.record_output(&out);
215 outputs.push(out);
216 }
217 Some(Ok(Err(e))) => outputs.push(failed_output(&self.competitors[i].name, e.to_string())),
218 Some(Err(e)) => {
219 outputs.push(failed_output(&self.competitors[i].name, format!("join error: {e}")))
220 }
221 None => outputs.push(failed_output(
222 &self.competitors[i].name,
223 "internal: missing join result".into(),
224 )),
225 }
226 }
227 outputs
228 }
229
230 async fn judge_pair(
233 &self,
234 task: &str,
235 name_a: &str,
236 ans_a: &str,
237 name_b: &str,
238 ans_b: &str,
239 runner: &Arc<dyn AgentRunner>,
240 infra: &SharedInfra,
241 ) -> (Side, String) {
242 if infra.begin_agent().is_err() {
243 return (Side::A, "budget exhausted: defaulted to first candidate".into());
244 }
245
246 let judge_task = format!(
247 r#"You are judging a head-to-head comparison for this task:
248
249## Task
250{task}
251
252## Candidate A
253{ans_a}
254
255## Candidate B
256{ans_b}
257
258Decide which candidate better accomplishes the task. Be decisive.
259Respond with a JSON object:
260```json
261{{"winner": "A", "rationale": "one sentence why"}}
262```
263`winner` must be exactly "A" or "B"."#,
264 );
265
266 let mut judge_spec = self.judge.clone();
267 judge_spec.name = format!("{}_{}_vs_{}", self.judge.name, name_a, name_b);
269
270 let mailbox = Mailbox::default();
271 let rt = infra.make_runtime();
272 match runner.run(&judge_spec, &judge_task, &rt, &mailbox).await {
273 Ok(out) => {
274 infra.record_output(&out);
275 parse_verdict(&out.answer)
276 }
277 Err(e) => (Side::A, format!("judge failed, defaulted to A: {e}")),
278 }
279 }
280}
281
282#[derive(Clone, Copy)]
283enum Side {
284 A,
285 B,
286}
287
288fn failed_output(name: &str, error: String) -> AgentOutput {
289 AgentOutput {
290 name: name.to_string(),
291 answer: String::new(),
292 turns: 0,
293 tool_calls: 0,
294 duration_ms: 0.0,
295 error: Some(error),
296 outcome: None,
297 tokens: None,
298 }
299}
300
301fn parse_verdict(answer: &str) -> (Side, String) {
309 if let Some(json) = car_ir::json_extract::extract_json_object(answer) {
310 if let Ok(v) = serde_json::from_str::<serde_json::Value>(&json) {
311 let rationale = v
312 .get("rationale")
313 .and_then(|r| r.as_str())
314 .unwrap_or("")
315 .to_string();
316 if let Some(w) = v.get("winner").and_then(|w| w.as_str()) {
317 let side = if w.trim().eq_ignore_ascii_case("b") {
318 Side::B
319 } else {
320 Side::A
321 };
322 return (side, rationale);
323 }
324 }
325 }
326
327 const FALLBACK_RATIONALE: &str = "verdict parsed heuristically (no JSON winner field)";
329 let upper = answer.to_uppercase();
330
331 let phrase_a = ["CANDIDATE A", "WINNER: A", "WINNER A", "\"A\"", "ANSWER A"]
333 .iter()
334 .filter_map(|p| upper.find(p))
335 .min();
336 let phrase_b = ["CANDIDATE B", "WINNER: B", "WINNER B", "\"B\"", "ANSWER B"]
337 .iter()
338 .filter_map(|p| upper.find(p))
339 .min();
340 match (phrase_a, phrase_b) {
341 (Some(a), Some(b)) => return (if b < a { Side::B } else { Side::A }, FALLBACK_RATIONALE.into()),
342 (Some(_), None) => return (Side::A, FALLBACK_RATIONALE.into()),
343 (None, Some(_)) => return (Side::B, FALLBACK_RATIONALE.into()),
344 (None, None) => {}
345 }
346
347 if let Some(side) = first_standalone_ab(&upper) {
349 return (side, FALLBACK_RATIONALE.into());
350 }
351
352 (Side::A, FALLBACK_RATIONALE.into())
354}
355
356fn first_standalone_ab(upper: &str) -> Option<Side> {
359 let bytes = upper.as_bytes();
360 for (i, &c) in bytes.iter().enumerate() {
361 if c == b'A' || c == b'B' {
362 let prev_alnum = i > 0 && bytes[i - 1].is_ascii_alphanumeric();
363 let next_alnum = i + 1 < bytes.len() && bytes[i + 1].is_ascii_alphanumeric();
364 if !prev_alnum && !next_alnum {
365 return Some(if c == b'B' { Side::B } else { Side::A });
366 }
367 }
368 }
369 None
370}
371
372#[cfg(test)]
373mod tests {
374 use super::*;
375 use car_engine::Runtime;
376
377 struct ScriptedRunner;
380
381 #[async_trait::async_trait]
382 impl AgentRunner for ScriptedRunner {
383 async fn run(
384 &self,
385 spec: &AgentSpec,
386 task: &str,
387 _runtime: &Runtime,
388 _mailbox: &Mailbox,
389 ) -> Result<AgentOutput, MultiError> {
390 let answer = if spec.name.contains("_vs_") {
393 let a = extract_block(task, "## Candidate A");
394 let b = extract_block(task, "## Candidate B");
395 let winner = if b > a { "B" } else { "A" };
396 format!("{{\"winner\": \"{winner}\", \"rationale\": \"later sorts higher\"}}")
397 } else {
398 spec.name.clone()
399 };
400 Ok(AgentOutput {
401 name: spec.name.clone(),
402 answer,
403 turns: 1,
404 tool_calls: 0,
405 duration_ms: 1.0,
406 error: None,
407 outcome: None,
408 tokens: None,
409 })
410 }
411 }
412
413 fn extract_block<'a>(task: &'a str, header: &str) -> &'a str {
414 task.split(header)
415 .nth(1)
416 .map(|s| s.split("##").next().unwrap_or("").trim())
417 .unwrap_or("")
418 }
419
420 #[tokio::test]
421 async fn single_elimination_picks_alphabetical_max() {
422 let competitors = vec![
423 AgentSpec::new("alpha", ""),
424 AgentSpec::new("bravo", ""),
425 AgentSpec::new("charlie", ""),
426 AgentSpec::new("delta", ""),
427 ];
428 let judge = AgentSpec::new("judge", "pick the better answer");
429 let runner: Arc<dyn AgentRunner> = Arc::new(ScriptedRunner);
430 let infra = SharedInfra::new();
431
432 let r = Tournament::new(competitors, judge)
433 .run("rank these", &runner, &infra)
434 .await
435 .unwrap();
436
437 assert_eq!(r.winner_name, "delta");
440 assert_eq!(r.winner_answer, "delta");
441 assert_eq!(r.matches.len(), 3);
443 assert_eq!(r.ranking.first().unwrap(), "delta");
444 assert_eq!(r.ranking.len(), 4);
445 }
446
447 #[tokio::test]
448 async fn odd_competitor_gets_a_bye() {
449 let competitors = vec![
450 AgentSpec::new("alpha", ""),
451 AgentSpec::new("bravo", ""),
452 AgentSpec::new("charlie", ""),
453 ];
454 let judge = AgentSpec::new("judge", "");
455 let runner: Arc<dyn AgentRunner> = Arc::new(ScriptedRunner);
456 let infra = SharedInfra::new();
457
458 let r = Tournament::new(competitors, judge)
459 .run("rank", &runner, &infra)
460 .await
461 .unwrap();
462
463 assert_eq!(r.matches.len(), 2);
466 assert_eq!(r.winner_name, "charlie"); }
468
469 #[test]
470 fn parse_verdict_handles_prose_not_just_json() {
471 let (s, _) = parse_verdict("Candidate B is clearly better, it covers more cases.");
473 assert!(matches!(s, Side::B), "phrase 'Candidate B' should win");
474 let (s, _) = parse_verdict("After careful analysis, B.");
475 assert!(matches!(s, Side::B), "standalone trailing 'B' should win");
476 let (s, _) = parse_verdict("Answer A is the stronger submission.");
477 assert!(matches!(s, Side::A));
478 let (s, r) = parse_verdict("I think... {\"winner\": \"B\", \"rationale\": \"x\"}");
480 assert!(matches!(s, Side::B));
481 assert_eq!(r, "x");
482 }
483
484 #[test]
485 fn parse_verdict_does_not_leak_full_answer_on_fallback() {
486 let huge = format!("Candidate B wins. {}", "blah ".repeat(500));
487 let (_, rationale) = parse_verdict(&huge);
488 assert!(
489 rationale.len() < 100,
490 "fallback rationale must not echo the whole answer"
491 );
492 }
493
494 #[tokio::test]
495 async fn budget_cap_limits_competitors() {
496 let competitors: Vec<AgentSpec> = (0..4)
497 .map(|i| AgentSpec::new(&format!("c{i}"), ""))
498 .collect();
499 let judge = AgentSpec::new("judge", "");
500 let runner: Arc<dyn AgentRunner> = Arc::new(ScriptedRunner);
501 let infra = SharedInfra::new().with_budget(crate::BudgetLimits {
503 max_agents: Some(2),
504 ..Default::default()
505 });
506
507 let r = Tournament::new(competitors, judge)
508 .run("rank", &runner, &infra)
509 .await
510 .unwrap();
511
512 let produced = r.candidates.iter().filter(|o| o.succeeded()).count();
513 assert_eq!(produced, 2, "only two competitors fit the agent budget");
514 }
515}