car_multi/patterns/
tournament.rs

1//! Tournament — rank competitors by pairwise comparative judgment.
2//!
3//! Each competitor produces one candidate answer to the task, then a fresh judge
4//! agent compares candidates **two at a time** and picks the better one. Unlike
5//! [`Vote`](crate::Vote) (single-shot majority over independent answers) or
6//! [`AdversarialReview`](crate::AdversarialReview) (pass/fail against a spec), a
7//! tournament produces a *relative ordering* — the blog-style "sort a large set
8//! by comparative judgment / run a bracket until a winner emerges" idiom.
9//!
10//! Single-elimination: competitors are paired each round, the judge picks a
11//! winner per pair, winners advance, and an odd one out gets a bye. After
12//! `ceil(log2(n))` rounds one winner remains. The round at which a competitor is
13//! eliminated yields a coarse ranking (later elimination = higher rank).
14//!
15//! Pairwise comparison is more reliable than asking a model to score N items on
16//! an absolute scale: a judge comparing exactly two answers has a much easier,
17//! lower-variance decision.
18
19use crate::budget::budget_skipped_output;
20use crate::error::MultiError;
21use crate::mailbox::Mailbox;
22use crate::runner::AgentRunner;
23use crate::shared::SharedInfra;
24use crate::types::{AgentOutput, AgentSpec};
25use serde::{Deserialize, Serialize};
26use std::sync::Arc;
27use tracing::instrument;
28
29/// One pairwise judgment in the bracket.
30#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct MatchResult {
32    /// 1-based round number.
33    pub round: u32,
34    /// Competitor name on side A.
35    pub a: String,
36    /// Competitor name on side B.
37    pub b: String,
38    /// The name that advanced (either `a` or `b`).
39    pub winner: String,
40    /// The judge's stated reason (best-effort; empty if unparseable).
41    pub rationale: String,
42}
43
44/// Result of a tournament.
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct TournamentResult {
47    pub task: String,
48    /// The competitors' initial candidate answers.
49    pub candidates: Vec<AgentOutput>,
50    /// Every pairwise judgment, in bracket order.
51    pub matches: Vec<MatchResult>,
52    /// The winning competitor's name (empty if no competitor produced an answer).
53    pub winner_name: String,
54    /// The winning competitor's answer.
55    pub winner_answer: String,
56    /// Competitor names from best to worst, by elimination round (the winner
57    /// first, then those eliminated in later rounds before earlier ones).
58    pub ranking: Vec<String>,
59}
60
61/// Rank competitors by single-elimination pairwise judgment.
62pub struct Tournament {
63    /// The competitors; each produces one candidate answer to the task.
64    pub competitors: Vec<AgentSpec>,
65    /// The judge agent: compares two candidates and picks the better. Always a
66    /// fresh agent per match (no carried context), like [`AdversarialReview`].
67    pub judge: AgentSpec,
68}
69
70impl Tournament {
71    pub fn new(competitors: Vec<AgentSpec>, judge: AgentSpec) -> Self {
72        Self { competitors, judge }
73    }
74
75    #[instrument(name = "multi.tournament", skip_all)]
76    pub async fn run(
77        &self,
78        task: &str,
79        runner: &Arc<dyn AgentRunner>,
80        infra: &SharedInfra,
81    ) -> Result<TournamentResult, MultiError> {
82        // Phase 1: each competitor produces a candidate answer (budget-gated).
83        let candidates = self.gather_candidates(task, runner, infra).await;
84
85        // Seed the bracket with competitors that actually produced an answer.
86        let mut alive: Vec<(String, String)> = candidates
87            .iter()
88            .filter(|o| o.succeeded())
89            .map(|o| (o.name.clone(), o.answer.clone()))
90            .collect();
91
92        let mut matches = Vec::new();
93        // Ranking accumulates losers in reverse: those eliminated later are
94        // better, so we push eliminations and reverse at the end.
95        let mut elimination_order: Vec<String> = Vec::new();
96
97        if alive.is_empty() {
98            return Ok(TournamentResult {
99                task: task.to_string(),
100                candidates,
101                matches,
102                winner_name: String::new(),
103                winner_answer: String::new(),
104                ranking: Vec::new(),
105            });
106        }
107
108        let mut round = 1;
109        while alive.len() > 1 {
110            let mut next: Vec<(String, String)> = Vec::new();
111            let mut i = 0;
112            while i + 1 < alive.len() {
113                let (name_a, ans_a) = alive[i].clone();
114                let (name_b, ans_b) = alive[i + 1].clone();
115
116                let (winner_side, rationale) = self
117                    .judge_pair(task, &name_a, &ans_a, &name_b, &ans_b, runner, infra)
118                    .await;
119
120                let (winner_name, winner_ans, loser_name) = match winner_side {
121                    Side::A => (name_a.clone(), ans_a.clone(), name_b.clone()),
122                    Side::B => (name_b.clone(), ans_b.clone(), name_a.clone()),
123                };
124                matches.push(MatchResult {
125                    round,
126                    a: name_a,
127                    b: name_b,
128                    winner: winner_name.clone(),
129                    rationale,
130                });
131                elimination_order.push(loser_name);
132                next.push((winner_name, winner_ans));
133                i += 2;
134            }
135            // Odd competitor out gets a bye to the next round.
136            if i < alive.len() {
137                next.push(alive[i].clone());
138            }
139            alive = next;
140            round += 1;
141        }
142
143        let (winner_name, winner_answer) = alive
144            .into_iter()
145            .next()
146            .unwrap_or_else(|| (String::new(), String::new()));
147
148        // Ranking: winner first, then losers from latest elimination to earliest.
149        let mut ranking = vec![winner_name.clone()];
150        ranking.extend(elimination_order.into_iter().rev());
151
152        Ok(TournamentResult {
153            task: task.to_string(),
154            candidates,
155            matches,
156            winner_name,
157            winner_answer,
158            ranking,
159        })
160    }
161
162    /// Run every competitor on the task, gated and recorded against the budget.
163    async fn gather_candidates(
164        &self,
165        task: &str,
166        runner: &Arc<dyn AgentRunner>,
167        infra: &SharedInfra,
168    ) -> Vec<AgentOutput> {
169        let mailbox = Arc::new(Mailbox::default());
170        enum Slot {
171            Spawned(usize),
172            Skipped(AgentOutput),
173        }
174        let mut handles: Vec<tokio::task::JoinHandle<Result<AgentOutput, MultiError>>> = Vec::new();
175        let mut slots: Vec<Slot> = Vec::new();
176
177        for spec in &self.competitors {
178            if let Err(e) = infra.begin_agent() {
179                slots.push(Slot::Skipped(budget_skipped_output(&spec.name, &e)));
180                continue;
181            }
182            let runner = Arc::clone(runner);
183            let spec = spec.clone();
184            let task = task.to_string();
185            let mailbox = Arc::clone(&mailbox);
186            // Provision the runtime before spawning (matches swarm), so the
187            // agent task never observes a half-registered tool set.
188            let rt = infra.make_runtime();
189            for tool in &spec.tools {
190                rt.register_tool(tool).await;
191            }
192            handles.push(tokio::spawn(async move {
193                runner.run(&spec, &task, &rt, &mailbox).await
194            }));
195            slots.push(Slot::Spawned(handles.len() - 1));
196        }
197
198        let mut results: Vec<Option<_>> = futures::future::join_all(handles)
199            .await
200            .into_iter()
201            .map(Some)
202            .collect();
203        let mut outputs = Vec::new();
204        for (i, slot) in slots.into_iter().enumerate() {
205            let idx = match slot {
206                Slot::Skipped(out) => {
207                    outputs.push(out);
208                    continue;
209                }
210                Slot::Spawned(idx) => idx,
211            };
212            match results.get_mut(idx).and_then(Option::take) {
213                Some(Ok(Ok(out))) => {
214                    infra.record_output(&out);
215                    outputs.push(out);
216                }
217                Some(Ok(Err(e))) => outputs.push(failed_output(&self.competitors[i].name, e.to_string())),
218                Some(Err(e)) => {
219                    outputs.push(failed_output(&self.competitors[i].name, format!("join error: {e}")))
220                }
221                None => outputs.push(failed_output(
222                    &self.competitors[i].name,
223                    "internal: missing join result".into(),
224                )),
225            }
226        }
227        outputs
228    }
229
230    /// Ask the judge to pick the better of two candidates. Defaults to side A on
231    /// a budget denial, judge error, or unparseable verdict (deterministic).
232    async fn judge_pair(
233        &self,
234        task: &str,
235        name_a: &str,
236        ans_a: &str,
237        name_b: &str,
238        ans_b: &str,
239        runner: &Arc<dyn AgentRunner>,
240        infra: &SharedInfra,
241    ) -> (Side, String) {
242        if infra.begin_agent().is_err() {
243            return (Side::A, "budget exhausted: defaulted to first candidate".into());
244        }
245
246        let judge_task = format!(
247            r#"You are judging a head-to-head comparison for this task:
248
249## Task
250{task}
251
252## Candidate A
253{ans_a}
254
255## Candidate B
256{ans_b}
257
258Decide which candidate better accomplishes the task. Be decisive.
259Respond with a JSON object:
260```json
261{{"winner": "A", "rationale": "one sentence why"}}
262```
263`winner` must be exactly "A" or "B"."#,
264        );
265
266        let mut judge_spec = self.judge.clone();
267        // Make the judge fresh per match so prior matches can't bias it.
268        judge_spec.name = format!("{}_{}_vs_{}", self.judge.name, name_a, name_b);
269
270        let mailbox = Mailbox::default();
271        let rt = infra.make_runtime();
272        match runner.run(&judge_spec, &judge_task, &rt, &mailbox).await {
273            Ok(out) => {
274                infra.record_output(&out);
275                parse_verdict(&out.answer)
276            }
277            Err(e) => (Side::A, format!("judge failed, defaulted to A: {e}")),
278        }
279    }
280}
281
282#[derive(Clone, Copy)]
283enum Side {
284    A,
285    B,
286}
287
288fn failed_output(name: &str, error: String) -> AgentOutput {
289    AgentOutput {
290        name: name.to_string(),
291        answer: String::new(),
292        turns: 0,
293        tool_calls: 0,
294        duration_ms: 0.0,
295        error: Some(error),
296        outcome: None,
297        tokens: None,
298    }
299}
300
301/// Parse the judge's verdict: prefer a JSON `{"winner": "A"|"B", "rationale"}`,
302/// else look for an explicit "Candidate A/B" / "winner: A/B" phrase, else the
303/// first *standalone* A/B token. Defaults to A only when nothing matches.
304///
305/// The fallback deliberately does NOT do a naive `find('A')`/`find('B')` over the
306/// whole string — almost any prose contains an 'A' inside a common word before
307/// the meaningful verdict, which would silently bias every match to A.
308fn parse_verdict(answer: &str) -> (Side, String) {
309    if let Some(json) = car_ir::json_extract::extract_json_object(answer) {
310        if let Ok(v) = serde_json::from_str::<serde_json::Value>(&json) {
311            let rationale = v
312                .get("rationale")
313                .and_then(|r| r.as_str())
314                .unwrap_or("")
315                .to_string();
316            if let Some(w) = v.get("winner").and_then(|w| w.as_str()) {
317                let side = if w.trim().eq_ignore_ascii_case("b") {
318                    Side::B
319                } else {
320                    Side::A
321                };
322                return (side, rationale);
323            }
324        }
325    }
326
327    // Fallback: never echo the full (possibly huge) answer into rationale.
328    const FALLBACK_RATIONALE: &str = "verdict parsed heuristically (no JSON winner field)";
329    let upper = answer.to_uppercase();
330
331    // 1. Explicit phrases, earliest one wins.
332    let phrase_a = ["CANDIDATE A", "WINNER: A", "WINNER A", "\"A\"", "ANSWER A"]
333        .iter()
334        .filter_map(|p| upper.find(p))
335        .min();
336    let phrase_b = ["CANDIDATE B", "WINNER: B", "WINNER B", "\"B\"", "ANSWER B"]
337        .iter()
338        .filter_map(|p| upper.find(p))
339        .min();
340    match (phrase_a, phrase_b) {
341        (Some(a), Some(b)) => return (if b < a { Side::B } else { Side::A }, FALLBACK_RATIONALE.into()),
342        (Some(_), None) => return (Side::A, FALLBACK_RATIONALE.into()),
343        (None, Some(_)) => return (Side::B, FALLBACK_RATIONALE.into()),
344        (None, None) => {}
345    }
346
347    // 2. First standalone 'A' or 'B' token (non-alphanumeric on both sides).
348    if let Some(side) = first_standalone_ab(&upper) {
349        return (side, FALLBACK_RATIONALE.into());
350    }
351
352    // 3. Nothing decisive — deterministic default.
353    (Side::A, FALLBACK_RATIONALE.into())
354}
355
356/// First standalone `A`/`B` (surrounded by non-alphanumerics, so not the 'a' in
357/// "candidate" or the 'b' in "because"). `None` if there is no isolated token.
358fn first_standalone_ab(upper: &str) -> Option<Side> {
359    let bytes = upper.as_bytes();
360    for (i, &c) in bytes.iter().enumerate() {
361        if c == b'A' || c == b'B' {
362            let prev_alnum = i > 0 && bytes[i - 1].is_ascii_alphanumeric();
363            let next_alnum = i + 1 < bytes.len() && bytes[i + 1].is_ascii_alphanumeric();
364            if !prev_alnum && !next_alnum {
365                return Some(if c == b'B' { Side::B } else { Side::A });
366            }
367        }
368    }
369    None
370}
371
372#[cfg(test)]
373mod tests {
374    use super::*;
375    use car_engine::Runtime;
376
377    /// A competitor echoes its name; the judge always picks the candidate whose
378    /// answer sorts later alphabetically (deterministic, so we can assert).
379    struct ScriptedRunner;
380
381    #[async_trait::async_trait]
382    impl AgentRunner for ScriptedRunner {
383        async fn run(
384            &self,
385            spec: &AgentSpec,
386            task: &str,
387            _runtime: &Runtime,
388            _mailbox: &Mailbox,
389        ) -> Result<AgentOutput, MultiError> {
390            // Judge specs carry the rendered candidates in the task; pick the
391            // one whose block sorts later by comparing the two answer blocks.
392            let answer = if spec.name.contains("_vs_") {
393                let a = extract_block(task, "## Candidate A");
394                let b = extract_block(task, "## Candidate B");
395                let winner = if b > a { "B" } else { "A" };
396                format!("{{\"winner\": \"{winner}\", \"rationale\": \"later sorts higher\"}}")
397            } else {
398                spec.name.clone()
399            };
400            Ok(AgentOutput {
401                name: spec.name.clone(),
402                answer,
403                turns: 1,
404                tool_calls: 0,
405                duration_ms: 1.0,
406                error: None,
407                outcome: None,
408                tokens: None,
409            })
410        }
411    }
412
413    fn extract_block<'a>(task: &'a str, header: &str) -> &'a str {
414        task.split(header)
415            .nth(1)
416            .map(|s| s.split("##").next().unwrap_or("").trim())
417            .unwrap_or("")
418    }
419
420    #[tokio::test]
421    async fn single_elimination_picks_alphabetical_max() {
422        let competitors = vec![
423            AgentSpec::new("alpha", ""),
424            AgentSpec::new("bravo", ""),
425            AgentSpec::new("charlie", ""),
426            AgentSpec::new("delta", ""),
427        ];
428        let judge = AgentSpec::new("judge", "pick the better answer");
429        let runner: Arc<dyn AgentRunner> = Arc::new(ScriptedRunner);
430        let infra = SharedInfra::new();
431
432        let r = Tournament::new(competitors, judge)
433            .run("rank these", &runner, &infra)
434            .await
435            .unwrap();
436
437        // Each competitor's answer is its name; judge prefers later-sorting, so
438        // "delta" wins the whole bracket.
439        assert_eq!(r.winner_name, "delta");
440        assert_eq!(r.winner_answer, "delta");
441        // 4 competitors -> 2 + 1 = 3 matches.
442        assert_eq!(r.matches.len(), 3);
443        assert_eq!(r.ranking.first().unwrap(), "delta");
444        assert_eq!(r.ranking.len(), 4);
445    }
446
447    #[tokio::test]
448    async fn odd_competitor_gets_a_bye() {
449        let competitors = vec![
450            AgentSpec::new("alpha", ""),
451            AgentSpec::new("bravo", ""),
452            AgentSpec::new("charlie", ""),
453        ];
454        let judge = AgentSpec::new("judge", "");
455        let runner: Arc<dyn AgentRunner> = Arc::new(ScriptedRunner);
456        let infra = SharedInfra::new();
457
458        let r = Tournament::new(competitors, judge)
459            .run("rank", &runner, &infra)
460            .await
461            .unwrap();
462
463        // 3 competitors: round 1 has one match (alpha vs bravo) + charlie bye;
464        // round 2 matches the winner vs charlie => 2 matches total.
465        assert_eq!(r.matches.len(), 2);
466        assert_eq!(r.winner_name, "charlie"); // sorts latest of {bravo, charlie}
467    }
468
469    #[test]
470    fn parse_verdict_handles_prose_not_just_json() {
471        // The old naive find('A')/find('B') mis-judged all of these to A.
472        let (s, _) = parse_verdict("Candidate B is clearly better, it covers more cases.");
473        assert!(matches!(s, Side::B), "phrase 'Candidate B' should win");
474        let (s, _) = parse_verdict("After careful analysis, B.");
475        assert!(matches!(s, Side::B), "standalone trailing 'B' should win");
476        let (s, _) = parse_verdict("Answer A is the stronger submission.");
477        assert!(matches!(s, Side::A));
478        // JSON path still authoritative even with prose around it.
479        let (s, r) = parse_verdict("I think... {\"winner\": \"B\", \"rationale\": \"x\"}");
480        assert!(matches!(s, Side::B));
481        assert_eq!(r, "x");
482    }
483
484    #[test]
485    fn parse_verdict_does_not_leak_full_answer_on_fallback() {
486        let huge = format!("Candidate B wins. {}", "blah ".repeat(500));
487        let (_, rationale) = parse_verdict(&huge);
488        assert!(
489            rationale.len() < 100,
490            "fallback rationale must not echo the whole answer"
491        );
492    }
493
494    #[tokio::test]
495    async fn budget_cap_limits_competitors() {
496        let competitors: Vec<AgentSpec> = (0..4)
497            .map(|i| AgentSpec::new(&format!("c{i}"), ""))
498            .collect();
499        let judge = AgentSpec::new("judge", "");
500        let runner: Arc<dyn AgentRunner> = Arc::new(ScriptedRunner);
501        // Allow only 2 competitor spawns (no budget for judges either).
502        let infra = SharedInfra::new().with_budget(crate::BudgetLimits {
503            max_agents: Some(2),
504            ..Default::default()
505        });
506
507        let r = Tournament::new(competitors, judge)
508            .run("rank", &runner, &infra)
509            .await
510            .unwrap();
511
512        let produced = r.candidates.iter().filter(|o| o.succeeded()).count();
513        assert_eq!(produced, 2, "only two competitors fit the agent budget");
514    }
515}
car_multi/patterns/tournament.rs

car_multi/patterns/
tournament.rs