Skip to main content

phi_core/agent_loop/
evaluation.rs

1//! Pluggable evaluation strategies for [`agent_loop_parallel`].
2//!
3//! [`EvaluationStrategy`] selects the best result after parallel branches finish.
4//! Five built-in implementations cover the most common use cases; implement the
5//! trait for custom evaluation logic.
6//!
7//! # Built-in strategies
8//!
9//! | Strategy | Selection criterion | Use case |
10//! |---|---|---|
11//! | [`TransparentEvaluation`] | Pass-through (1 branch only) | Zero-overhead wrapper |
12//! | [`PickFirstEvaluation`] | Always index 0 | Testing / deterministic default |
13//! | [`TokenEfficientEvaluation`] | Lowest total token usage | Cost / latency priority |
14//! | [`ElaborateEvaluation`] | Highest total token usage | Depth / thoroughness priority |
15//! | [`LlmJudgeEvaluation`] | Separate LLM judge call | Best quality selection |
16//!
17//! [`agent_loop_parallel`]: crate::agent_loop::agent_loop_parallel
18
19// `EvaluationDecision` and `EvaluationStrategy` are defined in `types.rs` so that
20// `agent_loop_parallel` in `agent_loop.rs` can use them without a circular dependency.
21// They are re-exported here for ergonomic imports from `crate::evaluation`.
22pub use crate::types::{EvaluationDecision, EvaluationStrategy};
23
24use super::config::AgentLoopConfig;
25use super::core::agent_loop;
26use crate::types::*;
27use tokio::sync::mpsc;
28use tokio_util::sync::CancellationToken;
29
30// ── TransparentEvaluation ─────────────────────────────────────────────────────
31
32/// Single-branch pass-through — panics if more than one branch is present.
33///
34/// Use this when you want the evaluational-parallelism plumbing (tracing events,
35/// `ParallelLoopResult`) without the overhead of running multiple branches.
36/// The standard `agent_loop` case is just `agent_loop_parallel` with one config
37/// and this strategy.
38pub struct TransparentEvaluation;
39
40#[async_trait::async_trait]
41impl EvaluationStrategy for TransparentEvaluation {
42    async fn evaluate(
43        &self,
44        _prompts: &[AgentMessage],
45        outcomes: &[ParallelLoopOutcome],
46        _tx: &mpsc::UnboundedSender<AgentEvent>,
47        _cancel: CancellationToken,
48    ) -> (EvaluationDecision, Usage) {
49        assert_eq!(
50            outcomes.len(),
51            1,
52            "TransparentEvaluation requires exactly one branch, got {}",
53            outcomes.len()
54        );
55        (EvaluationDecision::Select(0), Usage::default())
56    }
57}
58
59// ── PickFirstEvaluation ───────────────────────────────────────────────────────
60
61/// Always selects the first branch (index 0), regardless of content.
62///
63/// Useful for testing, debugging, and as a safe deterministic default when
64/// you only care about the output of the first config.
65pub struct PickFirstEvaluation;
66
67#[async_trait::async_trait]
68impl EvaluationStrategy for PickFirstEvaluation {
69    async fn evaluate(
70        &self,
71        _prompts: &[AgentMessage],
72        _outcomes: &[ParallelLoopOutcome],
73        _tx: &mpsc::UnboundedSender<AgentEvent>,
74        _cancel: CancellationToken,
75    ) -> (EvaluationDecision, Usage) {
76        (EvaluationDecision::Select(0), Usage::default())
77    }
78}
79
80// ── TokenEfficientEvaluation ─────────────────────────────────────────────────
81
82/// Selects the branch with the **lowest total token usage**.
83///
84/// Prefer this strategy when cost or latency is the primary concern and you
85/// want the most concise result that still answers the question.
86pub struct TokenEfficientEvaluation;
87
88#[async_trait::async_trait]
89impl EvaluationStrategy for TokenEfficientEvaluation {
90    async fn evaluate(
91        &self,
92        _prompts: &[AgentMessage],
93        outcomes: &[ParallelLoopOutcome],
94        _tx: &mpsc::UnboundedSender<AgentEvent>,
95        _cancel: CancellationToken,
96    ) -> (EvaluationDecision, Usage) {
97        let idx = outcomes
98            .iter()
99            .enumerate()
100            .min_by_key(|(_, o)| o.usage.total_tokens)
101            .map(|(i, _)| i)
102            .unwrap_or(0);
103        (EvaluationDecision::Select(idx), Usage::default())
104    }
105}
106
107// ── ElaborateEvaluation ───────────────────────────────────────────────────────
108
109/// Selects the branch with the **highest total token usage**.
110///
111/// Prefer this strategy when depth and thoroughness are the priority and you
112/// want the most detailed response among the branches.
113pub struct ElaborateEvaluation;
114
115#[async_trait::async_trait]
116impl EvaluationStrategy for ElaborateEvaluation {
117    async fn evaluate(
118        &self,
119        _prompts: &[AgentMessage],
120        outcomes: &[ParallelLoopOutcome],
121        _tx: &mpsc::UnboundedSender<AgentEvent>,
122        _cancel: CancellationToken,
123    ) -> (EvaluationDecision, Usage) {
124        let idx = outcomes
125            .iter()
126            .enumerate()
127            .max_by_key(|(_, o)| o.usage.total_tokens)
128            .map(|(i, _)| i)
129            .unwrap_or(0);
130        (EvaluationDecision::Select(idx), Usage::default())
131    }
132}
133
134// ── LlmJudgeEvaluation ────────────────────────────────────────────────────────
135
136/// Uses a separate LLM call to judge which branch response is best.
137///
138/// # Judge prompt construction
139///
140/// The judge sees only clean, relevant content — never raw tool calls or intermediate
141/// steps from inside a branch:
142///
143/// - **Prior conversation context** *(when present)*: the conversation history before
144///   the user query, formatted as a human-readable transcript. Only `Content::Text`
145///   survives — tool call arguments and images are stripped. Omitted when empty.
146/// - **Original query**: text extracted from user messages in `prompts` (`agent_loop`
147///   mode), or from the last `Message::User` in
148///   `context.messages[..original_context_len]` (`agent_loop_continue` mode).
149/// - **Per-branch response**: the final assistant text from the last
150///   `Message::Assistant` in `outcome.new_messages`. Tool calls, tool results,
151///   and all multi-turn exchanges within a branch are stripped. The judge evaluates
152///   outcomes, not the reasoning trace.
153///
154/// # `agent_loop_continue` mode
155///
156/// When `prompts` is empty (continue mode), the judge locates the last
157/// `Message::User` in `context.messages[..original_context_len]` as the query.
158/// Everything before that message becomes the prior conversation context.
159///
160/// # Judge's comprehension criteria
161///
162/// All N branch final responses (plus prior context) must fit in the judge model's
163/// context window *simultaneously* for a fair comparison. The token budget is
164/// derived from `judge_config.context_config.max_context_tokens` (if set).
165/// When no context limit is configured, all content is passed through as-is.
166///
167/// # 2-iteration compaction strategy
168///
169/// When combined content exceeds the budget, compaction is applied in two iterations:
170///
171/// **Iteration 1 — compact prior context only, outputs intact.**
172/// The prior context is reduced through 3 progressive tiers while branch outputs
173/// are preserved verbatim:
174/// 1. **Tier 1**: keep only the last 80 lines.
175/// 2. **Tier 2**: keep first paragraph + last paragraph only.
176/// 3. **Tier 3**: hard char limit derived from remaining budget.
177///
178/// **Iteration 2 — compact both independently (if iteration 1 insufficient).**
179/// Context stays at tier-3 form; branch outputs are now compacted independently
180/// through the same 3-tier pipeline.
181///
182/// A [`AgentEvent::ProgressMessage`] warning is emitted to `tx` if the budget
183/// cannot be satisfied after both iterations.
184///
185/// The judge's decision applies to the **original** (uncompacted) branch responses.
186/// `ParallelLoopResult::selected_messages` always contains the uncompacted winner.
187///
188/// # Response parsing
189///
190/// The judge's reply is scanned for the first numeric token (e.g., "1", "2",
191/// "Response 2"). Falls back to index 0 if no number is found or parsing fails.
192///
193/// # Session traceability
194///
195/// The judge loop inherits the `session_id` from the branches so all events
196/// (including the judge's `AgentStart`) are visible in the same session trace.
197pub struct LlmJudgeEvaluation {
198    /// Config for the judge LLM call. Set `context_config.max_context_tokens`
199    /// to enable the comprehension-criteria compaction check.
200    pub judge_config: AgentLoopConfig,
201    /// Optional system prompt override. When `None`, a built-in evaluation prompt is used.
202    pub system_prompt: Option<String>,
203}
204
205// ── Internal helpers ──────────────────────────────────────────────────────────
206
207/// Extract only `Content::Text` items from a content slice (strips tool calls, images, thinking).
208fn extract_text_only(content: &[Content]) -> String {
209    content
210        .iter()
211        .filter_map(|c| match c {
212            Content::Text { text } => Some(text.as_str()),
213            _ => None,
214        })
215        .collect::<Vec<_>>()
216        .join("\n")
217}
218
219/// Extract plain text from user messages in `prompts`.
220/// Strips tool calls, images, and thinking blocks — the judge only needs the question.
221fn extract_query_text(prompts: &[AgentMessage]) -> String {
222    prompts
223        .iter()
224        .filter_map(|m| match m {
225            AgentMessage::Llm(LlmMessage {
226                message: Message::User { content, .. },
227                ..
228            }) => Some(content),
229            _ => None,
230        })
231        .flat_map(|content| {
232            content.iter().filter_map(|c| match c {
233                Content::Text { text } => Some(text.as_str()),
234                _ => None,
235            })
236        })
237        .collect::<Vec<_>>()
238        .join("\n")
239}
240
241/// Extract the text of the **last** `Message::User` in `messages`.
242/// Returns `None` if no user message with text content is found.
243fn extract_last_user_text(messages: &[AgentMessage]) -> Option<String> {
244    messages.iter().rev().find_map(|m| match m {
245        AgentMessage::Llm(LlmMessage {
246            message: Message::User { content, .. },
247            ..
248        }) => {
249            let text = extract_text_only(content);
250            if text.is_empty() {
251                None
252            } else {
253                Some(text)
254            }
255        }
256        _ => None,
257    })
258}
259
260/// Format `messages` as a human-readable conversation transcript for the judge.
261///
262/// Only `Content::Text` is included — tool calls, images, and thinking blocks
263/// are stripped. Returns an empty string if the slice is empty or has no text.
264fn format_prior_context(messages: &[AgentMessage]) -> String {
265    let mut parts: Vec<String> = Vec::new();
266    for m in messages {
267        match m {
268            AgentMessage::Llm(LlmMessage {
269                message: Message::User { content, .. },
270                ..
271            }) => {
272                let text = extract_text_only(content);
273                if !text.is_empty() {
274                    parts.push(format!("User: {}", text));
275                }
276            }
277            AgentMessage::Llm(LlmMessage {
278                message: Message::Assistant { content, .. },
279                ..
280            }) => {
281                let text = extract_text_only(content);
282                if !text.is_empty() {
283                    parts.push(format!("Assistant: {}", text));
284                }
285            }
286            AgentMessage::Llm(LlmMessage {
287                message:
288                    Message::ToolResult {
289                        tool_name, content, ..
290                    },
291                ..
292            }) => {
293                let text = extract_text_only(content);
294                if !text.is_empty() {
295                    parts.push(format!("Tool [{}]: {}", tool_name, text));
296                }
297            }
298            _ => {}
299        }
300    }
301    parts.join("\n")
302}
303
304/// Extract the final assistant text from a branch's new messages.
305/// Returns the text content of the last `Message::Assistant` found, or an empty
306/// string when the branch produced no assistant text.
307fn extract_final_assistant_text(messages: &[AgentMessage]) -> String {
308    messages
309        .iter()
310        .rev()
311        .find_map(|m| match m {
312            AgentMessage::Llm(LlmMessage {
313                message: Message::Assistant { content, .. },
314                ..
315            }) => {
316                let text = extract_text_only(content);
317                if text.is_empty() {
318                    None
319                } else {
320                    Some(text)
321                }
322            }
323            _ => None,
324        })
325        .unwrap_or_default()
326}
327
328/// Tier 1: keep only the last `max_lines` lines of `text`.
329fn compact_tier1(text: &str, max_lines: usize) -> String {
330    let lines: Vec<&str> = text.lines().collect();
331    if lines.len() <= max_lines {
332        text.to_string()
333    } else {
334        lines[lines.len() - max_lines..].join("\n")
335    }
336}
337
338/// Tier 2: keep first paragraph + last paragraph (separated by `...`).
339fn compact_tier2(text: &str) -> String {
340    let paragraphs: Vec<&str> = text
341        .split("\n\n")
342        .map(str::trim)
343        .filter(|p| !p.is_empty())
344        .collect();
345    match paragraphs.len() {
346        0 => text.to_string(),
347        1 => paragraphs[0].to_string(),
348        _ => format!(
349            "{}\n\n...\n\n{}",
350            paragraphs[0],
351            paragraphs[paragraphs.len() - 1]
352        ),
353    }
354}
355
356/// Tier 3: hard char-limit truncation with an ellipsis marker.
357fn compact_tier3(text: &str, max_chars: usize) -> String {
358    if text.len() <= max_chars {
359        text.to_string()
360    } else {
361        // Truncate at a character boundary; ensure max_chars > 3 for the ellipsis.
362        let cut = max_chars.saturating_sub(3);
363        format!("{}...", &text[..cut])
364    }
365}
366
367/// Rough chars-to-tokens estimate (1 token ≈ 4 chars for English text).
368fn estimate_tokens(s: &str) -> usize {
369    s.len().div_ceil(4)
370}
371
372/// Apply iterative compaction until the combined response tokens fit within `token_budget`.
373///
374/// Returns the (possibly compacted) texts and a boolean indicating whether the
375/// comprehension criteria was satisfied after all tiers.
376fn compact_responses(responses: Vec<String>, token_budget: usize) -> (Vec<String>, bool) {
377    // Fast path — already within budget.
378    let mut current = responses;
379    if current.iter().map(|r| estimate_tokens(r)).sum::<usize>() <= token_budget {
380        return (current, true);
381    }
382
383    // Tier 1: tail truncation (keep last 80 lines per response).
384    current = current.into_iter().map(|r| compact_tier1(&r, 80)).collect();
385    if current.iter().map(|r| estimate_tokens(r)).sum::<usize>() <= token_budget {
386        return (current, true);
387    }
388
389    // Tier 2: paragraph summary (first + last paragraph per response).
390    current = current.into_iter().map(|r| compact_tier2(&r)).collect();
391    if current.iter().map(|r| estimate_tokens(r)).sum::<usize>() <= token_budget {
392        return (current, true);
393    }
394
395    // Tier 3: hard char limit derived from budget, minimum 200 chars per response.
396    let n = current.len().max(1);
397    let max_chars = std::cmp::max(200, (token_budget * 4) / n);
398    current = current
399        .into_iter()
400        .map(|r| compact_tier3(&r, max_chars))
401        .collect();
402
403    let satisfied = current.iter().map(|r| estimate_tokens(r)).sum::<usize>() <= token_budget;
404    (current, satisfied)
405}
406
407/// Two-iteration compaction for the combined judge input (prior context + branch outputs).
408///
409/// The `token_budget` covers `prior_context + outputs` only — the ~20% overhead for
410/// system prompt, query, and framing has already been deducted by the caller.
411///
412/// **Iteration 1** — compact `prior_context` progressively (tier 1 → 2 → 3) while
413/// keeping `outputs` intact. If the budget is satisfied after any tier, return early.
414///
415/// **Iteration 2** — if iteration 1 cannot satisfy the budget even at tier 3, the
416/// context stays at its most-compacted form and `outputs` are now compacted
417/// independently through the existing [`compact_responses`] pipeline.
418///
419/// Returns `(compacted_context, compacted_outputs, satisfied)`.
420fn compact_for_judge(
421    prior_context: String,
422    outputs: Vec<String>,
423    token_budget: usize,
424) -> (String, Vec<String>, bool) {
425    let out_tokens = || outputs.iter().map(|o| estimate_tokens(o)).sum::<usize>();
426
427    // Fast path — already within budget.
428    if estimate_tokens(&prior_context) + out_tokens() <= token_budget {
429        return (prior_context, outputs, true);
430    }
431
432    // ── Iteration 1: compact context only, outputs intact ────────────────────
433    let ctx1 = compact_tier1(&prior_context, 80);
434    if estimate_tokens(&ctx1) + out_tokens() <= token_budget {
435        return (ctx1, outputs, true);
436    }
437
438    let ctx2 = compact_tier2(&ctx1);
439    if estimate_tokens(&ctx2) + out_tokens() <= token_budget {
440        return (ctx2, outputs, true);
441    }
442
443    let n_out = outputs.len().max(1);
444    let ctx_budget_chars = (token_budget.saturating_sub(out_tokens()) * 4).max(200);
445    let ctx3 = compact_tier3(&ctx2, ctx_budget_chars);
446    if estimate_tokens(&ctx3) + out_tokens() <= token_budget {
447        return (ctx3, outputs, true);
448    }
449
450    // ── Iteration 2: compact outputs independently (context stays at tier-3) ─
451    let out_budget = token_budget
452        .saturating_sub(estimate_tokens(&ctx3))
453        .max(200 * n_out);
454    let (compacted_outputs, satisfied) = compact_responses(outputs, out_budget);
455    (ctx3, compacted_outputs, satisfied)
456}
457
458/// Build the judge's user prompt listing the prior context, original query,
459/// and all numbered branch responses.
460fn build_judge_user_message(
461    prior_context: Option<&str>,
462    query: &str,
463    responses: &[String],
464) -> String {
465    let mut msg = String::new();
466    if let Some(ctx) = prior_context.filter(|s| !s.trim().is_empty()) {
467        msg.push_str("Prior conversation context:\n");
468        msg.push_str(ctx);
469        msg.push_str("\n\n");
470    }
471    msg.push_str(&format!("Original query:\n{}\n\n", query));
472    for (i, resp) in responses.iter().enumerate() {
473        msg.push_str(&format!("Response {}:\n{}\n\n", i + 1, resp));
474    }
475    msg.push_str(
476        "Which response is best? Reply with ONLY the response number (e.g., \"1\" or \"2\").",
477    );
478    msg
479}
480
481/// Scan the judge's reply for the first integer in range [1, max_index+1].
482/// Returns a 0-based index. Falls back to 0 if no valid number is found.
483fn parse_judge_selection(text: &str, max_index: usize) -> usize {
484    for word in text.split_whitespace() {
485        let digits: String = word.chars().filter(|c| c.is_ascii_digit()).collect();
486        if let Ok(n) = digits.parse::<usize>() {
487            if n >= 1 && n <= max_index + 1 {
488                return n - 1;
489            }
490        }
491    }
492    0
493}
494
495// ── LlmJudgeEvaluation impl ───────────────────────────────────────────────────
496
497#[async_trait::async_trait]
498impl EvaluationStrategy for LlmJudgeEvaluation {
499    async fn evaluate(
500        &self,
501        prompts: &[AgentMessage],
502        outcomes: &[ParallelLoopOutcome],
503        tx: &mpsc::UnboundedSender<AgentEvent>,
504        cancel: CancellationToken,
505    ) -> (EvaluationDecision, Usage) {
506        // ── 1. Determine query and prior context ──────────────────────────────
507        //
508        // All branches share the same base context; outcomes[0] is representative.
509        // `original_context_len` marks the boundary between the shared base context
510        // and the new messages produced by each branch.
511        let orig_len = outcomes
512            .first()
513            .map(|o| o.original_context_len)
514            .unwrap_or(0);
515        let orig_ctx_msgs: &[AgentMessage] = outcomes
516            .first()
517            .map(|o| &o.context.messages[..orig_len])
518            .unwrap_or(&[]);
519
520        let (query, prior_context_msgs): (String, &[AgentMessage]) = if !prompts.is_empty() {
521            // agent_loop mode: query comes from prompts; the entire original context
522            // (base_context.messages) is the prior conversation history.
523            (extract_query_text(prompts), orig_ctx_msgs)
524        } else {
525            // agent_loop_continue mode: query is the last Message::User in the
526            // original context. Prior context = everything BEFORE that user message.
527            let last_user_pos = orig_ctx_msgs.iter().rposition(|m| {
528                matches!(
529                    m,
530                    AgentMessage::Llm(LlmMessage {
531                        message: Message::User { .. },
532                        ..
533                    })
534                )
535            });
536            match last_user_pos {
537                Some(pos) => (
538                    extract_last_user_text(&orig_ctx_msgs[pos..pos + 1]).unwrap_or_default(),
539                    &orig_ctx_msgs[..pos],
540                ),
541                None => (String::new(), orig_ctx_msgs),
542            }
543        };
544
545        let prior_context_text = format_prior_context(prior_context_msgs);
546
547        // ── 2. Extract per-branch final responses ─────────────────────────────
548        let raw_responses: Vec<String> = outcomes
549            .iter()
550            .map(|o| extract_final_assistant_text(&o.new_messages))
551            .collect();
552
553        // ── 3. Apply 2-iteration compaction (prior context first, then outputs)
554        //
555        // The budget (in tokens) comes from judge_config.context_config if set.
556        // Reserve ~20 % of the budget for system prompt, query, and framing overhead.
557        let token_budget = self
558            .judge_config
559            .context_config
560            .as_ref()
561            .map(|c| c.max_context_tokens);
562
563        let (prior_ctx_for_judge, responses) = if let Some(budget) = token_budget {
564            // 80% of budget for prior context + outputs; 20% for system/query/framing.
565            let content_budget = (budget * 4) / 5;
566            let (pc, resp, satisfied) =
567                compact_for_judge(prior_context_text, raw_responses, content_budget);
568            if !satisfied {
569                tx.send(AgentEvent::ProgressMessage {
570                    loop_id: String::new(),
571                    tool_call_id: "judge-compaction".into(),
572                    tool_name: "LlmJudgeEvaluation".into(),
573                    text: format!(
574                        "LlmJudgeEvaluation: could not fit prior context + {} branch \
575                         responses within the judge's context budget ({} tokens) after \
576                         2-iteration compaction. Proceeding best-effort — judge comparison \
577                         may be incomplete.",
578                        outcomes.len(),
579                        budget
580                    ),
581                })
582                .ok();
583            }
584            (pc, resp)
585        } else {
586            (prior_context_text, raw_responses)
587        };
588
589        // ── 4. Build judge context ────────────────────────────────────────────
590        let default_system = "You are an impartial judge evaluating AI assistant responses. \
591            Select the response that best answers the user's query. \
592            Reply with ONLY the response number (e.g., \"1\" or \"2\").";
593        let system_prompt = self
594            .system_prompt
595            .as_deref()
596            .unwrap_or(default_system)
597            .to_string();
598
599        let judge_user_text =
600            build_judge_user_message(Some(&prior_ctx_for_judge), &query, &responses);
601
602        // Inherit session_id from the branches for traceability.
603        let session_id = outcomes.first().and_then(|o| o.context.session_id.clone());
604
605        let mut judge_context = AgentContext {
606            system_prompt,
607            messages: vec![],
608            tools: vec![],
609            agent_id: None,
610            session_id,
611            loop_id: None,
612            parent_loop_id: None,
613            continuation_kind: None,
614            session: None,
615            user_context: Vec::new(),
616            inrun_context: Vec::new(),
617            active_node_id: None,
618            next_node_id: 0,
619        };
620
621        let judge_prompts = vec![AgentMessage::Llm(LlmMessage::new(Message::user(
622            judge_user_text,
623        )))];
624
625        // ── 5. Run judge loop — forward events and capture usage ──────────────
626        //
627        // The forwarder task is tokio::spawn-able because it only captures owned
628        // Send + 'static values: cloned tx, judge_rx (Receiver), usage_tx (oneshot).
629        let (judge_tx, judge_rx) = mpsc::unbounded_channel::<AgentEvent>();
630        let (usage_tx, usage_rx) = tokio::sync::oneshot::channel::<Usage>();
631
632        let main_tx = tx.clone();
633        tokio::spawn(async move {
634            let mut judge_rx = judge_rx;
635            let mut last_usage = Usage::default();
636            while let Some(event) = judge_rx.recv().await {
637                if let AgentEvent::AgentEnd { ref usage, .. } = event {
638                    last_usage = usage.clone();
639                }
640                main_tx.send(event).ok();
641            }
642            // judge_tx is dropped when agent_loop returns → recv() yields None →
643            // usage is sent back, unblocking usage_rx.await below.
644            usage_tx.send(last_usage).ok();
645        });
646
647        let judge_messages = agent_loop(
648            judge_prompts,
649            &mut judge_context,
650            &self.judge_config,
651            judge_tx,
652            cancel,
653        )
654        .await;
655
656        let judge_usage = usage_rx.await.unwrap_or_default();
657
658        // ── 6. Parse judge selection ──────────────────────────────────────────
659        let judge_text = extract_final_assistant_text(&judge_messages);
660        let selected = parse_judge_selection(&judge_text, outcomes.len() - 1);
661
662        (EvaluationDecision::Select(selected), judge_usage)
663    }
664}
665
666// ── Tests ─────────────────────────────────────────────────────────────────────
667
668#[cfg(test)]
669mod tests {
670    use super::*;
671
672    fn make_outcome(loop_id: &str, total_tokens: u64, final_text: &str) -> ParallelLoopOutcome {
673        let msg = AgentMessage::Llm(LlmMessage::new(Message::Assistant {
674            content: vec![Content::Text {
675                text: final_text.to_string(),
676            }],
677            stop_reason: StopReason::Stop,
678            model: "test".into(),
679            provider: "test".into(),
680            usage: Usage {
681                total_tokens,
682                ..Default::default()
683            },
684            timestamp: 0,
685            error_message: None,
686        }));
687        ParallelLoopOutcome {
688            config_index: 0,
689            loop_id: loop_id.to_string(),
690            context: AgentContext {
691                system_prompt: String::new(),
692                messages: vec![],
693                tools: vec![],
694                agent_id: None,
695                session_id: None,
696                loop_id: None,
697                parent_loop_id: None,
698                continuation_kind: None,
699                session: None,
700                user_context: Vec::new(),
701                inrun_context: Vec::new(),
702                active_node_id: None,
703                next_node_id: 0,
704            },
705            new_messages: vec![msg],
706            usage: Usage {
707                total_tokens,
708                ..Default::default()
709            },
710            original_context_len: 0,
711        }
712    }
713
714    fn dummy_tx() -> mpsc::UnboundedSender<AgentEvent> {
715        let (tx, _rx) = mpsc::unbounded_channel();
716        tx
717    }
718
719    #[tokio::test]
720    async fn test_transparent_single_branch() {
721        let outcomes = vec![make_outcome("loop1", 100, "hello")];
722        let (decision, usage) = TransparentEvaluation
723            .evaluate(&[], &outcomes, &dummy_tx(), CancellationToken::new())
724            .await;
725        assert!(matches!(decision, EvaluationDecision::Select(0)));
726        assert_eq!(usage.total_tokens, 0);
727    }
728
729    #[tokio::test]
730    #[should_panic(expected = "TransparentEvaluation requires exactly one branch")]
731    async fn test_transparent_panics_on_multiple() {
732        let outcomes = vec![
733            make_outcome("loop1", 100, "a"),
734            make_outcome("loop2", 200, "b"),
735        ];
736        TransparentEvaluation
737            .evaluate(&[], &outcomes, &dummy_tx(), CancellationToken::new())
738            .await;
739    }
740
741    #[tokio::test]
742    async fn test_pick_first() {
743        let outcomes = vec![
744            make_outcome("loop1", 300, "verbose"),
745            make_outcome("loop2", 50, "concise"),
746        ];
747        let (decision, _) = PickFirstEvaluation
748            .evaluate(&[], &outcomes, &dummy_tx(), CancellationToken::new())
749            .await;
750        assert!(matches!(decision, EvaluationDecision::Select(0)));
751    }
752
753    #[tokio::test]
754    async fn test_token_efficient() {
755        let outcomes = vec![
756            make_outcome("loop1", 500, "long verbose response"),
757            make_outcome("loop2", 50, "short"),
758            make_outcome("loop3", 200, "medium"),
759        ];
760        let (decision, _) = TokenEfficientEvaluation
761            .evaluate(&[], &outcomes, &dummy_tx(), CancellationToken::new())
762            .await;
763        assert!(matches!(decision, EvaluationDecision::Select(1)));
764    }
765
766    #[tokio::test]
767    async fn test_elaborate() {
768        let outcomes = vec![
769            make_outcome("loop1", 500, "long verbose response"),
770            make_outcome("loop2", 50, "short"),
771            make_outcome("loop3", 200, "medium"),
772        ];
773        let (decision, _) = ElaborateEvaluation
774            .evaluate(&[], &outcomes, &dummy_tx(), CancellationToken::new())
775            .await;
776        assert!(matches!(decision, EvaluationDecision::Select(0)));
777    }
778
779    #[test]
780    fn test_parse_judge_selection() {
781        assert_eq!(parse_judge_selection("2", 2), 1);
782        assert_eq!(parse_judge_selection("Response 1 is best.", 2), 0);
783        assert_eq!(parse_judge_selection("I pick 3.", 3), 2);
784        assert_eq!(parse_judge_selection("unclear", 2), 0); // fallback
785        assert_eq!(parse_judge_selection("5", 2), 0); // out of range → fallback
786    }
787
788    #[test]
789    fn test_compact_tier1() {
790        let text = (0..100)
791            .map(|i| format!("line {}", i))
792            .collect::<Vec<_>>()
793            .join("\n");
794        let compacted = compact_tier1(&text, 80);
795        assert_eq!(compacted.lines().count(), 80);
796    }
797
798    #[test]
799    fn test_compact_tier2() {
800        let text = "First paragraph.\n\nMiddle paragraph.\n\nLast paragraph.";
801        let compacted = compact_tier2(text);
802        assert!(compacted.contains("First paragraph."));
803        assert!(compacted.contains("Last paragraph."));
804        assert!(!compacted.contains("Middle paragraph."));
805    }
806
807    #[test]
808    fn test_extract_query_text() {
809        let prompts = vec![
810            AgentMessage::Llm(LlmMessage::new(Message::User {
811                content: vec![Content::Text {
812                    text: "Hello".into(),
813                }],
814                timestamp: 0,
815            })),
816            AgentMessage::Llm(LlmMessage::new(Message::User {
817                content: vec![Content::Text {
818                    text: "World".into(),
819                }],
820                timestamp: 0,
821            })),
822        ];
823        let query = extract_query_text(&prompts);
824        assert_eq!(query, "Hello\nWorld");
825    }
826
827    #[test]
828    fn test_extract_final_assistant_text() {
829        let messages = vec![
830            AgentMessage::Llm(LlmMessage::new(Message::Assistant {
831                content: vec![Content::Text {
832                    text: "first".into(),
833                }],
834                stop_reason: StopReason::Stop,
835                model: "m".into(),
836                provider: "p".into(),
837                usage: Usage::default(),
838                timestamp: 0,
839                error_message: None,
840            })),
841            AgentMessage::Llm(LlmMessage::new(Message::Assistant {
842                content: vec![Content::Text {
843                    text: "final".into(),
844                }],
845                stop_reason: StopReason::Stop,
846                model: "m".into(),
847                provider: "p".into(),
848                usage: Usage::default(),
849                timestamp: 0,
850                error_message: None,
851            })),
852        ];
853        assert_eq!(extract_final_assistant_text(&messages), "final");
854    }
855
856    #[test]
857    fn test_extract_last_user_text() {
858        let messages = vec![
859            AgentMessage::Llm(LlmMessage::new(Message::User {
860                content: vec![Content::Text {
861                    text: "first query".into(),
862                }],
863                timestamp: 0,
864            })),
865            AgentMessage::Llm(LlmMessage::new(Message::Assistant {
866                content: vec![Content::Text {
867                    text: "answer".into(),
868                }],
869                stop_reason: StopReason::Stop,
870                model: "m".into(),
871                provider: "p".into(),
872                usage: Usage::default(),
873                timestamp: 0,
874                error_message: None,
875            })),
876            AgentMessage::Llm(LlmMessage::new(Message::User {
877                content: vec![Content::Text {
878                    text: "follow-up".into(),
879                }],
880                timestamp: 0,
881            })),
882        ];
883        // Should return the LAST user message text.
884        assert_eq!(
885            extract_last_user_text(&messages),
886            Some("follow-up".to_string())
887        );
888    }
889
890    #[test]
891    fn test_extract_last_user_text_none() {
892        let messages: Vec<AgentMessage> = vec![];
893        assert_eq!(extract_last_user_text(&messages), None);
894    }
895
896    #[test]
897    fn test_format_prior_context() {
898        let messages = vec![
899            AgentMessage::Llm(LlmMessage::new(Message::User {
900                content: vec![Content::Text {
901                    text: "Hello".into(),
902                }],
903                timestamp: 0,
904            })),
905            AgentMessage::Llm(LlmMessage::new(Message::Assistant {
906                content: vec![Content::Text {
907                    text: "Hi there!".into(),
908                }],
909                stop_reason: StopReason::Stop,
910                model: "m".into(),
911                provider: "p".into(),
912                usage: Usage::default(),
913                timestamp: 0,
914                error_message: None,
915            })),
916        ];
917        let transcript = format_prior_context(&messages);
918        assert!(transcript.contains("User: Hello"));
919        assert!(transcript.contains("Assistant: Hi there!"));
920    }
921
922    #[test]
923    fn test_compact_for_judge_no_compaction_needed() {
924        let ctx = "short context".to_string();
925        let outputs = vec!["short response".to_string()];
926        let (c, o, satisfied) = compact_for_judge(ctx.clone(), outputs.clone(), 10_000);
927        assert!(satisfied);
928        assert_eq!(c, ctx);
929        assert_eq!(o, outputs);
930    }
931
932    #[test]
933    fn test_compact_for_judge_iter1_compacts_context_only() {
934        // Make a large context and tiny outputs; budget forces context compaction.
935        let many_lines: String = (0..200).map(|i| format!("line {}\n", i)).collect();
936        let outputs = vec!["tiny".to_string()];
937        // Budget that fits outputs but not full context (outputs ≈ 1 token, context ≈ 1000).
938        let budget = 100;
939        let (c, o, satisfied) = compact_for_judge(many_lines, outputs.clone(), budget);
940        // Outputs should be unchanged (iteration 1 only compacts context).
941        assert_eq!(o, outputs);
942        // Context should be shorter.
943        assert!(estimate_tokens(&c) < 1000);
944        // satisfied depends on whether budget was met; either way outputs are intact.
945        let _ = satisfied;
946    }
947}