Skip to main content

bamboo_engine/runtime/
gold_evaluation.rs

1use std::sync::Arc;
2
3use bamboo_agent_core::tools::{FunctionSchema, ToolCall, ToolSchema};
4use bamboo_agent_core::{AgentError, AgentEvent, GoldCheckpoint, GoldConfidence, GoldDecision};
5use bamboo_agent_core::{Message, Role, Session};
6use bamboo_compression::{TiktokenTokenCounter, TokenCounter};
7use bamboo_domain::ReasoningEffort;
8use bamboo_infrastructure::{LLMProvider, LLMRequestOptions};
9use chrono::Utc;
10use serde_json::json;
11use tokio::sync::mpsc;
12use tokio_util::sync::CancellationToken;
13
14use crate::metrics::TokenUsage as MetricsTokenUsage;
15use crate::runtime::config::GoldConfig;
16use crate::runtime::stream::handler::consume_llm_stream_silent;
17use crate::runtime::task_context::TaskLoopContext;
18
19/// Evaluation-scoped frame bundling parameters that identify and configure a
20/// single gold evaluation pass.  Passed into [`evaluate_gold`] to keep its
21/// parameter count below the clippy threshold.
22pub struct GoldEvalFrame<'a> {
23    pub event_tx: &'a mpsc::Sender<AgentEvent>,
24    pub session_id: &'a str,
25    pub model: &'a str,
26    pub reasoning_effort: Option<ReasoningEffort>,
27    pub checkpoint: GoldCheckpoint,
28    pub iteration: u32,
29}
30
31#[derive(Debug, Clone)]
32pub struct GoldEvaluationResult {
33    pub checkpoint: GoldCheckpoint,
34    pub iteration: u32,
35    pub decision: GoldDecision,
36    pub confidence: GoldConfidence,
37    pub reasoning: String,
38    /// Concrete information still missing to achieve the goal, if any.
39    pub missing_information: Vec<String>,
40    /// The single most useful next action for the agent to take, if the
41    /// evaluator can name one. Used to give the auto-continue nudge direction.
42    pub next_action: Option<String>,
43    pub prompt_tokens: u64,
44    pub completion_tokens: u64,
45}
46
47#[derive(Debug, Clone)]
48pub(crate) struct AsyncGoldEvaluationRequest {
49    pub(crate) session_id: String,
50    pub(crate) round_number: usize,
51    pub(crate) model_name: String,
52    pub(crate) reasoning_effort: Option<ReasoningEffort>,
53    pub(crate) checkpoint: GoldCheckpoint,
54    pub(crate) session_snapshot: Session,
55    pub(crate) task_context_snapshot: Option<TaskLoopContext>,
56    pub(crate) gold_config: GoldConfig,
57}
58
59#[derive(Debug, Clone)]
60pub(crate) struct AsyncGoldEvaluationResult {
61    pub(crate) round_number: usize,
62    pub(crate) model_name: String,
63    pub(crate) evaluation_result: GoldEvaluationResult,
64}
65
66fn normalize_lightweight_reasoning_effort(
67    reasoning_effort: Option<ReasoningEffort>,
68) -> Option<ReasoningEffort> {
69    reasoning_effort.map(|effort| match effort {
70        ReasoningEffort::Xhigh | ReasoningEffort::Max => ReasoningEffort::High,
71        other => other,
72    })
73}
74
75fn estimate_prompt_tokens(messages: &[Message]) -> u64 {
76    let counter = TiktokenTokenCounter::default();
77    u64::from(counter.count_messages(messages))
78}
79
80fn estimate_completion_tokens(content: &str, tool_calls: &[ToolCall]) -> u64 {
81    let counter = TiktokenTokenCounter::default();
82    let mut completion_surface = content.to_string();
83
84    for call in tool_calls {
85        if !completion_surface.is_empty() {
86            completion_surface.push('\n');
87        }
88        completion_surface.push_str(&call.function.name);
89        completion_surface.push('\n');
90        completion_surface.push_str(&call.function.arguments);
91    }
92
93    u64::from(counter.count_text(&completion_surface))
94}
95
96#[allow(clippy::too_many_arguments)]
97pub(crate) fn build_async_gold_evaluation_request(
98    task_context: &Option<TaskLoopContext>,
99    session: &Session,
100    session_id: &str,
101    round_number: usize,
102    model_name: Option<&str>,
103    reasoning_effort: Option<ReasoningEffort>,
104    checkpoint: GoldCheckpoint,
105    gold_config: &GoldConfig,
106) -> Result<Option<AsyncGoldEvaluationRequest>, AgentError> {
107    if !gold_config.enabled {
108        return Ok(None);
109    }
110
111    let model_name = gold_config
112        .model_name
113        .as_deref()
114        .or(model_name)
115        .ok_or_else(|| AgentError::LLM("gold evaluation model_name is required".to_string()))?;
116
117    Ok(Some(AsyncGoldEvaluationRequest {
118        session_id: session_id.to_string(),
119        round_number,
120        model_name: model_name.to_string(),
121        reasoning_effort,
122        checkpoint,
123        session_snapshot: session.clone(),
124        task_context_snapshot: task_context.clone(),
125        gold_config: gold_config.clone(),
126    }))
127}
128
129pub(crate) async fn execute_async_gold_evaluation(
130    request: AsyncGoldEvaluationRequest,
131    llm: Arc<dyn LLMProvider>,
132    event_tx: mpsc::Sender<AgentEvent>,
133) -> AsyncGoldEvaluationResult {
134    let evaluation_result = match evaluate_gold(
135        &request.session_snapshot,
136        request.task_context_snapshot.as_ref(),
137        &request.gold_config,
138        llm,
139        &GoldEvalFrame {
140            event_tx: &event_tx,
141            session_id: &request.session_id,
142            model: &request.model_name,
143            reasoning_effort: request.reasoning_effort,
144            checkpoint: request.checkpoint,
145            iteration: request.round_number as u32,
146        },
147    )
148    .await
149    {
150        Ok(result) => result,
151        Err(error) => GoldEvaluationResult {
152            checkpoint: request.checkpoint,
153            iteration: request.round_number as u32,
154            decision: GoldDecision::Continue,
155            confidence: GoldConfidence::Low,
156            reasoning: format!("Gold evaluation failed: {error}"),
157            missing_information: Vec::new(),
158            next_action: None,
159            prompt_tokens: 0,
160            completion_tokens: 0,
161        },
162    };
163
164    AsyncGoldEvaluationResult {
165        round_number: request.round_number,
166        model_name: request.model_name,
167        evaluation_result,
168    }
169}
170
171pub async fn evaluate_gold(
172    session: &Session,
173    task_context: Option<&TaskLoopContext>,
174    config: &GoldConfig,
175    llm: Arc<dyn LLMProvider>,
176    frame: &GoldEvalFrame<'_>,
177) -> Result<GoldEvaluationResult, AgentError> {
178    // Bind frame fields as locals so the rest of the function body stays unchanged.
179    let event_tx = frame.event_tx;
180    let session_id = frame.session_id;
181    let model = frame.model;
182    let reasoning_effort = frame.reasoning_effort;
183    let checkpoint = frame.checkpoint;
184    let iteration = frame.iteration;
185
186    let _ = event_tx
187        .send(AgentEvent::GoldEvaluationStarted {
188            session_id: session_id.to_string(),
189            checkpoint,
190            iteration,
191        })
192        .await;
193
194    let messages = build_gold_messages(session, task_context, config, checkpoint);
195    let prompt_tokens = estimate_prompt_tokens(&messages);
196    let tools = get_gold_evaluation_tools();
197
198    let request_reasoning_effort = normalize_lightweight_reasoning_effort(reasoning_effort);
199    let request_options = LLMRequestOptions {
200        session_id: Some(session_id.to_string()),
201        reasoning_effort: request_reasoning_effort,
202        parallel_tool_calls: None,
203        responses: None,
204        request_purpose: Some("gold_evaluation".to_string()),
205        cache: None,
206    };
207
208    match llm
209        .chat_stream_with_options(
210            &messages,
211            &tools,
212            Some(config.max_output_tokens),
213            model,
214            Some(&request_options),
215        )
216        .await
217    {
218        Ok(stream) => {
219            let stream_output =
220                consume_llm_stream_silent(stream, &CancellationToken::new(), session_id)
221                    .await
222                    .map_err(|error| AgentError::LLM(error.to_string()))?;
223
224            let result = parse_gold_evaluation(
225                &stream_output.content,
226                &stream_output.tool_calls,
227                checkpoint,
228                iteration,
229                prompt_tokens,
230            );
231
232            let _ = event_tx
233                .send(AgentEvent::GoldEvaluationCompleted {
234                    session_id: session_id.to_string(),
235                    checkpoint: result.checkpoint,
236                    iteration: result.iteration,
237                    decision: result.decision,
238                    confidence: result.confidence,
239                    reasoning: result.reasoning.clone(),
240                })
241                .await;
242
243            Ok(result)
244        }
245        Err(error) => Err(AgentError::LLM(error.to_string())),
246    }
247}
248
249pub fn build_gold_messages(
250    session: &Session,
251    task_context: Option<&TaskLoopContext>,
252    config: &GoldConfig,
253    checkpoint: GoldCheckpoint,
254) -> Vec<Message> {
255    let mut messages = Vec::new();
256
257    let mut system_prompt = String::from(
258        "You are a gold progress evaluator. Judge whether the agent has already achieved the user's goal, should continue execution, needs user input, is blocked, or is exhausted.\n\nRules:\n1. This phase is observe-only: do not mutate state or invent actions.\n2. You must call report_gold_evaluation exactly once.\n3. Use achieved only when the user's actual goal is satisfied.\n4. Use continue when more agent work is still appropriate.\n5. Use need_input only when missing user input is the true next blocker.\n6. Use blocked only for a concrete blocking condition.\n7. Use exhausted for loops, budget exhaustion, or clear inability to make progress.\n8. Keep reasoning short, concrete, and evidence-based."
259    );
260
261    if let Some(extra) = config
262        .evaluation_prompt
263        .as_deref()
264        .map(str::trim)
265        .filter(|value| !value.is_empty())
266    {
267        system_prompt.push_str("\n\nAdditional instructions:\n");
268        system_prompt.push_str(extra);
269    }
270
271    messages.push(Message::system(system_prompt));
272
273    let task_summary = task_context
274        .map(TaskLoopContext::format_for_prompt)
275        .filter(|value| !value.trim().is_empty())
276        .unwrap_or_else(|| "## Current Task List\nNo task list available.".to_string());
277
278    let pending_question_summary = session
279        .pending_question
280        .as_ref()
281        .map(|question| {
282            let options = if question.options.is_empty() {
283                "none".to_string()
284            } else {
285                question.options.join(" | ")
286            };
287            let tool_name = if question.tool_name.trim().is_empty() {
288                "unknown".to_string()
289            } else {
290                question.tool_name.clone()
291            };
292            format!(
293                "question={} | options={} | tool={} | source={:?}",
294                question.question, options, tool_name, question.source
295            )
296        })
297        .unwrap_or_else(|| "none".to_string());
298
299    let runtime_summary = session
300        .agent_runtime_state
301        .as_ref()
302        .map(|state| {
303            format!(
304                "status={:?} | current_round={} | max_rounds={} | suspend_reason={} | waiting_for_children={}",
305                state.status,
306                state.round.current_round,
307                state.round.max_rounds,
308                state
309                    .suspension
310                    .as_ref()
311                    .map(|s| s.reason.clone())
312                    .unwrap_or_else(|| "none".to_string()),
313                state.waiting_for_children.is_some()
314            )
315        })
316        .unwrap_or_else(|| "runtime_state=none".to_string());
317
318    let recent_messages = format_recent_messages(session, 6);
319
320    let goal_section = config
321        .effective_goal()
322        .map(|goal| format!("## Goal\n{goal}"))
323        .unwrap_or_else(|| {
324            "## Goal\nNo explicit goal set. Judge against the user's request inferred from the conversation.".to_string()
325        });
326
327    let user_prompt = format!(
328        "## Gold Checkpoint\ncheckpoint={}\n\n{}\n\n## Runtime\n{}\n\n## Pending Question\n{}\n\n{}\n\n## Recent Conversation\n{}\n\n## Instruction\nReport the best current Gold judgment for this checkpoint by measuring progress against the goal above. Remember: Phase 1 is observe-only, so only report decision/confidence/reasoning.",
329        checkpoint.as_str(),
330        goal_section,
331        runtime_summary,
332        pending_question_summary,
333        task_summary,
334        recent_messages,
335    );
336
337    messages.push(Message::user(user_prompt));
338    messages
339}
340
341fn format_recent_messages(session: &Session, limit: usize) -> String {
342    let start = session.messages.len().saturating_sub(limit);
343    let mut lines = Vec::new();
344
345    for message in session.messages.iter().skip(start) {
346        let role = match message.role {
347            Role::System => "system",
348            Role::User => "user",
349            Role::Assistant => "assistant",
350            Role::Tool => "tool",
351        };
352
353        let mut content = message.content.trim().replace('\n', " ");
354        if content.chars().count() > 240 {
355            content = format!("{}…", content.chars().take(240).collect::<String>());
356        }
357        if content.is_empty() {
358            content = "<empty>".to_string();
359        }
360
361        lines.push(format!("- [{}] {}", role, content));
362    }
363
364    if lines.is_empty() {
365        "- <no messages>".to_string()
366    } else {
367        lines.join("\n")
368    }
369}
370
371pub fn get_gold_evaluation_tools() -> Vec<ToolSchema> {
372    vec![ToolSchema {
373        schema_type: "function".to_string(),
374        function: FunctionSchema {
375            name: "report_gold_evaluation".to_string(),
376            description: "Report the current Gold evaluation decision for the session".to_string(),
377            parameters: json!({
378                "type": "object",
379                "properties": {
380                    "decision": {
381                        "type": "string",
382                        "enum": ["continue", "achieved", "blocked", "need_input", "exhausted"]
383                    },
384                    "confidence": {
385                        "type": "string",
386                        "enum": ["low", "medium", "high"]
387                    },
388                    "reasoning": {
389                        "type": "string",
390                        "description": "Short concrete reasoning for the decision"
391                    },
392                    "missing_information": {
393                        "type": "array",
394                        "items": { "type": "string" },
395                        "description": "Concrete pieces of information still missing to achieve the goal. Empty when nothing is missing."
396                    },
397                    "next_action": {
398                        "type": "string",
399                        "description": "The single most useful next action the agent should take. Provide when decision is continue."
400                    }
401                },
402                "required": ["decision", "confidence", "reasoning"],
403                "additionalProperties": false
404            }),
405        },
406    }]
407}
408
409pub fn parse_gold_evaluation(
410    content: &str,
411    tool_calls: &[ToolCall],
412    checkpoint: GoldCheckpoint,
413    iteration: u32,
414    prompt_tokens: u64,
415) -> GoldEvaluationResult {
416    let completion_tokens = estimate_completion_tokens(content, tool_calls);
417    let parsed = parse_gold_result_from_tool_calls(tool_calls);
418
419    let parsed = parsed.unwrap_or_else(|| {
420        let fallback_reasoning = content.trim().to_string();
421        ParsedGoldResult {
422            decision: GoldDecision::Continue,
423            confidence: GoldConfidence::Low,
424            reasoning: if fallback_reasoning.is_empty() {
425                "Gold evaluation returned no structured result; defaulting to continue.".to_string()
426            } else {
427                fallback_reasoning
428            },
429            missing_information: Vec::new(),
430            next_action: None,
431        }
432    });
433
434    GoldEvaluationResult {
435        checkpoint,
436        iteration,
437        decision: parsed.decision,
438        confidence: parsed.confidence,
439        reasoning: parsed.reasoning,
440        missing_information: parsed.missing_information,
441        next_action: parsed.next_action,
442        prompt_tokens,
443        completion_tokens,
444    }
445}
446
447struct ParsedGoldResult {
448    decision: GoldDecision,
449    confidence: GoldConfidence,
450    reasoning: String,
451    missing_information: Vec<String>,
452    next_action: Option<String>,
453}
454
455fn parse_gold_result_from_tool_calls(tool_calls: &[ToolCall]) -> Option<ParsedGoldResult> {
456    for tool_call in tool_calls {
457        if tool_call.function.name != "report_gold_evaluation" {
458            continue;
459        }
460
461        let Ok(args) = serde_json::from_str::<serde_json::Value>(&tool_call.function.arguments)
462        else {
463            continue;
464        };
465
466        let decision = match args.get("decision").and_then(|value| value.as_str()) {
467            Some("continue") => GoldDecision::Continue,
468            Some("achieved") => GoldDecision::Achieved,
469            Some("blocked") => GoldDecision::Blocked,
470            Some("need_input") => GoldDecision::NeedInput,
471            Some("exhausted") => GoldDecision::Exhausted,
472            _ => continue,
473        };
474
475        let confidence = match args.get("confidence").and_then(|value| value.as_str()) {
476            Some("low") => GoldConfidence::Low,
477            Some("medium") => GoldConfidence::Medium,
478            Some("high") => GoldConfidence::High,
479            _ => GoldConfidence::Low,
480        };
481
482        let reasoning = args
483            .get("reasoning")
484            .and_then(|value| value.as_str())
485            .map(str::trim)
486            .filter(|value| !value.is_empty())
487            .unwrap_or("Gold evaluation produced no reasoning")
488            .to_string();
489
490        let missing_information = args
491            .get("missing_information")
492            .and_then(|value| value.as_array())
493            .map(|items| {
494                items
495                    .iter()
496                    .filter_map(|item| item.as_str())
497                    .map(str::trim)
498                    .filter(|value| !value.is_empty())
499                    .map(str::to_string)
500                    .collect::<Vec<_>>()
501            })
502            .unwrap_or_default();
503
504        let next_action = args
505            .get("next_action")
506            .and_then(|value| value.as_str())
507            .map(str::trim)
508            .filter(|value| !value.is_empty())
509            .map(str::to_string);
510
511        return Some(ParsedGoldResult {
512            decision,
513            confidence,
514            reasoning,
515            missing_information,
516            next_action,
517        });
518    }
519
520    None
521}
522
523pub(crate) fn apply_gold_evaluation_result(
524    session: &mut Session,
525    result: &GoldEvaluationResult,
526) -> MetricsTokenUsage {
527    let evaluation_count = session
528        .metadata
529        .get("gold.evaluation_count")
530        .and_then(|value| value.parse::<u64>().ok())
531        .unwrap_or(0)
532        .saturating_add(1);
533
534    let summary = json!({
535        "checkpoint": result.checkpoint.as_str(),
536        "iteration": result.iteration,
537        "decision": result.decision.as_str(),
538        "confidence": result.confidence.as_str(),
539        "reasoning": result.reasoning,
540        "recorded_at": Utc::now().to_rfc3339(),
541    });
542
543    session
544        .metadata
545        .insert("gold.last_evaluation".to_string(), summary.to_string());
546    session.metadata.insert(
547        "gold.last_decision".to_string(),
548        result.decision.as_str().to_string(),
549    );
550    session.metadata.insert(
551        "gold.last_confidence".to_string(),
552        result.confidence.as_str().to_string(),
553    );
554    session
555        .metadata
556        .insert("gold.last_reasoning".to_string(), result.reasoning.clone());
557    session.metadata.insert(
558        "gold.last_checkpoint".to_string(),
559        result.checkpoint.as_str().to_string(),
560    );
561    session.metadata.insert(
562        "gold.last_iteration".to_string(),
563        result.iteration.to_string(),
564    );
565    session.metadata.insert(
566        "gold.evaluation_count".to_string(),
567        evaluation_count.to_string(),
568    );
569    session.updated_at = Utc::now();
570
571    let mut usage = MetricsTokenUsage {
572        prompt_tokens: result.prompt_tokens,
573        completion_tokens: result.completion_tokens,
574        ..Default::default()
575    };
576    usage.recompute_total();
577    usage
578}
579
580#[cfg(test)]
581mod tests {
582    use super::*;
583    use bamboo_agent_core::tools::FunctionCall;
584
585    fn report_call(arguments: serde_json::Value) -> ToolCall {
586        ToolCall {
587            id: "call-1".to_string(),
588            tool_type: "function".to_string(),
589            function: FunctionCall {
590                name: "report_gold_evaluation".to_string(),
591                arguments: arguments.to_string(),
592            },
593        }
594    }
595
596    #[test]
597    fn parse_gold_result_from_tool_calls_reads_structured_fields() {
598        let parsed = parse_gold_result_from_tool_calls(&[report_call(json!({
599            "decision": "blocked",
600            "confidence": "high",
601            "reasoning": "Missing credentials",
602            "missing_information": ["API key", "  ", "Database URL"],
603            "next_action": "  Ask the user for the API key  "
604        }))])
605        .expect("gold result should parse");
606
607        assert_eq!(parsed.decision, GoldDecision::Blocked);
608        assert_eq!(parsed.confidence, GoldConfidence::High);
609        assert_eq!(parsed.reasoning, "Missing credentials");
610        assert_eq!(
611            parsed.missing_information,
612            vec!["API key".to_string(), "Database URL".to_string()]
613        );
614        assert_eq!(
615            parsed.next_action.as_deref(),
616            Some("Ask the user for the API key")
617        );
618    }
619
620    #[test]
621    fn parse_gold_result_from_tool_calls_ignores_other_tools() {
622        let parsed = parse_gold_result_from_tool_calls(&[ToolCall {
623            id: "call-1".to_string(),
624            tool_type: "function".to_string(),
625            function: FunctionCall {
626                name: "other_tool".to_string(),
627                arguments: "{}".to_string(),
628            },
629        }]);
630
631        assert!(parsed.is_none());
632    }
633
634    #[test]
635    fn apply_gold_evaluation_result_updates_metadata_keys() {
636        let mut session = Session::new("session-1", "model");
637        let result = GoldEvaluationResult {
638            checkpoint: GoldCheckpoint::PostRound,
639            iteration: 2,
640            decision: GoldDecision::Achieved,
641            confidence: GoldConfidence::Medium,
642            reasoning: "Goal satisfied".to_string(),
643            missing_information: Vec::new(),
644            next_action: None,
645            prompt_tokens: 10,
646            completion_tokens: 5,
647        };
648
649        let usage = apply_gold_evaluation_result(&mut session, &result);
650
651        assert_eq!(
652            session
653                .metadata
654                .get("gold.last_decision")
655                .map(String::as_str),
656            Some("achieved")
657        );
658        assert_eq!(
659            session
660                .metadata
661                .get("gold.last_confidence")
662                .map(String::as_str),
663            Some("medium")
664        );
665        assert_eq!(
666            session
667                .metadata
668                .get("gold.last_checkpoint")
669                .map(String::as_str),
670            Some("post_round")
671        );
672        assert_eq!(
673            session
674                .metadata
675                .get("gold.evaluation_count")
676                .map(String::as_str),
677            Some("1")
678        );
679        assert_eq!(usage.prompt_tokens, 10);
680        assert_eq!(usage.completion_tokens, 5);
681        assert_eq!(usage.total_tokens, 15);
682    }
683}