1use std::sync::Arc;
2
3use bamboo_agent_core::tools::{FunctionSchema, ToolCall, ToolSchema};
4use bamboo_agent_core::{AgentError, AgentEvent, GoldCheckpoint, GoldConfidence, GoldDecision};
5use bamboo_agent_core::{Message, Role, Session};
6use bamboo_compression::{TiktokenTokenCounter, TokenCounter};
7use bamboo_domain::ReasoningEffort;
8use bamboo_infrastructure::{LLMProvider, LLMRequestOptions};
9use chrono::Utc;
10use serde_json::json;
11use tokio::sync::mpsc;
12use tokio_util::sync::CancellationToken;
13
14use crate::metrics::TokenUsage as MetricsTokenUsage;
15use crate::runtime::config::GoldConfig;
16use crate::runtime::stream::handler::consume_llm_stream_silent;
17use crate::runtime::task_context::TaskLoopContext;
18
19pub struct GoldEvalFrame<'a> {
23 pub event_tx: &'a mpsc::Sender<AgentEvent>,
24 pub session_id: &'a str,
25 pub model: &'a str,
26 pub reasoning_effort: Option<ReasoningEffort>,
27 pub checkpoint: GoldCheckpoint,
28 pub iteration: u32,
29}
30
31#[derive(Debug, Clone)]
32pub struct GoldEvaluationResult {
33 pub checkpoint: GoldCheckpoint,
34 pub iteration: u32,
35 pub decision: GoldDecision,
36 pub confidence: GoldConfidence,
37 pub reasoning: String,
38 pub missing_information: Vec<String>,
40 pub next_action: Option<String>,
43 pub prompt_tokens: u64,
44 pub completion_tokens: u64,
45}
46
47#[derive(Debug, Clone)]
48pub(crate) struct AsyncGoldEvaluationRequest {
49 pub(crate) session_id: String,
50 pub(crate) round_number: usize,
51 pub(crate) model_name: String,
52 pub(crate) reasoning_effort: Option<ReasoningEffort>,
53 pub(crate) checkpoint: GoldCheckpoint,
54 pub(crate) session_snapshot: Session,
55 pub(crate) task_context_snapshot: Option<TaskLoopContext>,
56 pub(crate) gold_config: GoldConfig,
57}
58
59#[derive(Debug, Clone)]
60pub(crate) struct AsyncGoldEvaluationResult {
61 pub(crate) round_number: usize,
62 pub(crate) model_name: String,
63 pub(crate) evaluation_result: GoldEvaluationResult,
64}
65
66fn normalize_lightweight_reasoning_effort(
67 reasoning_effort: Option<ReasoningEffort>,
68) -> Option<ReasoningEffort> {
69 reasoning_effort.map(|effort| match effort {
70 ReasoningEffort::Xhigh | ReasoningEffort::Max => ReasoningEffort::High,
71 other => other,
72 })
73}
74
75fn estimate_prompt_tokens(messages: &[Message]) -> u64 {
76 let counter = TiktokenTokenCounter::default();
77 u64::from(counter.count_messages(messages))
78}
79
80fn estimate_completion_tokens(content: &str, tool_calls: &[ToolCall]) -> u64 {
81 let counter = TiktokenTokenCounter::default();
82 let mut completion_surface = content.to_string();
83
84 for call in tool_calls {
85 if !completion_surface.is_empty() {
86 completion_surface.push('\n');
87 }
88 completion_surface.push_str(&call.function.name);
89 completion_surface.push('\n');
90 completion_surface.push_str(&call.function.arguments);
91 }
92
93 u64::from(counter.count_text(&completion_surface))
94}
95
96#[allow(clippy::too_many_arguments)]
97pub(crate) fn build_async_gold_evaluation_request(
98 task_context: &Option<TaskLoopContext>,
99 session: &Session,
100 session_id: &str,
101 round_number: usize,
102 model_name: Option<&str>,
103 reasoning_effort: Option<ReasoningEffort>,
104 checkpoint: GoldCheckpoint,
105 gold_config: &GoldConfig,
106) -> Result<Option<AsyncGoldEvaluationRequest>, AgentError> {
107 if !gold_config.enabled {
108 return Ok(None);
109 }
110
111 let model_name = gold_config
112 .model_name
113 .as_deref()
114 .or(model_name)
115 .ok_or_else(|| AgentError::LLM("gold evaluation model_name is required".to_string()))?;
116
117 Ok(Some(AsyncGoldEvaluationRequest {
118 session_id: session_id.to_string(),
119 round_number,
120 model_name: model_name.to_string(),
121 reasoning_effort,
122 checkpoint,
123 session_snapshot: session.clone(),
124 task_context_snapshot: task_context.clone(),
125 gold_config: gold_config.clone(),
126 }))
127}
128
129pub(crate) async fn execute_async_gold_evaluation(
130 request: AsyncGoldEvaluationRequest,
131 llm: Arc<dyn LLMProvider>,
132 event_tx: mpsc::Sender<AgentEvent>,
133) -> AsyncGoldEvaluationResult {
134 let evaluation_result = match evaluate_gold(
135 &request.session_snapshot,
136 request.task_context_snapshot.as_ref(),
137 &request.gold_config,
138 llm,
139 &GoldEvalFrame {
140 event_tx: &event_tx,
141 session_id: &request.session_id,
142 model: &request.model_name,
143 reasoning_effort: request.reasoning_effort,
144 checkpoint: request.checkpoint,
145 iteration: request.round_number as u32,
146 },
147 )
148 .await
149 {
150 Ok(result) => result,
151 Err(error) => GoldEvaluationResult {
152 checkpoint: request.checkpoint,
153 iteration: request.round_number as u32,
154 decision: GoldDecision::Continue,
155 confidence: GoldConfidence::Low,
156 reasoning: format!("Gold evaluation failed: {error}"),
157 missing_information: Vec::new(),
158 next_action: None,
159 prompt_tokens: 0,
160 completion_tokens: 0,
161 },
162 };
163
164 AsyncGoldEvaluationResult {
165 round_number: request.round_number,
166 model_name: request.model_name,
167 evaluation_result,
168 }
169}
170
171pub async fn evaluate_gold(
172 session: &Session,
173 task_context: Option<&TaskLoopContext>,
174 config: &GoldConfig,
175 llm: Arc<dyn LLMProvider>,
176 frame: &GoldEvalFrame<'_>,
177) -> Result<GoldEvaluationResult, AgentError> {
178 let event_tx = frame.event_tx;
180 let session_id = frame.session_id;
181 let model = frame.model;
182 let reasoning_effort = frame.reasoning_effort;
183 let checkpoint = frame.checkpoint;
184 let iteration = frame.iteration;
185
186 let _ = event_tx
187 .send(AgentEvent::GoldEvaluationStarted {
188 session_id: session_id.to_string(),
189 checkpoint,
190 iteration,
191 })
192 .await;
193
194 let messages = build_gold_messages(session, task_context, config, checkpoint);
195 let prompt_tokens = estimate_prompt_tokens(&messages);
196 let tools = get_gold_evaluation_tools();
197
198 let request_reasoning_effort = normalize_lightweight_reasoning_effort(reasoning_effort);
199 let request_options = LLMRequestOptions {
200 session_id: Some(session_id.to_string()),
201 reasoning_effort: request_reasoning_effort,
202 parallel_tool_calls: None,
203 responses: None,
204 request_purpose: Some("gold_evaluation".to_string()),
205 cache: None,
206 };
207
208 match llm
209 .chat_stream_with_options(
210 &messages,
211 &tools,
212 Some(config.max_output_tokens),
213 model,
214 Some(&request_options),
215 )
216 .await
217 {
218 Ok(stream) => {
219 let stream_output =
220 consume_llm_stream_silent(stream, &CancellationToken::new(), session_id)
221 .await
222 .map_err(|error| AgentError::LLM(error.to_string()))?;
223
224 let result = parse_gold_evaluation(
225 &stream_output.content,
226 &stream_output.tool_calls,
227 checkpoint,
228 iteration,
229 prompt_tokens,
230 );
231
232 let _ = event_tx
233 .send(AgentEvent::GoldEvaluationCompleted {
234 session_id: session_id.to_string(),
235 checkpoint: result.checkpoint,
236 iteration: result.iteration,
237 decision: result.decision,
238 confidence: result.confidence,
239 reasoning: result.reasoning.clone(),
240 })
241 .await;
242
243 Ok(result)
244 }
245 Err(error) => Err(AgentError::LLM(error.to_string())),
246 }
247}
248
249pub fn build_gold_messages(
250 session: &Session,
251 task_context: Option<&TaskLoopContext>,
252 config: &GoldConfig,
253 checkpoint: GoldCheckpoint,
254) -> Vec<Message> {
255 let mut messages = Vec::new();
256
257 let mut system_prompt = String::from(
258 "You are a gold progress evaluator. Judge whether the agent has already achieved the user's goal, should continue execution, needs user input, is blocked, or is exhausted.\n\nRules:\n1. This phase is observe-only: do not mutate state or invent actions.\n2. You must call report_gold_evaluation exactly once.\n3. Use achieved only when the user's actual goal is satisfied.\n4. Use continue when more agent work is still appropriate.\n5. Use need_input only when missing user input is the true next blocker.\n6. Use blocked only for a concrete blocking condition.\n7. Use exhausted for loops, budget exhaustion, or clear inability to make progress.\n8. Keep reasoning short, concrete, and evidence-based."
259 );
260
261 if let Some(extra) = config
262 .evaluation_prompt
263 .as_deref()
264 .map(str::trim)
265 .filter(|value| !value.is_empty())
266 {
267 system_prompt.push_str("\n\nAdditional instructions:\n");
268 system_prompt.push_str(extra);
269 }
270
271 messages.push(Message::system(system_prompt));
272
273 let task_summary = task_context
274 .map(TaskLoopContext::format_for_prompt)
275 .filter(|value| !value.trim().is_empty())
276 .unwrap_or_else(|| "## Current Task List\nNo task list available.".to_string());
277
278 let pending_question_summary = session
279 .pending_question
280 .as_ref()
281 .map(|question| {
282 let options = if question.options.is_empty() {
283 "none".to_string()
284 } else {
285 question.options.join(" | ")
286 };
287 let tool_name = if question.tool_name.trim().is_empty() {
288 "unknown".to_string()
289 } else {
290 question.tool_name.clone()
291 };
292 format!(
293 "question={} | options={} | tool={} | source={:?}",
294 question.question, options, tool_name, question.source
295 )
296 })
297 .unwrap_or_else(|| "none".to_string());
298
299 let runtime_summary = session
300 .agent_runtime_state
301 .as_ref()
302 .map(|state| {
303 format!(
304 "status={:?} | current_round={} | max_rounds={} | suspend_reason={} | waiting_for_children={}",
305 state.status,
306 state.round.current_round,
307 state.round.max_rounds,
308 state
309 .suspension
310 .as_ref()
311 .map(|s| s.reason.clone())
312 .unwrap_or_else(|| "none".to_string()),
313 state.waiting_for_children.is_some()
314 )
315 })
316 .unwrap_or_else(|| "runtime_state=none".to_string());
317
318 let recent_messages = format_recent_messages(session, 6);
319
320 let goal_section = config
321 .effective_goal()
322 .map(|goal| format!("## Goal\n{goal}"))
323 .unwrap_or_else(|| {
324 "## Goal\nNo explicit goal set. Judge against the user's request inferred from the conversation.".to_string()
325 });
326
327 let user_prompt = format!(
328 "## Gold Checkpoint\ncheckpoint={}\n\n{}\n\n## Runtime\n{}\n\n## Pending Question\n{}\n\n{}\n\n## Recent Conversation\n{}\n\n## Instruction\nReport the best current Gold judgment for this checkpoint by measuring progress against the goal above. Remember: Phase 1 is observe-only, so only report decision/confidence/reasoning.",
329 checkpoint.as_str(),
330 goal_section,
331 runtime_summary,
332 pending_question_summary,
333 task_summary,
334 recent_messages,
335 );
336
337 messages.push(Message::user(user_prompt));
338 messages
339}
340
341fn format_recent_messages(session: &Session, limit: usize) -> String {
342 let start = session.messages.len().saturating_sub(limit);
343 let mut lines = Vec::new();
344
345 for message in session.messages.iter().skip(start) {
346 let role = match message.role {
347 Role::System => "system",
348 Role::User => "user",
349 Role::Assistant => "assistant",
350 Role::Tool => "tool",
351 };
352
353 let mut content = message.content.trim().replace('\n', " ");
354 if content.chars().count() > 240 {
355 content = format!("{}…", content.chars().take(240).collect::<String>());
356 }
357 if content.is_empty() {
358 content = "<empty>".to_string();
359 }
360
361 lines.push(format!("- [{}] {}", role, content));
362 }
363
364 if lines.is_empty() {
365 "- <no messages>".to_string()
366 } else {
367 lines.join("\n")
368 }
369}
370
371pub fn get_gold_evaluation_tools() -> Vec<ToolSchema> {
372 vec![ToolSchema {
373 schema_type: "function".to_string(),
374 function: FunctionSchema {
375 name: "report_gold_evaluation".to_string(),
376 description: "Report the current Gold evaluation decision for the session".to_string(),
377 parameters: json!({
378 "type": "object",
379 "properties": {
380 "decision": {
381 "type": "string",
382 "enum": ["continue", "achieved", "blocked", "need_input", "exhausted"]
383 },
384 "confidence": {
385 "type": "string",
386 "enum": ["low", "medium", "high"]
387 },
388 "reasoning": {
389 "type": "string",
390 "description": "Short concrete reasoning for the decision"
391 },
392 "missing_information": {
393 "type": "array",
394 "items": { "type": "string" },
395 "description": "Concrete pieces of information still missing to achieve the goal. Empty when nothing is missing."
396 },
397 "next_action": {
398 "type": "string",
399 "description": "The single most useful next action the agent should take. Provide when decision is continue."
400 }
401 },
402 "required": ["decision", "confidence", "reasoning"],
403 "additionalProperties": false
404 }),
405 },
406 }]
407}
408
409pub fn parse_gold_evaluation(
410 content: &str,
411 tool_calls: &[ToolCall],
412 checkpoint: GoldCheckpoint,
413 iteration: u32,
414 prompt_tokens: u64,
415) -> GoldEvaluationResult {
416 let completion_tokens = estimate_completion_tokens(content, tool_calls);
417 let parsed = parse_gold_result_from_tool_calls(tool_calls);
418
419 let parsed = parsed.unwrap_or_else(|| {
420 let fallback_reasoning = content.trim().to_string();
421 ParsedGoldResult {
422 decision: GoldDecision::Continue,
423 confidence: GoldConfidence::Low,
424 reasoning: if fallback_reasoning.is_empty() {
425 "Gold evaluation returned no structured result; defaulting to continue.".to_string()
426 } else {
427 fallback_reasoning
428 },
429 missing_information: Vec::new(),
430 next_action: None,
431 }
432 });
433
434 GoldEvaluationResult {
435 checkpoint,
436 iteration,
437 decision: parsed.decision,
438 confidence: parsed.confidence,
439 reasoning: parsed.reasoning,
440 missing_information: parsed.missing_information,
441 next_action: parsed.next_action,
442 prompt_tokens,
443 completion_tokens,
444 }
445}
446
447struct ParsedGoldResult {
448 decision: GoldDecision,
449 confidence: GoldConfidence,
450 reasoning: String,
451 missing_information: Vec<String>,
452 next_action: Option<String>,
453}
454
455fn parse_gold_result_from_tool_calls(tool_calls: &[ToolCall]) -> Option<ParsedGoldResult> {
456 for tool_call in tool_calls {
457 if tool_call.function.name != "report_gold_evaluation" {
458 continue;
459 }
460
461 let Ok(args) = serde_json::from_str::<serde_json::Value>(&tool_call.function.arguments)
462 else {
463 continue;
464 };
465
466 let decision = match args.get("decision").and_then(|value| value.as_str()) {
467 Some("continue") => GoldDecision::Continue,
468 Some("achieved") => GoldDecision::Achieved,
469 Some("blocked") => GoldDecision::Blocked,
470 Some("need_input") => GoldDecision::NeedInput,
471 Some("exhausted") => GoldDecision::Exhausted,
472 _ => continue,
473 };
474
475 let confidence = match args.get("confidence").and_then(|value| value.as_str()) {
476 Some("low") => GoldConfidence::Low,
477 Some("medium") => GoldConfidence::Medium,
478 Some("high") => GoldConfidence::High,
479 _ => GoldConfidence::Low,
480 };
481
482 let reasoning = args
483 .get("reasoning")
484 .and_then(|value| value.as_str())
485 .map(str::trim)
486 .filter(|value| !value.is_empty())
487 .unwrap_or("Gold evaluation produced no reasoning")
488 .to_string();
489
490 let missing_information = args
491 .get("missing_information")
492 .and_then(|value| value.as_array())
493 .map(|items| {
494 items
495 .iter()
496 .filter_map(|item| item.as_str())
497 .map(str::trim)
498 .filter(|value| !value.is_empty())
499 .map(str::to_string)
500 .collect::<Vec<_>>()
501 })
502 .unwrap_or_default();
503
504 let next_action = args
505 .get("next_action")
506 .and_then(|value| value.as_str())
507 .map(str::trim)
508 .filter(|value| !value.is_empty())
509 .map(str::to_string);
510
511 return Some(ParsedGoldResult {
512 decision,
513 confidence,
514 reasoning,
515 missing_information,
516 next_action,
517 });
518 }
519
520 None
521}
522
523pub(crate) fn apply_gold_evaluation_result(
524 session: &mut Session,
525 result: &GoldEvaluationResult,
526) -> MetricsTokenUsage {
527 let evaluation_count = session
528 .metadata
529 .get("gold.evaluation_count")
530 .and_then(|value| value.parse::<u64>().ok())
531 .unwrap_or(0)
532 .saturating_add(1);
533
534 let summary = json!({
535 "checkpoint": result.checkpoint.as_str(),
536 "iteration": result.iteration,
537 "decision": result.decision.as_str(),
538 "confidence": result.confidence.as_str(),
539 "reasoning": result.reasoning,
540 "recorded_at": Utc::now().to_rfc3339(),
541 });
542
543 session
544 .metadata
545 .insert("gold.last_evaluation".to_string(), summary.to_string());
546 session.metadata.insert(
547 "gold.last_decision".to_string(),
548 result.decision.as_str().to_string(),
549 );
550 session.metadata.insert(
551 "gold.last_confidence".to_string(),
552 result.confidence.as_str().to_string(),
553 );
554 session
555 .metadata
556 .insert("gold.last_reasoning".to_string(), result.reasoning.clone());
557 session.metadata.insert(
558 "gold.last_checkpoint".to_string(),
559 result.checkpoint.as_str().to_string(),
560 );
561 session.metadata.insert(
562 "gold.last_iteration".to_string(),
563 result.iteration.to_string(),
564 );
565 session.metadata.insert(
566 "gold.evaluation_count".to_string(),
567 evaluation_count.to_string(),
568 );
569 session.updated_at = Utc::now();
570
571 let mut usage = MetricsTokenUsage {
572 prompt_tokens: result.prompt_tokens,
573 completion_tokens: result.completion_tokens,
574 ..Default::default()
575 };
576 usage.recompute_total();
577 usage
578}
579
580#[cfg(test)]
581mod tests {
582 use super::*;
583 use bamboo_agent_core::tools::FunctionCall;
584
585 fn report_call(arguments: serde_json::Value) -> ToolCall {
586 ToolCall {
587 id: "call-1".to_string(),
588 tool_type: "function".to_string(),
589 function: FunctionCall {
590 name: "report_gold_evaluation".to_string(),
591 arguments: arguments.to_string(),
592 },
593 }
594 }
595
596 #[test]
597 fn parse_gold_result_from_tool_calls_reads_structured_fields() {
598 let parsed = parse_gold_result_from_tool_calls(&[report_call(json!({
599 "decision": "blocked",
600 "confidence": "high",
601 "reasoning": "Missing credentials",
602 "missing_information": ["API key", " ", "Database URL"],
603 "next_action": " Ask the user for the API key "
604 }))])
605 .expect("gold result should parse");
606
607 assert_eq!(parsed.decision, GoldDecision::Blocked);
608 assert_eq!(parsed.confidence, GoldConfidence::High);
609 assert_eq!(parsed.reasoning, "Missing credentials");
610 assert_eq!(
611 parsed.missing_information,
612 vec!["API key".to_string(), "Database URL".to_string()]
613 );
614 assert_eq!(
615 parsed.next_action.as_deref(),
616 Some("Ask the user for the API key")
617 );
618 }
619
620 #[test]
621 fn parse_gold_result_from_tool_calls_ignores_other_tools() {
622 let parsed = parse_gold_result_from_tool_calls(&[ToolCall {
623 id: "call-1".to_string(),
624 tool_type: "function".to_string(),
625 function: FunctionCall {
626 name: "other_tool".to_string(),
627 arguments: "{}".to_string(),
628 },
629 }]);
630
631 assert!(parsed.is_none());
632 }
633
634 #[test]
635 fn apply_gold_evaluation_result_updates_metadata_keys() {
636 let mut session = Session::new("session-1", "model");
637 let result = GoldEvaluationResult {
638 checkpoint: GoldCheckpoint::PostRound,
639 iteration: 2,
640 decision: GoldDecision::Achieved,
641 confidence: GoldConfidence::Medium,
642 reasoning: "Goal satisfied".to_string(),
643 missing_information: Vec::new(),
644 next_action: None,
645 prompt_tokens: 10,
646 completion_tokens: 5,
647 };
648
649 let usage = apply_gold_evaluation_result(&mut session, &result);
650
651 assert_eq!(
652 session
653 .metadata
654 .get("gold.last_decision")
655 .map(String::as_str),
656 Some("achieved")
657 );
658 assert_eq!(
659 session
660 .metadata
661 .get("gold.last_confidence")
662 .map(String::as_str),
663 Some("medium")
664 );
665 assert_eq!(
666 session
667 .metadata
668 .get("gold.last_checkpoint")
669 .map(String::as_str),
670 Some("post_round")
671 );
672 assert_eq!(
673 session
674 .metadata
675 .get("gold.evaluation_count")
676 .map(String::as_str),
677 Some("1")
678 );
679 assert_eq!(usage.prompt_tokens, 10);
680 assert_eq!(usage.completion_tokens, 5);
681 assert_eq!(usage.total_tokens, 15);
682 }
683}