bamboo-engine 2026.5.30

use std::sync::Arc;

use bamboo_agent_core::tools::{FunctionSchema, ToolCall, ToolSchema};
use bamboo_agent_core::{AgentError, AgentEvent, GoldCheckpoint, GoldConfidence, GoldDecision};
use bamboo_agent_core::{Message, Role, Session};
use bamboo_compression::{TiktokenTokenCounter, TokenCounter};
use bamboo_domain::ReasoningEffort;
use bamboo_infrastructure::{LLMProvider, LLMRequestOptions};
use chrono::Utc;
use serde_json::json;
use tokio::sync::mpsc;
use tokio_util::sync::CancellationToken;

use crate::metrics::TokenUsage as MetricsTokenUsage;
use crate::runtime::config::GoldConfig;
use crate::runtime::stream::handler::consume_llm_stream_silent;
use crate::runtime::task_context::TaskLoopContext;

#[derive(Debug, Clone)]
pub struct GoldEvaluationResult {
    pub checkpoint: GoldCheckpoint,
    pub iteration: u32,
    pub decision: GoldDecision,
    pub confidence: GoldConfidence,
    pub reasoning: String,
    /// Concrete information still missing to achieve the goal, if any.
    pub missing_information: Vec<String>,
    /// The single most useful next action for the agent to take, if the
    /// evaluator can name one. Used to give the auto-continue nudge direction.
    pub next_action: Option<String>,
    pub prompt_tokens: u64,
    pub completion_tokens: u64,
}

#[derive(Debug, Clone)]
pub(crate) struct AsyncGoldEvaluationRequest {
    pub(crate) session_id: String,
    pub(crate) round_number: usize,
    pub(crate) model_name: String,
    pub(crate) reasoning_effort: Option<ReasoningEffort>,
    pub(crate) checkpoint: GoldCheckpoint,
    pub(crate) session_snapshot: Session,
    pub(crate) task_context_snapshot: Option<TaskLoopContext>,
    pub(crate) gold_config: GoldConfig,
}

#[derive(Debug, Clone)]
pub(crate) struct AsyncGoldEvaluationResult {
    pub(crate) round_number: usize,
    pub(crate) model_name: String,
    pub(crate) evaluation_result: GoldEvaluationResult,
}

fn normalize_lightweight_reasoning_effort(
    reasoning_effort: Option<ReasoningEffort>,
) -> Option<ReasoningEffort> {
    reasoning_effort.map(|effort| match effort {
        ReasoningEffort::Xhigh | ReasoningEffort::Max => ReasoningEffort::High,
        other => other,
    })
}

fn estimate_prompt_tokens(messages: &[Message]) -> u64 {
    let counter = TiktokenTokenCounter::default();
    u64::from(counter.count_messages(messages))
}

fn estimate_completion_tokens(content: &str, tool_calls: &[ToolCall]) -> u64 {
    let counter = TiktokenTokenCounter::default();
    let mut completion_surface = content.to_string();

    for call in tool_calls {
        if !completion_surface.is_empty() {
            completion_surface.push('\n');
        }
        completion_surface.push_str(&call.function.name);
        completion_surface.push('\n');
        completion_surface.push_str(&call.function.arguments);
    }

    u64::from(counter.count_text(&completion_surface))
}

pub(crate) fn build_async_gold_evaluation_request(
    task_context: &Option<TaskLoopContext>,
    session: &Session,
    session_id: &str,
    round_number: usize,
    model_name: Option<&str>,
    reasoning_effort: Option<ReasoningEffort>,
    checkpoint: GoldCheckpoint,
    gold_config: &GoldConfig,
) -> Result<Option<AsyncGoldEvaluationRequest>, AgentError> {
    if !gold_config.enabled {
        return Ok(None);
    }

    let model_name = gold_config
        .model_name
        .as_deref()
        .or(model_name)
        .ok_or_else(|| AgentError::LLM("gold evaluation model_name is required".to_string()))?;

    Ok(Some(AsyncGoldEvaluationRequest {
        session_id: session_id.to_string(),
        round_number,
        model_name: model_name.to_string(),
        reasoning_effort,
        checkpoint,
        session_snapshot: session.clone(),
        task_context_snapshot: task_context.clone(),
        gold_config: gold_config.clone(),
    }))
}

pub(crate) async fn execute_async_gold_evaluation(
    request: AsyncGoldEvaluationRequest,
    llm: Arc<dyn LLMProvider>,
    event_tx: mpsc::Sender<AgentEvent>,
) -> AsyncGoldEvaluationResult {
    let evaluation_result = match evaluate_gold(
        &request.session_snapshot,
        request.task_context_snapshot.as_ref(),
        &request.gold_config,
        llm,
        &event_tx,
        &request.session_id,
        &request.model_name,
        request.reasoning_effort,
        request.checkpoint,
        request.round_number as u32,
    )
    .await
    {
        Ok(result) => result,
        Err(error) => GoldEvaluationResult {
            checkpoint: request.checkpoint,
            iteration: request.round_number as u32,
            decision: GoldDecision::Continue,
            confidence: GoldConfidence::Low,
            reasoning: format!("Gold evaluation failed: {error}"),
            missing_information: Vec::new(),
            next_action: None,
            prompt_tokens: 0,
            completion_tokens: 0,
        },
    };

    AsyncGoldEvaluationResult {
        round_number: request.round_number,
        model_name: request.model_name,
        evaluation_result,
    }
}

pub async fn evaluate_gold(
    session: &Session,
    task_context: Option<&TaskLoopContext>,
    config: &GoldConfig,
    llm: Arc<dyn LLMProvider>,
    event_tx: &mpsc::Sender<AgentEvent>,
    session_id: &str,
    model: &str,
    reasoning_effort: Option<ReasoningEffort>,
    checkpoint: GoldCheckpoint,
    iteration: u32,
) -> Result<GoldEvaluationResult, AgentError> {
    let _ = event_tx
        .send(AgentEvent::GoldEvaluationStarted {
            session_id: session_id.to_string(),
            checkpoint,
            iteration,
        })
        .await;

    let messages = build_gold_messages(session, task_context, config, checkpoint);
    let prompt_tokens = estimate_prompt_tokens(&messages);
    let tools = get_gold_evaluation_tools();

    let request_reasoning_effort = normalize_lightweight_reasoning_effort(reasoning_effort);
    let request_options = LLMRequestOptions {
        session_id: Some(session_id.to_string()),
        reasoning_effort: request_reasoning_effort,
        parallel_tool_calls: None,
        responses: None,
        request_purpose: Some("gold_evaluation".to_string()),
    };

    match llm
        .chat_stream_with_options(
            &messages,
            &tools,
            Some(config.max_output_tokens),
            model,
            Some(&request_options),
        )
        .await
    {
        Ok(stream) => {
            let stream_output =
                consume_llm_stream_silent(stream, &CancellationToken::new(), session_id)
                    .await
                    .map_err(|error| AgentError::LLM(error.to_string()))?;

            let result = parse_gold_evaluation(
                &stream_output.content,
                &stream_output.tool_calls,
                checkpoint,
                iteration,
                prompt_tokens,
            );

            let _ = event_tx
                .send(AgentEvent::GoldEvaluationCompleted {
                    session_id: session_id.to_string(),
                    checkpoint: result.checkpoint,
                    iteration: result.iteration,
                    decision: result.decision,
                    confidence: result.confidence,
                    reasoning: result.reasoning.clone(),
                })
                .await;

            Ok(result)
        }
        Err(error) => Err(AgentError::LLM(error.to_string())),
    }
}

pub fn build_gold_messages(
    session: &Session,
    task_context: Option<&TaskLoopContext>,
    config: &GoldConfig,
    checkpoint: GoldCheckpoint,
) -> Vec<Message> {
    let mut messages = Vec::new();

    let mut system_prompt = String::from(
        "You are a gold progress evaluator. Judge whether the agent has already achieved the user's goal, should continue execution, needs user input, is blocked, or is exhausted.\n\nRules:\n1. This phase is observe-only: do not mutate state or invent actions.\n2. You must call report_gold_evaluation exactly once.\n3. Use achieved only when the user's actual goal is satisfied.\n4. Use continue when more agent work is still appropriate.\n5. Use need_input only when missing user input is the true next blocker.\n6. Use blocked only for a concrete blocking condition.\n7. Use exhausted for loops, budget exhaustion, or clear inability to make progress.\n8. Keep reasoning short, concrete, and evidence-based."
    );

    if let Some(extra) = config
        .evaluation_prompt
        .as_deref()
        .map(str::trim)
        .filter(|value| !value.is_empty())
    {
        system_prompt.push_str("\n\nAdditional instructions:\n");
        system_prompt.push_str(extra);
    }

    messages.push(Message::system(system_prompt));

    let task_summary = task_context
        .map(TaskLoopContext::format_for_prompt)
        .filter(|value| !value.trim().is_empty())
        .unwrap_or_else(|| "## Current Task List\nNo task list available.".to_string());

    let pending_question_summary = session
        .pending_question
        .as_ref()
        .map(|question| {
            let options = if question.options.is_empty() {
                "none".to_string()
            } else {
                question.options.join(" | ")
            };
            let tool_name = if question.tool_name.trim().is_empty() {
                "unknown".to_string()
            } else {
                question.tool_name.clone()
            };
            format!(
                "question={} | options={} | tool={} | source={:?}",
                question.question, options, tool_name, question.source
            )
        })
        .unwrap_or_else(|| "none".to_string());

    let runtime_summary = session
        .agent_runtime_state
        .as_ref()
        .map(|state| {
            format!(
                "status={:?} | current_round={} | max_rounds={} | suspend_reason={} | waiting_for_children={}",
                state.status,
                state.round.current_round,
                state.round.max_rounds,
                state
                    .suspension
                    .as_ref()
                    .map(|s| s.reason.clone())
                    .unwrap_or_else(|| "none".to_string()),
                state.waiting_for_children.is_some()
            )
        })
        .unwrap_or_else(|| "runtime_state=none".to_string());

    let recent_messages = format_recent_messages(session, 6);

    let goal_section = config
        .effective_goal()
        .map(|goal| format!("## Goal\n{goal}"))
        .unwrap_or_else(|| {
            "## Goal\nNo explicit goal set. Judge against the user's request inferred from the conversation.".to_string()
        });

    let user_prompt = format!(
        "## Gold Checkpoint\ncheckpoint={}\n\n{}\n\n## Runtime\n{}\n\n## Pending Question\n{}\n\n{}\n\n## Recent Conversation\n{}\n\n## Instruction\nReport the best current Gold judgment for this checkpoint by measuring progress against the goal above. Remember: Phase 1 is observe-only, so only report decision/confidence/reasoning.",
        checkpoint.as_str(),
        goal_section,
        runtime_summary,
        pending_question_summary,
        task_summary,
        recent_messages,
    );

    messages.push(Message::user(user_prompt));
    messages
}

fn format_recent_messages(session: &Session, limit: usize) -> String {
    let start = session.messages.len().saturating_sub(limit);
    let mut lines = Vec::new();

    for message in session.messages.iter().skip(start) {
        let role = match message.role {
            Role::System => "system",
            Role::User => "user",
            Role::Assistant => "assistant",
            Role::Tool => "tool",
        };

        let mut content = message.content.trim().replace('\n', " ");
        if content.chars().count() > 240 {
            content = format!("{}…", content.chars().take(240).collect::<String>());
        }
        if content.is_empty() {
            content = "<empty>".to_string();
        }

        lines.push(format!("- [{}] {}", role, content));
    }

    if lines.is_empty() {
        "- <no messages>".to_string()
    } else {
        lines.join("\n")
    }
}

pub fn get_gold_evaluation_tools() -> Vec<ToolSchema> {
    vec![ToolSchema {
        schema_type: "function".to_string(),
        function: FunctionSchema {
            name: "report_gold_evaluation".to_string(),
            description: "Report the current Gold evaluation decision for the session".to_string(),
            parameters: json!({
                "type": "object",
                "properties": {
                    "decision": {
                        "type": "string",
                        "enum": ["continue", "achieved", "blocked", "need_input", "exhausted"]
                    },
                    "confidence": {
                        "type": "string",
                        "enum": ["low", "medium", "high"]
                    },
                    "reasoning": {
                        "type": "string",
                        "description": "Short concrete reasoning for the decision"
                    },
                    "missing_information": {
                        "type": "array",
                        "items": { "type": "string" },
                        "description": "Concrete pieces of information still missing to achieve the goal. Empty when nothing is missing."
                    },
                    "next_action": {
                        "type": "string",
                        "description": "The single most useful next action the agent should take. Provide when decision is continue."
                    }
                },
                "required": ["decision", "confidence", "reasoning"],
                "additionalProperties": false
            }),
        },
    }]
}

pub fn parse_gold_evaluation(
    content: &str,
    tool_calls: &[ToolCall],
    checkpoint: GoldCheckpoint,
    iteration: u32,
    prompt_tokens: u64,
) -> GoldEvaluationResult {
    let completion_tokens = estimate_completion_tokens(content, tool_calls);
    let parsed = parse_gold_result_from_tool_calls(tool_calls);

    let parsed = parsed.unwrap_or_else(|| {
        let fallback_reasoning = content.trim().to_string();
        ParsedGoldResult {
            decision: GoldDecision::Continue,
            confidence: GoldConfidence::Low,
            reasoning: if fallback_reasoning.is_empty() {
                "Gold evaluation returned no structured result; defaulting to continue.".to_string()
            } else {
                fallback_reasoning
            },
            missing_information: Vec::new(),
            next_action: None,
        }
    });

    GoldEvaluationResult {
        checkpoint,
        iteration,
        decision: parsed.decision,
        confidence: parsed.confidence,
        reasoning: parsed.reasoning,
        missing_information: parsed.missing_information,
        next_action: parsed.next_action,
        prompt_tokens,
        completion_tokens,
    }
}

struct ParsedGoldResult {
    decision: GoldDecision,
    confidence: GoldConfidence,
    reasoning: String,
    missing_information: Vec<String>,
    next_action: Option<String>,
}

fn parse_gold_result_from_tool_calls(tool_calls: &[ToolCall]) -> Option<ParsedGoldResult> {
    for tool_call in tool_calls {
        if tool_call.function.name != "report_gold_evaluation" {
            continue;
        }

        let Ok(args) = serde_json::from_str::<serde_json::Value>(&tool_call.function.arguments)
        else {
            continue;
        };

        let decision = match args.get("decision").and_then(|value| value.as_str()) {
            Some("continue") => GoldDecision::Continue,
            Some("achieved") => GoldDecision::Achieved,
            Some("blocked") => GoldDecision::Blocked,
            Some("need_input") => GoldDecision::NeedInput,
            Some("exhausted") => GoldDecision::Exhausted,
            _ => continue,
        };

        let confidence = match args.get("confidence").and_then(|value| value.as_str()) {
            Some("low") => GoldConfidence::Low,
            Some("medium") => GoldConfidence::Medium,
            Some("high") => GoldConfidence::High,
            _ => GoldConfidence::Low,
        };

        let reasoning = args
            .get("reasoning")
            .and_then(|value| value.as_str())
            .map(str::trim)
            .filter(|value| !value.is_empty())
            .unwrap_or("Gold evaluation produced no reasoning")
            .to_string();

        let missing_information = args
            .get("missing_information")
            .and_then(|value| value.as_array())
            .map(|items| {
                items
                    .iter()
                    .filter_map(|item| item.as_str())
                    .map(str::trim)
                    .filter(|value| !value.is_empty())
                    .map(str::to_string)
                    .collect::<Vec<_>>()
            })
            .unwrap_or_default();

        let next_action = args
            .get("next_action")
            .and_then(|value| value.as_str())
            .map(str::trim)
            .filter(|value| !value.is_empty())
            .map(str::to_string);

        return Some(ParsedGoldResult {
            decision,
            confidence,
            reasoning,
            missing_information,
            next_action,
        });
    }

    None
}

pub(crate) fn apply_gold_evaluation_result(
    session: &mut Session,
    result: &GoldEvaluationResult,
) -> MetricsTokenUsage {
    let evaluation_count = session
        .metadata
        .get("gold.evaluation_count")
        .and_then(|value| value.parse::<u64>().ok())
        .unwrap_or(0)
        .saturating_add(1);

    let summary = json!({
        "checkpoint": result.checkpoint.as_str(),
        "iteration": result.iteration,
        "decision": result.decision.as_str(),
        "confidence": result.confidence.as_str(),
        "reasoning": result.reasoning,
        "recorded_at": Utc::now().to_rfc3339(),
    });

    session
        .metadata
        .insert("gold.last_evaluation".to_string(), summary.to_string());
    session.metadata.insert(
        "gold.last_decision".to_string(),
        result.decision.as_str().to_string(),
    );
    session.metadata.insert(
        "gold.last_confidence".to_string(),
        result.confidence.as_str().to_string(),
    );
    session
        .metadata
        .insert("gold.last_reasoning".to_string(), result.reasoning.clone());
    session.metadata.insert(
        "gold.last_checkpoint".to_string(),
        result.checkpoint.as_str().to_string(),
    );
    session.metadata.insert(
        "gold.last_iteration".to_string(),
        result.iteration.to_string(),
    );
    session.metadata.insert(
        "gold.evaluation_count".to_string(),
        evaluation_count.to_string(),
    );
    session.updated_at = Utc::now();

    let mut usage = MetricsTokenUsage {
        prompt_tokens: result.prompt_tokens,
        completion_tokens: result.completion_tokens,
        ..Default::default()
    };
    usage.recompute_total();
    usage
}

#[cfg(test)]
mod tests {
    use super::*;
    use bamboo_agent_core::tools::FunctionCall;

    fn report_call(arguments: serde_json::Value) -> ToolCall {
        ToolCall {
            id: "call-1".to_string(),
            tool_type: "function".to_string(),
            function: FunctionCall {
                name: "report_gold_evaluation".to_string(),
                arguments: arguments.to_string(),
            },
        }
    }

    #[test]
    fn parse_gold_result_from_tool_calls_reads_structured_fields() {
        let parsed = parse_gold_result_from_tool_calls(&[report_call(json!({
            "decision": "blocked",
            "confidence": "high",
            "reasoning": "Missing credentials",
            "missing_information": ["API key", "  ", "Database URL"],
            "next_action": "  Ask the user for the API key  "
        }))])
        .expect("gold result should parse");

        assert_eq!(parsed.decision, GoldDecision::Blocked);
        assert_eq!(parsed.confidence, GoldConfidence::High);
        assert_eq!(parsed.reasoning, "Missing credentials");
        assert_eq!(
            parsed.missing_information,
            vec!["API key".to_string(), "Database URL".to_string()]
        );
        assert_eq!(
            parsed.next_action.as_deref(),
            Some("Ask the user for the API key")
        );
    }

    #[test]
    fn parse_gold_result_from_tool_calls_ignores_other_tools() {
        let parsed = parse_gold_result_from_tool_calls(&[ToolCall {
            id: "call-1".to_string(),
            tool_type: "function".to_string(),
            function: FunctionCall {
                name: "other_tool".to_string(),
                arguments: "{}".to_string(),
            },
        }]);

        assert!(parsed.is_none());
    }

    #[test]
    fn apply_gold_evaluation_result_updates_metadata_keys() {
        let mut session = Session::new("session-1", "model");
        let result = GoldEvaluationResult {
            checkpoint: GoldCheckpoint::PostRound,
            iteration: 2,
            decision: GoldDecision::Achieved,
            confidence: GoldConfidence::Medium,
            reasoning: "Goal satisfied".to_string(),
            missing_information: Vec::new(),
            next_action: None,
            prompt_tokens: 10,
            completion_tokens: 5,
        };

        let usage = apply_gold_evaluation_result(&mut session, &result);

        assert_eq!(
            session
                .metadata
                .get("gold.last_decision")
                .map(String::as_str),
            Some("achieved")
        );
        assert_eq!(
            session
                .metadata
                .get("gold.last_confidence")
                .map(String::as_str),
            Some("medium")
        );
        assert_eq!(
            session
                .metadata
                .get("gold.last_checkpoint")
                .map(String::as_str),
            Some("post_round")
        );
        assert_eq!(
            session
                .metadata
                .get("gold.evaluation_count")
                .map(String::as_str),
            Some("1")
        );
        assert_eq!(usage.prompt_tokens, 10);
        assert_eq!(usage.completion_tokens, 5);
        assert_eq!(usage.total_tokens, 15);
    }
}