Skip to main content

bamboo_agent/agent/loop_module/
todo_evaluation.rs

1// TodoList Evaluation Module
2// 在 Agent Loop 每轮结束时,让 LLM 评估任务进度
3
4use crate::agent::core::tools::{FunctionSchema, ToolSchema};
5use crate::agent::core::{AgentEvent, Session, TodoItemStatus};
6use crate::agent::llm::LLMProvider;
7use serde_json::json;
8use std::sync::Arc;
9use tokio::sync::mpsc;
10
11use crate::agent::loop_module::todo_context::TodoLoopContext;
12
13/// 评估结果
14#[derive(Debug, Clone)]
15pub struct TodoEvaluationResult {
16    /// 是否需要评估(有 in_progress 的任务)
17    pub needs_evaluation: bool,
18    /// LLM 建议更新的项目
19    pub updates: Vec<TodoItemUpdate>,
20    /// LLM 的推理说明
21    pub reasoning: String,
22}
23
24#[derive(Debug, Clone)]
25pub struct TodoItemUpdate {
26    pub item_id: String,
27    pub status: TodoItemStatus,
28    pub notes: Option<String>,
29}
30
31/// 构建用于 TodoList 评估的 messages
32pub fn build_todo_evaluation_messages(
33    ctx: &TodoLoopContext,
34    _session: &Session,
35) -> Vec<crate::agent::core::Message> {
36    let mut messages = Vec::new();
37
38    // System prompt:专门用于 TodoList 评估
39    let system_prompt = r#"You are a task progress evaluator. Your job is to evaluate whether tasks are complete based on the execution context.
40
41## Your Task
42Review the todo list and execution history, then decide if any tasks should be marked as completed or blocked.
43
44## Rules
451. Mark as "completed" if the task goal has been achieved
462. Mark as "blocked" if there are unresolvable issues
473. Keep as "in_progress" if more work is needed
484. Add brief notes explaining your decision
49
50## Available Actions
51- update_todo_item: Update the status of a todo item
52
53## Constraints
54- Only update items that are currently "in_progress"
55- You MUST call update_todo_item if a task is complete
56- Provide clear reasoning in notes
57"#;
58
59    messages.push(crate::agent::core::Message::system(system_prompt));
60
61    // 构建 todo list 上下文
62    let todo_context = format!(
63        r#"
64## Current Todo List (Round {}/{})
65
66{}
67
68## Recent Tool Executions
69{}
70
71## Instructions
72Review each "in_progress" task above. For each task:
731. Check if the goal has been achieved based on tool execution results
742. If complete, call update_todo_item with status="completed" and brief notes
753. If blocked, call update_todo_item with status="blocked" and explain the issue
76
77Remember: You are NOT executing the task. You are only evaluating if existing work has completed it.
78"#,
79        ctx.current_round + 1,
80        ctx.max_rounds,
81        ctx.format_for_prompt(),
82        format_recent_tools(ctx, 5), // 最近 5 个 tool 调用
83    );
84
85    messages.push(crate::agent::core::Message::user(todo_context));
86
87    messages
88}
89
90/// 格式化最近的 tool 调用(用于 context)
91fn format_recent_tools(ctx: &TodoLoopContext, limit: usize) -> String {
92    let mut all_calls: Vec<(
93        String,
94        &crate::agent::loop_module::todo_context::ToolCallRecord,
95    )> = Vec::new();
96
97    for item in &ctx.items {
98        for call in &item.tool_calls {
99            all_calls.push((item.description.clone(), call));
100        }
101    }
102
103    // 按时间排序,取最近的 N 个
104    all_calls.sort_by_key(|(_, call)| std::cmp::Reverse(call.timestamp));
105
106    let recent: Vec<_> = all_calls.into_iter().take(limit).collect();
107
108    if recent.is_empty() {
109        return "No tool executions yet.".to_string();
110    }
111
112    let mut output = String::new();
113    for (i, (task_desc, call)) in recent.iter().enumerate() {
114        output.push_str(&format!(
115            "{}. [{}] Tool: {} ({})\n   Task: {}\n",
116            i + 1,
117            if call.success { "✓" } else { "✗" },
118            call.tool_name,
119            call.round + 1,
120            task_desc
121        ));
122    }
123
124    output
125}
126
127/// 获取 TodoList 评估的 tool schemas
128pub fn get_todo_evaluation_tools() -> Vec<ToolSchema> {
129    vec![ToolSchema {
130        schema_type: "function".to_string(),
131        function: FunctionSchema {
132            name: "update_todo_item".to_string(),
133            description: "Update the status of a todo item based on evaluation".to_string(),
134            parameters: json!({
135                "type": "object",
136                "properties": {
137                    "item_id": {
138                        "type": "string",
139                        "description": "The ID of the todo item to update"
140                    },
141                    "status": {
142                        "type": "string",
143                        "enum": ["completed", "blocked"],
144                        "description": "New status for the item"
145                    },
146                    "notes": {
147                        "type": "string",
148                        "description": "Brief explanation of why the status changed"
149                    }
150                },
151                "required": ["item_id", "status"]
152            }),
153        },
154    }]
155}
156
157/// 执行 TodoList 评估
158pub async fn evaluate_todo_progress(
159    ctx: &TodoLoopContext,
160    session: &Session,
161    llm: Arc<dyn LLMProvider>,
162    event_tx: &mpsc::Sender<AgentEvent>,
163    session_id: &str,
164    model: &str, // Add model parameter (required)
165) -> Result<TodoEvaluationResult, crate::agent::core::AgentError> {
166    use crate::agent::loop_module::stream::handler::consume_llm_stream;
167
168    // 检查是否有需要评估的任务
169    let in_progress_count = ctx
170        .items
171        .iter()
172        .filter(|item| matches!(item.status, TodoItemStatus::InProgress))
173        .count();
174
175    if in_progress_count == 0 {
176        return Ok(TodoEvaluationResult {
177            needs_evaluation: false,
178            updates: Vec::new(),
179            reasoning: "No in-progress tasks to evaluate".to_string(),
180        });
181    }
182
183    log::info!(
184        "[{}] Evaluating {} in-progress todo items",
185        session_id,
186        in_progress_count
187    );
188
189    // 发送评估开始事件
190    let _ = event_tx
191        .send(AgentEvent::TodoEvaluationStarted {
192            session_id: session_id.to_string(),
193            items_count: in_progress_count,
194        })
195        .await;
196
197    // 构建评估消息
198    let messages = build_todo_evaluation_messages(ctx, session);
199    let tools = get_todo_evaluation_tools();
200
201    // Use model from parameter (passed from config), not from session
202    log::debug!("[{}] Todo evaluation using model: {}", session_id, model);
203
204    // 调用 LLM(限制 output tokens)
205    match llm.chat_stream(&messages, &tools, Some(500), model).await {
206        Ok(stream) => {
207            // 消费流
208            let stream_output = consume_llm_stream(
209                stream,
210                event_tx,
211                &tokio_util::sync::CancellationToken::new(),
212                session_id,
213            )
214            .await
215            .map_err(|e| crate::agent::core::AgentError::LLM(e.to_string()))?;
216
217            log::info!(
218                "[{}] Todo evaluation completed: {} tokens, {} tool calls",
219                session_id,
220                stream_output.token_count,
221                stream_output.tool_calls.len()
222            );
223
224            // 解析 LLM 的决策
225            let mut updates = Vec::new();
226            for tool_call in &stream_output.tool_calls {
227                if tool_call.function.name == "update_todo_item" {
228                    if let Ok(args) =
229                        serde_json::from_str::<serde_json::Value>(&tool_call.function.arguments)
230                    {
231                        if let (Some(item_id), Some(status_str)) =
232                            (args["item_id"].as_str(), args["status"].as_str())
233                        {
234                            let status = match status_str {
235                                "completed" => TodoItemStatus::Completed,
236                                "blocked" => TodoItemStatus::Blocked,
237                                _ => continue,
238                            };
239
240                            updates.push(TodoItemUpdate {
241                                item_id: item_id.to_string(),
242                                status,
243                                notes: args["notes"].as_str().map(String::from),
244                            });
245                        }
246                    }
247                }
248            }
249
250            // 发送评估完成事件
251            let _ = event_tx
252                .send(AgentEvent::TodoEvaluationCompleted {
253                    session_id: session_id.to_string(),
254                    updates_count: updates.len(),
255                    reasoning: stream_output.content.clone(),
256                })
257                .await;
258
259            Ok(TodoEvaluationResult {
260                needs_evaluation: true,
261                updates,
262                reasoning: stream_output.content,
263            })
264        }
265        Err(e) => {
266            log::warn!("[{}] Todo evaluation failed: {}", session_id, e);
267            Ok(TodoEvaluationResult {
268                needs_evaluation: false,
269                updates: Vec::new(),
270                reasoning: format!("Evaluation failed: {}", e),
271            })
272        }
273    }
274}
275
276#[cfg(test)]
277mod tests {
278    use super::*;
279    use crate::agent::core::todo::{TodoItem, TodoList};
280    use crate::agent::loop_module::todo_context::{TodoLoopContext, TodoLoopItem};
281    use chrono::Utc;
282
283    fn create_test_context() -> TodoLoopContext {
284        let mut session = crate::agent::core::Session::new("test", "test-model");
285        let todo_list = TodoList {
286            session_id: "test".to_string(),
287            title: "Test Tasks".to_string(),
288            items: vec![TodoItem {
289                id: "1".to_string(),
290                description: "Fix bug in authentication".to_string(),
291                status: TodoItemStatus::InProgress,
292                depends_on: Vec::new(),
293                notes: String::new(),
294            }],
295            created_at: Utc::now(),
296            updated_at: Utc::now(),
297        };
298        session.set_todo_list(todo_list);
299
300        let mut ctx = TodoLoopContext::from_session(&session).unwrap();
301        ctx.items = vec![TodoLoopItem {
302            id: "1".to_string(),
303            description: "Fix bug in authentication".to_string(),
304            status: TodoItemStatus::InProgress,
305            tool_calls: vec![
306                crate::agent::loop_module::todo_context::ToolCallRecord {
307                    round: 0,
308                    tool_name: "read_file".to_string(),
309                    success: true,
310                    timestamp: Utc::now(),
311                },
312                crate::agent::loop_module::todo_context::ToolCallRecord {
313                    round: 1,
314                    tool_name: "write_file".to_string(),
315                    success: true,
316                    timestamp: Utc::now(),
317                },
318            ],
319            started_at_round: Some(0),
320            completed_at_round: None,
321        }];
322
323        ctx
324    }
325
326    #[test]
327    fn test_build_evaluation_messages() {
328        let ctx = create_test_context();
329        let session = crate::agent::core::Session::new("test", "test-model");
330
331        let messages = build_todo_evaluation_messages(&ctx, &session);
332
333        assert_eq!(messages.len(), 2);
334        assert!(messages[0].content.contains("task progress evaluator"));
335        assert!(messages[1].content.contains("Fix bug in authentication"));
336    }
337
338    #[test]
339    fn test_format_recent_tools() {
340        let ctx = create_test_context();
341        let output = format_recent_tools(&ctx, 5);
342
343        assert!(output.contains("read_file"));
344        assert!(output.contains("write_file"));
345        assert!(output.contains("✓"));
346    }
347
348    #[test]
349    fn test_needs_evaluation() {
350        let mut ctx = create_test_context();
351
352        // In-progress task needs evaluation
353        assert!(ctx
354            .items
355            .iter()
356            .any(|i| matches!(i.status, TodoItemStatus::InProgress)));
357
358        // Completed task doesn't need evaluation
359        ctx.items[0].status = TodoItemStatus::Completed;
360        assert!(!ctx
361            .items
362            .iter()
363            .any(|i| matches!(i.status, TodoItemStatus::InProgress)));
364    }
365
366    // ========== MODEL REQUIREMENT ARCHITECTURE TESTS ==========
367    // These tests ensure the design principle:
368    // "Todo evaluation must receive model as parameter, not use session.model"
369
370    /// Test: evaluate_todo_progress requires model parameter
371    /// This test documents that the function signature requires model
372    #[test]
373    fn todo_evaluation_requires_model_parameter() {
374        // This test is compile-time verification:
375        // The function signature is:
376        // pub async fn evaluate_todo_progress(
377        //     ctx: &TodoLoopContext,
378        //     session: &Session,
379        //     llm: Arc<dyn LLMProvider>,
380        //     event_tx: &mpsc::Sender<AgentEvent>,
381        //     session_id: &str,
382        //     model: &str,  // <-- Required parameter
383        // ) -> Result<TodoEvaluationResult, crate::agent::core::AgentError>
384        //
385        // The presence of `model: &str` in the signature proves that
386        // model must be passed as a parameter, not read from session.
387        //
388        // This is a documentation test - the actual verification
389        // happens at compile time when the function is called.
390        assert!(
391            true,
392            "Model parameter requirement is enforced by function signature"
393        );
394    }
395}