1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
use base64::Engine;
use tokio::time::Duration;
use tokio_stream::StreamExt;
use tracing::{debug, warn};
use kodegen_candle_agent::prelude::*;
use kodegen_mcp_schema::browser::{BROWSER_EXTRACT_TEXT, BROWSER_SCREENSHOT};
use cyrup_sugars::prelude::MessageChunk;
use crate::agent::{AgentError, AgentResult, BrowserExtractTextResponse, BrowserScreenshotResponse};
use super::processor::AgentInner;
/// Struct to hold browser state, screenshot path, and visual description
#[derive(Debug, Clone)]
pub(super) struct BrowserStateWithScreenshot {
pub(super) state: String,
pub(super) screenshot_path: Option<String>,
pub(super) visual_description: Option<String>,
}
/// Browser state management implementation
impl AgentInner {
/// Get current browser state for LLM context (HOT PATH!)
///
/// Fetches page content and optional screenshot via MCP tools.
/// This provides the LLM with current browser context for action planning.
///
/// Uses:
/// - browser_extract_text: Get page text content
/// - browser_screenshot: Get base64-encoded screenshot (optional)
///
/// Returns BrowserStateWithScreenshot with text summary and screenshot.
pub(super) async fn get_browser_state(&self) -> AgentResult<BrowserStateWithScreenshot> {
// Extract page content via MCP (HOT PATH!)
let content = match self
.mcp_client
.call_tool(BROWSER_EXTRACT_TEXT, serde_json::json!({}))
.await
{
Ok(result) => {
// Parse text from tool response
// browser_extract_text returns: {"success": true, "text": "...", "length": N, ...}
result
.content
.first()
.and_then(|c| c.as_text())
.and_then(|t| {
serde_json::from_str::<BrowserExtractTextResponse>(&t.text)
.ok()
.map(|response| response.text)
})
.unwrap_or_else(|| {
warn!("Failed to parse browser_extract_text response, using empty content");
String::new()
})
}
Err(e) => {
warn!("browser_extract_text failed: {}, using empty content", e);
String::new()
}
};
// Get screenshot via MCP and save to temp file (HOT PATH!)
let screenshot_path = match self
.mcp_client
.call_tool(BROWSER_SCREENSHOT, serde_json::json!({}))
.await
{
Ok(result) => {
// Parse base64 image from tool response
// ⚠️ CRITICAL: browser_screenshot returns {"image": base64}, NOT {"base64": base64}!
let screenshot_base64 =
result
.content
.first()
.and_then(|c| c.as_text())
.and_then(|t| {
serde_json::from_str::<BrowserScreenshotResponse>(&t.text)
.ok()
.map(|response| response.image)
});
// Save base64 to temp file for vision API
if let Some(base64_data) = screenshot_base64 {
// ✅ FIX 1: Move CPU-intensive base64 decode to blocking thread pool
let decoded_bytes = tokio::task::spawn_blocking(move || {
base64::engine::general_purpose::STANDARD.decode(&base64_data)
})
.await
.map_err(|e| {
AgentError::UnexpectedError(format!("Base64 decode task failed: {}", e))
})?
.map_err(|e| {
AgentError::UnexpectedError(format!("Base64 decode failed: {}", e))
})?;
// Create unique temp file path with nanosecond precision + PID
let temp_dir = std::env::temp_dir();
let duration = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map_err(|e| {
AgentError::BrowserError(format!("System time error: {}", e))
})?;
let filename = format!(
"browser_screenshot_{}_{:09}_{}.png",
duration.as_secs(),
duration.subsec_nanos(),
std::process::id()
);
let temp_path = temp_dir.join(filename);
// ✅ FIX 2: Use async file write instead of blocking std::fs::write
match tokio::fs::write(&temp_path, decoded_bytes).await {
Ok(_) => Some(temp_path.to_string_lossy().to_string()),
Err(e) => {
warn!("Failed to write screenshot to file: {}", e);
None
}
}
} else {
None
}
}
Err(e) => {
warn!(
"browser_screenshot failed: {}, continuing without screenshot",
e
);
None
}
};
// Build state representation for LLM
let state = format!(
"Content Length: {} characters\nContent Sample: {}{}",
content.len(),
&content[0..content.len().min(500)],
if content.len() > 500 { "..." } else { "" }
);
// Store state for recovery if needed
let mut agent_state = self.agent_state.lock().await;
agent_state.set_last_valid_state(state.clone());
drop(agent_state);
Ok(BrowserStateWithScreenshot {
state,
screenshot_path,
visual_description: None,
})
}
/// Format browser state with vision-based screenshot analysis
///
/// Uses CandleFluentAi::vision() to analyze screenshots and generate
/// detailed visual descriptions of UI elements and layout.
///
/// Populates browser_state.visual_description with the vision analysis result
/// for potential caching/reuse.
pub(super) async fn format_browser_state_with_vision(
&self,
browser_state: &mut BrowserStateWithScreenshot,
) -> AgentResult<String> {
let mut state_description = format!("Current browser state:\n{}", browser_state.state);
// Add vision-based screenshot analysis if available
if let Some(screenshot_path) = &browser_state.screenshot_path {
state_description.push_str("\n\nVisual Analysis:\n");
// Check if we already have cached visual description
let visual_desc = if let Some(ref cached) = browser_state.visual_description {
debug!("Using cached visual description");
cached.clone()
} else {
// Generate new vision analysis
let vision_query = "Describe the visible UI elements, their layout, and any interactive components (buttons, links, forms, input fields, etc.) in detail.";
// Wrap entire stream consumption in timeout
let vision_timeout = Duration::from_secs(self.vision_timeout_secs);
let result = tokio::time::timeout(vision_timeout, async {
let mut description = String::with_capacity(4096);
let mut stream =
CandleFluentAi::vision().describe_image(screenshot_path, vision_query);
while let Some(chunk) = stream.next().await {
if let Some(error) = chunk.error() {
return Err(format!("Vision analysis error: {}", error));
}
if !chunk.text.is_empty() {
description.push_str(&chunk.text);
}
if chunk.is_final {
if let Some(stats) = &chunk.stats {
debug!(
"Vision analysis: {} tokens generated",
stats.tokens_generated
);
}
return Ok(description);
}
}
Err("Vision stream ended without final chunk".to_string())
})
.await;
match result {
Ok(Ok(desc)) => {
browser_state.visual_description = Some(desc.clone());
desc
}
Ok(Err(e)) => {
warn!("Vision analysis failed: {}", e);
format!("[Vision analysis failed: {}]", e)
}
Err(_) => {
warn!(
"Vision analysis timed out after {}s",
self.vision_timeout_secs
);
format!(
"[Vision analysis timed out after {}s]",
self.vision_timeout_secs
)
}
}
};
state_description.push_str(&visual_desc);
state_description.push('\n');
// Clean up temp screenshot file after vision analysis completes
if let Err(e) = tokio::fs::remove_file(screenshot_path).await {
warn!(
"Failed to cleanup screenshot file {}: {}",
screenshot_path, e
);
}
}
Ok(state_description)
}
}