Skip to main content

localgpt_cli_tools/
browser.rs

1//! Browser automation tool via Chrome DevTools Protocol (CDP).
2//!
3//! Uses headless Chrome with direct WebSocket JSON-RPC — no external CDP crate.
4//! Chrome is lazily launched on first tool invocation and reused for subsequent calls.
5
6use anyhow::{Context, Result, bail};
7use async_trait::async_trait;
8use futures_util::{SinkExt, StreamExt};
9use serde_json::{Value, json};
10use std::sync::Arc;
11use std::time::Duration;
12use tokio::sync::Mutex;
13use tokio_tungstenite::tungstenite::Message;
14use tracing::debug;
15
16use localgpt_core::agent::providers::ToolSchema;
17use localgpt_core::agent::tools::Tool;
18
19/// Default timeout for CDP commands (30 seconds).
20const CDP_TIMEOUT: Duration = Duration::from_secs(30);
21
22/// Browser tool — provides navigate, screenshot, and text extraction.
23pub struct BrowserTool {
24    port: u16,
25    state: Arc<Mutex<BrowserState>>,
26}
27
28enum BrowserState {
29    NotStarted,
30    Running {
31        _process: tokio::process::Child, // kept alive for kill_on_drop
32        ws_url: String,
33    },
34}
35
36impl BrowserTool {
37    pub fn new(port: u16) -> Self {
38        Self {
39            port,
40            state: Arc::new(Mutex::new(BrowserState::NotStarted)),
41        }
42    }
43
44    /// Ensure Chrome is running and return the WebSocket debug URL.
45    async fn ensure_browser(&self) -> Result<String> {
46        let mut state = self.state.lock().await;
47        match &*state {
48            BrowserState::Running { ws_url, .. } => Ok(ws_url.clone()),
49            BrowserState::NotStarted => {
50                let chrome_path = detect_chrome()?;
51                debug!("Launching headless Chrome: {}", chrome_path);
52
53                let user_data = std::env::temp_dir().join("localgpt-chrome");
54                let process = tokio::process::Command::new(&chrome_path)
55                    .args([
56                        "--headless=new",
57                        "--disable-gpu",
58                        "--no-sandbox",
59                        "--disable-dev-shm-usage",
60                        &format!("--remote-debugging-port={}", self.port),
61                        &format!("--user-data-dir={}", user_data.display()),
62                        "--no-first-run",
63                        "--no-default-browser-check",
64                        "about:blank",
65                    ])
66                    .stdout(std::process::Stdio::null())
67                    .stderr(std::process::Stdio::null())
68                    .kill_on_drop(true)
69                    .spawn()
70                    .context("Failed to launch Chrome")?;
71
72                // Wait for Chrome to start accepting connections
73                let ws_url = wait_for_chrome(self.port).await?;
74                debug!("Chrome ready at {}", ws_url);
75
76                *state = BrowserState::Running {
77                    _process: process,
78                    ws_url: ws_url.clone(),
79                };
80                Ok(ws_url)
81            }
82        }
83    }
84
85    async fn do_navigate(&self, url: &str) -> Result<String> {
86        let ws_url = self.ensure_browser().await?;
87        let mut cdp = CdpConnection::connect(&ws_url).await?;
88
89        cdp.send("Page.enable", json!({})).await?;
90        cdp.send("Page.navigate", json!({"url": url})).await?;
91
92        // Wait for page load
93        tokio::time::sleep(Duration::from_secs(2)).await;
94
95        // Extract page content
96        let result = cdp
97            .send(
98                "Runtime.evaluate",
99                json!({"expression": "document.title + '\\n\\n' + document.body?.innerText?.substring(0, 10000) || ''"}),
100            )
101            .await?;
102
103        let text = result["result"]["value"]
104            .as_str()
105            .unwrap_or("(empty page)")
106            .to_string();
107
108        Ok(format!("URL: {}\n\n{}", url, text))
109    }
110
111    async fn do_screenshot(&self, url: Option<&str>) -> Result<String> {
112        let ws_url = self.ensure_browser().await?;
113        let mut cdp = CdpConnection::connect(&ws_url).await?;
114
115        if let Some(url) = url {
116            cdp.send("Page.enable", json!({})).await?;
117            cdp.send("Page.navigate", json!({"url": url})).await?;
118            tokio::time::sleep(Duration::from_secs(2)).await;
119        }
120
121        let result = cdp
122            .send("Page.captureScreenshot", json!({"format": "png"}))
123            .await?;
124
125        let data = result["data"]
126            .as_str()
127            .ok_or_else(|| anyhow::anyhow!("No screenshot data returned"))?;
128
129        Ok(format!("Screenshot captured ({} bytes base64)", data.len()))
130    }
131
132    async fn do_text(&self, selector: Option<&str>) -> Result<String> {
133        let ws_url = self.ensure_browser().await?;
134        let mut cdp = CdpConnection::connect(&ws_url).await?;
135
136        let expr = if let Some(sel) = selector {
137            format!(
138                "document.querySelector('{}')?.innerText || '(element not found)'",
139                sel.replace('\'', "\\'")
140            )
141        } else {
142            "document.body?.innerText || '(empty)'".to_string()
143        };
144
145        let result = cdp
146            .send("Runtime.evaluate", json!({"expression": expr}))
147            .await?;
148
149        Ok(result["result"]["value"]
150            .as_str()
151            .unwrap_or("(no text)")
152            .to_string())
153    }
154
155    async fn do_click(&self, selector: &str) -> Result<String> {
156        let ws_url = self.ensure_browser().await?;
157        let mut cdp = CdpConnection::connect(&ws_url).await?;
158
159        let expr = format!(
160            "(() => {{ const el = document.querySelector('{}'); if (el) {{ el.click(); return 'clicked'; }} return 'element not found'; }})()",
161            selector.replace('\'', "\\'")
162        );
163
164        let result = cdp
165            .send("Runtime.evaluate", json!({"expression": expr}))
166            .await?;
167
168        Ok(result["result"]["value"]
169            .as_str()
170            .unwrap_or("unknown")
171            .to_string())
172    }
173}
174
175#[async_trait]
176impl Tool for BrowserTool {
177    fn name(&self) -> &str {
178        "browser"
179    }
180
181    fn schema(&self) -> ToolSchema {
182        ToolSchema {
183            name: "browser".to_string(),
184            description: "Interact with web pages via headless Chrome. Actions: navigate (load URL and get text), screenshot (capture page as PNG), text (extract text by CSS selector), click (click element by selector).".to_string(),
185            parameters: json!({
186                "type": "object",
187                "properties": {
188                    "action": {
189                        "type": "string",
190                        "enum": ["navigate", "screenshot", "text", "click"],
191                        "description": "Action to perform"
192                    },
193                    "url": {
194                        "type": "string",
195                        "description": "URL to navigate to (required for navigate/screenshot)"
196                    },
197                    "selector": {
198                        "type": "string",
199                        "description": "CSS selector (for text/click actions)"
200                    }
201                },
202                "required": ["action"]
203            }),
204        }
205    }
206
207    async fn execute(&self, arguments: &str) -> Result<String> {
208        let args: Value = serde_json::from_str(arguments)?;
209        let action = args["action"]
210            .as_str()
211            .ok_or_else(|| anyhow::anyhow!("Missing action"))?;
212
213        match action {
214            "navigate" => {
215                let url = args["url"]
216                    .as_str()
217                    .ok_or_else(|| anyhow::anyhow!("navigate requires 'url'"))?;
218                self.do_navigate(url).await
219            }
220            "screenshot" => {
221                let url = args["url"].as_str();
222                self.do_screenshot(url).await
223            }
224            "text" => {
225                let selector = args["selector"].as_str();
226                self.do_text(selector).await
227            }
228            "click" => {
229                let selector = args["selector"]
230                    .as_str()
231                    .ok_or_else(|| anyhow::anyhow!("click requires 'selector'"))?;
232                self.do_click(selector).await
233            }
234            other => bail!(
235                "Unknown browser action: '{}'. Use: navigate, screenshot, text, click",
236                other
237            ),
238        }
239    }
240}
241
242impl Drop for BrowserTool {
243    fn drop(&mut self) {
244        // kill_on_drop handles Chrome cleanup
245    }
246}
247
248// ── CDP WebSocket connection ──
249
250struct CdpConnection {
251    ws: tokio_tungstenite::WebSocketStream<
252        tokio_tungstenite::MaybeTlsStream<tokio::net::TcpStream>,
253    >,
254    next_id: u32,
255}
256
257impl CdpConnection {
258    async fn connect(ws_url: &str) -> Result<Self> {
259        let (ws, _) = tokio::time::timeout(CDP_TIMEOUT, tokio_tungstenite::connect_async(ws_url))
260            .await
261            .map_err(|_| anyhow::anyhow!("CDP WebSocket connection timed out"))??;
262
263        Ok(Self { ws, next_id: 1 })
264    }
265
266    async fn send(&mut self, method: &str, params: Value) -> Result<Value> {
267        let id = self.next_id;
268        self.next_id += 1;
269
270        let msg = json!({
271            "id": id,
272            "method": method,
273            "params": params
274        });
275
276        self.ws
277            .send(Message::Text(msg.to_string().into()))
278            .await
279            .context("Failed to send CDP command")?;
280
281        // Read responses until we get our ID
282        loop {
283            let response = tokio::time::timeout(CDP_TIMEOUT, self.ws.next())
284                .await
285                .map_err(|_| anyhow::anyhow!("CDP response timed out for {}", method))?
286                .ok_or_else(|| anyhow::anyhow!("CDP WebSocket closed"))??;
287
288            if let Message::Text(text) = response {
289                let parsed: Value = serde_json::from_str(text.as_ref())?;
290                if parsed["id"].as_u64() == Some(id as u64) {
291                    if let Some(error) = parsed.get("error") {
292                        bail!("CDP error for {}: {}", method, error);
293                    }
294                    return Ok(parsed["result"].clone());
295                }
296                // Skip events (no matching id)
297            }
298        }
299    }
300}
301
302// ── Chrome detection and startup ──
303
304/// Detect Chrome/Chromium binary path.
305pub fn detect_chrome() -> Result<String> {
306    let candidates = if cfg!(target_os = "macos") {
307        vec![
308            "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
309            "/Applications/Chromium.app/Contents/MacOS/Chromium",
310        ]
311    } else {
312        vec![]
313    };
314
315    // Check well-known paths first
316    for path in &candidates {
317        if std::path::Path::new(path).exists() {
318            return Ok(path.to_string());
319        }
320    }
321
322    // Try PATH
323    for name in &["google-chrome", "chromium", "chromium-browser", "chrome"] {
324        if which::which(name).is_ok() {
325            return Ok(name.to_string());
326        }
327    }
328
329    bail!(
330        "Chrome/Chromium not found. Install Google Chrome or Chromium.\n\
331         Searched: {:?} + PATH lookup for google-chrome, chromium, chrome",
332        candidates
333    )
334}
335
336/// Wait for Chrome to start accepting CDP connections.
337async fn wait_for_chrome(port: u16) -> Result<String> {
338    let url = format!("http://127.0.0.1:{}/json/version", port);
339    let client = reqwest::Client::new();
340
341    for attempt in 0..20 {
342        tokio::time::sleep(Duration::from_millis(250)).await;
343
344        if let Ok(resp) = client.get(&url).send().await
345            && let Ok(info) = resp.json::<Value>().await
346            && let Some(ws_url) = info["webSocketDebuggerUrl"].as_str()
347        {
348            return Ok(ws_url.to_string());
349        }
350
351        if attempt > 0 && attempt % 5 == 0 {
352            debug!("Waiting for Chrome to start (attempt {})", attempt);
353        }
354    }
355
356    bail!("Chrome failed to start within 5 seconds on port {}", port)
357}
358
359#[cfg(test)]
360mod tests {
361    use super::*;
362
363    #[test]
364    fn test_browser_tool_schema() {
365        let tool = BrowserTool::new(9222);
366        assert_eq!(tool.name(), "browser");
367        let schema = tool.schema();
368        assert_eq!(schema.name, "browser");
369        let params = &schema.parameters;
370        assert!(params["properties"]["action"].is_object());
371        assert!(params["properties"]["url"].is_object());
372        assert!(params["properties"]["selector"].is_object());
373        assert_eq!(params["required"][0], "action");
374    }
375
376    #[test]
377    fn test_detect_chrome_path() {
378        // This test validates detect_chrome doesn't panic
379        // It may succeed or fail depending on Chrome installation
380        let result = detect_chrome();
381        if let Ok(path) = &result {
382            assert!(!path.is_empty());
383        }
384        // Both Ok and Err are valid — we just check it doesn't crash
385    }
386
387    #[tokio::test]
388    async fn test_browser_missing_action() {
389        let tool = BrowserTool::new(19222); // Use unlikely port
390        let result = tool.execute(r#"{"url": "https://example.com"}"#).await;
391        assert!(result.is_err());
392        assert!(result.unwrap_err().to_string().contains("Missing action"));
393    }
394
395    #[tokio::test]
396    async fn test_browser_unknown_action() {
397        let tool = BrowserTool::new(19222);
398        let result = tool.execute(r#"{"action": "fly"}"#).await;
399        assert!(result.is_err());
400        assert!(
401            result
402                .unwrap_err()
403                .to_string()
404                .contains("Unknown browser action")
405        );
406    }
407
408    #[tokio::test]
409    async fn test_browser_navigate_missing_url() {
410        let tool = BrowserTool::new(19222);
411        let result = tool.execute(r#"{"action": "navigate"}"#).await;
412        assert!(result.is_err());
413        assert!(result.unwrap_err().to_string().contains("requires 'url'"));
414    }
415
416    #[tokio::test]
417    async fn test_browser_click_missing_selector() {
418        let tool = BrowserTool::new(19222);
419        let result = tool.execute(r#"{"action": "click"}"#).await;
420        assert!(result.is_err());
421        assert!(
422            result
423                .unwrap_err()
424                .to_string()
425                .contains("requires 'selector'")
426        );
427    }
428}