Skip to main content

web_capture/
browser.rs

1//! Browser automation module
2//!
3//! This module provides headless browser operations for rendering pages
4//! and capturing screenshots. Note: Full browser automation requires
5//! browser-commander, which depends on having Chrome installed.
6//!
7//! For simpler HTTP fetching without JavaScript rendering, see the html module.
8
9use crate::{Result, WebCaptureError};
10use std::path::{Path, PathBuf};
11use std::sync::atomic::{AtomicU64, Ordering};
12use std::time::{Duration, SystemTime, UNIX_EPOCH};
13use tokio::process::Command;
14use tracing::info;
15
16static USER_DATA_DIR_COUNTER: AtomicU64 = AtomicU64::new(0);
17
18/// Browser engine type
19#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
20pub enum BrowserEngine {
21    /// Chromiumoxide engine (default)
22    #[default]
23    Chromiumoxide,
24}
25
26impl std::fmt::Display for BrowserEngine {
27    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
28        match self {
29            Self::Chromiumoxide => write!(f, "chromiumoxide"),
30        }
31    }
32}
33
34impl std::str::FromStr for BrowserEngine {
35    type Err = WebCaptureError;
36
37    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
38        match s.to_lowercase().as_str() {
39            "chromiumoxide" | "chromium" | "chrome" => Ok(Self::Chromiumoxide),
40            _ => Err(WebCaptureError::BrowserError(format!(
41                "Unknown browser engine: {s}"
42            ))),
43        }
44    }
45}
46
47/// Render HTML content from a URL using a headless browser
48///
49/// This function uses browser-commander to launch a headless browser,
50/// navigate to the URL, and return the rendered HTML content.
51///
52/// Note: This requires Chrome/Chromium to be installed on the system.
53///
54/// # Arguments
55///
56/// * `url` - The URL to render
57///
58/// # Returns
59///
60/// The rendered HTML content as a string
61///
62/// # Errors
63///
64/// Returns an error if browser operations fail
65pub async fn render_html(url: &str) -> Result<String> {
66    info!("Rendering HTML for URL: {}", url);
67
68    let chrome = find_chrome_executable().ok_or_else(|| {
69        WebCaptureError::BrowserError(
70            "Chrome/Chromium executable was not found. Set WEB_CAPTURE_CHROME, CHROME_PATH, or GOOGLE_CHROME_BIN.".to_string(),
71        )
72    })?;
73    let user_data_dir = temporary_user_data_dir();
74    std::fs::create_dir_all(&user_data_dir)?;
75
76    let output = tokio::time::timeout(
77        Duration::from_secs(60),
78        Command::new(&chrome)
79            .arg("--headless=new")
80            .arg("--disable-gpu")
81            .arg("--disable-extensions")
82            .arg("--disable-dev-shm-usage")
83            .arg("--no-sandbox")
84            .arg("--dump-dom")
85            .arg(format!("--user-data-dir={}", user_data_dir.display()))
86            .arg(url)
87            .output(),
88    )
89    .await
90    .map_err(|_| {
91        WebCaptureError::BrowserError(format!(
92            "Timed out waiting for headless Chrome to render {url}"
93        ))
94    })?
95    .map_err(|e| WebCaptureError::BrowserError(format!("Failed to launch Chrome: {e}")))?;
96
97    let _ = std::fs::remove_dir_all(&user_data_dir);
98
99    if !output.status.success() {
100        return Err(WebCaptureError::BrowserError(format!(
101            "Headless Chrome failed with status {}: {}",
102            output.status,
103            String::from_utf8_lossy(&output.stderr)
104        )));
105    }
106
107    let html = String::from_utf8(output.stdout)
108        .map_err(|e| WebCaptureError::BrowserError(format!("Chrome output was not UTF-8: {e}")))?;
109
110    info!("Successfully rendered HTML ({} bytes)", html.len());
111    Ok(html)
112}
113
114fn find_chrome_executable() -> Option<PathBuf> {
115    for env_var in [
116        "WEB_CAPTURE_CHROME",
117        "CHROME_PATH",
118        "GOOGLE_CHROME_BIN",
119        "CHROMIUM_PATH",
120    ] {
121        if let Ok(path) = std::env::var(env_var) {
122            let candidate = PathBuf::from(path);
123            if candidate.exists() {
124                return Some(candidate);
125            }
126        }
127    }
128
129    for name in [
130        "google-chrome",
131        "google-chrome-stable",
132        "chromium",
133        "chromium-browser",
134        "chrome",
135    ] {
136        if let Some(path) = find_on_path(name) {
137            return Some(path);
138        }
139    }
140
141    for path in [
142        "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
143        "/Applications/Chromium.app/Contents/MacOS/Chromium",
144        r"C:\Program Files\Google\Chrome\Application\chrome.exe",
145        r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
146        r"C:\Program Files\Chromium\Application\chrome.exe",
147    ] {
148        let candidate = PathBuf::from(path);
149        if candidate.exists() {
150            return Some(candidate);
151        }
152    }
153
154    None
155}
156
157fn find_on_path(name: &str) -> Option<PathBuf> {
158    let paths = std::env::var_os("PATH")?;
159    for dir in std::env::split_paths(&paths) {
160        let candidate = dir.join(name);
161        if candidate.exists() {
162            return Some(candidate);
163        }
164        #[cfg(windows)]
165        {
166            let candidate = dir.join(format!("{name}.exe"));
167            if candidate.exists() {
168                return Some(candidate);
169            }
170        }
171    }
172    None
173}
174
175fn temporary_user_data_dir() -> PathBuf {
176    let nonce = SystemTime::now()
177        .duration_since(UNIX_EPOCH)
178        .map_or(0, |duration| duration.as_nanos());
179    let seq = USER_DATA_DIR_COUNTER.fetch_add(1, Ordering::Relaxed);
180    std::env::temp_dir().join(format!(
181        "web-capture-chrome-{}-{nonce}-{seq}",
182        std::process::id()
183    ))
184}
185
186/// Capture a PNG screenshot of a URL
187///
188/// This function launches headless Chrome/Chromium, navigates to the URL,
189/// and captures a full-page PNG screenshot.
190///
191/// # Arguments
192///
193/// * `url` - The URL to capture
194///
195/// # Returns
196///
197/// The PNG image data as bytes
198///
199/// # Errors
200///
201/// Returns an error if Chrome/Chromium is unavailable or screenshot capture fails.
202pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
203    info!("Capturing screenshot for URL: {}", url);
204
205    let chrome = find_chrome_executable().ok_or_else(|| {
206        WebCaptureError::ScreenshotError(
207            "Chrome/Chromium executable was not found. Set WEB_CAPTURE_CHROME, CHROME_PATH, or GOOGLE_CHROME_BIN.".to_string(),
208        )
209    })?;
210
211    let user_data_dir = temporary_user_data_dir();
212    std::fs::create_dir_all(&user_data_dir).map_err(|e| {
213        WebCaptureError::ScreenshotError(format!("Failed to create temp user data dir: {e}"))
214    })?;
215
216    let screenshot_path = temporary_screenshot_path();
217    let screenshot_arg = format!("--screenshot={}", screenshot_path.display());
218
219    let output = tokio::time::timeout(
220        Duration::from_secs(60),
221        Command::new(&chrome)
222            .arg("--headless=new")
223            .arg("--disable-gpu")
224            .arg("--disable-extensions")
225            .arg("--disable-dev-shm-usage")
226            .arg("--no-sandbox")
227            .arg("--hide-scrollbars")
228            .arg("--window-size=1280,800")
229            .arg(&screenshot_arg)
230            .arg(format!("--user-data-dir={}", user_data_dir.display()))
231            .arg(url)
232            .output(),
233    )
234    .await
235    .map_err(|_| {
236        WebCaptureError::ScreenshotError(format!(
237            "Timed out waiting for headless Chrome to capture {url}"
238        ))
239    })?
240    .map_err(|e| WebCaptureError::ScreenshotError(format!("Failed to launch Chrome: {e}")))?;
241
242    let _ = std::fs::remove_dir_all(&user_data_dir);
243
244    if !output.status.success() {
245        let _ = std::fs::remove_file(&screenshot_path);
246        return Err(WebCaptureError::ScreenshotError(format!(
247            "Headless Chrome failed with status {}: {}",
248            output.status,
249            String::from_utf8_lossy(&output.stderr)
250        )));
251    }
252
253    let bytes = read_screenshot_bytes(&screenshot_path)?;
254    let _ = std::fs::remove_file(&screenshot_path);
255
256    if bytes.len() < 8 || &bytes[..8] != b"\x89PNG\r\n\x1a\n" {
257        return Err(WebCaptureError::ScreenshotError(
258            "Chrome screenshot output was not a valid PNG".to_string(),
259        ));
260    }
261
262    info!("Successfully captured screenshot ({} bytes)", bytes.len());
263    Ok(bytes)
264}
265
266fn temporary_screenshot_path() -> PathBuf {
267    let nonce = SystemTime::now()
268        .duration_since(UNIX_EPOCH)
269        .map_or(0, |duration| duration.as_nanos());
270    std::env::temp_dir().join(format!(
271        "web-capture-screenshot-{}-{nonce}.png",
272        std::process::id()
273    ))
274}
275
276fn read_screenshot_bytes(path: &Path) -> Result<Vec<u8>> {
277    std::fs::read(path).map_err(|e| {
278        WebCaptureError::ScreenshotError(format!("Failed to read screenshot file: {e}"))
279    })
280}