Skip to main content

web_capture/
browser.rs

1//! Browser automation module
2//!
3//! This module provides headless browser operations for rendering pages
4//! and capturing screenshots. Note: Full browser automation requires
5//! browser-commander, which depends on having Chrome installed.
6//!
7//! For simpler HTTP fetching without JavaScript rendering, see the html module.
8
9use crate::{Result, WebCaptureError};
10use std::path::{Path, PathBuf};
11use std::sync::atomic::{AtomicU64, Ordering};
12use std::time::{Duration, SystemTime, UNIX_EPOCH};
13use tokio::process::Command;
14use tracing::{debug, info};
15
16static USER_DATA_DIR_COUNTER: AtomicU64 = AtomicU64::new(0);
17
18/// Browser engine type
19#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
20pub enum BrowserEngine {
21    /// Chromiumoxide engine (default)
22    #[default]
23    Chromiumoxide,
24}
25
26impl std::fmt::Display for BrowserEngine {
27    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
28        match self {
29            Self::Chromiumoxide => write!(f, "chromiumoxide"),
30        }
31    }
32}
33
34impl std::str::FromStr for BrowserEngine {
35    type Err = WebCaptureError;
36
37    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
38        match s.to_lowercase().as_str() {
39            "chromiumoxide" | "chromium" | "chrome" => Ok(Self::Chromiumoxide),
40            _ => Err(WebCaptureError::BrowserError(format!(
41                "Unknown browser engine: {s}"
42            ))),
43        }
44    }
45}
46
47/// Render HTML content from a URL using a headless browser
48///
49/// This function uses browser-commander to launch a headless browser,
50/// navigate to the URL, and return the rendered HTML content.
51///
52/// Note: This requires Chrome/Chromium to be installed on the system.
53///
54/// # Arguments
55///
56/// * `url` - The URL to render
57///
58/// # Returns
59///
60/// The rendered HTML content as a string
61///
62/// # Errors
63///
64/// Returns an error if browser operations fail
65pub async fn render_html(url: &str) -> Result<String> {
66    render_html_with_timeout(url, Duration::from_secs(60)).await
67}
68
69/// Render HTML content from a URL using a headless browser and caller-provided timeout.
70///
71/// # Errors
72///
73/// Returns an error if Chrome is unavailable, fails, or does not finish before `timeout`.
74pub async fn render_html_with_timeout(url: &str, timeout: Duration) -> Result<String> {
75    info!("Rendering HTML for URL: {}", url);
76
77    let chrome = find_chrome_executable().ok_or_else(|| {
78        WebCaptureError::BrowserError(
79            "Chrome/Chromium executable was not found. Set WEB_CAPTURE_CHROME, CHROME_PATH, or GOOGLE_CHROME_BIN.".to_string(),
80        )
81    })?;
82    let user_data_dir = temporary_user_data_dir();
83    std::fs::create_dir_all(&user_data_dir)?;
84    let args = chrome_render_args(&user_data_dir, url);
85    debug!(
86        chrome = %chrome.display(),
87        user_data_dir = %user_data_dir.display(),
88        args = ?args,
89        "launching headless Chrome for DOM capture"
90    );
91
92    let mut command = Command::new(&chrome);
93    command.args(&args).kill_on_drop(true);
94    let output_result = tokio::time::timeout(timeout, command.output()).await;
95    let _ = std::fs::remove_dir_all(&user_data_dir);
96
97    let output = output_result
98        .map_err(|_| {
99            WebCaptureError::BrowserError(format!(
100                "Timed out waiting for headless Chrome to render {url}"
101            ))
102        })?
103        .map_err(|e| WebCaptureError::BrowserError(format!("Failed to launch Chrome: {e}")))?;
104    debug!(
105        status = %output.status,
106        stdout_bytes = output.stdout.len(),
107        stderr_bytes = output.stderr.len(),
108        stderr = %String::from_utf8_lossy(&output.stderr),
109        "headless Chrome DOM capture finished"
110    );
111
112    if !output.status.success() {
113        return Err(WebCaptureError::BrowserError(format!(
114            "Headless Chrome failed with status {}: {}",
115            output.status,
116            String::from_utf8_lossy(&output.stderr)
117        )));
118    }
119
120    let html = String::from_utf8(output.stdout)
121        .map_err(|e| WebCaptureError::BrowserError(format!("Chrome output was not UTF-8: {e}")))?;
122
123    info!("Successfully rendered HTML ({} bytes)", html.len());
124    Ok(html)
125}
126
127fn chrome_render_args(user_data_dir: &Path, url: &str) -> Vec<String> {
128    let mut args = common_chrome_args(user_data_dir);
129    args.extend([
130        "--dump-dom".to_string(),
131        "--timeout=30000".to_string(),
132        "--virtual-time-budget=8000".to_string(),
133        "--run-all-compositor-stages-before-draw".to_string(),
134        "--window-size=1280,800".to_string(),
135        url.to_string(),
136    ]);
137    args
138}
139
140fn common_chrome_args(user_data_dir: &Path) -> Vec<String> {
141    vec![
142        "--headless=new".to_string(),
143        "--disable-gpu".to_string(),
144        "--disable-extensions".to_string(),
145        "--disable-dev-shm-usage".to_string(),
146        "--disable-background-networking".to_string(),
147        "--disable-component-update".to_string(),
148        "--disable-default-apps".to_string(),
149        "--disable-sync".to_string(),
150        "--metrics-recording-only".to_string(),
151        "--no-default-browser-check".to_string(),
152        "--no-first-run".to_string(),
153        "--no-sandbox".to_string(),
154        format!("--user-data-dir={}", user_data_dir.display()),
155    ]
156}
157
158pub(crate) fn find_chrome_executable() -> Option<PathBuf> {
159    for env_var in [
160        "WEB_CAPTURE_CHROME",
161        "CHROME_PATH",
162        "GOOGLE_CHROME_BIN",
163        "CHROMIUM_PATH",
164    ] {
165        if let Ok(path) = std::env::var(env_var) {
166            let candidate = PathBuf::from(path);
167            if candidate.exists() {
168                return Some(candidate);
169            }
170        }
171    }
172
173    for name in [
174        "google-chrome",
175        "google-chrome-stable",
176        "chromium",
177        "chromium-browser",
178        "chrome",
179    ] {
180        if let Some(path) = find_on_path(name) {
181            return Some(path);
182        }
183    }
184
185    for path in [
186        "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
187        "/Applications/Chromium.app/Contents/MacOS/Chromium",
188        r"C:\Program Files\Google\Chrome\Application\chrome.exe",
189        r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
190        r"C:\Program Files\Chromium\Application\chrome.exe",
191    ] {
192        let candidate = PathBuf::from(path);
193        if candidate.exists() {
194            return Some(candidate);
195        }
196    }
197
198    None
199}
200
201fn find_on_path(name: &str) -> Option<PathBuf> {
202    let paths = std::env::var_os("PATH")?;
203    for dir in std::env::split_paths(&paths) {
204        let candidate = dir.join(name);
205        if candidate.exists() {
206            return Some(candidate);
207        }
208        #[cfg(windows)]
209        {
210            let candidate = dir.join(format!("{name}.exe"));
211            if candidate.exists() {
212                return Some(candidate);
213            }
214        }
215    }
216    None
217}
218
219pub(crate) fn temporary_user_data_dir() -> PathBuf {
220    let nonce = SystemTime::now()
221        .duration_since(UNIX_EPOCH)
222        .map_or(0, |duration| duration.as_nanos());
223    let seq = USER_DATA_DIR_COUNTER.fetch_add(1, Ordering::Relaxed);
224    std::env::temp_dir().join(format!(
225        "web-capture-chrome-{}-{nonce}-{seq}",
226        std::process::id()
227    ))
228}
229
230/// Capture a PNG screenshot of a URL
231///
232/// This function launches headless Chrome/Chromium, navigates to the URL,
233/// and captures a full-page PNG screenshot.
234///
235/// # Arguments
236///
237/// * `url` - The URL to capture
238///
239/// # Returns
240///
241/// The PNG image data as bytes
242///
243/// # Errors
244///
245/// Returns an error if Chrome/Chromium is unavailable or screenshot capture fails.
246pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
247    info!("Capturing screenshot for URL: {}", url);
248
249    let chrome = find_chrome_executable().ok_or_else(|| {
250        WebCaptureError::ScreenshotError(
251            "Chrome/Chromium executable was not found. Set WEB_CAPTURE_CHROME, CHROME_PATH, or GOOGLE_CHROME_BIN.".to_string(),
252        )
253    })?;
254
255    let user_data_dir = temporary_user_data_dir();
256    std::fs::create_dir_all(&user_data_dir).map_err(|e| {
257        WebCaptureError::ScreenshotError(format!("Failed to create temp user data dir: {e}"))
258    })?;
259
260    let screenshot_path = temporary_screenshot_path();
261    let args = chrome_screenshot_args(&user_data_dir, &screenshot_path, url);
262    debug!(
263        chrome = %chrome.display(),
264        user_data_dir = %user_data_dir.display(),
265        screenshot_path = %screenshot_path.display(),
266        args = ?args,
267        "launching headless Chrome for screenshot capture"
268    );
269
270    let output_result = tokio::time::timeout(
271        Duration::from_secs(60),
272        Command::new(&chrome).args(&args).output(),
273    )
274    .await;
275    let _ = std::fs::remove_dir_all(&user_data_dir);
276
277    let output = output_result
278        .map_err(|_| {
279            WebCaptureError::ScreenshotError(format!(
280                "Timed out waiting for headless Chrome to capture {url}"
281            ))
282        })?
283        .map_err(|e| WebCaptureError::ScreenshotError(format!("Failed to launch Chrome: {e}")))?;
284    debug!(
285        status = %output.status,
286        stdout_bytes = output.stdout.len(),
287        stderr_bytes = output.stderr.len(),
288        stderr = %String::from_utf8_lossy(&output.stderr),
289        "headless Chrome screenshot capture finished"
290    );
291
292    if !output.status.success() {
293        let _ = std::fs::remove_file(&screenshot_path);
294        return Err(WebCaptureError::ScreenshotError(format!(
295            "Headless Chrome failed with status {}: {}",
296            output.status,
297            String::from_utf8_lossy(&output.stderr)
298        )));
299    }
300
301    let bytes = read_screenshot_bytes(&screenshot_path)?;
302    let _ = std::fs::remove_file(&screenshot_path);
303
304    if bytes.len() < 8 || &bytes[..8] != b"\x89PNG\r\n\x1a\n" {
305        return Err(WebCaptureError::ScreenshotError(
306            "Chrome screenshot output was not a valid PNG".to_string(),
307        ));
308    }
309
310    info!("Successfully captured screenshot ({} bytes)", bytes.len());
311    Ok(bytes)
312}
313
314fn temporary_screenshot_path() -> PathBuf {
315    let nonce = SystemTime::now()
316        .duration_since(UNIX_EPOCH)
317        .map_or(0, |duration| duration.as_nanos());
318    std::env::temp_dir().join(format!(
319        "web-capture-screenshot-{}-{nonce}.png",
320        std::process::id()
321    ))
322}
323
324fn chrome_screenshot_args(user_data_dir: &Path, screenshot_path: &Path, url: &str) -> Vec<String> {
325    let mut args = common_chrome_args(user_data_dir);
326    args.extend([
327        "--hide-scrollbars".to_string(),
328        "--window-size=1280,800".to_string(),
329        "--timeout=30000".to_string(),
330        format!("--screenshot={}", screenshot_path.display()),
331        url.to_string(),
332    ]);
333    args
334}
335
336fn read_screenshot_bytes(path: &Path) -> Result<Vec<u8>> {
337    std::fs::read(path).map_err(|e| {
338        WebCaptureError::ScreenshotError(format!("Failed to read screenshot file: {e}"))
339    })
340}