use anyhow::{Context, Result};
use chromiumoxide::cdp::browser_protocol::page::{
EventLifecycleEvent, SetLifecycleEventsEnabledParams,
};
use chromiumoxide::{Browser, BrowserConfig, Page};
use futures::StreamExt;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::Semaphore;
const NETWORK_IDLE_TIMEOUT_MS: u64 = 10000;
fn detect_chrome_path() -> Option<PathBuf> {
#[cfg(target_os = "linux")]
let candidates = [
"/usr/bin/google-chrome-stable",
"/usr/bin/google-chrome",
"/usr/bin/chromium-browser",
"/usr/bin/chromium",
"/snap/bin/chromium",
"/usr/local/bin/chrome",
"/usr/local/bin/chromium",
];
#[cfg(target_os = "macos")]
let candidates = [
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"/Applications/Chromium.app/Contents/MacOS/Chromium",
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
];
#[cfg(target_os = "windows")]
let candidates = [
r"C:\Program Files\Google\Chrome\Application\chrome.exe",
r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
&format!(
r"{}\Google\Chrome\Application\chrome.exe",
std::env::var("LOCALAPPDATA").unwrap_or_default()
),
];
#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
let candidates: [&str; 0] = [];
for path in candidates {
let p = PathBuf::from(path);
if p.exists() {
return Some(p);
}
}
None
}
pub struct BrowserPool {
browser: Browser,
semaphore: Arc<Semaphore>,
user_agent: String,
}
impl BrowserPool {
pub async fn new(concurrency: usize) -> Result<Self> {
let chrome_path = detect_chrome_path().ok_or_else(|| {
anyhow::anyhow!(
"Chrome/Chromium not found. Searched paths:\n \
Linux: /usr/bin/google-chrome-stable, /usr/bin/google-chrome, /usr/bin/chromium-browser, /usr/bin/chromium\n \
macOS: /Applications/Google Chrome.app/...\n \
Windows: C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe\n\
Please install Chrome or Chromium."
)
})?;
let config = BrowserConfig::builder()
.chrome_executable(chrome_path)
.no_sandbox()
.arg("--disable-gpu")
.arg("--disable-dev-shm-usage")
.arg("--disable-setuid-sandbox")
.arg("--no-first-run")
.arg("--headless=new")
.build()
.map_err(|e| anyhow::anyhow!("Browser config error: {e}"))?;
let (browser, mut handler) = Browser::launch(config)
.await
.context("Failed to launch Chrome")?;
tokio::spawn(async move { while handler.next().await.is_some() {} });
Ok(Self {
browser,
semaphore: Arc::new(Semaphore::new(concurrency)),
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36".to_string(),
})
}
pub async fn new_page(&self) -> Result<BrowserPage> {
let permit = self.semaphore.clone().acquire_owned().await?;
let page = self.browser.new_page("about:blank").await?;
page.execute(
chromiumoxide::cdp::browser_protocol::network::SetUserAgentOverrideParams::new(
&self.user_agent,
),
)
.await?;
page.execute(SetLifecycleEventsEnabledParams::new(true))
.await?;
Ok(BrowserPage {
page,
_permit: permit,
})
}
pub async fn close(mut self) -> Result<()> {
self.browser.close().await?;
Ok(())
}
}
pub struct BrowserPage {
page: Page,
_permit: tokio::sync::OwnedSemaphorePermit,
}
impl BrowserPage {
pub async fn goto(&self, url: &str, timeout_ms: u64) -> Result<PageResult> {
let mut lifecycle = self
.page
.event_listener::<EventLifecycleEvent>()
.await
.context("Failed to subscribe to lifecycle events")?;
let nav_result =
tokio::time::timeout(Duration::from_millis(timeout_ms), self.page.goto(url)).await;
match nav_result {
Ok(Ok(_)) => {
self.wait_for_network_idle(&mut lifecycle).await;
let status = self.get_status().await;
let title = self.page.get_title().await.ok().flatten();
Ok(PageResult {
status,
title,
error: None,
})
}
Ok(Err(e)) => {
let (status, _) = parse_error(&e.to_string());
Ok(PageResult {
status,
title: None,
error: Some(e.to_string()),
})
}
Err(_) => Ok(PageResult {
status: 0,
title: None,
error: Some("Navigation timeout".to_string()),
}),
}
}
async fn wait_for_network_idle(
&self,
lifecycle: &mut chromiumoxide::listeners::EventStream<EventLifecycleEvent>,
) {
let wait_result =
tokio::time::timeout(Duration::from_millis(NETWORK_IDLE_TIMEOUT_MS), async {
while let Some(event) = lifecycle.next().await {
if event.name == "networkIdle" {
return WaitResult::NetworkIdle;
}
}
WaitResult::StreamEnded
})
.await;
let _ = wait_result;
}
async fn get_status(&self) -> u16 {
if let Ok(Some(t)) = self.page.get_title().await {
let t_lower = t.to_lowercase();
if t_lower.contains("404") || t_lower.contains("not found") {
return 404;
}
if t_lower.contains("403")
|| t_lower.contains("forbidden")
|| t_lower.contains("access denied")
{
return 403;
}
if t_lower.contains("500") || t_lower.contains("internal server error") {
return 500;
}
}
200
}
pub async fn content(&self) -> Result<String> {
self.page
.content()
.await
.context("Failed to get page content")
}
pub async fn current_url(&self) -> Option<String> {
self.page.url().await.ok().flatten()
}
}
enum WaitResult {
NetworkIdle,
StreamEnded,
}
#[derive(Debug)]
pub struct PageResult {
pub status: u16,
pub title: Option<String>,
pub error: Option<String>,
}
fn parse_error(error: &str) -> (u16, String) {
if error.contains("ERR_NAME_NOT_RESOLVED") {
(0, "DNS_FAILED".to_string())
} else if error.contains("ERR_CONNECTION_REFUSED") {
(0, "CONNECTION_REFUSED".to_string())
} else if error.contains("ERR_CONNECTION_TIMED_OUT") {
(0, "TIMEOUT".to_string())
} else if error.contains("ERR_CERT") || error.contains("SSL") {
(0, "SSL_ERROR".to_string())
} else {
(0, "NETWORK_ERROR".to_string())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_error() {
assert_eq!(parse_error("net::ERR_NAME_NOT_RESOLVED").1, "DNS_FAILED");
assert_eq!(
parse_error("ERR_CONNECTION_REFUSED").1,
"CONNECTION_REFUSED"
);
assert_eq!(parse_error("random error").1, "NETWORK_ERROR");
}
#[test]
fn test_detect_chrome_path() {
let result = detect_chrome_path();
if let Some(path) = result {
assert!(path.exists());
}
}
#[test]
fn test_network_idle_timeout_reasonable() {
let ms = NETWORK_IDLE_TIMEOUT_MS;
assert!(ms >= 5000, "Timeout too short for SPAs");
assert!(ms <= 30000, "Timeout too long, will slow down all fetches");
}
#[test]
fn test_wait_result_variants() {
let idle = WaitResult::NetworkIdle;
let ended = WaitResult::StreamEnded;
assert!(matches!(idle, WaitResult::NetworkIdle));
assert!(matches!(ended, WaitResult::StreamEnded));
}
}