use std::sync::atomic::{AtomicU64, Ordering as AtomicOrdering};
use std::time::Duration;
use chromiumoxide::Handler;
use chromiumoxide::browser::{Browser, BrowserConfig as ChromeBrowserConfig};
use chromiumoxide::cdp::browser_protocol::emulation::SetDeviceMetricsOverrideParams;
use chromiumoxide::cdp::browser_protocol::network::{Headers, SetCookieParams, SetExtraHttpHeadersParams};
use tokio_stream::StreamExt;
use tracing::Instrument as _;
use crate::browser_pool::BrowserPool;
use crate::error::CrawlError;
use crate::http::HttpResponse;
use crate::telemetry::attributes::{CRAWL_BROWSER_BACKEND, CRAWL_BROWSER_SESSION_ID, CRAWL_PAGES_RENDERED};
use crate::telemetry::metrics::registry;
use crate::types::{AuthConfig, BrowserBackend, BrowserWait, CookieInfo, CrawlConfig};
static BROWSER_SESSION_COUNTER: AtomicU64 = AtomicU64::new(1);
pub(crate) async fn browser_fetch(
url: &str,
config: &CrawlConfig,
prior_cookies: Option<&[CookieInfo]>,
pool: Option<&BrowserPool>,
#[cfg(feature = "browser-native")] native_executor: Option<&crawlberg_browser::adapter::NativeBrowserExecutor>,
) -> Result<HttpResponse, CrawlError> {
match config.browser.backend {
BrowserBackend::Chromiumoxide => chromiumoxide_fetch(url, config, prior_cookies, pool).await,
BrowserBackend::Native => {
#[cfg(feature = "browser-native")]
{
native_fetch(url, config, prior_cookies, native_executor).await
}
#[cfg(not(feature = "browser-native"))]
{
native_fetch(url, config, prior_cookies).await
}
}
}
}
async fn chromiumoxide_fetch(
url: &str,
config: &CrawlConfig,
prior_cookies: Option<&[CookieInfo]>,
pool: Option<&BrowserPool>,
) -> Result<HttpResponse, CrawlError> {
let session_id = BROWSER_SESSION_COUNTER.fetch_add(1, AtomicOrdering::Relaxed);
let session_id_str = session_id.to_string();
let span = tracing::info_span!(
"crawl.browser.session",
{ CRAWL_BROWSER_BACKEND } = "chromiumoxide",
{ CRAWL_BROWSER_SESSION_ID } = %session_id_str,
{ CRAWL_PAGES_RENDERED } = 1_i64,
);
registry().browser_sessions_active.add(1, &[]);
struct SessionGuard;
impl Drop for SessionGuard {
fn drop(&mut self) {
registry().browser_sessions_active.add(-1, &[]);
}
}
let _guard = SessionGuard;
chromiumoxide_fetch_inner(url, config, prior_cookies, pool)
.instrument(span)
.await
}
async fn chromiumoxide_fetch_inner(
url: &str,
config: &CrawlConfig,
prior_cookies: Option<&[CookieInfo]>,
pool: Option<&BrowserPool>,
) -> Result<HttpResponse, CrawlError> {
if let Some(pool) = pool {
let page = if config.browser.session_affinity {
let session_key = crate::browser_session_pool::SessionKey::from_url(
url,
config.browser.proxy.as_ref().map(|p| p.url.as_str()),
)?;
let session_pool = config.browser_session_pool.as_deref().ok_or_else(|| {
CrawlError::BrowserError("session_affinity enabled but session pool is not configured".into())
})?;
if let Some(pooled_page) = session_pool.acquire(&session_key).await {
pooled_page
} else {
let pooled = pool.acquire_page().await?;
pooled.page().clone()
}
} else {
let pooled = pool.acquire_page().await?;
pooled.page().clone()
};
let result = page_fetch(url, config, &page, prior_cookies).await;
if config.browser.session_affinity
&& result.is_ok()
&& let Ok(session_key) = crate::browser_session_pool::SessionKey::from_url(
url,
config.browser.proxy.as_ref().map(|p| p.url.as_str()),
)
&& let Some(session_pool) = config.browser_session_pool.as_deref()
{
session_pool.insert(session_key, page).await;
} else {
let _ = page.close().await;
}
result
} else {
let (mut browser, mut handler, data_dir) = launch_or_connect(config).await?;
let handler_handle = tokio::spawn(async move { while handler.next().await.is_some() {} });
let page = browser
.new_page("about:blank")
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to create page: {e}")))?;
let result = page_fetch(url, config, &page, prior_cookies).await;
let _ = page.close().await;
let _ = browser.close().await;
let _ = browser.wait().await;
drop(browser);
let _ = tokio::time::timeout(Duration::from_secs(5), handler_handle).await;
if let Some(dir) = data_dir {
let _ = std::fs::remove_dir_all(&dir);
}
result
}
}
#[cfg(feature = "browser-native")]
async fn native_fetch(
url: &str,
config: &CrawlConfig,
prior_cookies: Option<&[CookieInfo]>,
native_executor: Option<&crawlberg_browser::adapter::NativeBrowserExecutor>,
) -> Result<HttpResponse, CrawlError> {
let native_executor = native_executor.ok_or_else(|| {
CrawlError::BrowserError("native browser executor is not available for BrowserBackend::Native".into())
})?;
crate::native_browser::native_browser_fetch(url, config, prior_cookies, native_executor).await
}
#[cfg(not(feature = "browser-native"))]
async fn native_fetch(
_url: &str,
_config: &CrawlConfig,
_prior_cookies: Option<&[CookieInfo]>,
) -> Result<HttpResponse, CrawlError> {
Err(CrawlError::InvalidConfig(
"browser.backend = native requires the browser-native feature".into(),
))
}
async fn page_fetch(
url: &str,
config: &CrawlConfig,
page: &chromiumoxide::Page,
prior_cookies: Option<&[CookieInfo]>,
) -> Result<HttpResponse, CrawlError> {
let stealth = matches!(config.browser.mode, crate::types::BrowserMode::Stealth);
if stealth {
crate::stealth::apply_stealth_patches(page).await;
}
let resolved_ua = if let Some(ref ua) = config.user_agent {
ua.clone()
} else if stealth {
resolve_default_user_agent().to_string()
} else {
"".to_string()
};
if !resolved_ua.is_empty() {
page.set_user_agent(&resolved_ua)
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to set user agent: {e}")))?;
}
if stealth && let Err(e) = set_viewport(page, 1920, 1080).await {
return Err(CrawlError::BrowserError(format!("failed to set viewport: {e}")));
}
if let Some(cookies) = prior_cookies {
for cookie in cookies {
let mut builder = SetCookieParams::builder().name(&cookie.name).value(&cookie.value);
if let Some(ref domain) = cookie.domain {
builder = builder.domain(domain);
}
if let Some(ref path) = cookie.path {
builder = builder.path(path);
}
if let Ok(params) = builder.build() {
let _ = page.execute(params).await;
}
}
}
let mut extra_headers = serde_json::Map::new();
for (k, v) in &config.custom_headers {
extra_headers.insert(k.clone(), serde_json::Value::String(v.clone()));
}
match config.auth {
Some(AuthConfig::Bearer { ref token }) => {
extra_headers.insert(
"Authorization".to_owned(),
serde_json::Value::String(format!("Bearer {token}")),
);
}
Some(AuthConfig::Header { ref name, ref value }) => {
extra_headers.insert(name.clone(), serde_json::Value::String(value.clone()));
}
_ => {}
}
if !extra_headers.is_empty() {
let params = SetExtraHttpHeadersParams::new(Headers::new(serde_json::Value::Object(extra_headers)));
page.execute(params)
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to set headers: {e}")))?;
}
let timeout = config.browser.timeout;
tokio::time::timeout(timeout, async {
page.goto(url)
.await
.map_err(|e| CrawlError::BrowserError(format!("navigation failed: {e}")))?;
wait_for_ready(page, config)
.await
.map_err(|e| CrawlError::BrowserError(format!("wait failed: {e}")))?;
Ok::<(), CrawlError>(())
})
.await
.map_err(|_| CrawlError::BrowserTimeout(format!("browser timed out after {timeout:?}")))??;
if let Some(extra) = config.browser.extra_wait {
tokio::time::sleep(extra).await;
}
let html = page
.content()
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to extract HTML: {e}")))?;
let body_bytes = html.as_bytes().to_vec();
Ok(HttpResponse {
status: 200,
content_type: "text/html".to_owned(),
body: html,
body_bytes,
headers: std::collections::HashMap::new(),
browser_extras: None,
final_url: url.to_owned(),
})
}
async fn wait_for_ready(
page: &chromiumoxide::Page,
config: &CrawlConfig,
) -> Result<(), chromiumoxide::error::CdpError> {
match config.browser.wait {
BrowserWait::NetworkIdle => {
tokio::time::sleep(Duration::from_millis(500)).await;
}
BrowserWait::Selector => {
if let Some(ref selector) = config.browser.wait_selector {
page.find_element(selector).await?;
} else {
tokio::time::sleep(Duration::from_millis(500)).await;
}
}
BrowserWait::Fixed => {
tokio::time::sleep(Duration::from_secs(2)).await;
}
}
Ok(())
}
async fn launch_or_connect(config: &CrawlConfig) -> Result<(Browser, Handler, Option<std::path::PathBuf>), CrawlError> {
if let Some(ref endpoint) = config.browser.endpoint {
let (browser, handler) = Browser::connect(endpoint)
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to connect to {endpoint}: {e}")))?;
Ok((browser, handler, None))
} else {
use std::sync::atomic::{AtomicU64, Ordering as AtomicOrdering};
static LAUNCH_COUNTER: AtomicU64 = AtomicU64::new(0);
let user_data_dir = std::env::temp_dir().join(format!(
"crawlberg-browser-{}-{}",
std::process::id(),
LAUNCH_COUNTER.fetch_add(1, AtomicOrdering::Relaxed),
));
let mut builder = ChromeBrowserConfig::builder()
.no_sandbox()
.new_headless_mode()
.user_data_dir(&user_data_dir)
.disable_default_args();
builder = builder
.env("OBJC_DISABLE_INITIALIZE_FORK_SAFETY", "YES")
.env("OS_ACTIVITY_MODE", "disable");
for arg in crate::browser_pool::safe_default_args() {
builder = builder.arg(arg);
}
let browser_config = builder
.build()
.map_err(|e| CrawlError::BrowserError(format!("invalid browser config: {e}")))?;
match Browser::launch(browser_config).await {
Ok((browser, handler)) => Ok((browser, handler, Some(user_data_dir))),
Err(e) => {
let _ = std::fs::remove_dir_all(&user_data_dir);
Err(CrawlError::BrowserError(format!("failed to launch browser: {e}")))
}
}
}
}
fn resolve_default_user_agent() -> &'static str {
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36"
}
async fn set_viewport(page: &chromiumoxide::Page, width: u32, height: u32) -> Result<(), Box<dyn std::error::Error>> {
let params = SetDeviceMetricsOverrideParams::builder()
.width(width)
.height(height)
.device_scale_factor(1.0)
.build()?;
page.execute(params).await?;
Ok(())
}