use std::time::Duration;
use chromiumoxide::Handler;
use chromiumoxide::browser::{Browser, BrowserConfig as ChromeBrowserConfig};
use chromiumoxide::cdp::browser_protocol::network::{Headers, SetCookieParams, SetExtraHttpHeadersParams};
use tokio_stream::StreamExt;
use crate::browser_pool::BrowserPool;
use crate::error::CrawlError;
use crate::http::HttpResponse;
use crate::types::{AuthConfig, BrowserWait, CookieInfo, CrawlConfig};
pub(crate) async fn browser_fetch(
url: &str,
config: &CrawlConfig,
prior_cookies: Option<&[CookieInfo]>,
pool: Option<&BrowserPool>,
) -> Result<HttpResponse, CrawlError> {
if let Some(pool) = pool {
let pooled = pool.acquire_page().await?;
let result = page_fetch(url, config, pooled.page(), prior_cookies).await;
pooled.close().await;
result
} else {
let (browser, mut handler, data_dir) = launch_or_connect(config).await?;
let handler_handle = tokio::spawn(async move { while handler.next().await.is_some() {} });
let page = browser
.new_page("about:blank")
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to create page: {e}")))?;
let result = page_fetch(url, config, &page, prior_cookies).await;
drop(browser);
let _ = tokio::time::timeout(Duration::from_secs(5), handler_handle).await;
if let Some(dir) = data_dir {
let _ = std::fs::remove_dir_all(&dir);
}
result
}
}
async fn page_fetch(
url: &str,
config: &CrawlConfig,
page: &chromiumoxide::Page,
prior_cookies: Option<&[CookieInfo]>,
) -> Result<HttpResponse, CrawlError> {
if let Some(ref ua) = config.user_agent {
page.set_user_agent(ua)
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to set user agent: {e}")))?;
}
if let Some(cookies) = prior_cookies {
for cookie in cookies {
let mut builder = SetCookieParams::builder().name(&cookie.name).value(&cookie.value);
if let Some(ref domain) = cookie.domain {
builder = builder.domain(domain);
}
if let Some(ref path) = cookie.path {
builder = builder.path(path);
}
if let Ok(params) = builder.build() {
let _ = page.execute(params).await;
}
}
}
let mut extra_headers = serde_json::Map::new();
for (k, v) in &config.custom_headers {
extra_headers.insert(k.clone(), serde_json::Value::String(v.clone()));
}
match config.auth {
Some(AuthConfig::Bearer { ref token }) => {
extra_headers.insert(
"Authorization".to_owned(),
serde_json::Value::String(format!("Bearer {token}")),
);
}
Some(AuthConfig::Header { ref name, ref value }) => {
extra_headers.insert(name.clone(), serde_json::Value::String(value.clone()));
}
_ => {}
}
if !extra_headers.is_empty() {
let params = SetExtraHttpHeadersParams::new(Headers::new(serde_json::Value::Object(extra_headers)));
page.execute(params)
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to set headers: {e}")))?;
}
let timeout = config.browser.timeout;
tokio::time::timeout(timeout, async {
page.goto(url)
.await
.map_err(|e| CrawlError::BrowserError(format!("navigation failed: {e}")))?;
wait_for_ready(page, config)
.await
.map_err(|e| CrawlError::BrowserError(format!("wait failed: {e}")))?;
Ok::<(), CrawlError>(())
})
.await
.map_err(|_| CrawlError::BrowserTimeout(format!("browser timed out after {timeout:?}")))??;
if let Some(extra) = config.browser.extra_wait {
tokio::time::sleep(extra).await;
}
let html = page
.content()
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to extract HTML: {e}")))?;
let body_bytes = html.as_bytes().to_vec();
Ok(HttpResponse {
status: 200,
content_type: "text/html".to_owned(),
body: html,
body_bytes,
})
}
async fn wait_for_ready(
page: &chromiumoxide::Page,
config: &CrawlConfig,
) -> Result<(), chromiumoxide::error::CdpError> {
match config.browser.wait {
BrowserWait::NetworkIdle => {
tokio::time::sleep(Duration::from_millis(500)).await;
}
BrowserWait::Selector => {
if let Some(ref selector) = config.browser.wait_selector {
page.find_element(selector).await?;
} else {
tokio::time::sleep(Duration::from_millis(500)).await;
}
}
BrowserWait::Fixed => {
tokio::time::sleep(Duration::from_secs(2)).await;
}
}
Ok(())
}
async fn launch_or_connect(config: &CrawlConfig) -> Result<(Browser, Handler, Option<std::path::PathBuf>), CrawlError> {
if let Some(ref endpoint) = config.browser.endpoint {
let (browser, handler) = Browser::connect(endpoint)
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to connect to {endpoint}: {e}")))?;
Ok((browser, handler, None))
} else {
use std::sync::atomic::{AtomicU64, Ordering as AtomicOrdering};
static LAUNCH_COUNTER: AtomicU64 = AtomicU64::new(0);
let user_data_dir = std::env::temp_dir().join(format!(
"kreuzcrawl-browser-{}-{}",
std::process::id(),
LAUNCH_COUNTER.fetch_add(1, AtomicOrdering::Relaxed),
));
let browser_config = ChromeBrowserConfig::builder()
.no_sandbox()
.new_headless_mode()
.user_data_dir(&user_data_dir)
.build()
.map_err(|e| CrawlError::BrowserError(format!("invalid browser config: {e}")))?;
match Browser::launch(browser_config).await {
Ok((browser, handler)) => Ok((browser, handler, Some(user_data_dir))),
Err(e) => {
let _ = std::fs::remove_dir_all(&user_data_dir);
Err(CrawlError::BrowserError(format!("failed to launch browser: {e}")))
}
}
}
}