use std::time::Duration;
use chromiumoxide::Handler;
use chromiumoxide::browser::{Browser, BrowserConfig as ChromeBrowserConfig};
use chromiumoxide::cdp::browser_protocol::network::{Headers, SetExtraHttpHeadersParams};
use chromiumoxide::cdp::browser_protocol::page::CaptureScreenshotFormat;
use chromiumoxide::page::ScreenshotParams;
use serde_json::json;
use tokio_stream::StreamExt;
use super::{PageAction, ScrollDirection};
use crate::error::CrawlError;
use crate::types::{ActionResult, AuthConfig, BrowserWait, CrawlConfig, InteractionResult};
pub(super) async fn run(
url: &str,
actions: &[PageAction],
config: &CrawlConfig,
) -> Result<InteractionResult, CrawlError> {
let (mut browser, mut handler, data_dir) = launch_or_connect(config).await?;
let handler_handle = tokio::spawn(async move { while handler.next().await.is_some() {} });
let result = run_with_browser(&browser, url, actions, config).await;
let _ = browser.close().await;
let _ = browser.wait().await;
drop(browser);
let _ = tokio::time::timeout(Duration::from_secs(5), handler_handle).await;
if let Some(dir) = data_dir {
let _ = std::fs::remove_dir_all(dir);
}
result
}
async fn run_with_browser(
browser: &Browser,
url: &str,
actions: &[PageAction],
config: &CrawlConfig,
) -> Result<InteractionResult, CrawlError> {
let page = browser
.new_page("about:blank")
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to create page: {e}")))?;
let result = async {
prepare_page(&page, config).await?;
navigate_and_wait(&page, url, config).await?;
if let Some(ref script) = config.browser.eval_script {
evaluate_json(&page, script).await.map_err(|e| {
CrawlError::BrowserError(format!(
"post-navigation eval_script failed before interaction actions: {e}"
))
})?;
}
let mut action_results = Vec::with_capacity(actions.len());
let mut screenshot = None;
for (index, action) in actions.iter().enumerate() {
match execute_action(&page, action).await {
Ok(action_data) => {
if let Some(bytes) = action_data.screenshot {
screenshot = Some(bytes);
}
action_results.push(ActionResult {
action_index: index,
action_type: action_type(action).into(),
success: true,
data: action_data.data,
error: None,
});
}
Err(error) => {
action_results.push(ActionResult {
action_index: index,
action_type: action_type(action).into(),
success: false,
data: None,
error: Some(error.to_string()),
});
}
}
}
let final_html = page
.content()
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to extract final HTML: {e}")))?;
let final_url = evaluate_json(&page, "location.href")
.await
.ok()
.and_then(|value| value.as_str().map(str::to_owned))
.unwrap_or_else(|| url.to_owned());
Ok(InteractionResult {
action_results,
final_html,
final_url,
screenshot,
})
}
.await;
let _ = page.close().await;
result
}
async fn prepare_page(page: &chromiumoxide::Page, config: &CrawlConfig) -> Result<(), CrawlError> {
if matches!(config.browser.mode, crate::types::BrowserMode::Stealth) {
crate::stealth::apply_stealth_patches(page).await;
}
if let Some(ref ua) = config.user_agent {
page.set_user_agent(ua)
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to set user agent: {e}")))?;
}
let mut extra_headers = serde_json::Map::new();
for (key, value) in &config.custom_headers {
extra_headers.insert(key.clone(), serde_json::Value::String(value.clone()));
}
match config.auth {
Some(AuthConfig::Bearer { ref token }) => {
extra_headers.insert(
"Authorization".to_owned(),
serde_json::Value::String(format!("Bearer {token}")),
);
}
Some(AuthConfig::Header { ref name, ref value }) => {
extra_headers.insert(name.clone(), serde_json::Value::String(value.clone()));
}
_ => {}
}
if !extra_headers.is_empty() {
let params = SetExtraHttpHeadersParams::new(Headers::new(serde_json::Value::Object(extra_headers)));
page.execute(params)
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to set headers: {e}")))?;
}
Ok(())
}
async fn navigate_and_wait(page: &chromiumoxide::Page, url: &str, config: &CrawlConfig) -> Result<(), CrawlError> {
let timeout = config.browser.timeout;
tokio::time::timeout(timeout, async {
page.goto(url)
.await
.map_err(|e| CrawlError::BrowserError(format!("navigation failed: {e}")))?;
wait_for_ready(page, config)
.await
.map_err(|e| CrawlError::BrowserError(format!("wait failed: {e}")))?;
Ok::<(), CrawlError>(())
})
.await
.map_err(|_| CrawlError::BrowserTimeout(format!("browser timed out after {timeout:?}")))??;
if let Some(extra) = config.browser.extra_wait {
tokio::time::sleep(extra).await;
}
Ok(())
}
async fn wait_for_ready(
page: &chromiumoxide::Page,
config: &CrawlConfig,
) -> Result<(), chromiumoxide::error::CdpError> {
match config.browser.wait {
BrowserWait::NetworkIdle => tokio::time::sleep(Duration::from_millis(500)).await,
BrowserWait::Selector => {
if let Some(ref selector) = config.browser.wait_selector {
page.find_element(selector).await?;
} else {
tokio::time::sleep(Duration::from_millis(500)).await;
}
}
BrowserWait::Fixed => tokio::time::sleep(Duration::from_secs(2)).await,
}
Ok(())
}
struct ActionData {
data: Option<serde_json::Value>,
screenshot: Option<Vec<u8>>,
}
impl ActionData {
fn empty() -> Self {
Self {
data: None,
screenshot: None,
}
}
fn data(data: serde_json::Value) -> Self {
Self {
data: Some(data),
screenshot: None,
}
}
}
async fn execute_action(page: &chromiumoxide::Page, action: &PageAction) -> Result<ActionData, CrawlError> {
match action {
PageAction::Click { selector } => {
page.find_element(selector)
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to find click target {selector:?}: {e}")))?
.click()
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to click {selector:?}: {e}")))?;
Ok(ActionData::empty())
}
PageAction::TypeText { selector, text } => {
let element = page
.find_element(selector)
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to find type target {selector:?}: {e}")))?;
element
.click()
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to focus {selector:?}: {e}")))?;
element
.type_str(text)
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to type into {selector:?}: {e}")))?;
Ok(ActionData::empty())
}
PageAction::Press { key } => {
dispatch_key_event(page, key).await?;
Ok(ActionData::empty())
}
PageAction::Scroll {
direction,
selector,
amount,
} => {
scroll(page, *direction, selector.as_deref(), *amount).await?;
Ok(ActionData::empty())
}
PageAction::Wait { milliseconds, selector } => {
if let Some(selector) = selector {
page.find_element(selector)
.await
.map_err(|e| CrawlError::BrowserError(format!("failed waiting for selector {selector:?}: {e}")))?;
} else if let Some(ms) = milliseconds {
tokio::time::sleep(Duration::from_millis(*ms as u64)).await;
}
Ok(ActionData::empty())
}
PageAction::Screenshot { full_page } => {
let params = ScreenshotParams::builder()
.format(CaptureScreenshotFormat::Png)
.full_page(full_page.unwrap_or(false))
.build();
let bytes = page
.screenshot(params)
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to capture screenshot: {e}")))?;
let len = bytes.len();
Ok(ActionData {
data: Some(json!({ "bytes": len, "format": "png" })),
screenshot: Some(bytes),
})
}
PageAction::ExecuteJs { script } => {
let value = evaluate_json(page, script).await?;
Ok(ActionData::data(value))
}
PageAction::Scrape => {
let html = page
.content()
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to scrape current page: {e}")))?;
Ok(ActionData::data(json!({ "html": html })))
}
}
}
async fn evaluate_json(page: &chromiumoxide::Page, script: &str) -> Result<serde_json::Value, CrawlError> {
let wrapped;
let effective_script = if script.contains("return ") || script.trim_start().starts_with("return") {
wrapped = format!("(function() {{ {script} }})()");
wrapped.as_str()
} else {
script
};
let result = page
.evaluate(effective_script)
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to evaluate JavaScript: {e}")))?;
Ok(result.value().cloned().unwrap_or(serde_json::Value::Null))
}
async fn dispatch_key_event(page: &chromiumoxide::Page, key: &str) -> Result<(), CrawlError> {
let key_json = serde_json::to_string(key).map_err(|e| CrawlError::Other(format!("failed to encode key: {e}")))?;
let script = format!(
r#"
(() => {{
const key = {key_json};
const target = document.activeElement || document.body || document;
for (const type of ["keydown", "keyup"]) {{
target.dispatchEvent(new KeyboardEvent(type, {{ key, bubbles: true, cancelable: true }}));
}}
return true;
}})()
"#
);
evaluate_json(page, &script).await?;
Ok(())
}
async fn scroll(
page: &chromiumoxide::Page,
direction: ScrollDirection,
selector: Option<&str>,
amount: Option<i64>,
) -> Result<(), CrawlError> {
let amount = amount.unwrap_or(800).unsigned_abs();
let signed_amount = match direction {
ScrollDirection::Up => format!("-{amount}"),
ScrollDirection::Down => amount.to_string(),
};
let selector_json =
serde_json::to_string(&selector).map_err(|e| CrawlError::Other(format!("failed to encode selector: {e}")))?;
let script = format!(
r#"
(() => {{
const selector = {selector_json};
const target = selector ? document.querySelector(selector) : window;
if (!target) {{
throw new Error(`scroll target not found: ${{selector}}`);
}}
if (target === window) {{
window.scrollBy(0, {signed_amount});
}} else {{
target.scrollTop += {signed_amount};
}}
return true;
}})()
"#
);
evaluate_json(page, &script).await?;
Ok(())
}
fn action_type(action: &PageAction) -> &'static str {
match action {
PageAction::Click { .. } => "click",
PageAction::TypeText { .. } => "type",
PageAction::Press { .. } => "press",
PageAction::Scroll { .. } => "scroll",
PageAction::Wait { .. } => "wait",
PageAction::Screenshot { .. } => "screenshot",
PageAction::ExecuteJs { .. } => "executeJs",
PageAction::Scrape => "scrape",
}
}
async fn launch_or_connect(config: &CrawlConfig) -> Result<(Browser, Handler, Option<std::path::PathBuf>), CrawlError> {
if let Some(ref endpoint) = config.browser.endpoint {
let (browser, handler) = Browser::connect(endpoint)
.await
.map_err(|e| CrawlError::BrowserError(format!("failed to connect to {endpoint}: {e}")))?;
Ok((browser, handler, None))
} else {
use std::sync::atomic::{AtomicU64, Ordering as AtomicOrdering};
static LAUNCH_COUNTER: AtomicU64 = AtomicU64::new(0);
let user_data_dir = std::env::temp_dir().join(format!(
"crawlberg-interact-{}-{}",
std::process::id(),
LAUNCH_COUNTER.fetch_add(1, AtomicOrdering::Relaxed),
));
let mut builder = ChromeBrowserConfig::builder()
.no_sandbox()
.new_headless_mode()
.user_data_dir(&user_data_dir)
.disable_default_args();
for arg in safe_default_args() {
builder = builder.arg(arg);
}
if let Some(proxy) = config.browser.proxy.as_ref().or(config.proxy.as_ref()) {
builder = builder.arg(format!("--proxy-server={}", proxy.url));
}
let browser_config = builder
.build()
.map_err(|e| CrawlError::BrowserError(format!("invalid browser config: {e}")))?;
match Browser::launch(browser_config).await {
Ok((browser, handler)) => Ok((browser, handler, Some(user_data_dir))),
Err(e) => {
let _ = std::fs::remove_dir_all(&user_data_dir);
Err(CrawlError::BrowserError(format!("failed to launch browser: {e}")))
}
}
}
}
fn safe_default_args() -> Vec<&'static str> {
let all_args = vec![
"--disable-background-networking",
"--enable-features=NetworkService,NetworkServiceInProcess",
"--disable-background-timer-throttling",
"--disable-backgrounding-occluded-windows",
"--disable-breakpad",
"--disable-client-side-phishing-detection",
"--disable-component-extensions-with-background-pages",
"--disable-default-apps",
"--disable-dev-shm-usage",
"--disable-features=TranslateUI",
"--disable-hang-monitor",
"--disable-ipc-flooding-protection",
"--disable-popup-blocking",
"--disable-prompt-on-repost",
"--disable-renderer-backgrounding",
"--disable-sync",
"--force-color-profile=srgb",
"--metrics-recording-only",
"--no-first-run",
"--password-store=basic",
"--lang=en_US",
];
if std::path::Path::new("/snap/chromium/current/usr/bin/chromium").exists() {
all_args
.into_iter()
.filter(|&arg| {
!matches!(
arg,
"--disable-background-networking"
| "--enable-features=NetworkService,NetworkServiceInProcess"
| "--disable-background-timer-throttling"
| "--metrics-recording-only"
)
})
.collect()
} else {
all_args
}
}