crawlberg 1.0.1

High-performance web crawling engine
Documentation
use std::time::Duration;

use crawlberg_browser::adapter::{
    NativeActionResult, NativeBrowserConfig, NativeBrowserExecutor, NativeBrowserWait, NativeCookie,
    NativeInteractionResult, NativePageAction, NativeScrollDirection,
};

use super::{PageAction, ScrollDirection};
use crate::error::CrawlError;
use crate::types::{ActionResult, AuthConfig, BrowserWait, CrawlConfig, InteractionResult};

pub(super) async fn run(
    url: &str,
    actions: &[PageAction],
    config: &CrawlConfig,
    native_executor: &NativeBrowserExecutor,
) -> Result<InteractionResult, CrawlError> {
    if config.browser.endpoint.is_some() {
        return Err(CrawlError::InvalidConfig(
            "browser.endpoint is only supported by the chromiumoxide backend".into(),
        ));
    }

    let native_config = build_native_config(config);
    let native_actions = actions.iter().map(map_action).collect::<Vec<_>>();
    let post_navigation_wait = post_navigation_wait(config);
    let timeout = config.browser.timeout;

    let native_result = native_executor
        .interact_url(url, &native_config, &native_actions, post_navigation_wait)
        .await
        .map_err(|e| {
            let message = e.to_string();
            if message.contains("timed out") {
                CrawlError::BrowserTimeout(format!("browser timed out after {timeout:?}"))
            } else {
                CrawlError::BrowserError(format!("native browser interact failed: {message}"))
            }
        })?;

    Ok(map_result(native_result))
}

fn build_native_config(config: &CrawlConfig) -> NativeBrowserConfig {
    let mut extra_headers = config.custom_headers.clone();
    match config.auth {
        Some(AuthConfig::Bearer { ref token }) => {
            extra_headers.insert("Authorization".to_owned(), format!("Bearer {token}"));
        }
        Some(AuthConfig::Header { ref name, ref value }) => {
            extra_headers.insert(name.clone(), value.clone());
        }
        _ => {}
    }

    let wait_until = match config.browser.wait {
        BrowserWait::NetworkIdle => NativeBrowserWait::NetworkIdle,
        BrowserWait::Selector => NativeBrowserWait::Selector,
        BrowserWait::Fixed => NativeBrowserWait::Load,
    };

    NativeBrowserConfig {
        user_agent: config.user_agent.clone(),
        timeout: config.browser.timeout,
        wait_until,
        extra_headers,
        respect_robots_txt: config.respect_robots_txt,
        stealth: matches!(config.browser.mode, crate::types::BrowserMode::Stealth),
        proxy_url: resolved_proxy(config),
        prior_cookies: Vec::<NativeCookie>::new(),
        block_url_patterns: config.browser.block_url_patterns.clone(),
        eval_script: config.browser.eval_script.clone(),
        wait_selector: config.browser.wait_selector.clone(),
        robots_user_agent: config.browser.robots_user_agent.clone(),
        capture_network_events: config.browser.capture_network_events,
    }
}

fn resolved_proxy(config: &CrawlConfig) -> Option<String> {
    config.browser.proxy.as_ref().or(config.proxy.as_ref()).map(|proxy| {
        if proxy.username.is_some() || proxy.password.is_some() {
            let user = proxy.username.as_deref().unwrap_or("");
            let pass = proxy.password.as_deref().unwrap_or("");
            if let Some(rest) = proxy.url.strip_prefix("http://") {
                format!("http://{user}:{pass}@{rest}")
            } else if let Some(rest) = proxy.url.strip_prefix("https://") {
                format!("https://{user}:{pass}@{rest}")
            } else {
                proxy.url.clone()
            }
        } else {
            proxy.url.clone()
        }
    })
}

fn post_navigation_wait(config: &CrawlConfig) -> Option<Duration> {
    let fixed_wait = if config.browser.wait == BrowserWait::Fixed {
        Some(Duration::from_secs(2))
    } else {
        None
    };
    match (fixed_wait, config.browser.extra_wait) {
        (Some(base), Some(extra)) => Some(base + extra),
        (Some(base), None) => Some(base),
        (None, extra) => extra,
    }
}

fn map_action(action: &PageAction) -> NativePageAction {
    match action {
        PageAction::Click { selector } => NativePageAction::Click {
            selector: selector.clone(),
        },
        PageAction::TypeText { selector, text } => NativePageAction::TypeText {
            selector: selector.clone(),
            text: text.clone(),
        },
        PageAction::Press { key } => NativePageAction::Press { key: key.clone() },
        PageAction::Scroll {
            direction,
            selector,
            amount,
        } => NativePageAction::Scroll {
            direction: map_scroll_direction(*direction),
            selector: selector.clone(),
            amount: *amount,
        },
        PageAction::Wait { milliseconds, selector } => NativePageAction::Wait {
            milliseconds: *milliseconds,
            selector: selector.clone(),
        },
        PageAction::Screenshot { full_page } => NativePageAction::Screenshot { full_page: *full_page },
        PageAction::ExecuteJs { script } => NativePageAction::ExecuteJs { script: script.clone() },
        PageAction::Scrape => NativePageAction::Scrape,
    }
}

fn map_scroll_direction(direction: ScrollDirection) -> NativeScrollDirection {
    match direction {
        ScrollDirection::Up => NativeScrollDirection::Up,
        ScrollDirection::Down => NativeScrollDirection::Down,
    }
}

fn map_result(result: NativeInteractionResult) -> InteractionResult {
    InteractionResult {
        action_results: result.action_results.into_iter().map(map_action_result).collect(),
        final_html: result.final_html,
        final_url: result.final_url,
        screenshot: result.screenshot,
    }
}

fn map_action_result(result: NativeActionResult) -> ActionResult {
    ActionResult {
        action_index: result.action_index,
        action_type: result.action_type.into(),
        success: result.success,
        data: result.data,
        error: result.error,
    }
}