use std::sync::Arc;
use crate::defaults;
use crate::error::CrawlError;
#[cfg(not(target_arch = "wasm32"))]
use crate::sink::EventSink;
use crate::traits::*;
use crate::types::*;
use super::CrawlEngine;
pub struct CrawlEngineBuilder {
config: Option<CrawlConfig>,
frontier: Option<Arc<dyn Frontier>>,
rate_limiter: Option<Arc<dyn RateLimiter>>,
store: Option<Arc<dyn CrawlStore>>,
event_emitter: Option<Arc<dyn EventEmitter>>,
strategy: Option<Arc<dyn CrawlStrategy>>,
content_filter: Option<Arc<dyn ContentFilter>>,
cache: Option<Arc<dyn CrawlCache>>,
#[cfg(not(target_arch = "wasm32"))]
event_sink: Option<Arc<dyn EventSink>>,
page_budget: Option<Arc<dyn crate::budget::PageBudget>>,
#[cfg(feature = "browser")]
browser_pool: Option<Arc<crate::browser_pool::BrowserPool>>,
#[cfg(all(not(target_arch = "wasm32"), feature = "browser-native"))]
native_executor: Option<Arc<crawlberg_browser::adapter::NativeBrowserExecutor>>,
proxy_provider: Option<Arc<dyn crate::ProxyProvider>>,
}
impl CrawlEngineBuilder {
pub fn new() -> Self {
Self {
config: None,
frontier: None,
rate_limiter: None,
store: None,
event_emitter: None,
strategy: None,
content_filter: None,
cache: None,
#[cfg(not(target_arch = "wasm32"))]
event_sink: None,
page_budget: None,
#[cfg(feature = "browser")]
browser_pool: None,
#[cfg(all(not(target_arch = "wasm32"), feature = "browser-native"))]
native_executor: None,
proxy_provider: None,
}
}
pub fn config(mut self, config: CrawlConfig) -> Self {
self.config = Some(config);
self
}
#[allow(dead_code)]
#[cfg_attr(alef, alef(skip))]
pub fn frontier(mut self, frontier: impl Frontier + 'static) -> Self {
self.frontier = Some(Arc::new(frontier));
self
}
#[allow(dead_code)]
#[cfg_attr(alef, alef(skip))]
pub fn rate_limiter(mut self, rate_limiter: impl RateLimiter + 'static) -> Self {
self.rate_limiter = Some(Arc::new(rate_limiter));
self
}
#[allow(dead_code)]
#[cfg_attr(alef, alef(skip))]
pub fn store(mut self, store: impl CrawlStore + 'static) -> Self {
self.store = Some(Arc::new(store));
self
}
#[allow(dead_code)]
#[cfg_attr(alef, alef(skip))]
pub fn event_emitter(mut self, event_emitter: impl EventEmitter + 'static) -> Self {
self.event_emitter = Some(Arc::new(event_emitter));
self
}
#[allow(dead_code)]
#[cfg_attr(alef, alef(skip))]
pub fn strategy(mut self, strategy: impl CrawlStrategy + 'static) -> Self {
self.strategy = Some(Arc::new(strategy));
self
}
#[allow(dead_code)]
#[cfg_attr(alef, alef(skip))]
pub fn content_filter(mut self, content_filter: impl ContentFilter + 'static) -> Self {
self.content_filter = Some(Arc::new(content_filter));
self
}
#[allow(dead_code)]
#[cfg_attr(alef, alef(skip))]
pub fn cache(mut self, cache: impl CrawlCache + 'static) -> Self {
self.cache = Some(Arc::new(cache));
self
}
#[cfg(not(target_arch = "wasm32"))]
#[allow(dead_code)]
#[cfg_attr(alef, alef(skip))]
pub fn event_sink(mut self, event_sink: impl EventSink + 'static) -> Self {
self.event_sink = Some(Arc::new(event_sink));
self
}
#[allow(dead_code)]
#[cfg_attr(alef, alef(skip))]
pub fn page_budget(mut self, page_budget: impl crate::budget::PageBudget + 'static) -> Self {
self.page_budget = Some(Arc::new(page_budget));
self
}
#[cfg(feature = "browser")]
#[cfg_attr(alef, alef(skip))]
pub fn with_browser_pool(mut self, pool: Arc<crate::browser_pool::BrowserPool>) -> Self {
self.browser_pool = Some(pool);
self
}
#[cfg(all(not(target_arch = "wasm32"), feature = "browser-native"))]
#[cfg_attr(alef, alef(skip))]
pub fn with_native_executor(mut self, executor: Arc<crawlberg_browser::adapter::NativeBrowserExecutor>) -> Self {
self.native_executor = Some(executor);
self
}
#[cfg_attr(alef, alef(skip))]
pub fn with_proxy_provider(mut self, provider: Arc<dyn crate::ProxyProvider>) -> Self {
self.proxy_provider = Some(provider);
self
}
pub fn build(self) -> Result<CrawlEngine, CrawlError> {
#[allow(unused_mut)]
let mut config = self.config.unwrap_or_default();
#[cfg(feature = "browser")]
if let Some(pool) = self.browser_pool {
config.browser_pool = Some(pool);
}
if let Some(provider) = self.proxy_provider {
config.proxy_provider = Some(provider);
}
if std::env::var("CRAWLBERG_ALLOW_PRIVATE_NETWORK")
.map(|v| v.to_lowercase())
.ok()
.is_some_and(|v| v == "1" || v == "true")
{
config.ssrf.deny_private = false;
}
if config.ssrf.scheme_allowlist.is_empty() {
config.ssrf.scheme_allowlist = crate::net::ssrf::default_scheme_allowlist();
}
let rate_limit_ms = config.rate_limit_ms.unwrap_or(200);
#[cfg(not(target_arch = "wasm32"))]
let ua_rotation = crate::tower::UaRotationLayer::new(config.user_agents.clone());
#[cfg(all(not(target_arch = "wasm32"), feature = "browser-native"))]
let native_browser_executor = if let Some(executor) = self.native_executor {
Some(executor)
} else {
build_native_browser_executor(&config)?
};
Ok(CrawlEngine {
config,
frontier: self
.frontier
.unwrap_or_else(|| Arc::new(defaults::InMemoryFrontier::new())),
rate_limiter: self.rate_limiter.unwrap_or_else(|| {
Arc::new(defaults::PerDomainThrottle::new(std::time::Duration::from_millis(
rate_limit_ms,
)))
}),
store: self.store.unwrap_or_else(|| Arc::new(defaults::NoopStore)),
event_emitter: self.event_emitter.unwrap_or_else(|| Arc::new(defaults::NoopEmitter)),
strategy: self.strategy.unwrap_or_else(|| Arc::new(defaults::BfsStrategy)),
content_filter: self.content_filter.unwrap_or_else(|| Arc::new(defaults::NoopFilter)),
cache: self.cache.unwrap_or_else(|| Arc::new(defaults::NoopCache)),
#[cfg(not(target_arch = "wasm32"))]
event_sink: self.event_sink,
page_budget: self
.page_budget
.unwrap_or_else(|| Arc::new(crate::budget::DefaultPageBudget)),
#[cfg(not(target_arch = "wasm32"))]
ua_rotation,
#[cfg(all(not(target_arch = "wasm32"), feature = "browser-native"))]
native_browser_executor,
})
}
}
#[cfg(all(not(target_arch = "wasm32"), feature = "browser-native"))]
fn build_native_browser_executor(
config: &CrawlConfig,
) -> Result<Option<Arc<crawlberg_browser::adapter::NativeBrowserExecutor>>, CrawlError> {
if config.browser.backend != BrowserBackend::Native {
return Ok(None);
}
let executor_config = match config.max_concurrent {
Some(workers) if workers > 0 => crawlberg_browser::adapter::NativeBrowserExecutorConfig::with_workers(workers),
_ => crawlberg_browser::adapter::NativeBrowserExecutorConfig::default(),
};
let executor = crawlberg_browser::adapter::NativeBrowserExecutor::new(executor_config)
.map_err(|e| CrawlError::BrowserError(format!("failed to start native browser executor: {e}")))?;
Ok(Some(Arc::new(executor)))
}
impl Default for CrawlEngineBuilder {
fn default() -> Self {
Self::new()
}
}