use std::collections::HashMap;
use std::path::PathBuf;
use std::time::Duration;
use super::antibot::DynAntibotStrategy;
use super::bypass::DynBypassProvider;
use super::config::{AuthConfig, BrowserConfig, ContentConfig, CrawlConfig, ProxyConfig};
use super::discovery::AssetCategory;
use super::dispatch::{
DispatchProfile, DynDomainStatePort, DynEscalationBudget, DynRetryPolicy, DynWafClassifier, EscalationStrategy,
};
use crate::net::HostMatcher;
#[derive(Default)]
pub struct CrawlConfigBuilder {
inner: CrawlConfig,
}
impl CrawlConfigBuilder {
pub fn max_depth(mut self, value: usize) -> Self {
self.inner.max_depth = Some(value);
self
}
pub fn max_pages(mut self, value: usize) -> Self {
self.inner.max_pages = Some(value);
self
}
pub fn max_concurrent(mut self, value: usize) -> Self {
self.inner.max_concurrent = Some(value);
self
}
pub fn respect_robots_txt(mut self, value: bool) -> Self {
self.inner.respect_robots_txt = value;
self
}
pub fn soft_http_errors(mut self, value: bool) -> Self {
self.inner.soft_http_errors = value;
self
}
pub fn user_agent(mut self, value: impl Into<String>) -> Self {
self.inner.user_agent = Some(value.into());
self
}
pub fn stay_on_domain(mut self, value: bool) -> Self {
self.inner.stay_on_domain = value;
self
}
pub fn allow_subdomains(mut self, value: bool) -> Self {
self.inner.allow_subdomains = value;
self
}
pub fn include_paths(mut self, value: Vec<String>) -> Self {
self.inner.include_paths = value;
self
}
pub fn exclude_paths(mut self, value: Vec<String>) -> Self {
self.inner.exclude_paths = value;
self
}
pub fn custom_headers(mut self, value: HashMap<String, String>) -> Self {
self.inner.custom_headers = value;
self
}
pub fn request_timeout(mut self, value: Duration) -> Self {
self.inner.request_timeout = value;
self
}
pub fn rate_limit_ms(mut self, value: u64) -> Self {
self.inner.rate_limit_ms = Some(value);
self
}
pub fn max_redirects(mut self, value: usize) -> Self {
self.inner.max_redirects = value;
self
}
pub fn retry_count(mut self, value: usize) -> Self {
self.inner.retry_count = value;
self
}
pub fn retry_codes(mut self, value: Vec<u16>) -> Self {
self.inner.retry_codes = value;
self
}
pub fn cookies_enabled(mut self, value: bool) -> Self {
self.inner.cookies_enabled = value;
self
}
pub fn auth(mut self, value: AuthConfig) -> Self {
self.inner.auth = Some(value);
self
}
pub fn max_body_size(mut self, value: usize) -> Self {
self.inner.max_body_size = Some(value);
self
}
pub fn remove_tags(mut self, value: Vec<String>) -> Self {
self.inner.remove_tags = value;
self
}
pub fn content(mut self, value: ContentConfig) -> Self {
self.inner.content = value;
self
}
pub fn map_limit(mut self, value: usize) -> Self {
self.inner.map_limit = Some(value);
self
}
pub fn map_search(mut self, value: impl Into<String>) -> Self {
self.inner.map_search = Some(value.into());
self
}
pub fn download_assets(mut self, value: bool) -> Self {
self.inner.download_assets = value;
self
}
pub fn asset_types(mut self, value: Vec<AssetCategory>) -> Self {
self.inner.asset_types = value;
self
}
pub fn max_asset_size(mut self, value: usize) -> Self {
self.inner.max_asset_size = Some(value);
self
}
pub fn browser(mut self, value: BrowserConfig) -> Self {
self.inner.browser = value;
self
}
pub fn proxy(mut self, value: ProxyConfig) -> Self {
self.inner.proxy = Some(value);
self
}
pub fn user_agents(mut self, value: Vec<String>) -> Self {
self.inner.user_agents = value;
self
}
pub fn capture_screenshot(mut self, value: bool) -> Self {
self.inner.capture_screenshot = value;
self
}
pub fn download_documents(mut self, value: bool) -> Self {
self.inner.download_documents = value;
self
}
pub fn document_max_size(mut self, value: usize) -> Self {
self.inner.document_max_size = Some(value);
self
}
pub fn document_mime_types(mut self, value: Vec<String>) -> Self {
self.inner.document_mime_types = value;
self
}
pub fn warc_output(mut self, value: PathBuf) -> Self {
self.inner.warc_output = Some(value);
self
}
pub fn browser_profile(mut self, value: impl Into<String>) -> Self {
self.inner.browser_profile = Some(value.into());
self
}
pub fn save_browser_profile(mut self, value: bool) -> Self {
self.inner.save_browser_profile = value;
self
}
pub fn allow_private_networks(mut self, allow: bool) -> Self {
self.inner.ssrf.deny_private = !allow;
self
}
pub fn ssrf_allowlist_host(mut self, matcher: HostMatcher) -> Self {
self.inner.ssrf.allowlist.push(matcher);
self
}
pub fn dispatch(mut self, value: DispatchProfile) -> Self {
self.inner.dispatch = Some(value);
self
}
pub fn build(self) -> CrawlConfig {
self.inner
}
}
#[derive(Default)]
pub struct DispatchProfileBuilder {
inner: DispatchProfile,
}
impl DispatchProfileBuilder {
pub fn bypass(mut self, provider: DynBypassProvider) -> Self {
self.inner.bypass = Some(provider);
self
}
pub fn strategy(mut self, value: EscalationStrategy) -> Self {
self.inner.strategy = value;
self
}
pub fn retry_policy(mut self, policy: DynRetryPolicy) -> Self {
self.inner.retry_policy = Some(policy);
self
}
pub fn waf_classifier(mut self, classifier: DynWafClassifier) -> Self {
self.inner.waf_classifier = Some(classifier);
self
}
pub fn domain_state(mut self, state: DynDomainStatePort) -> Self {
self.inner.domain_state = Some(state);
self
}
pub fn escalation_budget(mut self, budget: DynEscalationBudget) -> Self {
self.inner.escalation_budget = Some(budget);
self
}
pub fn max_total_attempts(mut self, value: u32) -> Self {
self.inner.max_total_attempts = value;
self
}
pub fn antibot_strategy(mut self, strategy: DynAntibotStrategy) -> Self {
self.inner.antibot_strategy = Some(strategy);
self
}
pub fn build(self) -> DispatchProfile {
self.inner
}
}