Skip to main content

spider/
configuration.rs

1use crate::compact_str::CompactString;
2use crate::features::chrome_common::RequestInterceptConfiguration;
3pub use crate::features::chrome_common::{
4    AuthChallengeResponse, AuthChallengeResponseResponse, AutomationScripts, AutomationScriptsMap,
5    CaptureScreenshotFormat, CaptureScreenshotParams, ClipViewport, ExecutionScripts,
6    ExecutionScriptsMap, ScreenShotConfig, ScreenshotParams, Viewport, WaitFor, WaitForDelay,
7    WaitForIdleNetwork, WaitForSelector, WebAutomation,
8};
9pub use crate::features::gemini_common::GeminiConfigs;
10pub use crate::features::openai_common::GPTConfigs;
11#[cfg(feature = "search")]
12pub use crate::features::search::{
13    SearchError, SearchOptions, SearchResult, SearchResults, TimeRange,
14};
15pub use crate::features::webdriver_common::{WebDriverBrowser, WebDriverConfig};
16use crate::utils::get_domain_from_url;
17use crate::utils::BasicCachePolicy;
18use crate::website::CronType;
19use reqwest::header::{AsHeaderName, HeaderMap, HeaderName, HeaderValue, IntoHeaderName};
20use std::net::IpAddr;
21use std::sync::Arc;
22use std::time::Duration;
23
24#[cfg(feature = "chrome")]
25pub use spider_fingerprint::Fingerprint;
26
27/// Redirect policy configuration for request
28#[derive(Debug, Default, Clone, PartialEq)]
29#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
30pub enum RedirectPolicy {
31    #[default]
32    #[cfg_attr(
33        feature = "serde",
34        serde(alias = "Loose", alias = "loose", alias = "LOOSE",)
35    )]
36    /// A loose policy that allows all request up to the redirect limit.
37    Loose,
38    #[cfg_attr(
39        feature = "serde",
40        serde(alias = "Strict", alias = "strict", alias = "STRICT",)
41    )]
42    /// A strict policy only allowing request that match the domain set for crawling.
43    Strict,
44    #[cfg_attr(
45        feature = "serde",
46        serde(alias = "None", alias = "none", alias = "NONE",)
47    )]
48    /// Prevent all redirects.
49    None,
50}
51
52#[cfg(not(feature = "regex"))]
53/// Allow list normal matching paths.
54pub type AllowList = Vec<CompactString>;
55
56#[cfg(feature = "regex")]
57/// Allow list regex.
58pub type AllowList = Box<regex::RegexSet>;
59
60/// Whitelist or Blacklist
61#[derive(Debug, Default, Clone)]
62#[cfg_attr(not(feature = "regex"), derive(PartialEq, Eq))]
63pub struct AllowListSet(pub AllowList);
64
65#[cfg(feature = "chrome")]
66/// Track the events made via chrome.
67#[derive(Debug, PartialEq, Eq, Clone, Default)]
68#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
69pub struct ChromeEventTracker {
70    /// Track the responses.
71    pub responses: bool,
72    /// Track the requests.
73    pub requests: bool,
74    /// Track the changes between web automation.
75    pub automation: bool,
76}
77
78#[cfg(feature = "chrome")]
79impl ChromeEventTracker {
80    /// Create a new chrome event tracker
81    pub fn new(requests: bool, responses: bool) -> Self {
82        ChromeEventTracker {
83            requests,
84            responses,
85            automation: true,
86        }
87    }
88}
89
90#[cfg(feature = "sitemap")]
91#[derive(Debug, Default)]
92/// Determine if the sitemap modified to the whitelist.
93pub struct SitemapWhitelistChanges {
94    /// Added the default sitemap.xml whitelist.
95    pub added_default: bool,
96    /// Added the custom whitelist path.
97    pub added_custom: bool,
98}
99
100#[cfg(feature = "sitemap")]
101impl SitemapWhitelistChanges {
102    /// Was the whitelist modified?
103    pub(crate) fn modified(&self) -> bool {
104        self.added_default || self.added_custom
105    }
106}
107
108/// Determine allow proxy
109#[derive(Debug, Default, Clone, PartialEq)]
110#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
111pub enum ProxyIgnore {
112    /// Chrome proxy.
113    Chrome,
114    /// HTTP proxy.
115    Http,
116    #[default]
117    /// Do not ignore
118    No,
119}
120
121/// The networking proxy to use.
122#[derive(Debug, Default, Clone, PartialEq)]
123#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
124pub struct RequestProxy {
125    /// The proxy address.
126    pub addr: String,
127    /// Ignore the proxy when running a request type.
128    pub ignore: ProxyIgnore,
129}
130
131/// Structure to configure `Website` crawler
132/// ```rust
133/// use spider::website::Website;
134/// let mut website: Website = Website::new("https://choosealicense.com");
135/// website.configuration.blacklist_url.insert(Default::default()).push("https://choosealicense.com/licenses/".to_string().into());
136/// website.configuration.respect_robots_txt = true;
137/// website.configuration.subdomains = true;
138/// website.configuration.tld = true;
139/// ```
140#[derive(Debug, Default, Clone)]
141#[cfg_attr(
142    all(
143        not(feature = "regex"),
144        not(feature = "openai"),
145        not(feature = "cache_openai"),
146        not(feature = "gemini"),
147        not(feature = "cache_gemini")
148    ),
149    derive(PartialEq)
150)]
151#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
152#[cfg_attr(feature = "serde", serde(default))]
153pub struct Configuration {
154    /// Respect robots.txt file and not scrape not allowed files. This may slow down crawls if robots.txt file has a delay included.
155    pub respect_robots_txt: bool,
156    /// Allow sub-domains.
157    pub subdomains: bool,
158    /// Allow all tlds for domain.
159    pub tld: bool,
160    /// The max timeout for the crawl.
161    pub crawl_timeout: Option<Duration>,
162    /// Preserve the HTTP host header from being included.
163    pub preserve_host_header: bool,
164    /// List of pages to not crawl. [optional: regex pattern matching]
165    pub blacklist_url: Option<Vec<CompactString>>,
166    /// List of pages to only crawl. [optional: regex pattern matching]
167    pub whitelist_url: Option<Vec<CompactString>>,
168    /// User-Agent for request.
169    pub user_agent: Option<Box<CompactString>>,
170    /// Polite crawling delay in milli seconds.
171    pub delay: u64,
172    /// Request max timeout per page. By default the request times out in 15s. Set to None to disable.
173    pub request_timeout: Option<Duration>,
174    /// Use HTTP2 for connection. Enable if you know the website has http2 support.
175    pub http2_prior_knowledge: bool,
176    /// Use proxy list for performing network request.
177    pub proxies: Option<Vec<RequestProxy>>,
178    /// Headers to include with request.
179    pub headers: Option<Box<SerializableHeaderMap>>,
180    #[cfg(feature = "sitemap")]
181    /// Include a sitemap in response of the crawl.
182    pub sitemap_url: Option<Box<CompactString>>,
183    #[cfg(feature = "sitemap")]
184    /// Prevent including the sitemap links with the crawl.
185    pub ignore_sitemap: bool,
186    /// The max redirections allowed for request.
187    pub redirect_limit: usize,
188    /// The redirect policy type to use.
189    pub redirect_policy: RedirectPolicy,
190    #[cfg(feature = "cookies")]
191    /// Cookie string to use for network requests ex: "foo=bar; Domain=blog.spider"
192    pub cookie_str: String,
193    #[cfg(feature = "wreq")]
194    /// The type of request emulation. This does nothing without the flag `sync` enabled.
195    pub emulation: Option<wreq_util::Emulation>,
196    #[cfg(feature = "cron")]
197    /// Cron string to perform crawls - use <https://crontab.guru/> to help generate a valid cron for needs.
198    pub cron_str: String,
199    #[cfg(feature = "cron")]
200    /// The type of cron to run either crawl or scrape.
201    pub cron_type: CronType,
202    /// The max depth to crawl for a website. Defaults to 25 to help prevent infinite recursion.
203    pub depth: usize,
204    /// The depth to crawl pertaining to the root.
205    pub depth_distance: usize,
206    /// Use stealth mode for requests.
207    pub stealth_mode: spider_fingerprint::configs::Tier,
208    /// Configure the viewport for chrome and viewport headers.
209    pub viewport: Option<Viewport>,
210    /// Crawl budget for the paths. This helps prevent crawling extra pages and limiting the amount.
211    pub budget: Option<hashbrown::HashMap<case_insensitive_string::CaseInsensitiveString, u32>>,
212    /// If wild card budgeting is found for the website.
213    pub wild_card_budgeting: bool,
214    /// External domains to include case-insensitive.
215    pub external_domains_caseless:
216        Arc<hashbrown::HashSet<case_insensitive_string::CaseInsensitiveString>>,
217    /// Collect all the resources found on the page.
218    pub full_resources: bool,
219    /// Dangerously accept invalid certficates.
220    pub accept_invalid_certs: bool,
221    /// The auth challenge response. The 'chrome_intercept' flag is also required in order to intercept the response.
222    pub auth_challenge_response: Option<AuthChallengeResponse>,
223    /// The OpenAI configs to use to help drive the chrome browser. This does nothing without the 'openai' flag.
224    pub openai_config: Option<Box<GPTConfigs>>,
225    /// The Gemini configs to use to help drive the chrome browser. This does nothing without the 'gemini' flag.
226    pub gemini_config: Option<Box<GeminiConfigs>>,
227    /// Remote multimodal automation config (vision + LLM-driven steps).
228    /// Requires the `agent` feature for full functionality, uses stub type otherwise.
229    pub remote_multimodal: Option<Box<crate::features::automation::RemoteMultimodalConfigs>>,
230    /// Use a shared queue strategy when crawling. This can scale workloads evenly that do not need priority.
231    pub shared_queue: bool,
232    /// Return the page links in the subscription channels. This does nothing without the flag `sync` enabled.
233    pub return_page_links: bool,
234    /// Retry count to attempt to swap proxies etc.
235    pub retry: u8,
236    /// Skip spawning a control thread that can pause, start, and shutdown the crawl.
237    pub no_control_thread: bool,
238    /// The blacklist urls.
239    blacklist: AllowListSet,
240    /// The whitelist urls.
241    whitelist: AllowListSet,
242    /// Crawl budget for the paths. This helps prevent crawling extra pages and limiting the amount.
243    pub(crate) inner_budget:
244        Option<hashbrown::HashMap<case_insensitive_string::CaseInsensitiveString, u32>>,
245    /// Expect only to handle HTML to save on resources. This mainly only blocks the crawling and returning of resources from the server.
246    pub only_html: bool,
247    /// The concurrency limits to apply.
248    pub concurrency_limit: Option<usize>,
249    /// Normalize the html de-deplucating the content.
250    pub normalize: bool,
251    /// Share the state of the crawl requires the 'disk' feature flag.
252    pub shared: bool,
253    /// Modify the headers to act like a real-browser
254    pub modify_headers: bool,
255    /// Modify the HTTP client headers only to act like a real-browser
256    pub modify_http_client_headers: bool,
257    /// Cache the page following HTTP caching rules.
258    #[cfg(any(
259        feature = "cache_request",
260        feature = "chrome",
261        feature = "chrome_remote_cache"
262    ))]
263    pub cache: bool,
264    /// Skip browser rendering entirely if cached response exists.
265    /// When enabled, returns cached HTML directly without launching Chrome.
266    #[cfg(any(
267        feature = "cache_request",
268        feature = "chrome",
269        feature = "chrome_remote_cache"
270    ))]
271    pub cache_skip_browser: bool,
272    #[cfg(feature = "chrome")]
273    /// Enable or disable service workers. Enabled by default.
274    pub service_worker_enabled: bool,
275    #[cfg(feature = "chrome")]
276    /// Overrides default host system timezone with the specified one.
277    #[cfg(feature = "chrome")]
278    pub timezone_id: Option<Box<String>>,
279    /// Overrides default host system locale with the specified one.
280    #[cfg(feature = "chrome")]
281    pub locale: Option<Box<String>>,
282    /// Set a custom script to eval on each new document.
283    #[cfg(feature = "chrome")]
284    pub evaluate_on_new_document: Option<Box<String>>,
285    #[cfg(feature = "chrome")]
286    /// Dismiss dialogs.
287    pub dismiss_dialogs: Option<bool>,
288    #[cfg(feature = "chrome")]
289    /// Wait for options for the page.
290    pub wait_for: Option<WaitFor>,
291    #[cfg(feature = "chrome")]
292    /// Take a screenshot of the page.
293    pub screenshot: Option<ScreenShotConfig>,
294    #[cfg(feature = "chrome")]
295    /// Track the events made via chrome.
296    pub track_events: Option<ChromeEventTracker>,
297    #[cfg(feature = "chrome")]
298    /// Setup fingerprint ID on each document. This does nothing without the flag `chrome` enabled.
299    pub fingerprint: Fingerprint,
300    #[cfg(feature = "chrome")]
301    /// The chrome connection url. Useful for targeting different headless instances. Defaults to using the env CHROME_URL.
302    pub chrome_connection_url: Option<String>,
303    /// Scripts to execute for individual pages, the full path of the url is required for an exact match. This is useful for running one off JS on pages like performing custom login actions.
304    #[cfg(feature = "chrome")]
305    pub execution_scripts: Option<ExecutionScripts>,
306    /// Web automation scripts to run up to a duration of 60 seconds.
307    #[cfg(feature = "chrome")]
308    pub automation_scripts: Option<AutomationScripts>,
309    /// Setup network interception for request. This does nothing without the flag `chrome_intercept` enabled.
310    #[cfg(feature = "chrome")]
311    pub chrome_intercept: RequestInterceptConfiguration,
312    /// The referer to use.
313    pub referer: Option<String>,
314    /// Determine the max bytes per page.
315    pub max_page_bytes: Option<f64>,
316    /// Determine the max bytes per browser context.
317    pub max_bytes_allowed: Option<u64>,
318    #[cfg(feature = "chrome")]
319    /// Disables log domain, prevents further log entries from being reported to the client. This does nothing without the flag `chrome` enabled.
320    pub disable_log: bool,
321    #[cfg(feature = "chrome")]
322    /// Automatic locale and timezone handling via third party. This does nothing without the flag `chrome` enabled.
323    pub auto_geolocation: bool,
324    /// The cache policy to use.
325    pub cache_policy: Option<BasicCachePolicy>,
326    #[cfg(feature = "chrome")]
327    /// Enables bypassing CSP. This does nothing without the flag `chrome` enabled.
328    pub bypass_csp: bool,
329    /// Bind the connections only on the network interface.
330    pub network_interface: Option<String>,
331    /// Bind to a local IP Address.
332    pub local_address: Option<IpAddr>,
333    /// The default http connect timeout
334    pub default_http_connect_timeout: Option<Duration>,
335    /// The default http read timeout
336    pub default_http_read_timeout: Option<Duration>,
337    #[cfg(feature = "webdriver")]
338    /// WebDriver configuration for browser automation. This does nothing without the `webdriver` flag enabled.
339    pub webdriver_config: Option<Box<WebDriverConfig>>,
340    #[cfg(feature = "search")]
341    /// Search provider configuration for web search integration. This does nothing without the `search` flag enabled.
342    pub search_config: Option<Box<SearchConfig>>,
343    #[cfg(feature = "spider_cloud")]
344    /// Spider Cloud config. See <https://spider.cloud>.
345    pub spider_cloud: Option<Box<SpiderCloudConfig>>,
346    #[cfg(feature = "hedge")]
347    /// Hedged request configuration for work-stealing on slow requests.
348    /// When enabled, fires a duplicate request on a different proxy after a delay.
349    pub hedge: Option<crate::utils::hedge::HedgeConfig>,
350}
351
352#[derive(Default, Debug, Clone, PartialEq, Eq)]
353/// Serializable HTTP headers.
354pub struct SerializableHeaderMap(pub HeaderMap);
355
356impl SerializableHeaderMap {
357    /// Innter HeaderMap.
358    pub fn inner(&self) -> &HeaderMap {
359        &self.0
360    }
361    /// Returns true if the map contains a value for the specified key.
362    pub fn contains_key<K>(&self, key: K) -> bool
363    where
364        K: AsHeaderName,
365    {
366        self.0.contains_key(key)
367    }
368    /// Inserts a key-value pair into the map.
369    pub fn insert<K>(
370        &mut self,
371        key: K,
372        val: reqwest::header::HeaderValue,
373    ) -> Option<reqwest::header::HeaderValue>
374    where
375        K: IntoHeaderName,
376    {
377        self.0.insert(key, val)
378    }
379    /// Extend a `HeaderMap` with the contents of another `HeaderMap`.
380    pub fn extend<I>(&mut self, iter: I)
381    where
382        I: IntoIterator<Item = (Option<HeaderName>, HeaderValue)>,
383    {
384        self.0.extend(iter);
385    }
386}
387
388/// Get a cloned copy of the `Referer` header as a `String` (if it exists and is valid UTF-8).
389pub fn get_referer(header_map: &Option<Box<SerializableHeaderMap>>) -> Option<String> {
390    match header_map {
391        Some(header_map) => {
392            header_map
393                .0
394                .get(crate::client::header::REFERER) // Retrieves the "Referer" HeaderValue if it exists
395                .and_then(|value| value.to_str().ok()) // &str from HeaderValue
396                .map(String::from) // Convert &str to String (owned)
397        }
398        _ => None,
399    }
400}
401
402impl From<HeaderMap> for SerializableHeaderMap {
403    fn from(header_map: HeaderMap) -> Self {
404        SerializableHeaderMap(header_map)
405    }
406}
407
408#[cfg(feature = "serde")]
409impl serde::Serialize for SerializableHeaderMap {
410    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
411    where
412        S: serde::Serializer,
413    {
414        let map: std::collections::BTreeMap<String, String> = self
415            .0
416            .iter()
417            .map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
418            .collect();
419        map.serialize(serializer)
420    }
421}
422
423#[cfg(feature = "serde")]
424impl<'de> serde::Deserialize<'de> for SerializableHeaderMap {
425    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
426    where
427        D: serde::Deserializer<'de>,
428    {
429        use reqwest::header::{HeaderName, HeaderValue};
430        use std::collections::BTreeMap;
431        let map: BTreeMap<String, String> = BTreeMap::deserialize(deserializer)?;
432        let mut headers = HeaderMap::with_capacity(map.len());
433        for (k, v) in map {
434            let key = HeaderName::from_bytes(k.as_bytes()).map_err(serde::de::Error::custom)?;
435            let value = HeaderValue::from_str(&v).map_err(serde::de::Error::custom)?;
436            headers.insert(key, value);
437        }
438        Ok(SerializableHeaderMap(headers))
439    }
440}
441
442#[cfg(feature = "serde")]
443impl serde::Serialize for AllowListSet {
444    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
445    where
446        S: serde::Serializer,
447    {
448        #[cfg(not(feature = "regex"))]
449        {
450            self.0.serialize(serializer)
451        }
452
453        #[cfg(feature = "regex")]
454        {
455            self.0
456                .patterns()
457                .iter()
458                .collect::<Vec<&String>>()
459                .serialize(serializer)
460        }
461    }
462}
463
464#[cfg(feature = "serde")]
465impl<'de> serde::Deserialize<'de> for AllowListSet {
466    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
467    where
468        D: serde::Deserializer<'de>,
469    {
470        #[cfg(not(feature = "regex"))]
471        {
472            let vec = Vec::<CompactString>::deserialize(deserializer)?;
473            Ok(AllowListSet(vec))
474        }
475
476        #[cfg(feature = "regex")]
477        {
478            let patterns = Vec::<String>::deserialize(deserializer)?;
479            let regex_set = regex::RegexSet::new(&patterns).map_err(serde::de::Error::custom)?;
480            Ok(AllowListSet(regex_set.into()))
481        }
482    }
483}
484
485/// Get the user agent from the top agent list randomly.
486#[cfg(feature = "ua_generator")]
487pub fn get_ua(chrome: bool) -> &'static str {
488    if chrome {
489        ua_generator::ua::spoof_chrome_ua()
490    } else {
491        ua_generator::ua::spoof_ua()
492    }
493}
494
495/// Get the user agent via cargo package + version.
496#[cfg(not(feature = "ua_generator"))]
497pub fn get_ua(_chrome: bool) -> &'static str {
498    use std::env;
499
500    lazy_static! {
501        static ref AGENT: &'static str =
502            concat!(env!("CARGO_PKG_NAME"), '/', env!("CARGO_PKG_VERSION"));
503    };
504
505    AGENT.as_ref()
506}
507
508impl Configuration {
509    /// Represents crawl configuration for a website.
510    #[cfg(not(feature = "chrome"))]
511    pub fn new() -> Self {
512        Self {
513            delay: 0,
514            depth: 25,
515            redirect_limit: 7,
516            request_timeout: Some(Duration::from_secs(120)),
517            only_html: true,
518            modify_headers: true,
519            ..Default::default()
520        }
521    }
522
523    /// Represents crawl configuration for a website.
524    #[cfg(feature = "chrome")]
525    pub fn new() -> Self {
526        Self {
527            delay: 0,
528            depth: 25,
529            redirect_limit: 7,
530            request_timeout: Some(Duration::from_secs(120)),
531            chrome_intercept: RequestInterceptConfiguration::new(cfg!(
532                feature = "chrome_intercept"
533            )),
534            user_agent: Some(Box::new(get_ua(true).into())),
535            only_html: true,
536            cache: true,
537            modify_headers: true,
538            service_worker_enabled: true,
539            fingerprint: Fingerprint::Basic,
540            auto_geolocation: false,
541            ..Default::default()
542        }
543    }
544
545    /// Build a `RemoteMultimodalEngine` from `RemoteMultimodalConfigs`.
546    /// Requires the `agent` feature.
547    #[cfg(feature = "agent")]
548    pub fn build_remote_multimodal_engine(
549        &self,
550    ) -> Option<crate::features::automation::RemoteMultimodalEngine> {
551        let cfgs = self.remote_multimodal.as_ref()?;
552        let sem = cfgs
553            .concurrency_limit
554            .filter(|&n| n > 0)
555            .map(|n| std::sync::Arc::new(tokio::sync::Semaphore::new(n)));
556
557        #[allow(unused_mut)]
558        let mut engine = crate::features::automation::RemoteMultimodalEngine::new(
559            cfgs.api_url.clone(),
560            cfgs.model_name.clone(),
561            cfgs.system_prompt.clone(),
562        )
563        .with_api_key(cfgs.api_key.as_deref())
564        .with_system_prompt_extra(cfgs.system_prompt_extra.as_deref())
565        .with_user_message_extra(cfgs.user_message_extra.as_deref())
566        .with_remote_multimodal_config(cfgs.cfg.clone())
567        .with_prompt_url_gate(cfgs.prompt_url_gate.clone())
568        .with_vision_model(cfgs.vision_model.clone())
569        .with_text_model(cfgs.text_model.clone())
570        .with_vision_route_mode(cfgs.vision_route_mode)
571        .with_chrome_ai(cfgs.use_chrome_ai)
572        .with_semaphore(sem)
573        .to_owned();
574
575        #[cfg(feature = "agent_skills")]
576        if let Some(ref registry) = cfgs.skill_registry {
577            engine.with_skill_registry(Some(registry.clone()));
578        }
579
580        // Build per-round complexity router from model pool (3+ models required)
581        let model_pool = cfgs.model_pool.clone();
582        if model_pool.len() >= 3 {
583            let model_names: Vec<&str> =
584                model_pool.iter().map(|ep| ep.model_name.as_str()).collect();
585            let policy = crate::features::automation::auto_policy(&model_names);
586            engine.model_router = Some(crate::features::automation::ModelRouter::with_policy(
587                policy,
588            ));
589        }
590        engine.model_pool = model_pool;
591
592        Some(engine)
593    }
594
595    /// Determine if the agent should be set to a Chrome Agent.
596    #[cfg(not(feature = "chrome"))]
597    pub(crate) fn only_chrome_agent(&self) -> bool {
598        false
599    }
600
601    /// Determine if the agent should be set to a Chrome Agent.
602    #[cfg(feature = "chrome")]
603    pub(crate) fn only_chrome_agent(&self) -> bool {
604        self.chrome_connection_url.is_some()
605            || self.wait_for.is_some()
606            || self.chrome_intercept.enabled
607            || self.stealth_mode.stealth()
608            || self.fingerprint.valid()
609    }
610
611    #[cfg(feature = "regex")]
612    /// Compile the regex for the blacklist.
613    pub fn get_blacklist(&self) -> Box<regex::RegexSet> {
614        match &self.blacklist_url {
615            Some(blacklist) => match regex::RegexSet::new(&**blacklist) {
616                Ok(s) => Box::new(s),
617                _ => Default::default(),
618            },
619            _ => Default::default(),
620        }
621    }
622
623    #[cfg(not(feature = "regex"))]
624    /// Handle the blacklist options.
625    pub fn get_blacklist(&self) -> AllowList {
626        match &self.blacklist_url {
627            Some(blacklist) => blacklist.to_owned(),
628            _ => Default::default(),
629        }
630    }
631
632    /// Set the blacklist
633    pub(crate) fn set_blacklist(&mut self) {
634        self.blacklist = AllowListSet(self.get_blacklist());
635    }
636
637    /// Set the whitelist
638    pub fn set_whitelist(&mut self) {
639        self.whitelist = AllowListSet(self.get_whitelist());
640    }
641
642    /// Configure the allow list.
643    pub fn configure_allowlist(&mut self) {
644        self.set_whitelist();
645        self.set_blacklist();
646    }
647
648    /// Get the blacklist compiled.
649    pub fn get_blacklist_compiled(&self) -> &AllowList {
650        &self.blacklist.0
651    }
652
653    /// Setup the budget for crawling.
654    pub fn configure_budget(&mut self) {
655        self.inner_budget.clone_from(&self.budget);
656    }
657
658    /// Get the whitelist compiled.
659    pub fn get_whitelist_compiled(&self) -> &AllowList {
660        &self.whitelist.0
661    }
662
663    #[cfg(feature = "regex")]
664    /// Compile the regex for the whitelist.
665    pub fn get_whitelist(&self) -> Box<regex::RegexSet> {
666        match &self.whitelist_url {
667            Some(whitelist) => match regex::RegexSet::new(&**whitelist) {
668                Ok(s) => Box::new(s),
669                _ => Default::default(),
670            },
671            _ => Default::default(),
672        }
673    }
674
675    #[cfg(not(feature = "regex"))]
676    /// Handle the whitelist options.
677    pub fn get_whitelist(&self) -> AllowList {
678        match &self.whitelist_url {
679            Some(whitelist) => whitelist.to_owned(),
680            _ => Default::default(),
681        }
682    }
683
684    #[cfg(feature = "sitemap")]
685    /// Add sitemap paths to the whitelist and track what was added.
686    pub fn add_sitemap_to_whitelist(&mut self) -> SitemapWhitelistChanges {
687        let mut changes = SitemapWhitelistChanges::default();
688
689        if self.ignore_sitemap && self.whitelist_url.is_none() {
690            return changes;
691        }
692
693        if let Some(list) = self.whitelist_url.as_mut() {
694            if list.is_empty() {
695                return changes;
696            }
697
698            let default = CompactString::from("sitemap.xml");
699
700            if !list.contains(&default) {
701                list.push(default);
702                changes.added_default = true;
703            }
704
705            if let Some(custom) = &self.sitemap_url {
706                if !list.contains(custom) {
707                    list.push(*custom.clone());
708                    changes.added_custom = true;
709                }
710            }
711        }
712
713        changes
714    }
715
716    #[cfg(feature = "sitemap")]
717    /// Revert any changes made to the whitelist by `add_sitemap_to_whitelist`.
718    pub fn remove_sitemap_from_whitelist(&mut self, changes: SitemapWhitelistChanges) {
719        if let Some(list) = self.whitelist_url.as_mut() {
720            if changes.added_default {
721                let default = CompactString::from("sitemap.xml");
722                if let Some(pos) = list.iter().position(|s| s == default) {
723                    list.remove(pos);
724                }
725            }
726            if changes.added_custom {
727                if let Some(custom) = &self.sitemap_url {
728                    if let Some(pos) = list.iter().position(|s| *s == **custom) {
729                        list.remove(pos);
730                    }
731                }
732            }
733            if list.is_empty() {
734                self.whitelist_url = None;
735            }
736        }
737    }
738
739    /// Respect robots.txt file.
740    pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self {
741        self.respect_robots_txt = respect_robots_txt;
742        self
743    }
744
745    /// Include subdomains detection.
746    pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self {
747        self.subdomains = subdomains;
748        self
749    }
750
751    /// Bypass CSP protection detection. This does nothing without the feat flag `chrome` enabled.
752    #[cfg(feature = "chrome")]
753    pub fn with_csp_bypass(&mut self, enabled: bool) -> &mut Self {
754        self.bypass_csp = enabled;
755        self
756    }
757
758    /// Bypass CSP protection detection. This does nothing without the feat flag `chrome` enabled.
759    #[cfg(not(feature = "chrome"))]
760    pub fn with_csp_bypass(&mut self, _enabled: bool) -> &mut Self {
761        self
762    }
763
764    /// Bind the connections only on the network interface.
765    pub fn with_network_interface(&mut self, network_interface: Option<String>) -> &mut Self {
766        self.network_interface = network_interface;
767        self
768    }
769
770    /// Bind to a local IP Address.
771    pub fn with_local_address(&mut self, local_address: Option<IpAddr>) -> &mut Self {
772        self.local_address = local_address;
773        self
774    }
775
776    /// Include tld detection.
777    pub fn with_tld(&mut self, tld: bool) -> &mut Self {
778        self.tld = tld;
779        self
780    }
781
782    /// The max duration for the crawl. This is useful when websites use a robots.txt with long durations and throttle the timeout removing the full concurrency.
783    pub fn with_crawl_timeout(&mut self, crawl_timeout: Option<Duration>) -> &mut Self {
784        self.crawl_timeout = crawl_timeout;
785        self
786    }
787
788    /// Delay between request as ms.
789    pub fn with_delay(&mut self, delay: u64) -> &mut Self {
790        self.delay = delay;
791        self
792    }
793
794    /// Only use HTTP/2.
795    pub fn with_http2_prior_knowledge(&mut self, http2_prior_knowledge: bool) -> &mut Self {
796        self.http2_prior_knowledge = http2_prior_knowledge;
797        self
798    }
799
800    /// Max time to wait for request. By default request times out in 15s. Set to None to disable.
801    pub fn with_request_timeout(&mut self, request_timeout: Option<Duration>) -> &mut Self {
802        match request_timeout {
803            Some(timeout) => self.request_timeout = Some(timeout),
804            _ => self.request_timeout = None,
805        };
806
807        self
808    }
809
810    #[cfg(feature = "sitemap")]
811    /// Set the sitemap url. This does nothing without the `sitemap` feature flag.
812    pub fn with_sitemap(&mut self, sitemap_url: Option<&str>) -> &mut Self {
813        match sitemap_url {
814            Some(sitemap_url) => {
815                self.sitemap_url = Some(CompactString::new(sitemap_url.to_string()).into())
816            }
817            _ => self.sitemap_url = None,
818        };
819        self
820    }
821
822    #[cfg(not(feature = "sitemap"))]
823    /// Set the sitemap url. This does nothing without the `sitemap` feature flag.
824    pub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self {
825        self
826    }
827
828    #[cfg(feature = "sitemap")]
829    /// Ignore the sitemap when crawling. This method does nothing if the `sitemap` is not enabled.
830    pub fn with_ignore_sitemap(&mut self, ignore_sitemap: bool) -> &mut Self {
831        self.ignore_sitemap = ignore_sitemap;
832        self
833    }
834
835    #[cfg(not(feature = "sitemap"))]
836    /// Ignore the sitemap when crawling. This method does nothing if the `sitemap` is not enabled.
837    pub fn with_ignore_sitemap(&mut self, _ignore_sitemap: bool) -> &mut Self {
838        self
839    }
840
841    /// Add user agent to request.
842    pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self {
843        match user_agent {
844            Some(agent) => self.user_agent = Some(CompactString::new(agent).into()),
845            _ => self.user_agent = None,
846        };
847        self
848    }
849
850    /// Preserve the HOST header.
851    pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self {
852        self.preserve_host_header = preserve;
853        self
854    }
855
856    /// Use a remote multimodal model to drive browser automation.
857    /// Requires the `agent` feature.
858    #[cfg(feature = "agent")]
859    pub fn with_remote_multimodal(
860        &mut self,
861        remote_multimodal: Option<crate::features::automation::RemoteMultimodalConfigs>,
862    ) -> &mut Self {
863        self.remote_multimodal = remote_multimodal.map(Box::new);
864        self
865    }
866
867    /// Use a remote multimodal model to drive browser automation.
868    /// When the `agent` feature is not enabled, this uses a stub type.
869    #[cfg(not(feature = "agent"))]
870    pub fn with_remote_multimodal(
871        &mut self,
872        remote_multimodal: Option<crate::features::automation::RemoteMultimodalConfigs>,
873    ) -> &mut Self {
874        self.remote_multimodal = remote_multimodal.map(Box::new);
875        self
876    }
877
878    #[cfg(not(feature = "openai"))]
879    /// The OpenAI configs to use to drive the browser. This method does nothing if the `openai` is not enabled.
880    pub fn with_openai(&mut self, _openai_config: Option<GPTConfigs>) -> &mut Self {
881        self
882    }
883
884    /// The OpenAI configs to use to drive the browser. This method does nothing if the `openai` is not enabled.
885    #[cfg(feature = "openai")]
886    pub fn with_openai(&mut self, openai_config: Option<GPTConfigs>) -> &mut Self {
887        match openai_config {
888            Some(openai_config) => self.openai_config = Some(Box::new(openai_config)),
889            _ => self.openai_config = None,
890        };
891        self
892    }
893
894    #[cfg(not(feature = "gemini"))]
895    /// The Gemini configs to use to drive the browser. This method does nothing if the `gemini` is not enabled.
896    pub fn with_gemini(&mut self, _gemini_config: Option<GeminiConfigs>) -> &mut Self {
897        self
898    }
899
900    /// The Gemini configs to use to drive the browser. This method does nothing if the `gemini` is not enabled.
901    #[cfg(feature = "gemini")]
902    pub fn with_gemini(&mut self, gemini_config: Option<GeminiConfigs>) -> &mut Self {
903        match gemini_config {
904            Some(gemini_config) => self.gemini_config = Some(Box::new(gemini_config)),
905            _ => self.gemini_config = None,
906        };
907        self
908    }
909
910    #[cfg(feature = "cookies")]
911    /// Cookie string to use in request. This does nothing without the `cookies` flag enabled.
912    pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self {
913        self.cookie_str = cookie_str.into();
914        self
915    }
916
917    #[cfg(not(feature = "cookies"))]
918    /// Cookie string to use in request. This does nothing without the `cookies` flag enabled.
919    pub fn with_cookies(&mut self, _cookie_str: &str) -> &mut Self {
920        self
921    }
922
923    #[cfg(feature = "chrome")]
924    /// Set custom fingerprint ID for request. This does nothing without the `chrome` flag enabled.
925    pub fn with_fingerprint(&mut self, fingerprint: bool) -> &mut Self {
926        if fingerprint {
927            self.fingerprint = Fingerprint::Basic;
928        } else {
929            self.fingerprint = Fingerprint::None;
930        }
931        self
932    }
933
934    #[cfg(feature = "chrome")]
935    /// Set custom fingerprint ID for request. This does nothing without the `chrome` flag enabled.
936    pub fn with_fingerprint_advanced(&mut self, fingerprint: Fingerprint) -> &mut Self {
937        self.fingerprint = fingerprint;
938        self
939    }
940
941    #[cfg(not(feature = "chrome"))]
942    /// Set custom fingerprint ID for request. This does nothing without the `chrome` flag enabled.
943    pub fn with_fingerprint(&mut self, _fingerprint: bool) -> &mut Self {
944        self
945    }
946
947    /// Use proxies for request.
948    pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self {
949        self.proxies = proxies.map(|p| {
950            p.iter()
951                .map(|addr| RequestProxy {
952                    addr: addr.to_owned(),
953                    ..Default::default()
954                })
955                .collect::<Vec<RequestProxy>>()
956        });
957        self
958    }
959
960    /// Use proxies for request with control between chrome and http.
961    pub fn with_proxies_direct(&mut self, proxies: Option<Vec<RequestProxy>>) -> &mut Self {
962        self.proxies = proxies;
963        self
964    }
965
966    /// Use a shared semaphore to evenly handle workloads. The default is false.
967    pub fn with_shared_queue(&mut self, shared_queue: bool) -> &mut Self {
968        self.shared_queue = shared_queue;
969        self
970    }
971
972    /// Add blacklist urls to ignore.
973    pub fn with_blacklist_url<T>(&mut self, blacklist_url: Option<Vec<T>>) -> &mut Self
974    where
975        Vec<CompactString>: From<Vec<T>>,
976    {
977        match blacklist_url {
978            Some(p) => self.blacklist_url = Some(p.into()),
979            _ => self.blacklist_url = None,
980        };
981        self
982    }
983
984    /// Add whitelist urls to allow.
985    pub fn with_whitelist_url<T>(&mut self, whitelist_url: Option<Vec<T>>) -> &mut Self
986    where
987        Vec<CompactString>: From<Vec<T>>,
988    {
989        match whitelist_url {
990            Some(p) => self.whitelist_url = Some(p.into()),
991            _ => self.whitelist_url = None,
992        };
993        self
994    }
995
996    /// Return the links found on the page in the channel subscriptions. This method does nothing if the `decentralized` is enabled.
997    pub fn with_return_page_links(&mut self, return_page_links: bool) -> &mut Self {
998        self.return_page_links = return_page_links;
999        self
1000    }
1001
1002    /// Set HTTP headers for request using [reqwest::header::HeaderMap](https://docs.rs/reqwest/latest/reqwest/header/struct.HeaderMap.html).
1003    pub fn with_headers(&mut self, headers: Option<reqwest::header::HeaderMap>) -> &mut Self {
1004        match headers {
1005            Some(m) => self.headers = Some(SerializableHeaderMap::from(m).into()),
1006            _ => self.headers = None,
1007        };
1008        self
1009    }
1010
1011    /// Set the max redirects allowed for request.
1012    pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self {
1013        self.redirect_limit = redirect_limit;
1014        self
1015    }
1016
1017    /// Set the redirect policy to use.
1018    pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self {
1019        self.redirect_policy = policy;
1020        self
1021    }
1022
1023    /// Add a referer (mis-spelling) to the request.
1024    pub fn with_referer(&mut self, referer: Option<String>) -> &mut Self {
1025        self.referer = referer;
1026        self
1027    }
1028
1029    /// Add a referer to the request.
1030    pub fn with_referrer(&mut self, referer: Option<String>) -> &mut Self {
1031        self.referer = referer;
1032        self
1033    }
1034
1035    /// Determine whether to collect all the resources found on pages.
1036    pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self {
1037        self.full_resources = full_resources;
1038        self
1039    }
1040
1041    /// Determine whether to dismiss dialogs. This method does nothing if the `chrome` is enabled.
1042    #[cfg(feature = "chrome")]
1043    pub fn with_dismiss_dialogs(&mut self, dismiss_dialogs: bool) -> &mut Self {
1044        self.dismiss_dialogs = Some(dismiss_dialogs);
1045        self
1046    }
1047
1048    /// Determine whether to dismiss dialogs. This method does nothing if the `chrome` is enabled.
1049    #[cfg(not(feature = "chrome"))]
1050    pub fn with_dismiss_dialogs(&mut self, _dismiss_dialogs: bool) -> &mut Self {
1051        self
1052    }
1053
1054    /// Set the request emuluation. This method does nothing if the `wreq` flag is not enabled.
1055    #[cfg(feature = "wreq")]
1056    pub fn with_emulation(&mut self, emulation: Option<wreq_util::Emulation>) -> &mut Self {
1057        self.emulation = emulation;
1058        self
1059    }
1060
1061    #[cfg(feature = "cron")]
1062    /// Setup cron jobs to run. This does nothing without the `cron` flag enabled.
1063    pub fn with_cron(&mut self, cron_str: &str, cron_type: CronType) -> &mut Self {
1064        self.cron_str = cron_str.into();
1065        self.cron_type = cron_type;
1066        self
1067    }
1068
1069    #[cfg(not(feature = "cron"))]
1070    /// Setup cron jobs to run. This does nothing without the `cron` flag enabled.
1071    pub fn with_cron(&mut self, _cron_str: &str, _cron_type: CronType) -> &mut Self {
1072        self
1073    }
1074
1075    /// Set a crawl page limit. If the value is 0 there is no limit.
1076    pub fn with_limit(&mut self, limit: u32) -> &mut Self {
1077        self.with_budget(Some(hashbrown::HashMap::from([("*", limit)])));
1078        self
1079    }
1080
1081    /// Set the concurrency limits. If you set the value to None to use the default limits using the system CPU cors * n.
1082    pub fn with_concurrency_limit(&mut self, limit: Option<usize>) -> &mut Self {
1083        self.concurrency_limit = limit;
1084        self
1085    }
1086
1087    #[cfg(feature = "chrome")]
1088    /// Set the authentiation challenge response. This does nothing without the feat flag `chrome` enabled.
1089    pub fn with_auth_challenge_response(
1090        &mut self,
1091        auth_challenge_response: Option<AuthChallengeResponse>,
1092    ) -> &mut Self {
1093        self.auth_challenge_response = auth_challenge_response;
1094        self
1095    }
1096
1097    #[cfg(feature = "chrome")]
1098    /// Set a custom script to evaluate on new document creation. This does nothing without the feat flag `chrome` enabled.
1099    pub fn with_evaluate_on_new_document(
1100        &mut self,
1101        evaluate_on_new_document: Option<Box<String>>,
1102    ) -> &mut Self {
1103        self.evaluate_on_new_document = evaluate_on_new_document;
1104        self
1105    }
1106
1107    #[cfg(not(feature = "chrome"))]
1108    /// Set a custom script to evaluate on new document creation. This does nothing without the feat flag `chrome` enabled.
1109    pub fn with_evaluate_on_new_document(
1110        &mut self,
1111        _evaluate_on_new_document: Option<Box<String>>,
1112    ) -> &mut Self {
1113        self
1114    }
1115
1116    #[cfg(not(feature = "chrome"))]
1117    /// Set the authentiation challenge response. This does nothing without the feat flag `chrome` enabled.
1118    pub fn with_auth_challenge_response(
1119        &mut self,
1120        _auth_challenge_response: Option<AuthChallengeResponse>,
1121    ) -> &mut Self {
1122        self
1123    }
1124
1125    /// Set a crawl depth limit. If the value is 0 there is no limit.
1126    pub fn with_depth(&mut self, depth: usize) -> &mut Self {
1127        self.depth = depth;
1128        self
1129    }
1130
1131    #[cfg(any(feature = "cache_request", feature = "chrome_remote_cache"))]
1132    /// Cache the page following HTTP rules. This method does nothing if the `cache` feature is not enabled.
1133    pub fn with_caching(&mut self, cache: bool) -> &mut Self {
1134        self.cache = cache;
1135        self
1136    }
1137
1138    #[cfg(not(any(feature = "cache_request", feature = "chrome_remote_cache")))]
1139    /// Cache the page following HTTP rules. This method does nothing if the `cache` feature is not enabled.
1140    pub fn with_caching(&mut self, _cache: bool) -> &mut Self {
1141        self
1142    }
1143
1144    #[cfg(any(feature = "cache_request", feature = "chrome_remote_cache"))]
1145    /// Skip browser rendering entirely if cached response exists.
1146    /// When enabled with caching, returns cached HTML directly without launching Chrome.
1147    /// This is useful for performance when you only need the cached content.
1148    pub fn with_cache_skip_browser(&mut self, skip: bool) -> &mut Self {
1149        self.cache_skip_browser = skip;
1150        self
1151    }
1152
1153    #[cfg(not(any(feature = "cache_request", feature = "chrome_remote_cache")))]
1154    /// Skip browser rendering entirely if cached response exists.
1155    /// This method does nothing if the cache features are not enabled.
1156    pub fn with_cache_skip_browser(&mut self, _skip: bool) -> &mut Self {
1157        self
1158    }
1159
1160    #[cfg(feature = "chrome")]
1161    /// Enable or disable Service Workers. This method does nothing if the `chrome` feature is not enabled.
1162    pub fn with_service_worker_enabled(&mut self, enabled: bool) -> &mut Self {
1163        self.service_worker_enabled = enabled;
1164        self
1165    }
1166
1167    #[cfg(not(feature = "chrome"))]
1168    /// Enable or disable Service Workers. This method does nothing if the `chrome` feature is not enabled.
1169    pub fn with_service_worker_enabled(&mut self, _enabled: bool) -> &mut Self {
1170        self
1171    }
1172
1173    /// Automatically setup geo-location configurations when using a proxy. This method does nothing if the `chrome` feature is not enabled.
1174    #[cfg(not(feature = "chrome"))]
1175    pub fn with_auto_geolocation(&mut self, _enabled: bool) -> &mut Self {
1176        self
1177    }
1178
1179    /// Automatically setup geo-location configurations when using a proxy. This method does nothing if the `chrome` feature is not enabled.
1180    #[cfg(feature = "chrome")]
1181    pub fn with_auto_geolocation(&mut self, enabled: bool) -> &mut Self {
1182        self.auto_geolocation = enabled;
1183        self
1184    }
1185
1186    /// Set the retry limit for request. Set the value to 0 for no retries. The default is 0.
1187    pub fn with_retry(&mut self, retry: u8) -> &mut Self {
1188        self.retry = retry;
1189        self
1190    }
1191
1192    /// The default http connect timeout.
1193    pub fn with_default_http_connect_timeout(
1194        &mut self,
1195        default_http_connect_timeout: Option<Duration>,
1196    ) -> &mut Self {
1197        self.default_http_connect_timeout = default_http_connect_timeout;
1198        self
1199    }
1200
1201    /// The default http read timeout.
1202    pub fn with_default_http_read_timeout(
1203        &mut self,
1204        default_http_read_timeout: Option<Duration>,
1205    ) -> &mut Self {
1206        self.default_http_read_timeout = default_http_read_timeout;
1207        self
1208    }
1209
1210    /// Skip setting up a control thread for pause, start, and shutdown programmatic handling. This does nothing without the 'control' flag enabled.
1211    pub fn with_no_control_thread(&mut self, no_control_thread: bool) -> &mut Self {
1212        self.no_control_thread = no_control_thread;
1213        self
1214    }
1215
1216    /// Configures the viewport of the browser, which defaults to 800x600. This method does nothing if the 'chrome' feature is not enabled.
1217    pub fn with_viewport(&mut self, viewport: Option<crate::configuration::Viewport>) -> &mut Self {
1218        self.viewport = viewport.map(|vp| vp);
1219        self
1220    }
1221
1222    #[cfg(feature = "chrome")]
1223    /// Use stealth mode for the request. This does nothing without the `chrome` flag enabled.
1224    pub fn with_stealth(&mut self, stealth_mode: bool) -> &mut Self {
1225        if stealth_mode {
1226            self.stealth_mode = spider_fingerprint::configs::Tier::Basic;
1227        } else {
1228            self.stealth_mode = spider_fingerprint::configs::Tier::None;
1229        }
1230        self
1231    }
1232
1233    #[cfg(feature = "chrome")]
1234    /// Use stealth mode for the request. This does nothing without the `chrome` flag enabled.
1235    pub fn with_stealth_advanced(
1236        &mut self,
1237        stealth_mode: spider_fingerprint::configs::Tier,
1238    ) -> &mut Self {
1239        self.stealth_mode = stealth_mode;
1240        self
1241    }
1242
1243    #[cfg(not(feature = "chrome"))]
1244    /// Use stealth mode for the request. This does nothing without the `chrome` flag enabled.
1245    pub fn with_stealth(&mut self, _stealth_mode: bool) -> &mut Self {
1246        self
1247    }
1248
1249    #[cfg(feature = "chrome")]
1250    /// Wait for network request to be idle within a time frame period (500ms no network connections). This does nothing without the `chrome` flag enabled.
1251    pub fn with_wait_for_idle_network(
1252        &mut self,
1253        wait_for_idle_network: Option<WaitForIdleNetwork>,
1254    ) -> &mut Self {
1255        match self.wait_for.as_mut() {
1256            Some(wait_for) => wait_for.idle_network = wait_for_idle_network,
1257            _ => {
1258                let mut wait_for = WaitFor::default();
1259                wait_for.idle_network = wait_for_idle_network;
1260                self.wait_for = Some(wait_for);
1261            }
1262        }
1263        self
1264    }
1265
1266    #[cfg(feature = "chrome")]
1267    /// Wait for network request with a max timeout. This does nothing without the `chrome` flag enabled.
1268    pub fn with_wait_for_idle_network0(
1269        &mut self,
1270        wait_for_idle_network0: Option<WaitForIdleNetwork>,
1271    ) -> &mut Self {
1272        match self.wait_for.as_mut() {
1273            Some(wait_for) => wait_for.idle_network0 = wait_for_idle_network0,
1274            _ => {
1275                let mut wait_for = WaitFor::default();
1276                wait_for.idle_network0 = wait_for_idle_network0;
1277                self.wait_for = Some(wait_for);
1278            }
1279        }
1280        self
1281    }
1282
1283    #[cfg(feature = "chrome")]
1284    /// Wait for network to be almost idle with a max timeout. This does nothing without the `chrome` flag enabled.
1285    pub fn with_wait_for_almost_idle_network0(
1286        &mut self,
1287        wait_for_almost_idle_network0: Option<WaitForIdleNetwork>,
1288    ) -> &mut Self {
1289        match self.wait_for.as_mut() {
1290            Some(wait_for) => wait_for.almost_idle_network0 = wait_for_almost_idle_network0,
1291            _ => {
1292                let mut wait_for = WaitFor::default();
1293                wait_for.almost_idle_network0 = wait_for_almost_idle_network0;
1294                self.wait_for = Some(wait_for);
1295            }
1296        }
1297        self
1298    }
1299
1300    #[cfg(not(feature = "chrome"))]
1301    /// Wait for network to be almost idle with a max timeout. This does nothing without the `chrome` flag enabled.
1302    pub fn with_wait_for_almost_idle_network0(
1303        &mut self,
1304        _wait_for_almost_idle_network0: Option<WaitForIdleNetwork>,
1305    ) -> &mut Self {
1306        self
1307    }
1308
1309    #[cfg(not(feature = "chrome"))]
1310    /// Wait for network request with a max timeout. This does nothing without the `chrome` flag enabled.
1311    pub fn with_wait_for_idle_network0(
1312        &mut self,
1313        _wait_for_idle_network0: Option<WaitForIdleNetwork>,
1314    ) -> &mut Self {
1315        self
1316    }
1317
1318    #[cfg(not(feature = "chrome"))]
1319    /// Wait for idle network request. This method does nothing if the `chrome` feature is not enabled.
1320    pub fn with_wait_for_idle_network(
1321        &mut self,
1322        _wait_for_idle_network: Option<WaitForIdleNetwork>,
1323    ) -> &mut Self {
1324        self
1325    }
1326
1327    #[cfg(feature = "chrome")]
1328    /// Wait for idle dom mutations for target element. This method does nothing if the [chrome] feature is not enabled.
1329    pub fn with_wait_for_idle_dom(
1330        &mut self,
1331        wait_for_idle_dom: Option<WaitForSelector>,
1332    ) -> &mut Self {
1333        match self.wait_for.as_mut() {
1334            Some(wait_for) => wait_for.dom = wait_for_idle_dom,
1335            _ => {
1336                let mut wait_for = WaitFor::default();
1337                wait_for.dom = wait_for_idle_dom;
1338                self.wait_for = Some(wait_for);
1339            }
1340        }
1341        self
1342    }
1343
1344    #[cfg(not(feature = "chrome"))]
1345    /// Wait for idle dom mutations for target element. This method does nothing if the `chrome` feature is not enabled.
1346    pub fn with_wait_for_idle_dom(
1347        &mut self,
1348        _wait_for_idle_dom: Option<WaitForSelector>,
1349    ) -> &mut Self {
1350        self
1351    }
1352
1353    #[cfg(feature = "chrome")]
1354    /// Wait for a selector. This method does nothing if the `chrome` feature is not enabled.
1355    pub fn with_wait_for_selector(
1356        &mut self,
1357        wait_for_selector: Option<WaitForSelector>,
1358    ) -> &mut Self {
1359        match self.wait_for.as_mut() {
1360            Some(wait_for) => wait_for.selector = wait_for_selector,
1361            _ => {
1362                let mut wait_for = WaitFor::default();
1363                wait_for.selector = wait_for_selector;
1364                self.wait_for = Some(wait_for);
1365            }
1366        }
1367        self
1368    }
1369
1370    #[cfg(not(feature = "chrome"))]
1371    /// Wait for a selector. This method does nothing if the `chrome` feature is not enabled.
1372    pub fn with_wait_for_selector(
1373        &mut self,
1374        _wait_for_selector: Option<WaitForSelector>,
1375    ) -> &mut Self {
1376        self
1377    }
1378
1379    #[cfg(feature = "chrome")]
1380    /// Wait for with delay. Should only be used for testing. This method does nothing if the 'chrome' feature is not enabled.
1381    pub fn with_wait_for_delay(&mut self, wait_for_delay: Option<WaitForDelay>) -> &mut Self {
1382        match self.wait_for.as_mut() {
1383            Some(wait_for) => wait_for.delay = wait_for_delay,
1384            _ => {
1385                let mut wait_for = WaitFor::default();
1386                wait_for.delay = wait_for_delay;
1387                self.wait_for = Some(wait_for);
1388            }
1389        }
1390        self
1391    }
1392
1393    #[cfg(not(feature = "chrome"))]
1394    /// Wait for with delay. Should only be used for testing. This method does nothing if the 'chrome' feature is not enabled.
1395    pub fn with_wait_for_delay(&mut self, _wait_for_delay: Option<WaitForDelay>) -> &mut Self {
1396        self
1397    }
1398
1399    #[cfg(feature = "chrome_intercept")]
1400    /// Use request intercept for the request to only allow content that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the `chrome_intercept` is not enabled.
1401    pub fn with_chrome_intercept(
1402        &mut self,
1403        chrome_intercept: RequestInterceptConfiguration,
1404        url: &Option<Box<url::Url>>,
1405    ) -> &mut Self {
1406        self.chrome_intercept = chrome_intercept;
1407        self.chrome_intercept.setup_intercept_manager(url);
1408        self
1409    }
1410
1411    #[cfg(not(feature = "chrome_intercept"))]
1412    /// Use request intercept for the request to only allow content required for the page that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the `chrome_intercept` is not enabled.
1413    pub fn with_chrome_intercept(
1414        &mut self,
1415        _chrome_intercept: RequestInterceptConfiguration,
1416        _url: &Option<Box<url::Url>>,
1417    ) -> &mut Self {
1418        self
1419    }
1420
1421    #[cfg(feature = "chrome")]
1422    /// Set the connection url for the chrome instance. This method does nothing if the `chrome` is not enabled.
1423    pub fn with_chrome_connection(&mut self, chrome_connection_url: Option<String>) -> &mut Self {
1424        self.chrome_connection_url = chrome_connection_url;
1425        self
1426    }
1427
1428    #[cfg(not(feature = "chrome"))]
1429    /// Set the connection url for the chrome instance. This method does nothing if the `chrome` is not enabled.
1430    pub fn with_chrome_connection(&mut self, _chrome_connection_url: Option<String>) -> &mut Self {
1431        self
1432    }
1433
1434    #[cfg(not(feature = "chrome"))]
1435    /// Set JS to run on certain pages. This method does nothing if the `chrome` is not enabled.
1436    pub fn with_execution_scripts(
1437        &mut self,
1438        _execution_scripts: Option<ExecutionScriptsMap>,
1439    ) -> &mut Self {
1440        self
1441    }
1442
1443    #[cfg(feature = "chrome")]
1444    /// Set JS to run on certain pages. This method does nothing if the `chrome` is not enabled.
1445    pub fn with_execution_scripts(
1446        &mut self,
1447        execution_scripts: Option<ExecutionScriptsMap>,
1448    ) -> &mut Self {
1449        self.execution_scripts =
1450            crate::features::chrome_common::convert_to_trie_execution_scripts(&execution_scripts);
1451        self
1452    }
1453
1454    #[cfg(not(feature = "chrome"))]
1455    /// Run web automated actions on certain pages. This method does nothing if the `chrome` is not enabled.
1456    pub fn with_automation_scripts(
1457        &mut self,
1458        _automation_scripts: Option<AutomationScriptsMap>,
1459    ) -> &mut Self {
1460        self
1461    }
1462
1463    #[cfg(feature = "chrome")]
1464    /// Run web automated actions on certain pages. This method does nothing if the `chrome` is not enabled.
1465    pub fn with_automation_scripts(
1466        &mut self,
1467        automation_scripts: Option<AutomationScriptsMap>,
1468    ) -> &mut Self {
1469        self.automation_scripts =
1470            crate::features::chrome_common::convert_to_trie_automation_scripts(&automation_scripts);
1471        self
1472    }
1473
1474    /// Set a crawl budget per path with levels support /a/b/c or for all paths with "*". This does nothing without the `budget` flag enabled.
1475    pub fn with_budget(&mut self, budget: Option<hashbrown::HashMap<&str, u32>>) -> &mut Self {
1476        self.budget = match budget {
1477            Some(budget) => {
1478                let mut crawl_budget: hashbrown::HashMap<
1479                    case_insensitive_string::CaseInsensitiveString,
1480                    u32,
1481                > = hashbrown::HashMap::new();
1482
1483                for b in budget.into_iter() {
1484                    crawl_budget.insert(
1485                        case_insensitive_string::CaseInsensitiveString::from(b.0),
1486                        b.1,
1487                    );
1488                }
1489
1490                Some(crawl_budget)
1491            }
1492            _ => None,
1493        };
1494        self
1495    }
1496
1497    /// Group external domains to treat the crawl as one. If None is passed this will clear all prior domains.
1498    pub fn with_external_domains<'a, 'b>(
1499        &mut self,
1500        external_domains: Option<impl Iterator<Item = String> + 'a>,
1501    ) -> &mut Self {
1502        match external_domains {
1503            Some(external_domains) => {
1504                self.external_domains_caseless = external_domains
1505                    .into_iter()
1506                    .filter_map(|d| {
1507                        if d == "*" {
1508                            Some("*".into())
1509                        } else {
1510                            let host = get_domain_from_url(&d);
1511
1512                            if !host.is_empty() {
1513                                Some(host.into())
1514                            } else {
1515                                None
1516                            }
1517                        }
1518                    })
1519                    .collect::<hashbrown::HashSet<case_insensitive_string::CaseInsensitiveString>>()
1520                    .into();
1521            }
1522            _ => self.external_domains_caseless = Default::default(),
1523        }
1524
1525        self
1526    }
1527
1528    /// Dangerously accept invalid certificates - this should be used as a last resort.
1529    pub fn with_danger_accept_invalid_certs(&mut self, accept_invalid_certs: bool) -> &mut Self {
1530        self.accept_invalid_certs = accept_invalid_certs;
1531        self
1532    }
1533
1534    /// Normalize the content de-duplicating trailing slash pages and other pages that can be duplicated. This may initially show the link in your links_visited or subscription calls but, the following links will not be crawled.
1535    pub fn with_normalize(&mut self, normalize: bool) -> &mut Self {
1536        self.normalize = normalize;
1537        self
1538    }
1539
1540    #[cfg(not(feature = "disk"))]
1541    /// Store all the links found on the disk to share the state. This does nothing without the `disk` flag enabled.
1542    pub fn with_shared_state(&mut self, _shared: bool) -> &mut Self {
1543        self
1544    }
1545
1546    /// Store all the links found on the disk to share the state. This does nothing without the `disk` flag enabled.
1547    #[cfg(feature = "disk")]
1548    pub fn with_shared_state(&mut self, shared: bool) -> &mut Self {
1549        self.shared = shared;
1550        self
1551    }
1552
1553    #[cfg(not(feature = "chrome"))]
1554    /// Overrides default host system timezone with the specified one. This does nothing without the `chrome` flag enabled.
1555    pub fn with_timezone_id(&mut self, _timezone_id: Option<String>) -> &mut Self {
1556        self
1557    }
1558
1559    #[cfg(feature = "chrome")]
1560    /// Overrides default host system timezone with the specified one. This does nothing without the `chrome` flag enabled.
1561    pub fn with_timezone_id(&mut self, timezone_id: Option<String>) -> &mut Self {
1562        self.timezone_id = timezone_id.map(|timezone_id| timezone_id.into());
1563        self
1564    }
1565
1566    #[cfg(not(feature = "chrome"))]
1567    /// Overrides default host system locale with the specified one. This does nothing without the `chrome` flag enabled.
1568    pub fn with_locale(&mut self, _locale: Option<String>) -> &mut Self {
1569        self
1570    }
1571
1572    #[cfg(feature = "chrome")]
1573    /// Overrides default host system locale with the specified one. This does nothing without the `chrome` flag enabled.
1574    pub fn with_locale(&mut self, locale: Option<String>) -> &mut Self {
1575        self.locale = locale.map(|locale| locale.into());
1576        self
1577    }
1578
1579    #[cfg(feature = "chrome")]
1580    /// Track the events made via chrome.
1581    pub fn with_event_tracker(&mut self, track_events: Option<ChromeEventTracker>) -> &mut Self {
1582        self.track_events = track_events;
1583        self
1584    }
1585
1586    /// Set the chrome screenshot configuration. This does nothing without the `chrome` flag enabled.
1587    #[cfg(not(feature = "chrome"))]
1588    pub fn with_screenshot(&mut self, _screenshot_config: Option<ScreenShotConfig>) -> &mut Self {
1589        self
1590    }
1591
1592    /// Set the chrome screenshot configuration. This does nothing without the `chrome` flag enabled.
1593    #[cfg(feature = "chrome")]
1594    pub fn with_screenshot(&mut self, screenshot_config: Option<ScreenShotConfig>) -> &mut Self {
1595        self.screenshot = screenshot_config;
1596        self
1597    }
1598
1599    /// Set the max amount of bytes to collect per page. This method does nothing if the `chrome` is not enabled.
1600    pub fn with_max_page_bytes(&mut self, max_page_bytes: Option<f64>) -> &mut Self {
1601        self.max_page_bytes = max_page_bytes;
1602        self
1603    }
1604
1605    /// Set the max amount of bytes to collected for the browser context. This method does nothing if the `chrome` is not enabled.
1606    pub fn with_max_bytes_allowed(&mut self, max_bytes_allowed: Option<u64>) -> &mut Self {
1607        self.max_bytes_allowed = max_bytes_allowed;
1608        self
1609    }
1610
1611    /// Block assets from loading from the network.
1612    pub fn with_block_assets(&mut self, only_html: bool) -> &mut Self {
1613        self.only_html = only_html;
1614        self
1615    }
1616
1617    /// Modify the headers to mimic a real browser.
1618    pub fn with_modify_headers(&mut self, modify_headers: bool) -> &mut Self {
1619        self.modify_headers = modify_headers;
1620        self
1621    }
1622
1623    /// Modify the HTTP client headers to mimic a real browser.
1624    pub fn with_modify_http_client_headers(
1625        &mut self,
1626        modify_http_client_headers: bool,
1627    ) -> &mut Self {
1628        self.modify_http_client_headers = modify_http_client_headers;
1629        self
1630    }
1631
1632    /// Set the cache policy.
1633    pub fn with_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) -> &mut Self {
1634        self.cache_policy = cache_policy;
1635        self
1636    }
1637
1638    #[cfg(feature = "webdriver")]
1639    /// Set the WebDriver configuration. This does nothing without the `webdriver` flag enabled.
1640    pub fn with_webdriver_config(
1641        &mut self,
1642        webdriver_config: Option<WebDriverConfig>,
1643    ) -> &mut Self {
1644        self.webdriver_config = webdriver_config.map(Box::new);
1645        self
1646    }
1647
1648    #[cfg(not(feature = "webdriver"))]
1649    /// Set the WebDriver configuration. This does nothing without the `webdriver` flag enabled.
1650    pub fn with_webdriver_config(
1651        &mut self,
1652        _webdriver_config: Option<WebDriverConfig>,
1653    ) -> &mut Self {
1654        self
1655    }
1656
1657    /// Get the cache option to use for the run. This does nothing without the 'cache_request' feature.
1658    #[cfg(any(feature = "cache_request", feature = "chrome_remote_cache"))]
1659    pub(crate) fn get_cache_options(&self) -> Option<crate::utils::CacheOptions> {
1660        use crate::utils::CacheOptions;
1661        if !self.cache {
1662            return None;
1663        }
1664        let auth_token = self
1665            .headers
1666            .as_ref()
1667            .and_then(|headers| {
1668                headers
1669                    .0
1670                    .get("authorization")
1671                    .or_else(|| headers.0.get("Authorization"))
1672            })
1673            .map(|s| s.to_owned());
1674
1675        let skip_browser = self.cache_skip_browser;
1676
1677        match auth_token {
1678            Some(token) if !token.is_empty() => {
1679                if let Ok(token_str) = token.to_str() {
1680                    if skip_browser {
1681                        Some(CacheOptions::SkipBrowserAuthorized(token_str.into()))
1682                    } else {
1683                        Some(CacheOptions::Authorized(token_str.into()))
1684                    }
1685                } else if skip_browser {
1686                    Some(CacheOptions::SkipBrowser)
1687                } else {
1688                    Some(CacheOptions::Yes)
1689                }
1690            }
1691            _ => {
1692                if skip_browser {
1693                    Some(CacheOptions::SkipBrowser)
1694                } else {
1695                    Some(CacheOptions::Yes)
1696                }
1697            }
1698        }
1699    }
1700
1701    /// Get the cache option to use for the run. This does nothing without the 'cache_request' feature.
1702    #[cfg(all(
1703        feature = "chrome",
1704        not(any(feature = "cache_request", feature = "chrome_remote_cache"))
1705    ))]
1706    pub(crate) fn get_cache_options(&self) -> Option<crate::utils::CacheOptions> {
1707        None
1708    }
1709
1710    /// Get the cache option to use for the run when chrome/cache features are disabled.
1711    #[cfg(not(any(
1712        feature = "cache_request",
1713        feature = "chrome_remote_cache",
1714        feature = "chrome"
1715    )))]
1716    #[allow(dead_code)]
1717    pub(crate) fn get_cache_options(&self) -> Option<crate::utils::CacheOptions> {
1718        None
1719    }
1720
1721    /// Build the website configuration when using with_builder.
1722    pub fn build(&self) -> Self {
1723        self.to_owned()
1724    }
1725
1726    #[cfg(feature = "search")]
1727    /// Configure web search integration. This does nothing without the `search` flag enabled.
1728    pub fn with_search_config(&mut self, search_config: Option<SearchConfig>) -> &mut Self {
1729        self.search_config = search_config.map(Box::new);
1730        self
1731    }
1732
1733    #[cfg(not(feature = "search"))]
1734    /// Configure web search integration. This does nothing without the `search` flag enabled.
1735    pub fn with_search_config(&mut self, _search_config: Option<()>) -> &mut Self {
1736        self
1737    }
1738
1739    /// Set a [spider.cloud](https://spider.cloud) API key (Proxy mode).
1740    #[cfg(feature = "spider_cloud")]
1741    pub fn with_spider_cloud(&mut self, api_key: &str) -> &mut Self {
1742        self.spider_cloud = Some(Box::new(SpiderCloudConfig::new(api_key)));
1743        self
1744    }
1745
1746    /// Set a [spider.cloud](https://spider.cloud) API key (no-op without `spider_cloud` feature).
1747    #[cfg(not(feature = "spider_cloud"))]
1748    pub fn with_spider_cloud(&mut self, _api_key: &str) -> &mut Self {
1749        self
1750    }
1751
1752    /// Set a [spider.cloud](https://spider.cloud) config.
1753    #[cfg(feature = "spider_cloud")]
1754    pub fn with_spider_cloud_config(&mut self, config: SpiderCloudConfig) -> &mut Self {
1755        self.spider_cloud = Some(Box::new(config));
1756        self
1757    }
1758
1759    /// Set a [spider.cloud](https://spider.cloud) config (no-op without `spider_cloud` feature).
1760    #[cfg(not(feature = "spider_cloud"))]
1761    pub fn with_spider_cloud_config(&mut self, _config: ()) -> &mut Self {
1762        self
1763    }
1764
1765    /// Set the hedged request (work-stealing) configuration.
1766    #[cfg(feature = "hedge")]
1767    pub fn with_hedge(&mut self, config: crate::utils::hedge::HedgeConfig) -> &mut Self {
1768        self.hedge = Some(config);
1769        self
1770    }
1771
1772    /// Set the hedged request configuration (no-op without `hedge` feature).
1773    #[cfg(not(feature = "hedge"))]
1774    pub fn with_hedge(&mut self, _config: ()) -> &mut Self {
1775        self
1776    }
1777}
1778
1779/// Search provider configuration for web search integration.
1780#[cfg(feature = "search")]
1781#[derive(Debug, Clone, PartialEq)]
1782#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
1783pub struct SearchConfig {
1784    /// The search provider to use.
1785    pub provider: SearchProviderType,
1786    /// API key for the search provider.
1787    pub api_key: String,
1788    /// Custom API URL (overrides default endpoint for the provider).
1789    pub api_url: Option<String>,
1790    /// Default search options.
1791    pub default_options: Option<SearchOptions>,
1792}
1793
1794#[cfg(feature = "search")]
1795impl SearchConfig {
1796    /// Create a new search configuration.
1797    pub fn new(provider: SearchProviderType, api_key: impl Into<String>) -> Self {
1798        Self {
1799            provider,
1800            api_key: api_key.into(),
1801            api_url: None,
1802            default_options: None,
1803        }
1804    }
1805
1806    /// Use a custom API endpoint for this provider.
1807    pub fn with_api_url(mut self, url: impl Into<String>) -> Self {
1808        self.api_url = Some(url.into());
1809        self
1810    }
1811
1812    /// Set default search options.
1813    pub fn with_default_options(mut self, options: SearchOptions) -> Self {
1814        self.default_options = Some(options);
1815        self
1816    }
1817
1818    /// Check if this configuration is valid and search is enabled.
1819    ///
1820    /// Returns true if an API key is set or a custom API URL is configured.
1821    pub fn is_enabled(&self) -> bool {
1822        !self.api_key.is_empty() || self.api_url.is_some()
1823    }
1824}
1825
1826/// Available search providers.
1827#[cfg(feature = "search")]
1828#[derive(Debug, Clone, Default, PartialEq, Eq)]
1829#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
1830pub enum SearchProviderType {
1831    /// Serper.dev - Google SERP API (high quality).
1832    #[default]
1833    Serper,
1834    /// Brave Search API (privacy-focused).
1835    Brave,
1836    /// Microsoft Bing Web Search API.
1837    Bing,
1838    /// Tavily AI Search (optimized for LLMs).
1839    Tavily,
1840}
1841
1842// ─── Spider Cloud ───────────────────────────────────────────────────────────
1843
1844/// Integration mode for [spider.cloud](https://spider.cloud).
1845#[cfg(feature = "spider_cloud")]
1846#[derive(Debug, Clone, Default, PartialEq, Eq)]
1847#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
1848pub enum SpiderCloudMode {
1849    /// Route all HTTP requests through `proxy.spider.cloud`.
1850    /// This is the simplest mode — the existing fetch pipeline works
1851    /// unmodified, traffic goes through the proxy transparently.
1852    #[default]
1853    Proxy,
1854    /// Use the spider.cloud `POST /crawl` API (with `limit: 1`) for each page.
1855    /// Best for simple scraping needs.
1856    Api,
1857    /// Use the spider.cloud `POST /unblocker` API for anti-bot bypass.
1858    /// Best for hard-to-get pages behind advanced bot protection.
1859    Unblocker,
1860    /// Direct fetch first; fall back to spider.cloud API on
1861    /// 403 / 429 / 503 or connection errors.
1862    Fallback,
1863    /// Intelligent mode: proxy by default, automatically falls back to
1864    /// `/unblocker` when it detects bot protection (403, 429, 503, CAPTCHA
1865    /// pages, Cloudflare challenges, empty bodies on HTML pages, etc.).
1866    /// This is the recommended mode for production use.
1867    Smart,
1868}
1869
1870/// Configuration for spider.cloud integration.
1871///
1872/// Spider Cloud provides anti-bot bypass, proxy rotation, and high-throughput
1873/// data collection. Sign up at <https://spider.cloud> to obtain an API key.
1874#[cfg(feature = "spider_cloud")]
1875#[derive(Debug, Clone, PartialEq, Eq)]
1876#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
1877pub struct SpiderCloudConfig {
1878    /// API key / secret. Sign up at <https://spider.cloud> to get one.
1879    pub api_key: String,
1880    /// Integration mode.
1881    #[cfg_attr(feature = "serde", serde(default))]
1882    pub mode: SpiderCloudMode,
1883    /// API base URL (default: `https://api.spider.cloud`).
1884    #[cfg_attr(
1885        feature = "serde",
1886        serde(default = "SpiderCloudConfig::default_api_url")
1887    )]
1888    pub api_url: String,
1889    /// Proxy URL (default: `https://proxy.spider.cloud`).
1890    #[cfg_attr(
1891        feature = "serde",
1892        serde(default = "SpiderCloudConfig::default_proxy_url")
1893    )]
1894    pub proxy_url: String,
1895    /// Return format for API mode (default: `"raw"` to get original HTML).
1896    #[cfg_attr(
1897        feature = "serde",
1898        serde(default = "SpiderCloudConfig::default_return_format")
1899    )]
1900    pub return_format: String,
1901    /// Extra params forwarded in API mode (e.g. `stealth`, `fingerprint`, `cache`).
1902    #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
1903    pub extra_params: Option<hashbrown::HashMap<String, serde_json::Value>>,
1904}
1905
1906#[cfg(feature = "spider_cloud")]
1907impl Default for SpiderCloudConfig {
1908    fn default() -> Self {
1909        Self {
1910            api_key: String::new(),
1911            mode: SpiderCloudMode::default(),
1912            api_url: Self::default_api_url(),
1913            proxy_url: Self::default_proxy_url(),
1914            return_format: Self::default_return_format(),
1915            extra_params: None,
1916        }
1917    }
1918}
1919
1920#[cfg(feature = "spider_cloud")]
1921impl SpiderCloudConfig {
1922    /// Create a new config with defaults (Proxy mode).
1923    pub fn new(api_key: impl Into<String>) -> Self {
1924        Self {
1925            api_key: api_key.into(),
1926            ..Default::default()
1927        }
1928    }
1929
1930    /// Set the integration mode.
1931    pub fn with_mode(mut self, mode: SpiderCloudMode) -> Self {
1932        self.mode = mode;
1933        self
1934    }
1935
1936    /// Set a custom API base URL.
1937    pub fn with_api_url(mut self, url: impl Into<String>) -> Self {
1938        self.api_url = url.into();
1939        self
1940    }
1941
1942    /// Set a custom proxy URL.
1943    pub fn with_proxy_url(mut self, url: impl Into<String>) -> Self {
1944        self.proxy_url = url.into();
1945        self
1946    }
1947
1948    /// Set the return format for API mode.
1949    pub fn with_return_format(mut self, fmt: impl Into<String>) -> Self {
1950        self.return_format = fmt.into();
1951        self
1952    }
1953
1954    /// Set extra params for API mode.
1955    pub fn with_extra_params(
1956        mut self,
1957        params: hashbrown::HashMap<String, serde_json::Value>,
1958    ) -> Self {
1959        self.extra_params = Some(params);
1960        self
1961    }
1962
1963    /// Determine if a response should trigger a spider.cloud API fallback.
1964    ///
1965    /// This encapsulates the intelligence about which status codes and
1966    /// content patterns indicate the page needs spider.cloud's help.
1967    ///
1968    /// Checks for:
1969    /// - HTTP 403 (Forbidden) — typically bot protection
1970    /// - HTTP 429 (Too Many Requests) — rate limiting
1971    /// - HTTP 503 (Service Unavailable) — often Cloudflare/DDoS protection
1972    /// - HTTP 520-530 (Cloudflare error range)
1973    /// - HTTP 5xx (server errors)
1974    /// - Empty body on what should be an HTML page
1975    /// - Known CAPTCHA / challenge page markers in the response body
1976    pub fn should_fallback(&self, status_code: u16, body: Option<&[u8]>) -> bool {
1977        match self.mode {
1978            SpiderCloudMode::Api | SpiderCloudMode::Unblocker => false, // already using API
1979            SpiderCloudMode::Proxy => false,                            // proxy-only, no fallback
1980            SpiderCloudMode::Fallback | SpiderCloudMode::Smart => {
1981                // Status code triggers
1982                if matches!(status_code, 403 | 429 | 503 | 520..=530) {
1983                    return true;
1984                }
1985                if status_code >= 500 {
1986                    return true;
1987                }
1988
1989                // Content-based triggers (Smart mode only)
1990                if self.mode == SpiderCloudMode::Smart {
1991                    if let Some(body) = body {
1992                        // Empty body when we expected HTML
1993                        if body.is_empty() {
1994                            return true;
1995                        }
1996
1997                        // Check for bot protection / CAPTCHA markers in the body
1998                        // (only check first 4KB for performance)
1999                        let check_len = body.len().min(4096);
2000                        let snippet = String::from_utf8_lossy(&body[..check_len]);
2001                        let lower = snippet.to_lowercase();
2002
2003                        // Cloudflare challenge
2004                        if lower.contains("cf-browser-verification")
2005                            || lower.contains("cloudflare") && lower.contains("challenge-platform")
2006                        {
2007                            return true;
2008                        }
2009
2010                        // Generic CAPTCHA / bot detection markers
2011                        if lower.contains("captcha") && lower.contains("challenge")
2012                            || lower.contains("please verify you are a human")
2013                            || lower.contains("access denied") && lower.contains("automated")
2014                            || lower.contains("bot detection")
2015                        {
2016                            return true;
2017                        }
2018
2019                        // Distil Networks / Imperva / Akamai patterns
2020                        if lower.contains("distil_r_captcha")
2021                            || lower.contains("_imperva")
2022                            || lower.contains("akamai") && lower.contains("bot manager")
2023                        {
2024                            return true;
2025                        }
2026                    }
2027                }
2028
2029                false
2030            }
2031        }
2032    }
2033
2034    /// Get the fallback API route for this config.
2035    ///
2036    /// - `Smart` mode → `/unblocker` (best for bot-protected pages)
2037    /// - `Fallback` mode → `/crawl` (general purpose)
2038    /// - Other modes → `/crawl` (default)
2039    pub fn fallback_route(&self) -> &'static str {
2040        match self.mode {
2041            SpiderCloudMode::Smart | SpiderCloudMode::Unblocker => "unblocker",
2042            _ => "crawl",
2043        }
2044    }
2045
2046    /// Whether this mode uses the proxy transport layer.
2047    pub fn uses_proxy(&self) -> bool {
2048        matches!(
2049            self.mode,
2050            SpiderCloudMode::Proxy | SpiderCloudMode::Fallback | SpiderCloudMode::Smart
2051        )
2052    }
2053
2054    fn default_api_url() -> String {
2055        "https://api.spider.cloud".to_string()
2056    }
2057
2058    fn default_proxy_url() -> String {
2059        "https://proxy.spider.cloud".to_string()
2060    }
2061
2062    fn default_return_format() -> String {
2063        "raw".to_string()
2064    }
2065}
2066
2067#[cfg(test)]
2068mod tests {
2069    use super::*;
2070
2071    #[test]
2072    fn test_configuration_defaults() {
2073        let config = Configuration::default();
2074        assert!(!config.respect_robots_txt);
2075        assert!(!config.subdomains);
2076        assert!(!config.tld);
2077        assert_eq!(config.delay, 0);
2078        assert!(config.user_agent.is_none());
2079        assert!(config.blacklist_url.is_none());
2080        assert!(config.whitelist_url.is_none());
2081        assert!(config.proxies.is_none());
2082        assert!(!config.http2_prior_knowledge);
2083    }
2084
2085    #[test]
2086    fn test_redirect_policy_variants() {
2087        assert_eq!(RedirectPolicy::default(), RedirectPolicy::Loose);
2088        let strict = RedirectPolicy::Strict;
2089        let none = RedirectPolicy::None;
2090        assert_ne!(strict, RedirectPolicy::Loose);
2091        assert_ne!(none, RedirectPolicy::Loose);
2092        assert_ne!(strict, none);
2093    }
2094
2095    #[test]
2096    fn test_proxy_ignore_variants() {
2097        assert_eq!(ProxyIgnore::default(), ProxyIgnore::No);
2098        let chrome = ProxyIgnore::Chrome;
2099        let http = ProxyIgnore::Http;
2100        assert_ne!(chrome, ProxyIgnore::No);
2101        assert_ne!(http, ProxyIgnore::No);
2102        assert_ne!(chrome, http);
2103    }
2104
2105    #[test]
2106    fn test_request_proxy_construction() {
2107        let proxy = RequestProxy {
2108            addr: "http://proxy.example.com:8080".to_string(),
2109            ignore: ProxyIgnore::No,
2110        };
2111        assert_eq!(proxy.addr, "http://proxy.example.com:8080");
2112        assert_eq!(proxy.ignore, ProxyIgnore::No);
2113    }
2114
2115    #[test]
2116    fn test_request_proxy_default() {
2117        let proxy = RequestProxy::default();
2118        assert!(proxy.addr.is_empty());
2119        assert_eq!(proxy.ignore, ProxyIgnore::No);
2120    }
2121
2122    #[test]
2123    fn test_configuration_blacklist_setup() {
2124        let mut config = Configuration::default();
2125        config.blacklist_url = Some(vec![
2126            "https://example.com/private".into(),
2127            "https://example.com/admin".into(),
2128        ]);
2129        assert_eq!(config.blacklist_url.as_ref().unwrap().len(), 2);
2130    }
2131
2132    #[test]
2133    fn test_configuration_whitelist_setup() {
2134        let mut config = Configuration::default();
2135        config.whitelist_url = Some(vec!["https://example.com/public".into()]);
2136        assert_eq!(config.whitelist_url.as_ref().unwrap().len(), 1);
2137    }
2138
2139    #[test]
2140    fn test_configuration_external_domains() {
2141        let mut config = Configuration::default();
2142        config.external_domains_caseless = Arc::new(
2143            [
2144                case_insensitive_string::CaseInsensitiveString::from("Example.Com"),
2145                case_insensitive_string::CaseInsensitiveString::from("OTHER.org"),
2146            ]
2147            .into_iter()
2148            .collect(),
2149        );
2150        assert_eq!(config.external_domains_caseless.len(), 2);
2151        assert!(config.external_domains_caseless.contains(
2152            &case_insensitive_string::CaseInsensitiveString::from("example.com")
2153        ));
2154    }
2155
2156    #[test]
2157    fn test_configuration_budget() {
2158        let mut config = Configuration::default();
2159        let mut budget = hashbrown::HashMap::new();
2160        budget.insert(
2161            case_insensitive_string::CaseInsensitiveString::from("/path"),
2162            100u32,
2163        );
2164        config.budget = Some(budget);
2165        assert!(config.budget.is_some());
2166        assert_eq!(
2167            config.budget.as_ref().unwrap().get(
2168                &case_insensitive_string::CaseInsensitiveString::from("/path")
2169            ),
2170            Some(&100u32)
2171        );
2172    }
2173
2174    #[cfg(not(feature = "regex"))]
2175    #[test]
2176    fn test_allow_list_set_default() {
2177        let allow_list = AllowListSet::default();
2178        assert!(allow_list.0.is_empty());
2179    }
2180
2181    #[cfg(feature = "agent")]
2182    #[test]
2183    fn test_build_remote_multimodal_engine_preserves_dual_models() {
2184        use crate::features::automation::{
2185            ModelEndpoint, RemoteMultimodalConfigs, VisionRouteMode,
2186        };
2187
2188        let mut config = Configuration::default();
2189        let mm = RemoteMultimodalConfigs::new(
2190            "https://api.example.com/v1/chat/completions",
2191            "primary-model",
2192        )
2193        .with_vision_model(ModelEndpoint::new("vision-model").with_api_key("vision-key"))
2194        .with_text_model(
2195            ModelEndpoint::new("text-model")
2196                .with_api_url("https://text.example.com/v1/chat/completions")
2197                .with_api_key("text-key"),
2198        )
2199        .with_vision_route_mode(VisionRouteMode::TextFirst);
2200        config.remote_multimodal = Some(Box::new(mm));
2201
2202        let engine = config
2203            .build_remote_multimodal_engine()
2204            .expect("engine should be built");
2205
2206        assert_eq!(
2207            engine.vision_model.as_ref().map(|m| m.model_name.as_str()),
2208            Some("vision-model")
2209        );
2210        assert_eq!(
2211            engine.text_model.as_ref().map(|m| m.model_name.as_str()),
2212            Some("text-model")
2213        );
2214        assert_eq!(engine.vision_route_mode, VisionRouteMode::TextFirst);
2215    }
2216}