spider/
configuration.rs

1use crate::compact_str::CompactString;
2use crate::features::chrome_common::RequestInterceptConfiguration;
3pub use crate::features::chrome_common::{
4    AuthChallengeResponse, AuthChallengeResponseResponse, AutomationScripts, AutomationScriptsMap,
5    CaptureScreenshotFormat, CaptureScreenshotParams, ClipViewport, ExecutionScripts,
6    ExecutionScriptsMap, ScreenShotConfig, ScreenshotParams, Viewport, WaitFor, WaitForDelay,
7    WaitForIdleNetwork, WaitForSelector, WebAutomation,
8};
9pub use crate::features::gemini_common::GeminiConfigs;
10pub use crate::features::openai_common::GPTConfigs;
11#[cfg(feature = "search")]
12pub use crate::features::search::{
13    SearchError, SearchOptions, SearchResult, SearchResults, TimeRange,
14};
15pub use crate::features::webdriver_common::{WebDriverBrowser, WebDriverConfig};
16use crate::utils::get_domain_from_url;
17use crate::utils::BasicCachePolicy;
18use crate::website::CronType;
19use reqwest::header::{AsHeaderName, HeaderMap, HeaderName, HeaderValue, IntoHeaderName};
20use std::net::IpAddr;
21use std::sync::Arc;
22use std::time::Duration;
23
24#[cfg(feature = "chrome")]
25pub use spider_fingerprint::Fingerprint;
26
27/// Check if an API key is a placeholder or empty.
28pub fn is_placeholder_api_key(key: &str) -> bool {
29    let trimmed = key.trim();
30    trimmed.is_empty()
31        || trimmed.eq_ignore_ascii_case("YOUR_API_KEY")
32        || trimmed.eq_ignore_ascii_case("YOUR-API-KEY")
33        || trimmed.eq_ignore_ascii_case("API_KEY")
34        || trimmed.eq_ignore_ascii_case("API-KEY")
35}
36
37/// Redirect policy configuration for request
38#[derive(Debug, Default, Clone, PartialEq)]
39#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
40pub enum RedirectPolicy {
41    #[default]
42    #[cfg_attr(
43        feature = "serde",
44        serde(alias = "Loose", alias = "loose", alias = "LOOSE",)
45    )]
46    /// A loose policy that allows all request up to the redirect limit.
47    Loose,
48    #[cfg_attr(
49        feature = "serde",
50        serde(alias = "Strict", alias = "strict", alias = "STRICT",)
51    )]
52    /// A strict policy only allowing request that match the domain set for crawling.
53    Strict,
54    #[cfg_attr(
55        feature = "serde",
56        serde(alias = "None", alias = "none", alias = "NONE",)
57    )]
58    /// Prevent all redirects.
59    None,
60}
61
62#[cfg(not(feature = "regex"))]
63/// Allow list normal matching paths.
64pub type AllowList = Vec<CompactString>;
65
66#[cfg(feature = "regex")]
67/// Allow list regex.
68pub type AllowList = Box<regex::RegexSet>;
69
70/// Whitelist or Blacklist
71#[derive(Debug, Default, Clone)]
72#[cfg_attr(not(feature = "regex"), derive(PartialEq, Eq))]
73pub struct AllowListSet(pub AllowList);
74
75#[cfg(feature = "chrome")]
76/// Track the events made via chrome.
77#[derive(Debug, PartialEq, Eq, Clone, Default)]
78#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
79pub struct ChromeEventTracker {
80    /// Track the responses.
81    pub responses: bool,
82    /// Track the requests.
83    pub requests: bool,
84    /// Track the changes between web automation.
85    pub automation: bool,
86}
87
88#[cfg(feature = "chrome")]
89impl ChromeEventTracker {
90    /// Create a new chrome event tracker
91    pub fn new(requests: bool, responses: bool) -> Self {
92        ChromeEventTracker {
93            requests,
94            responses,
95            automation: true,
96        }
97    }
98}
99
100#[cfg(feature = "sitemap")]
101#[derive(Debug, Default)]
102/// Determine if the sitemap modified to the whitelist.
103pub struct SitemapWhitelistChanges {
104    /// Added the default sitemap.xml whitelist.
105    pub added_default: bool,
106    /// Added the custom whitelist path.
107    pub added_custom: bool,
108}
109
110#[cfg(feature = "sitemap")]
111impl SitemapWhitelistChanges {
112    /// Was the whitelist modified?
113    pub(crate) fn modified(&self) -> bool {
114        self.added_default || self.added_custom
115    }
116}
117
118/// Determine allow proxy
119#[derive(Debug, Default, Clone, PartialEq)]
120#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
121pub enum ProxyIgnore {
122    /// Chrome proxy.
123    Chrome,
124    /// HTTP proxy.
125    Http,
126    #[default]
127    /// Do not ignore
128    No,
129}
130
131/// The networking proxy to use.
132#[derive(Debug, Default, Clone, PartialEq)]
133#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
134pub struct RequestProxy {
135    /// The proxy address.
136    pub addr: String,
137    /// Ignore the proxy when running a request type.
138    pub ignore: ProxyIgnore,
139}
140
141/// The protocol used to communicate with a backend.
142#[cfg(feature = "parallel_backends")]
143#[derive(Debug, Clone, PartialEq)]
144#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
145pub enum BackendProtocol {
146    /// Chrome DevTools Protocol over WebSocket.
147    Cdp,
148    /// WebDriver (W3C) over HTTP.
149    WebDriver,
150}
151
152/// The engine type for a parallel crawl backend.
153#[cfg(feature = "parallel_backends")]
154#[derive(Debug, Default, Clone, PartialEq)]
155#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
156pub enum BackendEngine {
157    #[default]
158    /// CDP backend — communicates via the Chrome DevTools Protocol.
159    Cdp,
160    /// Servo — communicates via WebDriver protocol.
161    Servo,
162    /// A custom backend. Set `protocol` on [`BackendEndpoint`] to tell
163    /// spider whether to use CDP or WebDriver to communicate with it.
164    Custom,
165}
166
167/// A parallel crawl backend endpoint.
168///
169/// Each backend can run either **remotely** (connect to a running instance via
170/// `endpoint`) or **locally** (spider manages the engine process via
171/// `binary_path`). Set `endpoint` for remote mode, `binary_path` for local.
172#[cfg(feature = "parallel_backends")]
173#[derive(Debug, Default, Clone, PartialEq)]
174#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
175#[cfg_attr(feature = "serde", serde(default))]
176pub struct BackendEndpoint {
177    /// The browser engine to use.
178    pub engine: BackendEngine,
179    /// Remote endpoint URL. For CDP backends: a WebSocket URL
180    /// (e.g. `"ws://127.0.0.1:9222"`). For Servo: a WebDriver HTTP URL
181    /// (e.g. `"http://localhost:4444"`). When set, the engine is assumed to
182    /// be already running at this address.
183    pub endpoint: Option<String>,
184    /// Path to the engine binary for local mode. When set (and `endpoint` is
185    /// `None`), spider will spawn and manage the engine process. Uses PATH
186    /// lookup if empty string.
187    pub binary_path: Option<String>,
188    /// Explicit protocol override. When `None`, inferred from `engine`:
189    /// `Cdp` → CDP, `Servo` → WebDriver, `Custom` → **required**.
190    /// For custom backends, set this to tell spider how to communicate.
191    pub protocol: Option<BackendProtocol>,
192    /// Per-backend proxy address. When set, this backend routes its outbound
193    /// requests through the given proxy (e.g. `"socks5://proxy1:1080"`,
194    /// `"http://proxy2:8080"`). Overrides the global `ProxyRotator` for this
195    /// backend. For CDP backends, creates an isolated browser context with
196    /// the proxy. For WebDriver backends, sets the proxy capability.
197    pub proxy: Option<String>,
198}
199
200/// Configuration for parallel crawl backends.
201///
202/// When enabled, races alternative browser engines (CDP, Servo) alongside
203/// the primary crawl path. The best HTML response wins.
204#[cfg(feature = "parallel_backends")]
205#[derive(Debug, Clone, PartialEq)]
206#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
207#[cfg_attr(feature = "serde", serde(default))]
208pub struct ParallelBackendsConfig {
209    /// Alternative backends to race against the primary crawl.
210    pub backends: Vec<BackendEndpoint>,
211    /// Grace period (ms) after first response to wait for better results.
212    /// Allows slower backends to finish if they produce higher quality HTML.
213    /// Default: 500.
214    pub grace_period_ms: u64,
215    /// Master switch. Default: `true` (enabled when config is present).
216    pub enabled: bool,
217    /// Quality score threshold (0–100). If the first response scores at or
218    /// above this value, accept it immediately without waiting for the grace
219    /// period. Default: 80.
220    pub fast_accept_threshold: u16,
221    /// Maximum consecutive errors before auto-disabling a backend for
222    /// the remainder of the crawl. Default: 10.
223    pub max_consecutive_errors: u16,
224    /// Timeout (ms) for the initial TCP/WebSocket connection to a backend.
225    /// Separate from `request_timeout` so that down backends fail fast
226    /// without affecting navigation/fetch timeouts. Default: 5000 (5s).
227    pub connect_timeout_ms: u64,
228    /// Skip backend racing when the primary response has a binary
229    /// `Content-Type` (image/*, audio/*, video/*, font/*, application/pdf,
230    /// etc.). There is no HTML quality variance for binary resources.
231    /// Default: `true`.
232    pub skip_binary_content_types: bool,
233    /// Maximum concurrent backend sessions across all URLs. Prevents memory
234    /// spikes on large crawls. `0` means unlimited. Default: 8.
235    pub max_concurrent_sessions: usize,
236    /// Additional URL extensions to skip backend racing for, on top of the
237    /// built-in asset list (images, fonts, videos, etc.). Case-insensitive.
238    /// Example: `["xml", "rss"]`.
239    pub skip_extensions: Vec<CompactString>,
240    /// Maximum aggregate HTML bytes held by in-flight backend responses across
241    /// all concurrent races. When this cap is reached, new backend fetches are
242    /// skipped (primary-only) until existing responses are consumed or dropped.
243    /// Works without the `balance` feature. `0` means unlimited.
244    /// Default: 256 MiB (268_435_456).
245    pub max_backend_bytes_in_flight: usize,
246    /// Hard deadline (ms) for an entire backend fetch (connect + navigate +
247    /// extract). If a backend exceeds this, the task is cancelled and returns
248    /// `None`. Prevents a single stalled backend from blocking the primary
249    /// Chrome result during the grace window. `0` means no outer timeout
250    /// (individual phase timeouts still apply). Default: 30_000 (30s).
251    pub backend_timeout_ms: u64,
252}
253
254#[cfg(feature = "parallel_backends")]
255impl Default for ParallelBackendsConfig {
256    fn default() -> Self {
257        Self {
258            backends: Vec::new(),
259            grace_period_ms: 500,
260            enabled: true,
261            fast_accept_threshold: 80,
262            max_consecutive_errors: 10,
263            connect_timeout_ms: 5000,
264            skip_binary_content_types: true,
265            max_concurrent_sessions: 8,
266            skip_extensions: Vec::new(),
267            max_backend_bytes_in_flight: 256 * 1024 * 1024, // 256 MiB
268            backend_timeout_ms: 30_000,
269        }
270    }
271}
272
273/// User-configurable antibot detection patterns. Any match triggers `AntiBotTech::Custom`.
274#[derive(Debug, Default, Clone, PartialEq, Eq)]
275#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
276#[cfg_attr(feature = "serde", serde(default))]
277pub struct CustomAntibotPatterns {
278    /// Body substring patterns (matched against response bodies < 30KB).
279    pub body: Vec<CompactString>,
280    /// URL substring patterns.
281    pub url: Vec<CompactString>,
282    /// Header keys whose presence triggers antibot detection.
283    pub header_keys: Vec<CompactString>,
284}
285
286/// Structure to configure `Website` crawler
287/// ```rust
288/// use spider::website::Website;
289/// let mut website: Website = Website::new("https://choosealicense.com");
290/// website.configuration.blacklist_url.insert(Default::default()).push("https://choosealicense.com/licenses/".to_string().into());
291/// website.configuration.respect_robots_txt = true;
292/// website.configuration.subdomains = true;
293/// website.configuration.tld = true;
294/// ```
295#[derive(Debug, Default, Clone)]
296#[cfg_attr(
297    all(
298        not(feature = "regex"),
299        not(feature = "openai"),
300        not(feature = "cache_openai"),
301        not(feature = "gemini"),
302        not(feature = "cache_gemini")
303    ),
304    derive(PartialEq)
305)]
306#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
307#[cfg_attr(feature = "serde", serde(default))]
308pub struct Configuration {
309    /// Respect robots.txt file and not scrape not allowed files. This may slow down crawls if robots.txt file has a delay included.
310    pub respect_robots_txt: bool,
311    /// Allow sub-domains.
312    pub subdomains: bool,
313    /// Allow all tlds for domain.
314    pub tld: bool,
315    /// The max timeout for the crawl.
316    pub crawl_timeout: Option<Duration>,
317    /// Preserve the HTTP host header from being included.
318    pub preserve_host_header: bool,
319    /// List of pages to not crawl. [optional: regex pattern matching]
320    pub blacklist_url: Option<Vec<CompactString>>,
321    /// List of pages to only crawl. [optional: regex pattern matching]
322    pub whitelist_url: Option<Vec<CompactString>>,
323    /// User-Agent for request.
324    pub user_agent: Option<Box<CompactString>>,
325    /// Polite crawling delay in milli seconds.
326    pub delay: u64,
327    /// Request max timeout per page. By default the request times out in 15s. Set to None to disable.
328    pub request_timeout: Option<Duration>,
329    /// Use HTTP2 for connection. Enable if you know the website has http2 support.
330    pub http2_prior_knowledge: bool,
331    /// Use proxy list for performing network request.
332    pub proxies: Option<Vec<RequestProxy>>,
333    /// Headers to include with request.
334    pub headers: Option<Box<SerializableHeaderMap>>,
335    #[cfg(feature = "sitemap")]
336    /// Include a sitemap in response of the crawl.
337    pub sitemap_url: Option<Box<CompactString>>,
338    #[cfg(feature = "sitemap")]
339    /// Prevent including the sitemap links with the crawl.
340    pub ignore_sitemap: bool,
341    /// The max redirections allowed for request.
342    pub redirect_limit: usize,
343    /// The redirect policy type to use.
344    pub redirect_policy: RedirectPolicy,
345    #[cfg(feature = "cookies")]
346    /// Cookie string to use for network requests ex: "foo=bar; Domain=blog.spider"
347    pub cookie_str: String,
348    #[cfg(feature = "wreq")]
349    /// The type of request emulation. This does nothing without the flag `sync` enabled.
350    pub emulation: Option<wreq_util::Emulation>,
351    #[cfg(feature = "cron")]
352    /// Cron string to perform crawls - use <https://crontab.guru/> to help generate a valid cron for needs.
353    pub cron_str: String,
354    #[cfg(feature = "cron")]
355    /// The type of cron to run either crawl or scrape.
356    pub cron_type: CronType,
357    /// The max depth to crawl for a website. Defaults to 25 to help prevent infinite recursion.
358    pub depth: usize,
359    /// The depth to crawl pertaining to the root.
360    pub depth_distance: usize,
361    /// Use stealth mode for requests.
362    pub stealth_mode: spider_fingerprint::configs::Tier,
363    /// Configure the viewport for chrome and viewport headers.
364    pub viewport: Option<Viewport>,
365    /// Crawl budget for the paths. This helps prevent crawling extra pages and limiting the amount.
366    pub budget: Option<hashbrown::HashMap<case_insensitive_string::CaseInsensitiveString, u32>>,
367    /// If wild card budgeting is found for the website.
368    pub wild_card_budgeting: bool,
369    /// External domains to include case-insensitive.
370    pub external_domains_caseless:
371        Arc<hashbrown::HashSet<case_insensitive_string::CaseInsensitiveString>>,
372    /// Collect all the resources found on the page.
373    pub full_resources: bool,
374    /// Dangerously accept invalid certficates.
375    pub accept_invalid_certs: bool,
376    /// The auth challenge response. The 'chrome_intercept' flag is also required in order to intercept the response.
377    pub auth_challenge_response: Option<AuthChallengeResponse>,
378    /// The OpenAI configs to use to help drive the chrome browser. This does nothing without the 'openai' flag.
379    pub openai_config: Option<Box<GPTConfigs>>,
380    /// The Gemini configs to use to help drive the chrome browser. This does nothing without the 'gemini' flag.
381    pub gemini_config: Option<Box<GeminiConfigs>>,
382    /// Remote multimodal automation config (vision + LLM-driven steps).
383    /// Requires the `agent` feature for full functionality, uses stub type otherwise.
384    pub remote_multimodal: Option<Box<crate::features::automation::RemoteMultimodalConfigs>>,
385    /// Use a shared queue strategy when crawling. This can scale workloads evenly that do not need priority.
386    pub shared_queue: bool,
387    /// Return the page links in the subscription channels. This does nothing without the flag `sync` enabled.
388    pub return_page_links: bool,
389    /// Retry count to attempt to swap proxies etc.
390    pub retry: u8,
391    /// Custom antibot detection patterns. When set, these are matched in addition
392    /// to the built-in patterns. Any match triggers `AntiBotTech::Custom`.
393    pub custom_antibot: Option<CustomAntibotPatterns>,
394    /// Skip spawning a control thread that can pause, start, and shutdown the crawl.
395    pub no_control_thread: bool,
396    /// The blacklist urls.
397    blacklist: AllowListSet,
398    /// The whitelist urls.
399    whitelist: AllowListSet,
400    /// Crawl budget for the paths. This helps prevent crawling extra pages and limiting the amount.
401    pub(crate) inner_budget:
402        Option<hashbrown::HashMap<case_insensitive_string::CaseInsensitiveString, u32>>,
403    /// Expect only to handle HTML to save on resources. This mainly only blocks the crawling and returning of resources from the server.
404    pub only_html: bool,
405    /// The concurrency limits to apply.
406    pub concurrency_limit: Option<usize>,
407    /// Normalize the html de-deplucating the content.
408    pub normalize: bool,
409    /// Share the state of the crawl requires the 'disk' feature flag.
410    pub shared: bool,
411    /// Modify the headers to act like a real-browser
412    pub modify_headers: bool,
413    /// Modify the HTTP client headers only to act like a real-browser
414    pub modify_http_client_headers: bool,
415    /// Cache the page following HTTP caching rules.
416    #[cfg(any(
417        feature = "cache_request",
418        feature = "chrome",
419        feature = "chrome_remote_cache"
420    ))]
421    pub cache: bool,
422    /// Skip browser rendering entirely if cached response exists.
423    /// When enabled, returns cached HTML directly without launching Chrome.
424    #[cfg(any(
425        feature = "cache_request",
426        feature = "chrome",
427        feature = "chrome_remote_cache"
428    ))]
429    pub cache_skip_browser: bool,
430    /// Namespace mixed into every cache key so logically distinct variants
431    /// (country, proxy pool, tenant, A/B bucket, device profile, …) never
432    /// collide on the same cached bytes. Free-form — spider treats it as an
433    /// opaque partition string. `None` uses the default (empty) namespace.
434    /// Always present (zero cost when unset); its effect is gated by whichever
435    /// cache feature is active.
436    pub cache_namespace: Option<Box<String>>,
437    #[cfg(feature = "chrome")]
438    /// Enable or disable service workers. Enabled by default.
439    pub service_worker_enabled: bool,
440    #[cfg(feature = "chrome")]
441    /// Overrides default host system timezone with the specified one.
442    #[cfg(feature = "chrome")]
443    pub timezone_id: Option<Box<String>>,
444    /// Overrides default host system locale with the specified one.
445    #[cfg(feature = "chrome")]
446    pub locale: Option<Box<String>>,
447    /// Set a custom script to eval on each new document.
448    #[cfg(feature = "chrome")]
449    pub evaluate_on_new_document: Option<Box<String>>,
450    #[cfg(feature = "chrome")]
451    /// Dismiss dialogs.
452    pub dismiss_dialogs: Option<bool>,
453    #[cfg(feature = "chrome")]
454    /// Wait for options for the page.
455    pub wait_for: Option<WaitFor>,
456    #[cfg(feature = "chrome")]
457    /// Take a screenshot of the page.
458    pub screenshot: Option<ScreenShotConfig>,
459    #[cfg(feature = "chrome")]
460    /// Track the events made via chrome.
461    pub track_events: Option<ChromeEventTracker>,
462    #[cfg(feature = "chrome")]
463    /// Setup fingerprint ID on each document. This does nothing without the flag `chrome` enabled.
464    pub fingerprint: Fingerprint,
465    #[cfg(feature = "chrome")]
466    /// The chrome connection url. Useful for targeting different headless instances. Defaults to using the env CHROME_URL.
467    pub chrome_connection_url: Option<String>,
468    #[cfg(feature = "chrome")]
469    /// Multiple remote Chrome connection URLs for failover. When a connection
470    /// fails after retries, the next URL is tried automatically. Requires the
471    /// `chrome` feature. When set, takes priority over `chrome_connection_url`.
472    pub chrome_connection_urls: Option<Vec<String>>,
473    /// Scripts to execute for individual pages, the full path of the url is required for an exact match. This is useful for running one off JS on pages like performing custom login actions.
474    #[cfg(feature = "chrome")]
475    pub execution_scripts: Option<ExecutionScripts>,
476    /// Web automation scripts to run up to a duration of 60 seconds.
477    #[cfg(feature = "chrome")]
478    pub automation_scripts: Option<AutomationScripts>,
479    /// Setup network interception for request. This does nothing without the flag `chrome_intercept` enabled.
480    #[cfg(feature = "chrome")]
481    pub chrome_intercept: RequestInterceptConfiguration,
482    /// The referer to use.
483    pub referer: Option<String>,
484    /// Determine the max bytes per page.
485    pub max_page_bytes: Option<f64>,
486    /// Determine the max bytes per browser context.
487    pub max_bytes_allowed: Option<u64>,
488    #[cfg(feature = "chrome")]
489    /// Disables log domain, prevents further log entries from being reported to the client. This does nothing without the flag `chrome` enabled.
490    pub disable_log: bool,
491    #[cfg(feature = "chrome")]
492    /// Automatic locale and timezone handling via third party. This does nothing without the flag `chrome` enabled.
493    pub auto_geolocation: bool,
494    /// The cache policy to use.
495    pub cache_policy: Option<BasicCachePolicy>,
496    #[cfg(feature = "chrome")]
497    /// Enables bypassing CSP. This does nothing without the flag `chrome` enabled.
498    pub bypass_csp: bool,
499    #[cfg(feature = "chrome")]
500    /// Disables JavaScript execution on the page. This does nothing without the flag `chrome` enabled.
501    pub disable_javascript: bool,
502    /// Bind the connections only on the network interface.
503    pub network_interface: Option<String>,
504    /// Bind to a local IP Address.
505    pub local_address: Option<IpAddr>,
506    /// The default http connect timeout
507    pub default_http_connect_timeout: Option<Duration>,
508    /// The default http read timeout
509    pub default_http_read_timeout: Option<Duration>,
510    #[cfg(feature = "webdriver")]
511    /// WebDriver configuration for browser automation. This does nothing without the `webdriver` flag enabled.
512    pub webdriver_config: Option<Box<WebDriverConfig>>,
513    #[cfg(feature = "search")]
514    /// Search provider configuration for web search integration. This does nothing without the `search` flag enabled.
515    pub search_config: Option<Box<SearchConfig>>,
516    #[cfg(feature = "spider_cloud")]
517    /// Spider Cloud config. See <https://spider.cloud>.
518    pub spider_cloud: Option<Box<SpiderCloudConfig>>,
519    #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
520    /// Spider Browser Cloud config for remote CDP via `wss://browser.spider.cloud`.
521    pub spider_browser: Option<Box<SpiderBrowserConfig>>,
522    #[cfg(feature = "hedge")]
523    /// Hedged request configuration for work-stealing on slow requests.
524    /// When enabled, fires a duplicate request on a different proxy after a delay.
525    pub hedge: Option<crate::utils::hedge::HedgeConfig>,
526    #[cfg(feature = "auto_throttle")]
527    /// Latency-based auto-throttle configuration. When enabled, dynamically
528    /// adjusts per-domain crawl delay based on measured server response time.
529    pub auto_throttle: Option<crate::utils::auto_throttle::AutoThrottleConfig>,
530    #[cfg(feature = "etag_cache")]
531    /// Enable ETag / conditional request caching. When true, stores ETag and
532    /// Last-Modified headers from responses and sends If-None-Match /
533    /// If-Modified-Since on subsequent requests to the same URL, allowing
534    /// servers to respond with lightweight 304 Not Modified.
535    pub etag_cache: bool,
536    #[cfg(feature = "warc")]
537    /// WARC output configuration. When set, the crawl writes a WARC 1.1 file
538    /// containing all fetched pages as `response` records.
539    pub warc: Option<crate::utils::warc::WarcConfig>,
540    #[cfg(feature = "parallel_backends")]
541    /// Parallel crawl backend configuration. Race CDP / Servo backends alongside
542    /// the primary crawl path. Requires the `parallel_backends` feature.
543    pub parallel_backends: Option<ParallelBackendsConfig>,
544}
545
546#[derive(Default, Debug, Clone, PartialEq, Eq)]
547/// Serializable HTTP headers.
548pub struct SerializableHeaderMap(pub HeaderMap);
549
550impl SerializableHeaderMap {
551    /// Innter HeaderMap.
552    pub fn inner(&self) -> &HeaderMap {
553        &self.0
554    }
555    /// Returns true if the map contains a value for the specified key.
556    pub fn contains_key<K>(&self, key: K) -> bool
557    where
558        K: AsHeaderName,
559    {
560        self.0.contains_key(key)
561    }
562    /// Inserts a key-value pair into the map.
563    pub fn insert<K>(
564        &mut self,
565        key: K,
566        val: reqwest::header::HeaderValue,
567    ) -> Option<reqwest::header::HeaderValue>
568    where
569        K: IntoHeaderName,
570    {
571        self.0.insert(key, val)
572    }
573    /// Extend a `HeaderMap` with the contents of another `HeaderMap`.
574    pub fn extend<I>(&mut self, iter: I)
575    where
576        I: IntoIterator<Item = (Option<HeaderName>, HeaderValue)>,
577    {
578        self.0.extend(iter);
579    }
580}
581
582/// Get a cloned copy of the `Referer` header as a `String` (if it exists and is valid UTF-8).
583pub fn get_referer(header_map: &Option<Box<SerializableHeaderMap>>) -> Option<String> {
584    match header_map {
585        Some(header_map) => {
586            header_map
587                .0
588                .get(crate::client::header::REFERER) // Retrieves the "Referer" HeaderValue if it exists
589                .and_then(|value| value.to_str().ok()) // &str from HeaderValue
590                .map(String::from) // Convert &str to String (owned)
591        }
592        _ => None,
593    }
594}
595
596impl From<HeaderMap> for SerializableHeaderMap {
597    fn from(header_map: HeaderMap) -> Self {
598        SerializableHeaderMap(header_map)
599    }
600}
601
602#[cfg(feature = "serde")]
603impl serde::Serialize for SerializableHeaderMap {
604    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
605    where
606        S: serde::Serializer,
607    {
608        let map: std::collections::BTreeMap<String, String> = self
609            .0
610            .iter()
611            .map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
612            .collect();
613        map.serialize(serializer)
614    }
615}
616
617#[cfg(feature = "serde")]
618impl<'de> serde::Deserialize<'de> for SerializableHeaderMap {
619    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
620    where
621        D: serde::Deserializer<'de>,
622    {
623        use reqwest::header::{HeaderName, HeaderValue};
624        use std::collections::BTreeMap;
625        let map: BTreeMap<String, String> = BTreeMap::deserialize(deserializer)?;
626        let mut headers = HeaderMap::with_capacity(map.len());
627        for (k, v) in map {
628            let key = HeaderName::from_bytes(k.as_bytes()).map_err(serde::de::Error::custom)?;
629            let value = HeaderValue::from_str(&v).map_err(serde::de::Error::custom)?;
630            headers.insert(key, value);
631        }
632        Ok(SerializableHeaderMap(headers))
633    }
634}
635
636#[cfg(feature = "serde")]
637impl serde::Serialize for AllowListSet {
638    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
639    where
640        S: serde::Serializer,
641    {
642        #[cfg(not(feature = "regex"))]
643        {
644            self.0.serialize(serializer)
645        }
646
647        #[cfg(feature = "regex")]
648        {
649            self.0
650                .patterns()
651                .iter()
652                .collect::<Vec<&String>>()
653                .serialize(serializer)
654        }
655    }
656}
657
658#[cfg(feature = "serde")]
659impl<'de> serde::Deserialize<'de> for AllowListSet {
660    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
661    where
662        D: serde::Deserializer<'de>,
663    {
664        #[cfg(not(feature = "regex"))]
665        {
666            let vec = Vec::<CompactString>::deserialize(deserializer)?;
667            Ok(AllowListSet(vec))
668        }
669
670        #[cfg(feature = "regex")]
671        {
672            let patterns = Vec::<String>::deserialize(deserializer)?;
673            let regex_set = regex::RegexSet::new(&patterns).map_err(serde::de::Error::custom)?;
674            Ok(AllowListSet(regex_set.into()))
675        }
676    }
677}
678
679/// Get the user agent from the top agent list randomly.
680#[cfg(feature = "ua_generator")]
681pub fn get_ua(chrome: bool) -> &'static str {
682    if chrome {
683        ua_generator::ua::spoof_chrome_ua()
684    } else {
685        ua_generator::ua::spoof_ua()
686    }
687}
688
689/// Get the user agent via cargo package + version.
690#[cfg(not(feature = "ua_generator"))]
691pub fn get_ua(_chrome: bool) -> &'static str {
692    use std::env;
693
694    lazy_static! {
695        static ref AGENT: &'static str =
696            concat!(env!("CARGO_PKG_NAME"), '/', env!("CARGO_PKG_VERSION"));
697    };
698
699    AGENT.as_ref()
700}
701
702impl Configuration {
703    /// Represents crawl configuration for a website.
704    #[cfg(not(feature = "chrome"))]
705    pub fn new() -> Self {
706        Self {
707            delay: 0,
708            depth: 25,
709            redirect_limit: 7,
710            request_timeout: Some(Duration::from_secs(120)),
711            only_html: true,
712            modify_headers: true,
713            ..Default::default()
714        }
715    }
716
717    /// Represents crawl configuration for a website.
718    #[cfg(feature = "chrome")]
719    pub fn new() -> Self {
720        Self {
721            delay: 0,
722            depth: 25,
723            redirect_limit: 7,
724            request_timeout: Some(Duration::from_secs(120)),
725            chrome_intercept: RequestInterceptConfiguration::new(cfg!(
726                feature = "chrome_intercept"
727            )),
728            user_agent: Some(Box::new(get_ua(true).into())),
729            only_html: true,
730            cache: true,
731            modify_headers: true,
732            service_worker_enabled: true,
733            fingerprint: Fingerprint::Basic,
734            auto_geolocation: false,
735            ..Default::default()
736        }
737    }
738
739    /// Build a `RemoteMultimodalEngine` from `RemoteMultimodalConfigs`.
740    /// Requires the `agent` feature.
741    #[cfg(feature = "agent")]
742    pub fn build_remote_multimodal_engine(
743        &self,
744    ) -> Option<crate::features::automation::RemoteMultimodalEngine> {
745        let cfgs = self.remote_multimodal.as_ref()?;
746        let sem = cfgs
747            .concurrency_limit
748            .filter(|&n| n > 0)
749            .map(|n| std::sync::Arc::new(tokio::sync::Semaphore::new(n)));
750
751        #[allow(unused_mut)]
752        let mut engine = crate::features::automation::RemoteMultimodalEngine::new(
753            cfgs.api_url.clone(),
754            cfgs.model_name.clone(),
755            cfgs.system_prompt.clone(),
756        )
757        .with_api_key(cfgs.api_key.as_deref())
758        .with_system_prompt_extra(cfgs.system_prompt_extra.as_deref())
759        .with_user_message_extra(cfgs.user_message_extra.as_deref())
760        .with_remote_multimodal_config(cfgs.cfg.clone())
761        .with_prompt_url_gate(cfgs.prompt_url_gate.clone())
762        .with_vision_model(cfgs.vision_model.clone())
763        .with_text_model(cfgs.text_model.clone())
764        .with_vision_route_mode(cfgs.vision_route_mode)
765        .with_chrome_ai(cfgs.use_chrome_ai)
766        .with_semaphore(sem)
767        .to_owned();
768
769        #[cfg(feature = "agent_skills")]
770        if let Some(ref registry) = cfgs.skill_registry {
771            engine.with_skill_registry(Some(registry.clone()));
772        }
773
774        // Build per-round complexity router from model pool (3+ models required)
775        let model_pool = cfgs.model_pool.clone();
776        if model_pool.len() >= 3 {
777            let model_names: Vec<&str> =
778                model_pool.iter().map(|ep| ep.model_name.as_str()).collect();
779            let policy = crate::features::automation::auto_policy(&model_names);
780            engine.model_router = Some(crate::features::automation::ModelRouter::with_policy(
781                policy,
782            ));
783        }
784        engine.model_pool = model_pool;
785
786        Some(engine)
787    }
788
789    /// Determine if the agent should be set to a Chrome Agent.
790    #[cfg(not(feature = "chrome"))]
791    pub(crate) fn only_chrome_agent(&self) -> bool {
792        false
793    }
794
795    /// Determine if the agent should be set to a Chrome Agent.
796    #[cfg(feature = "chrome")]
797    pub(crate) fn only_chrome_agent(&self) -> bool {
798        self.chrome_connection_url.is_some()
799            || self.wait_for.is_some()
800            || self.chrome_intercept.enabled
801            || self.stealth_mode.stealth()
802            || self.fingerprint.valid()
803    }
804
805    #[cfg(feature = "regex")]
806    /// Compile the regex for the blacklist.
807    pub fn get_blacklist(&self) -> Box<regex::RegexSet> {
808        match &self.blacklist_url {
809            Some(blacklist) => match regex::RegexSet::new(&**blacklist) {
810                Ok(s) => Box::new(s),
811                _ => Default::default(),
812            },
813            _ => Default::default(),
814        }
815    }
816
817    #[cfg(not(feature = "regex"))]
818    /// Handle the blacklist options.
819    pub fn get_blacklist(&self) -> AllowList {
820        match &self.blacklist_url {
821            Some(blacklist) => blacklist.to_owned(),
822            _ => Default::default(),
823        }
824    }
825
826    /// Set the blacklist
827    pub(crate) fn set_blacklist(&mut self) {
828        self.blacklist = AllowListSet(self.get_blacklist());
829    }
830
831    /// Set the whitelist
832    pub fn set_whitelist(&mut self) {
833        self.whitelist = AllowListSet(self.get_whitelist());
834    }
835
836    /// Configure the allow list.
837    pub fn configure_allowlist(&mut self) {
838        self.set_whitelist();
839        self.set_blacklist();
840    }
841
842    /// Get the blacklist compiled.
843    pub fn get_blacklist_compiled(&self) -> &AllowList {
844        &self.blacklist.0
845    }
846
847    /// Setup the budget for crawling.
848    pub fn configure_budget(&mut self) {
849        self.inner_budget.clone_from(&self.budget);
850    }
851
852    /// Get the whitelist compiled.
853    pub fn get_whitelist_compiled(&self) -> &AllowList {
854        &self.whitelist.0
855    }
856
857    #[cfg(feature = "regex")]
858    /// Compile the regex for the whitelist.
859    pub fn get_whitelist(&self) -> Box<regex::RegexSet> {
860        match &self.whitelist_url {
861            Some(whitelist) => match regex::RegexSet::new(&**whitelist) {
862                Ok(s) => Box::new(s),
863                _ => Default::default(),
864            },
865            _ => Default::default(),
866        }
867    }
868
869    #[cfg(not(feature = "regex"))]
870    /// Handle the whitelist options.
871    pub fn get_whitelist(&self) -> AllowList {
872        match &self.whitelist_url {
873            Some(whitelist) => whitelist.to_owned(),
874            _ => Default::default(),
875        }
876    }
877
878    #[cfg(feature = "sitemap")]
879    /// Add sitemap paths to the whitelist and track what was added.
880    pub fn add_sitemap_to_whitelist(&mut self) -> SitemapWhitelistChanges {
881        let mut changes = SitemapWhitelistChanges::default();
882
883        if self.ignore_sitemap && self.whitelist_url.is_none() {
884            return changes;
885        }
886
887        if let Some(list) = self.whitelist_url.as_mut() {
888            if list.is_empty() {
889                return changes;
890            }
891
892            let default = CompactString::from("sitemap.xml");
893
894            if !list.contains(&default) {
895                list.push(default);
896                changes.added_default = true;
897            }
898
899            if let Some(custom) = &self.sitemap_url {
900                if !list.contains(custom) {
901                    list.push(*custom.clone());
902                    changes.added_custom = true;
903                }
904            }
905        }
906
907        changes
908    }
909
910    #[cfg(feature = "sitemap")]
911    /// Revert any changes made to the whitelist by `add_sitemap_to_whitelist`.
912    pub fn remove_sitemap_from_whitelist(&mut self, changes: SitemapWhitelistChanges) {
913        if let Some(list) = self.whitelist_url.as_mut() {
914            if changes.added_default {
915                let default = CompactString::from("sitemap.xml");
916                if let Some(pos) = list.iter().position(|s| s == default) {
917                    list.remove(pos);
918                }
919            }
920            if changes.added_custom {
921                if let Some(custom) = &self.sitemap_url {
922                    if let Some(pos) = list.iter().position(|s| *s == **custom) {
923                        list.remove(pos);
924                    }
925                }
926            }
927            if list.is_empty() {
928                self.whitelist_url = None;
929            }
930        }
931    }
932
933    /// Respect robots.txt file.
934    pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self {
935        self.respect_robots_txt = respect_robots_txt;
936        self
937    }
938
939    /// Include subdomains detection.
940    pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self {
941        self.subdomains = subdomains;
942        self
943    }
944
945    /// Bypass CSP protection detection. This does nothing without the feat flag `chrome` enabled.
946    #[cfg(feature = "chrome")]
947    pub fn with_csp_bypass(&mut self, enabled: bool) -> &mut Self {
948        self.bypass_csp = enabled;
949        self
950    }
951
952    /// Bypass CSP protection detection. This does nothing without the feat flag `chrome` enabled.
953    #[cfg(not(feature = "chrome"))]
954    pub fn with_csp_bypass(&mut self, _enabled: bool) -> &mut Self {
955        self
956    }
957
958    /// Disable JavaScript execution on the page. This does nothing without the feat flag `chrome` enabled.
959    #[cfg(feature = "chrome")]
960    pub fn with_disable_javascript(&mut self, disabled: bool) -> &mut Self {
961        self.disable_javascript = disabled;
962        self
963    }
964
965    /// Disable JavaScript execution on the page. This does nothing without the feat flag `chrome` enabled.
966    #[cfg(not(feature = "chrome"))]
967    pub fn with_disable_javascript(&mut self, _disabled: bool) -> &mut Self {
968        self
969    }
970
971    /// Bind the connections only on the network interface.
972    pub fn with_network_interface(&mut self, network_interface: Option<String>) -> &mut Self {
973        self.network_interface = network_interface;
974        self
975    }
976
977    /// Bind to a local IP Address.
978    pub fn with_local_address(&mut self, local_address: Option<IpAddr>) -> &mut Self {
979        self.local_address = local_address;
980        self
981    }
982
983    /// Include tld detection.
984    pub fn with_tld(&mut self, tld: bool) -> &mut Self {
985        self.tld = tld;
986        self
987    }
988
989    /// The max duration for the crawl. This is useful when websites use a robots.txt with long durations and throttle the timeout removing the full concurrency.
990    pub fn with_crawl_timeout(&mut self, crawl_timeout: Option<Duration>) -> &mut Self {
991        self.crawl_timeout = crawl_timeout;
992        self
993    }
994
995    /// Delay between request as ms.
996    pub fn with_delay(&mut self, delay: u64) -> &mut Self {
997        self.delay = delay;
998        self
999    }
1000
1001    /// Only use HTTP/2.
1002    pub fn with_http2_prior_knowledge(&mut self, http2_prior_knowledge: bool) -> &mut Self {
1003        self.http2_prior_knowledge = http2_prior_knowledge;
1004        self
1005    }
1006
1007    /// Max time to wait for request. By default request times out in 15s. Set to None to disable.
1008    pub fn with_request_timeout(&mut self, request_timeout: Option<Duration>) -> &mut Self {
1009        match request_timeout {
1010            Some(timeout) => self.request_timeout = Some(timeout),
1011            _ => self.request_timeout = None,
1012        };
1013
1014        self
1015    }
1016
1017    #[cfg(feature = "sitemap")]
1018    /// Set the sitemap url. This does nothing without the `sitemap` feature flag.
1019    pub fn with_sitemap(&mut self, sitemap_url: Option<&str>) -> &mut Self {
1020        match sitemap_url {
1021            Some(sitemap_url) => {
1022                self.sitemap_url = Some(CompactString::new(sitemap_url.to_string()).into())
1023            }
1024            _ => self.sitemap_url = None,
1025        };
1026        self
1027    }
1028
1029    #[cfg(not(feature = "sitemap"))]
1030    /// Set the sitemap url. This does nothing without the `sitemap` feature flag.
1031    pub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self {
1032        self
1033    }
1034
1035    #[cfg(feature = "sitemap")]
1036    /// Ignore the sitemap when crawling. This method does nothing if the `sitemap` is not enabled.
1037    pub fn with_ignore_sitemap(&mut self, ignore_sitemap: bool) -> &mut Self {
1038        self.ignore_sitemap = ignore_sitemap;
1039        self
1040    }
1041
1042    #[cfg(not(feature = "sitemap"))]
1043    /// Ignore the sitemap when crawling. This method does nothing if the `sitemap` is not enabled.
1044    pub fn with_ignore_sitemap(&mut self, _ignore_sitemap: bool) -> &mut Self {
1045        self
1046    }
1047
1048    /// Add user agent to request.
1049    pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self {
1050        match user_agent {
1051            Some(agent) => self.user_agent = Some(CompactString::new(agent).into()),
1052            _ => self.user_agent = None,
1053        };
1054        self
1055    }
1056
1057    /// Preserve the HOST header.
1058    pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self {
1059        self.preserve_host_header = preserve;
1060        self
1061    }
1062
1063    /// Use a remote multimodal model to drive browser automation.
1064    /// Requires the `agent` feature.
1065    #[cfg(feature = "agent")]
1066    pub fn with_remote_multimodal(
1067        &mut self,
1068        remote_multimodal: Option<crate::features::automation::RemoteMultimodalConfigs>,
1069    ) -> &mut Self {
1070        self.remote_multimodal = remote_multimodal.map(Box::new);
1071        self
1072    }
1073
1074    /// Use a remote multimodal model to drive browser automation.
1075    /// When the `agent` feature is not enabled, this uses a stub type.
1076    #[cfg(not(feature = "agent"))]
1077    pub fn with_remote_multimodal(
1078        &mut self,
1079        remote_multimodal: Option<crate::features::automation::RemoteMultimodalConfigs>,
1080    ) -> &mut Self {
1081        self.remote_multimodal = remote_multimodal.map(Box::new);
1082        self
1083    }
1084
1085    #[cfg(not(feature = "openai"))]
1086    /// The OpenAI configs to use to drive the browser. This method does nothing if the `openai` is not enabled.
1087    pub fn with_openai(&mut self, _openai_config: Option<GPTConfigs>) -> &mut Self {
1088        self
1089    }
1090
1091    /// The OpenAI configs to use to drive the browser. This method does nothing if the `openai` is not enabled.
1092    #[cfg(feature = "openai")]
1093    pub fn with_openai(&mut self, openai_config: Option<GPTConfigs>) -> &mut Self {
1094        match openai_config {
1095            Some(openai_config) => self.openai_config = Some(Box::new(openai_config)),
1096            _ => self.openai_config = None,
1097        };
1098        self
1099    }
1100
1101    #[cfg(not(feature = "gemini"))]
1102    /// The Gemini configs to use to drive the browser. This method does nothing if the `gemini` is not enabled.
1103    pub fn with_gemini(&mut self, _gemini_config: Option<GeminiConfigs>) -> &mut Self {
1104        self
1105    }
1106
1107    /// The Gemini configs to use to drive the browser. This method does nothing if the `gemini` is not enabled.
1108    #[cfg(feature = "gemini")]
1109    pub fn with_gemini(&mut self, gemini_config: Option<GeminiConfigs>) -> &mut Self {
1110        match gemini_config {
1111            Some(gemini_config) => self.gemini_config = Some(Box::new(gemini_config)),
1112            _ => self.gemini_config = None,
1113        };
1114        self
1115    }
1116
1117    #[cfg(feature = "cookies")]
1118    /// Cookie string to use in request. This does nothing without the `cookies` flag enabled.
1119    pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self {
1120        self.cookie_str = cookie_str.into();
1121        self
1122    }
1123
1124    #[cfg(not(feature = "cookies"))]
1125    /// Cookie string to use in request. This does nothing without the `cookies` flag enabled.
1126    pub fn with_cookies(&mut self, _cookie_str: &str) -> &mut Self {
1127        self
1128    }
1129
1130    #[cfg(feature = "chrome")]
1131    /// Set custom fingerprint ID for request. This does nothing without the `chrome` flag enabled.
1132    pub fn with_fingerprint(&mut self, fingerprint: bool) -> &mut Self {
1133        if fingerprint {
1134            self.fingerprint = Fingerprint::Basic;
1135        } else {
1136            self.fingerprint = Fingerprint::None;
1137        }
1138        self
1139    }
1140
1141    #[cfg(feature = "chrome")]
1142    /// Set custom fingerprint ID for request. This does nothing without the `chrome` flag enabled.
1143    pub fn with_fingerprint_advanced(&mut self, fingerprint: Fingerprint) -> &mut Self {
1144        self.fingerprint = fingerprint;
1145        self
1146    }
1147
1148    #[cfg(not(feature = "chrome"))]
1149    /// Set custom fingerprint ID for request. This does nothing without the `chrome` flag enabled.
1150    pub fn with_fingerprint(&mut self, _fingerprint: bool) -> &mut Self {
1151        self
1152    }
1153
1154    /// Use proxies for request.
1155    pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self {
1156        self.proxies = proxies.map(|p| {
1157            p.iter()
1158                .map(|addr| RequestProxy {
1159                    addr: addr.to_owned(),
1160                    ..Default::default()
1161                })
1162                .collect::<Vec<RequestProxy>>()
1163        });
1164        self
1165    }
1166
1167    /// Use proxies for request with control between chrome and http.
1168    pub fn with_proxies_direct(&mut self, proxies: Option<Vec<RequestProxy>>) -> &mut Self {
1169        self.proxies = proxies;
1170        self
1171    }
1172
1173    /// Use a shared semaphore to evenly handle workloads. The default is false.
1174    pub fn with_shared_queue(&mut self, shared_queue: bool) -> &mut Self {
1175        self.shared_queue = shared_queue;
1176        self
1177    }
1178
1179    /// Add blacklist urls to ignore.
1180    pub fn with_blacklist_url<T>(&mut self, blacklist_url: Option<Vec<T>>) -> &mut Self
1181    where
1182        Vec<CompactString>: From<Vec<T>>,
1183    {
1184        match blacklist_url {
1185            Some(p) => self.blacklist_url = Some(p.into()),
1186            _ => self.blacklist_url = None,
1187        };
1188        self
1189    }
1190
1191    /// Add whitelist urls to allow.
1192    pub fn with_whitelist_url<T>(&mut self, whitelist_url: Option<Vec<T>>) -> &mut Self
1193    where
1194        Vec<CompactString>: From<Vec<T>>,
1195    {
1196        match whitelist_url {
1197            Some(p) => self.whitelist_url = Some(p.into()),
1198            _ => self.whitelist_url = None,
1199        };
1200        self
1201    }
1202
1203    /// Return the links found on the page in the channel subscriptions. This method does nothing if the `decentralized` is enabled.
1204    pub fn with_return_page_links(&mut self, return_page_links: bool) -> &mut Self {
1205        self.return_page_links = return_page_links;
1206        self
1207    }
1208
1209    /// Set HTTP headers for request using [reqwest::header::HeaderMap](https://docs.rs/reqwest/latest/reqwest/header/struct.HeaderMap.html).
1210    pub fn with_headers(&mut self, headers: Option<reqwest::header::HeaderMap>) -> &mut Self {
1211        match headers {
1212            Some(m) => self.headers = Some(SerializableHeaderMap::from(m).into()),
1213            _ => self.headers = None,
1214        };
1215        self
1216    }
1217
1218    /// Set the max redirects allowed for request.
1219    pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self {
1220        self.redirect_limit = redirect_limit;
1221        self
1222    }
1223
1224    /// Set the redirect policy to use.
1225    pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self {
1226        self.redirect_policy = policy;
1227        self
1228    }
1229
1230    /// Add a referer (mis-spelling) to the request.
1231    pub fn with_referer(&mut self, referer: Option<String>) -> &mut Self {
1232        self.referer = referer;
1233        self
1234    }
1235
1236    /// Add a referer to the request.
1237    pub fn with_referrer(&mut self, referer: Option<String>) -> &mut Self {
1238        self.referer = referer;
1239        self
1240    }
1241
1242    /// Determine whether to collect all the resources found on pages.
1243    pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self {
1244        self.full_resources = full_resources;
1245        self
1246    }
1247
1248    /// Determine whether to dismiss dialogs. This method does nothing if the `chrome` is enabled.
1249    #[cfg(feature = "chrome")]
1250    pub fn with_dismiss_dialogs(&mut self, dismiss_dialogs: bool) -> &mut Self {
1251        self.dismiss_dialogs = Some(dismiss_dialogs);
1252        self
1253    }
1254
1255    /// Determine whether to dismiss dialogs. This method does nothing if the `chrome` is enabled.
1256    #[cfg(not(feature = "chrome"))]
1257    pub fn with_dismiss_dialogs(&mut self, _dismiss_dialogs: bool) -> &mut Self {
1258        self
1259    }
1260
1261    /// Set the request emuluation. This method does nothing if the `wreq` flag is not enabled.
1262    #[cfg(feature = "wreq")]
1263    pub fn with_emulation(&mut self, emulation: Option<wreq_util::Emulation>) -> &mut Self {
1264        self.emulation = emulation;
1265        self
1266    }
1267
1268    #[cfg(feature = "cron")]
1269    /// Setup cron jobs to run. This does nothing without the `cron` flag enabled.
1270    pub fn with_cron(&mut self, cron_str: &str, cron_type: CronType) -> &mut Self {
1271        self.cron_str = cron_str.into();
1272        self.cron_type = cron_type;
1273        self
1274    }
1275
1276    #[cfg(not(feature = "cron"))]
1277    /// Setup cron jobs to run. This does nothing without the `cron` flag enabled.
1278    pub fn with_cron(&mut self, _cron_str: &str, _cron_type: CronType) -> &mut Self {
1279        self
1280    }
1281
1282    /// Set a crawl page limit. If the value is 0 there is no limit.
1283    pub fn with_limit(&mut self, limit: u32) -> &mut Self {
1284        self.with_budget(Some(hashbrown::HashMap::from([("*", limit)])));
1285        self
1286    }
1287
1288    /// Set the concurrency limits. If you set the value to None to use the default limits using the system CPU cors * n.
1289    pub fn with_concurrency_limit(&mut self, limit: Option<usize>) -> &mut Self {
1290        self.concurrency_limit = limit;
1291        self
1292    }
1293
1294    #[cfg(feature = "chrome")]
1295    /// Set the authentiation challenge response. This does nothing without the feat flag `chrome` enabled.
1296    pub fn with_auth_challenge_response(
1297        &mut self,
1298        auth_challenge_response: Option<AuthChallengeResponse>,
1299    ) -> &mut Self {
1300        self.auth_challenge_response = auth_challenge_response;
1301        self
1302    }
1303
1304    #[cfg(feature = "chrome")]
1305    /// Set a custom script to evaluate on new document creation. This does nothing without the feat flag `chrome` enabled.
1306    pub fn with_evaluate_on_new_document(
1307        &mut self,
1308        evaluate_on_new_document: Option<Box<String>>,
1309    ) -> &mut Self {
1310        self.evaluate_on_new_document = evaluate_on_new_document;
1311        self
1312    }
1313
1314    #[cfg(not(feature = "chrome"))]
1315    /// Set a custom script to evaluate on new document creation. This does nothing without the feat flag `chrome` enabled.
1316    pub fn with_evaluate_on_new_document(
1317        &mut self,
1318        _evaluate_on_new_document: Option<Box<String>>,
1319    ) -> &mut Self {
1320        self
1321    }
1322
1323    #[cfg(not(feature = "chrome"))]
1324    /// Set the authentiation challenge response. This does nothing without the feat flag `chrome` enabled.
1325    pub fn with_auth_challenge_response(
1326        &mut self,
1327        _auth_challenge_response: Option<AuthChallengeResponse>,
1328    ) -> &mut Self {
1329        self
1330    }
1331
1332    /// Set a crawl depth limit. If the value is 0 there is no limit.
1333    pub fn with_depth(&mut self, depth: usize) -> &mut Self {
1334        self.depth = depth;
1335        self
1336    }
1337
1338    #[cfg(any(feature = "cache_request", feature = "chrome_remote_cache"))]
1339    /// Cache the page following HTTP rules. This method does nothing if the `cache` feature is not enabled.
1340    pub fn with_caching(&mut self, cache: bool) -> &mut Self {
1341        self.cache = cache;
1342        self
1343    }
1344
1345    #[cfg(not(any(feature = "cache_request", feature = "chrome_remote_cache")))]
1346    /// Cache the page following HTTP rules. This method does nothing if the `cache` feature is not enabled.
1347    pub fn with_caching(&mut self, _cache: bool) -> &mut Self {
1348        self
1349    }
1350
1351    #[cfg(any(feature = "cache_request", feature = "chrome_remote_cache"))]
1352    /// Skip browser rendering entirely if cached response exists.
1353    /// When enabled with caching, returns cached HTML directly without launching Chrome.
1354    /// This is useful for performance when you only need the cached content.
1355    pub fn with_cache_skip_browser(&mut self, skip: bool) -> &mut Self {
1356        self.cache_skip_browser = skip;
1357        self
1358    }
1359
1360    #[cfg(not(any(feature = "cache_request", feature = "chrome_remote_cache")))]
1361    /// Skip browser rendering entirely if cached response exists.
1362    /// This method does nothing if the cache features are not enabled.
1363    pub fn with_cache_skip_browser(&mut self, _skip: bool) -> &mut Self {
1364        self
1365    }
1366
1367    /// Partition the cache by an opaque namespace so logically distinct
1368    /// variants of the same URL (country, proxy pool, tenant, A/B bucket,
1369    /// device profile, …) never collide on the same cached bytes.
1370    /// `None` uses the default (empty) namespace. Has no observable effect
1371    /// when no cache feature is active, but the configuration is always
1372    /// settable regardless of feature flags.
1373    pub fn with_cache_namespace<S: Into<String>>(&mut self, namespace: Option<S>) -> &mut Self {
1374        self.cache_namespace = namespace.map(|s| Box::new(s.into()));
1375        self
1376    }
1377
1378    /// Borrowed access to the cache namespace (`None` = default partition).
1379    #[inline]
1380    pub(crate) fn cache_namespace_str(&self) -> Option<&str> {
1381        self.cache_namespace.as_ref().map(|s| s.as_str())
1382    }
1383
1384    #[cfg(feature = "chrome")]
1385    /// Enable or disable Service Workers. This method does nothing if the `chrome` feature is not enabled.
1386    pub fn with_service_worker_enabled(&mut self, enabled: bool) -> &mut Self {
1387        self.service_worker_enabled = enabled;
1388        self
1389    }
1390
1391    #[cfg(not(feature = "chrome"))]
1392    /// Enable or disable Service Workers. This method does nothing if the `chrome` feature is not enabled.
1393    pub fn with_service_worker_enabled(&mut self, _enabled: bool) -> &mut Self {
1394        self
1395    }
1396
1397    /// Automatically setup geo-location configurations when using a proxy. This method does nothing if the `chrome` feature is not enabled.
1398    #[cfg(not(feature = "chrome"))]
1399    pub fn with_auto_geolocation(&mut self, _enabled: bool) -> &mut Self {
1400        self
1401    }
1402
1403    /// Automatically setup geo-location configurations when using a proxy. This method does nothing if the `chrome` feature is not enabled.
1404    #[cfg(feature = "chrome")]
1405    pub fn with_auto_geolocation(&mut self, enabled: bool) -> &mut Self {
1406        self.auto_geolocation = enabled;
1407        self
1408    }
1409
1410    /// Set the retry limit for request. Set the value to 0 for no retries. The default is 0.
1411    pub fn with_retry(&mut self, retry: u8) -> &mut Self {
1412        self.retry = retry;
1413        self
1414    }
1415
1416    /// The default http connect timeout.
1417    pub fn with_default_http_connect_timeout(
1418        &mut self,
1419        default_http_connect_timeout: Option<Duration>,
1420    ) -> &mut Self {
1421        self.default_http_connect_timeout = default_http_connect_timeout;
1422        self
1423    }
1424
1425    /// The default http read timeout.
1426    pub fn with_default_http_read_timeout(
1427        &mut self,
1428        default_http_read_timeout: Option<Duration>,
1429    ) -> &mut Self {
1430        self.default_http_read_timeout = default_http_read_timeout;
1431        self
1432    }
1433
1434    /// Skip setting up a control thread for pause, start, and shutdown programmatic handling. This does nothing without the 'control' flag enabled.
1435    pub fn with_no_control_thread(&mut self, no_control_thread: bool) -> &mut Self {
1436        self.no_control_thread = no_control_thread;
1437        self
1438    }
1439
1440    /// Configures the viewport of the browser, which defaults to 800x600. This method does nothing if the 'chrome' feature is not enabled.
1441    pub fn with_viewport(&mut self, viewport: Option<crate::configuration::Viewport>) -> &mut Self {
1442        self.viewport = viewport.map(|vp| vp);
1443        self
1444    }
1445
1446    #[cfg(feature = "chrome")]
1447    /// Use stealth mode for the request. This does nothing without the `chrome` flag enabled.
1448    pub fn with_stealth(&mut self, stealth_mode: bool) -> &mut Self {
1449        if stealth_mode {
1450            self.stealth_mode = spider_fingerprint::configs::Tier::Basic;
1451        } else {
1452            self.stealth_mode = spider_fingerprint::configs::Tier::None;
1453        }
1454        self
1455    }
1456
1457    #[cfg(feature = "chrome")]
1458    /// Use stealth mode for the request. This does nothing without the `chrome` flag enabled.
1459    pub fn with_stealth_advanced(
1460        &mut self,
1461        stealth_mode: spider_fingerprint::configs::Tier,
1462    ) -> &mut Self {
1463        self.stealth_mode = stealth_mode;
1464        self
1465    }
1466
1467    #[cfg(not(feature = "chrome"))]
1468    /// Use stealth mode for the request. This does nothing without the `chrome` flag enabled.
1469    pub fn with_stealth(&mut self, _stealth_mode: bool) -> &mut Self {
1470        self
1471    }
1472
1473    #[cfg(feature = "chrome")]
1474    /// Wait for network request to be idle within a time frame period (500ms no network connections). This does nothing without the `chrome` flag enabled.
1475    pub fn with_wait_for_idle_network(
1476        &mut self,
1477        wait_for_idle_network: Option<WaitForIdleNetwork>,
1478    ) -> &mut Self {
1479        match self.wait_for.as_mut() {
1480            Some(wait_for) => wait_for.idle_network = wait_for_idle_network,
1481            _ => {
1482                let mut wait_for = WaitFor::default();
1483                wait_for.idle_network = wait_for_idle_network;
1484                self.wait_for = Some(wait_for);
1485            }
1486        }
1487        self
1488    }
1489
1490    #[cfg(feature = "chrome")]
1491    /// Wait for network request with a max timeout. This does nothing without the `chrome` flag enabled.
1492    pub fn with_wait_for_idle_network0(
1493        &mut self,
1494        wait_for_idle_network0: Option<WaitForIdleNetwork>,
1495    ) -> &mut Self {
1496        match self.wait_for.as_mut() {
1497            Some(wait_for) => wait_for.idle_network0 = wait_for_idle_network0,
1498            _ => {
1499                let mut wait_for = WaitFor::default();
1500                wait_for.idle_network0 = wait_for_idle_network0;
1501                self.wait_for = Some(wait_for);
1502            }
1503        }
1504        self
1505    }
1506
1507    #[cfg(feature = "chrome")]
1508    /// Wait for network to be almost idle with a max timeout. This does nothing without the `chrome` flag enabled.
1509    pub fn with_wait_for_almost_idle_network0(
1510        &mut self,
1511        wait_for_almost_idle_network0: Option<WaitForIdleNetwork>,
1512    ) -> &mut Self {
1513        match self.wait_for.as_mut() {
1514            Some(wait_for) => wait_for.almost_idle_network0 = wait_for_almost_idle_network0,
1515            _ => {
1516                let mut wait_for = WaitFor::default();
1517                wait_for.almost_idle_network0 = wait_for_almost_idle_network0;
1518                self.wait_for = Some(wait_for);
1519            }
1520        }
1521        self
1522    }
1523
1524    #[cfg(not(feature = "chrome"))]
1525    /// Wait for network to be almost idle with a max timeout. This does nothing without the `chrome` flag enabled.
1526    pub fn with_wait_for_almost_idle_network0(
1527        &mut self,
1528        _wait_for_almost_idle_network0: Option<WaitForIdleNetwork>,
1529    ) -> &mut Self {
1530        self
1531    }
1532
1533    #[cfg(not(feature = "chrome"))]
1534    /// Wait for network request with a max timeout. This does nothing without the `chrome` flag enabled.
1535    pub fn with_wait_for_idle_network0(
1536        &mut self,
1537        _wait_for_idle_network0: Option<WaitForIdleNetwork>,
1538    ) -> &mut Self {
1539        self
1540    }
1541
1542    #[cfg(not(feature = "chrome"))]
1543    /// Wait for idle network request. This method does nothing if the `chrome` feature is not enabled.
1544    pub fn with_wait_for_idle_network(
1545        &mut self,
1546        _wait_for_idle_network: Option<WaitForIdleNetwork>,
1547    ) -> &mut Self {
1548        self
1549    }
1550
1551    #[cfg(feature = "chrome")]
1552    /// Wait for idle dom mutations for target element. This method does nothing if the [chrome] feature is not enabled.
1553    pub fn with_wait_for_idle_dom(
1554        &mut self,
1555        wait_for_idle_dom: Option<WaitForSelector>,
1556    ) -> &mut Self {
1557        match self.wait_for.as_mut() {
1558            Some(wait_for) => wait_for.dom = wait_for_idle_dom,
1559            _ => {
1560                let mut wait_for = WaitFor::default();
1561                wait_for.dom = wait_for_idle_dom;
1562                self.wait_for = Some(wait_for);
1563            }
1564        }
1565        self
1566    }
1567
1568    #[cfg(not(feature = "chrome"))]
1569    /// Wait for idle dom mutations for target element. This method does nothing if the `chrome` feature is not enabled.
1570    pub fn with_wait_for_idle_dom(
1571        &mut self,
1572        _wait_for_idle_dom: Option<WaitForSelector>,
1573    ) -> &mut Self {
1574        self
1575    }
1576
1577    #[cfg(feature = "chrome")]
1578    /// Wait for a selector. This method does nothing if the `chrome` feature is not enabled.
1579    pub fn with_wait_for_selector(
1580        &mut self,
1581        wait_for_selector: Option<WaitForSelector>,
1582    ) -> &mut Self {
1583        match self.wait_for.as_mut() {
1584            Some(wait_for) => wait_for.selector = wait_for_selector,
1585            _ => {
1586                let mut wait_for = WaitFor::default();
1587                wait_for.selector = wait_for_selector;
1588                self.wait_for = Some(wait_for);
1589            }
1590        }
1591        self
1592    }
1593
1594    #[cfg(not(feature = "chrome"))]
1595    /// Wait for a selector. This method does nothing if the `chrome` feature is not enabled.
1596    pub fn with_wait_for_selector(
1597        &mut self,
1598        _wait_for_selector: Option<WaitForSelector>,
1599    ) -> &mut Self {
1600        self
1601    }
1602
1603    #[cfg(feature = "chrome")]
1604    /// Wait for with delay. Should only be used for testing. This method does nothing if the 'chrome' feature is not enabled.
1605    pub fn with_wait_for_delay(&mut self, wait_for_delay: Option<WaitForDelay>) -> &mut Self {
1606        match self.wait_for.as_mut() {
1607            Some(wait_for) => wait_for.delay = wait_for_delay,
1608            _ => {
1609                let mut wait_for = WaitFor::default();
1610                wait_for.delay = wait_for_delay;
1611                self.wait_for = Some(wait_for);
1612            }
1613        }
1614        self
1615    }
1616
1617    #[cfg(not(feature = "chrome"))]
1618    /// Wait for with delay. Should only be used for testing. This method does nothing if the 'chrome' feature is not enabled.
1619    pub fn with_wait_for_delay(&mut self, _wait_for_delay: Option<WaitForDelay>) -> &mut Self {
1620        self
1621    }
1622
1623    #[cfg(feature = "chrome_intercept")]
1624    /// Use request intercept for the request to only allow content that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the `chrome_intercept` is not enabled.
1625    pub fn with_chrome_intercept(
1626        &mut self,
1627        chrome_intercept: RequestInterceptConfiguration,
1628        url: &Option<Box<url::Url>>,
1629    ) -> &mut Self {
1630        self.chrome_intercept = chrome_intercept;
1631        self.chrome_intercept.setup_intercept_manager(url);
1632        self
1633    }
1634
1635    #[cfg(not(feature = "chrome_intercept"))]
1636    /// Use request intercept for the request to only allow content required for the page that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the `chrome_intercept` is not enabled.
1637    pub fn with_chrome_intercept(
1638        &mut self,
1639        _chrome_intercept: RequestInterceptConfiguration,
1640        _url: &Option<Box<url::Url>>,
1641    ) -> &mut Self {
1642        self
1643    }
1644
1645    #[cfg(feature = "chrome")]
1646    /// Set the connection url for the chrome instance. This method does nothing if the `chrome` is not enabled.
1647    pub fn with_chrome_connection(&mut self, chrome_connection_url: Option<String>) -> &mut Self {
1648        self.chrome_connection_url = chrome_connection_url;
1649        self
1650    }
1651
1652    #[cfg(not(feature = "chrome"))]
1653    /// Set the connection url for the chrome instance. This method does nothing if the `chrome` is not enabled.
1654    pub fn with_chrome_connection(&mut self, _chrome_connection_url: Option<String>) -> &mut Self {
1655        self
1656    }
1657
1658    #[cfg(feature = "chrome")]
1659    /// Set multiple remote Chrome connection URLs for failover. When a
1660    /// connection fails after retries, the next URL is tried. Takes
1661    /// priority over `chrome_connection_url` when set.
1662    pub fn with_chrome_connections(&mut self, urls: Vec<String>) -> &mut Self {
1663        self.chrome_connection_urls = if urls.is_empty() { None } else { Some(urls) };
1664        self
1665    }
1666
1667    #[cfg(not(feature = "chrome"))]
1668    /// Set multiple remote Chrome connection URLs. This method does nothing if the `chrome` is not enabled.
1669    pub fn with_chrome_connections(&mut self, _urls: Vec<String>) -> &mut Self {
1670        self
1671    }
1672
1673    #[cfg(not(feature = "chrome"))]
1674    /// Set JS to run on certain pages. This method does nothing if the `chrome` is not enabled.
1675    pub fn with_execution_scripts(
1676        &mut self,
1677        _execution_scripts: Option<ExecutionScriptsMap>,
1678    ) -> &mut Self {
1679        self
1680    }
1681
1682    #[cfg(feature = "chrome")]
1683    /// Set JS to run on certain pages. This method does nothing if the `chrome` is not enabled.
1684    pub fn with_execution_scripts(
1685        &mut self,
1686        execution_scripts: Option<ExecutionScriptsMap>,
1687    ) -> &mut Self {
1688        self.execution_scripts =
1689            crate::features::chrome_common::convert_to_trie_execution_scripts(&execution_scripts);
1690        self
1691    }
1692
1693    #[cfg(not(feature = "chrome"))]
1694    /// Run web automated actions on certain pages. This method does nothing if the `chrome` is not enabled.
1695    pub fn with_automation_scripts(
1696        &mut self,
1697        _automation_scripts: Option<AutomationScriptsMap>,
1698    ) -> &mut Self {
1699        self
1700    }
1701
1702    #[cfg(feature = "chrome")]
1703    /// Run web automated actions on certain pages. This method does nothing if the `chrome` is not enabled.
1704    pub fn with_automation_scripts(
1705        &mut self,
1706        automation_scripts: Option<AutomationScriptsMap>,
1707    ) -> &mut Self {
1708        self.automation_scripts =
1709            crate::features::chrome_common::convert_to_trie_automation_scripts(&automation_scripts);
1710        self
1711    }
1712
1713    /// Set a crawl budget per path with levels support /a/b/c or for all paths with "*". This does nothing without the `budget` flag enabled.
1714    pub fn with_budget(&mut self, budget: Option<hashbrown::HashMap<&str, u32>>) -> &mut Self {
1715        self.budget = match budget {
1716            Some(budget) => {
1717                let mut crawl_budget: hashbrown::HashMap<
1718                    case_insensitive_string::CaseInsensitiveString,
1719                    u32,
1720                > = hashbrown::HashMap::new();
1721
1722                for b in budget.into_iter() {
1723                    crawl_budget.insert(
1724                        case_insensitive_string::CaseInsensitiveString::from(b.0),
1725                        b.1,
1726                    );
1727                }
1728
1729                Some(crawl_budget)
1730            }
1731            _ => None,
1732        };
1733        self
1734    }
1735
1736    /// Group external domains to treat the crawl as one. If None is passed this will clear all prior domains.
1737    pub fn with_external_domains<'a, 'b>(
1738        &mut self,
1739        external_domains: Option<impl Iterator<Item = String> + 'a>,
1740    ) -> &mut Self {
1741        match external_domains {
1742            Some(external_domains) => {
1743                self.external_domains_caseless = external_domains
1744                    .into_iter()
1745                    .filter_map(|d| {
1746                        if d == "*" {
1747                            Some("*".into())
1748                        } else {
1749                            let host = get_domain_from_url(&d);
1750
1751                            if !host.is_empty() {
1752                                Some(host.into())
1753                            } else {
1754                                None
1755                            }
1756                        }
1757                    })
1758                    .collect::<hashbrown::HashSet<case_insensitive_string::CaseInsensitiveString>>()
1759                    .into();
1760            }
1761            _ => self.external_domains_caseless = Default::default(),
1762        }
1763
1764        self
1765    }
1766
1767    /// Dangerously accept invalid certificates - this should be used as a last resort.
1768    pub fn with_danger_accept_invalid_certs(&mut self, accept_invalid_certs: bool) -> &mut Self {
1769        self.accept_invalid_certs = accept_invalid_certs;
1770        self
1771    }
1772
1773    /// Normalize the content de-duplicating trailing slash pages and other pages that can be duplicated. This may initially show the link in your links_visited or subscription calls but, the following links will not be crawled.
1774    pub fn with_normalize(&mut self, normalize: bool) -> &mut Self {
1775        self.normalize = normalize;
1776        self
1777    }
1778
1779    #[cfg(not(feature = "disk"))]
1780    /// Store all the links found on the disk to share the state. This does nothing without the `disk` flag enabled.
1781    pub fn with_shared_state(&mut self, _shared: bool) -> &mut Self {
1782        self
1783    }
1784
1785    /// Store all the links found on the disk to share the state. This does nothing without the `disk` flag enabled.
1786    #[cfg(feature = "disk")]
1787    pub fn with_shared_state(&mut self, shared: bool) -> &mut Self {
1788        self.shared = shared;
1789        self
1790    }
1791
1792    #[cfg(not(feature = "chrome"))]
1793    /// Overrides default host system timezone with the specified one. This does nothing without the `chrome` flag enabled.
1794    pub fn with_timezone_id(&mut self, _timezone_id: Option<String>) -> &mut Self {
1795        self
1796    }
1797
1798    #[cfg(feature = "chrome")]
1799    /// Overrides default host system timezone with the specified one. This does nothing without the `chrome` flag enabled.
1800    pub fn with_timezone_id(&mut self, timezone_id: Option<String>) -> &mut Self {
1801        self.timezone_id = timezone_id.map(|timezone_id| timezone_id.into());
1802        self
1803    }
1804
1805    #[cfg(not(feature = "chrome"))]
1806    /// Overrides default host system locale with the specified one. This does nothing without the `chrome` flag enabled.
1807    pub fn with_locale(&mut self, _locale: Option<String>) -> &mut Self {
1808        self
1809    }
1810
1811    #[cfg(feature = "chrome")]
1812    /// Overrides default host system locale with the specified one. This does nothing without the `chrome` flag enabled.
1813    pub fn with_locale(&mut self, locale: Option<String>) -> &mut Self {
1814        self.locale = locale.map(|locale| locale.into());
1815        self
1816    }
1817
1818    #[cfg(feature = "chrome")]
1819    /// Track the events made via chrome.
1820    pub fn with_event_tracker(&mut self, track_events: Option<ChromeEventTracker>) -> &mut Self {
1821        self.track_events = track_events;
1822        self
1823    }
1824
1825    /// Set the chrome screenshot configuration. This does nothing without the `chrome` flag enabled.
1826    #[cfg(not(feature = "chrome"))]
1827    pub fn with_screenshot(&mut self, _screenshot_config: Option<ScreenShotConfig>) -> &mut Self {
1828        self
1829    }
1830
1831    /// Set the chrome screenshot configuration. This does nothing without the `chrome` flag enabled.
1832    #[cfg(feature = "chrome")]
1833    pub fn with_screenshot(&mut self, screenshot_config: Option<ScreenShotConfig>) -> &mut Self {
1834        self.screenshot = screenshot_config;
1835        self
1836    }
1837
1838    /// Set the max amount of bytes to collect per page. This method does nothing if the `chrome` is not enabled.
1839    pub fn with_max_page_bytes(&mut self, max_page_bytes: Option<f64>) -> &mut Self {
1840        self.max_page_bytes = max_page_bytes;
1841        self
1842    }
1843
1844    /// Set the max amount of bytes to collected for the browser context. This method does nothing if the `chrome` is not enabled.
1845    pub fn with_max_bytes_allowed(&mut self, max_bytes_allowed: Option<u64>) -> &mut Self {
1846        self.max_bytes_allowed = max_bytes_allowed;
1847        self
1848    }
1849
1850    /// Block assets from loading from the network.
1851    pub fn with_block_assets(&mut self, only_html: bool) -> &mut Self {
1852        self.only_html = only_html;
1853        self
1854    }
1855
1856    /// Modify the headers to mimic a real browser.
1857    pub fn with_modify_headers(&mut self, modify_headers: bool) -> &mut Self {
1858        self.modify_headers = modify_headers;
1859        self
1860    }
1861
1862    /// Modify the HTTP client headers to mimic a real browser.
1863    pub fn with_modify_http_client_headers(
1864        &mut self,
1865        modify_http_client_headers: bool,
1866    ) -> &mut Self {
1867        self.modify_http_client_headers = modify_http_client_headers;
1868        self
1869    }
1870
1871    /// Set the cache policy.
1872    pub fn with_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) -> &mut Self {
1873        self.cache_policy = cache_policy;
1874        self
1875    }
1876
1877    #[cfg(feature = "webdriver")]
1878    /// Set the WebDriver configuration. This does nothing without the `webdriver` flag enabled.
1879    pub fn with_webdriver_config(
1880        &mut self,
1881        webdriver_config: Option<WebDriverConfig>,
1882    ) -> &mut Self {
1883        self.webdriver_config = webdriver_config.map(Box::new);
1884        self
1885    }
1886
1887    #[cfg(not(feature = "webdriver"))]
1888    /// Set the WebDriver configuration. This does nothing without the `webdriver` flag enabled.
1889    pub fn with_webdriver_config(
1890        &mut self,
1891        _webdriver_config: Option<WebDriverConfig>,
1892    ) -> &mut Self {
1893        self
1894    }
1895
1896    /// Get the cache option to use for the run. This does nothing without the 'cache_request' feature.
1897    #[cfg(any(feature = "cache_request", feature = "chrome_remote_cache"))]
1898    pub(crate) fn get_cache_options(&self) -> Option<crate::utils::CacheOptions> {
1899        use crate::utils::CacheOptions;
1900        if !self.cache {
1901            return None;
1902        }
1903        let auth_token = self
1904            .headers
1905            .as_ref()
1906            .and_then(|headers| {
1907                headers
1908                    .0
1909                    .get("authorization")
1910                    .or_else(|| headers.0.get("Authorization"))
1911            })
1912            .map(|s| s.to_owned());
1913
1914        // When using in-memory cache (cache_mem), auto-enable skip_browser
1915        // since the cached HTML was already rendered by a prior Chrome crawl
1916        // and re-rendering through Chrome is redundant. The browser only
1917        // launches when the cache has no hit for the requested page.
1918        #[cfg(feature = "cache_mem")]
1919        let skip_browser = true;
1920        #[cfg(not(feature = "cache_mem"))]
1921        let skip_browser = self.cache_skip_browser;
1922
1923        match auth_token {
1924            Some(token) if !token.is_empty() => {
1925                if let Ok(token_str) = token.to_str() {
1926                    if skip_browser {
1927                        Some(CacheOptions::SkipBrowserAuthorized(token_str.into()))
1928                    } else {
1929                        Some(CacheOptions::Authorized(token_str.into()))
1930                    }
1931                } else if skip_browser {
1932                    Some(CacheOptions::SkipBrowser)
1933                } else {
1934                    Some(CacheOptions::Yes)
1935                }
1936            }
1937            _ => {
1938                if skip_browser {
1939                    Some(CacheOptions::SkipBrowser)
1940                } else {
1941                    Some(CacheOptions::Yes)
1942                }
1943            }
1944        }
1945    }
1946
1947    /// Get the cache option to use for the run. This does nothing without the 'cache_request' feature.
1948    #[cfg(all(
1949        feature = "chrome",
1950        not(any(feature = "cache_request", feature = "chrome_remote_cache"))
1951    ))]
1952    pub(crate) fn get_cache_options(&self) -> Option<crate::utils::CacheOptions> {
1953        None
1954    }
1955
1956    /// Get the cache option to use for the run when chrome/cache features are disabled.
1957    #[cfg(not(any(
1958        feature = "cache_request",
1959        feature = "chrome_remote_cache",
1960        feature = "chrome"
1961    )))]
1962    #[allow(dead_code)]
1963    pub(crate) fn get_cache_options(&self) -> Option<crate::utils::CacheOptions> {
1964        None
1965    }
1966
1967    /// Build the website configuration when using with_builder.
1968    pub fn build(&self) -> Self {
1969        self.to_owned()
1970    }
1971
1972    #[cfg(feature = "search")]
1973    /// Configure web search integration. This does nothing without the `search` flag enabled.
1974    pub fn with_search_config(&mut self, search_config: Option<SearchConfig>) -> &mut Self {
1975        self.search_config = search_config.map(Box::new);
1976        self
1977    }
1978
1979    #[cfg(not(feature = "search"))]
1980    /// Configure web search integration. This does nothing without the `search` flag enabled.
1981    pub fn with_search_config(&mut self, _search_config: Option<()>) -> &mut Self {
1982        self
1983    }
1984
1985    /// Set a [spider.cloud](https://spider.cloud) API key (Proxy mode).
1986    #[cfg(feature = "spider_cloud")]
1987    pub fn with_spider_cloud(&mut self, api_key: &str) -> &mut Self {
1988        if is_placeholder_api_key(api_key) {
1989            log::warn!("Spider Cloud API key looks like a placeholder — skipping. Get a real key at https://spider.cloud");
1990            return self;
1991        }
1992        self.spider_cloud = Some(Box::new(SpiderCloudConfig::new(api_key)));
1993        self
1994    }
1995
1996    /// Set a [spider.cloud](https://spider.cloud) API key (no-op without `spider_cloud` feature).
1997    #[cfg(not(feature = "spider_cloud"))]
1998    pub fn with_spider_cloud(&mut self, _api_key: &str) -> &mut Self {
1999        self
2000    }
2001
2002    /// Set a [spider.cloud](https://spider.cloud) config.
2003    #[cfg(feature = "spider_cloud")]
2004    pub fn with_spider_cloud_config(&mut self, config: SpiderCloudConfig) -> &mut Self {
2005        self.spider_cloud = Some(Box::new(config));
2006        self
2007    }
2008
2009    /// Set a [spider.cloud](https://spider.cloud) config (no-op without `spider_cloud` feature).
2010    #[cfg(not(feature = "spider_cloud"))]
2011    pub fn with_spider_cloud_config(&mut self, _config: ()) -> &mut Self {
2012        self
2013    }
2014
2015    /// Connect to [Spider Browser Cloud](https://spider.cloud/docs/api#browser)
2016    /// via CDP over WebSocket using an API key.
2017    ///
2018    /// Sets `chrome_connection_url` to `wss://browser.spider.cloud/v1/browser?token=API_KEY`.
2019    #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2020    pub fn with_spider_browser(&mut self, api_key: &str) -> &mut Self {
2021        if is_placeholder_api_key(api_key) {
2022            log::warn!("Spider Browser Cloud API key looks like a placeholder — skipping. Get a real key at https://spider.cloud");
2023            return self;
2024        }
2025        let cfg = SpiderBrowserConfig::new(api_key);
2026        self.chrome_connection_url = Some(cfg.connection_url());
2027        self.spider_browser = Some(Box::new(cfg));
2028        self
2029    }
2030
2031    /// Connect to Spider Browser Cloud (no-op without `spider_cloud` + `chrome` features).
2032    #[cfg(not(all(feature = "spider_cloud", feature = "chrome")))]
2033    pub fn with_spider_browser(&mut self, _api_key: &str) -> &mut Self {
2034        self
2035    }
2036
2037    /// Connect to [Spider Browser Cloud](https://spider.cloud/docs/api#browser)
2038    /// with full configuration (stealth, country, browser type, etc.).
2039    #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2040    pub fn with_spider_browser_config(&mut self, config: SpiderBrowserConfig) -> &mut Self {
2041        self.chrome_connection_url = Some(config.connection_url());
2042        self.spider_browser = Some(Box::new(config));
2043        self
2044    }
2045
2046    /// Connect to Spider Browser Cloud with config (no-op without features).
2047    #[cfg(not(all(feature = "spider_cloud", feature = "chrome")))]
2048    pub fn with_spider_browser_config(&mut self, _config: ()) -> &mut Self {
2049        self
2050    }
2051
2052    /// Set the hedged request (work-stealing) configuration.
2053    #[cfg(feature = "hedge")]
2054    pub fn with_hedge(&mut self, config: crate::utils::hedge::HedgeConfig) -> &mut Self {
2055        self.hedge = Some(config);
2056        self
2057    }
2058
2059    /// Set the hedged request configuration (no-op without `hedge` feature).
2060    #[cfg(not(feature = "hedge"))]
2061    pub fn with_hedge(&mut self, _config: ()) -> &mut Self {
2062        self
2063    }
2064
2065    #[cfg(feature = "auto_throttle")]
2066    /// Set the auto-throttle configuration for latency-based adaptive delay.
2067    pub fn with_auto_throttle(
2068        &mut self,
2069        config: crate::utils::auto_throttle::AutoThrottleConfig,
2070    ) -> &mut Self {
2071        self.auto_throttle = Some(config);
2072        self
2073    }
2074
2075    /// Set the auto-throttle configuration (no-op without `auto_throttle` feature).
2076    #[cfg(not(feature = "auto_throttle"))]
2077    pub fn with_auto_throttle(&mut self, _config: ()) -> &mut Self {
2078        self
2079    }
2080
2081    #[cfg(feature = "etag_cache")]
2082    /// Enable or disable ETag / conditional request caching for bandwidth-efficient re-crawls.
2083    pub fn with_etag_cache(&mut self, enabled: bool) -> &mut Self {
2084        self.etag_cache = enabled;
2085        self
2086    }
2087
2088    /// Enable or disable ETag caching (no-op without `etag_cache` feature).
2089    #[cfg(not(feature = "etag_cache"))]
2090    pub fn with_etag_cache(&mut self, _enabled: bool) -> &mut Self {
2091        self
2092    }
2093
2094    #[cfg(feature = "warc")]
2095    /// Configure WARC output for writing a web archive file during crawl.
2096    pub fn with_warc(&mut self, config: crate::utils::warc::WarcConfig) -> &mut Self {
2097        self.warc = Some(config);
2098        self
2099    }
2100
2101    /// Configure WARC output (no-op without `warc` feature).
2102    #[cfg(not(feature = "warc"))]
2103    pub fn with_warc(&mut self, _config: ()) -> &mut Self {
2104        self
2105    }
2106}
2107
2108/// Search provider configuration for web search integration.
2109#[cfg(feature = "search")]
2110#[derive(Debug, Clone, PartialEq)]
2111#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2112pub struct SearchConfig {
2113    /// The search provider to use.
2114    pub provider: SearchProviderType,
2115    /// API key for the search provider.
2116    pub api_key: String,
2117    /// Custom API URL (overrides default endpoint for the provider).
2118    pub api_url: Option<String>,
2119    /// Default search options.
2120    pub default_options: Option<SearchOptions>,
2121}
2122
2123#[cfg(feature = "search")]
2124impl SearchConfig {
2125    /// Create a new search configuration.
2126    pub fn new(provider: SearchProviderType, api_key: impl Into<String>) -> Self {
2127        Self {
2128            provider,
2129            api_key: api_key.into(),
2130            api_url: None,
2131            default_options: None,
2132        }
2133    }
2134
2135    /// Use a custom API endpoint for this provider.
2136    pub fn with_api_url(mut self, url: impl Into<String>) -> Self {
2137        self.api_url = Some(url.into());
2138        self
2139    }
2140
2141    /// Set default search options.
2142    pub fn with_default_options(mut self, options: SearchOptions) -> Self {
2143        self.default_options = Some(options);
2144        self
2145    }
2146
2147    /// Check if this configuration is valid and search is enabled.
2148    ///
2149    /// Returns true if an API key is set or a custom API URL is configured.
2150    pub fn is_enabled(&self) -> bool {
2151        !self.api_key.is_empty() || self.api_url.is_some()
2152    }
2153}
2154
2155/// Available search providers.
2156#[cfg(feature = "search")]
2157#[derive(Debug, Clone, Default, PartialEq, Eq)]
2158#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2159pub enum SearchProviderType {
2160    /// Serper.dev - Google SERP API (high quality).
2161    #[default]
2162    Serper,
2163    /// Brave Search API (privacy-focused).
2164    Brave,
2165    /// Microsoft Bing Web Search API.
2166    Bing,
2167    /// Tavily AI Search (optimized for LLMs).
2168    Tavily,
2169}
2170
2171// ─── Spider Cloud ───────────────────────────────────────────────────────────
2172
2173/// Integration mode for [spider.cloud](https://spider.cloud).
2174#[cfg(feature = "spider_cloud")]
2175#[derive(Debug, Clone, Default, PartialEq, Eq)]
2176#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2177pub enum SpiderCloudMode {
2178    /// Route all HTTP requests through `proxy.spider.cloud`.
2179    /// This is the simplest mode — the existing fetch pipeline works
2180    /// unmodified, traffic goes through the proxy transparently.
2181    #[default]
2182    Proxy,
2183    /// Use the spider.cloud `POST /crawl` API (with `limit: 1`) for each page.
2184    /// Best for simple scraping needs.
2185    Api,
2186    /// Use the spider.cloud `POST /unblocker` API for anti-bot bypass.
2187    /// Best for hard-to-get pages behind advanced bot protection.
2188    Unblocker,
2189    /// Direct fetch first; fall back to spider.cloud API on
2190    /// 403 / 429 / 503 or connection errors.
2191    Fallback,
2192    /// Intelligent mode: proxy by default, automatically falls back to
2193    /// `/unblocker` when it detects bot protection (403, 429, 503, CAPTCHA
2194    /// pages, Cloudflare challenges, empty bodies on HTML pages, etc.).
2195    /// This is the recommended mode for production use.
2196    Smart,
2197}
2198
2199/// Return format for Spider Cloud API responses.
2200#[cfg(feature = "spider_cloud")]
2201#[derive(Debug, Clone, Default, PartialEq, Eq)]
2202#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2203pub enum SpiderCloudReturnFormat {
2204    /// Original HTML (default).
2205    #[default]
2206    #[cfg_attr(feature = "serde", serde(rename = "raw"))]
2207    Raw,
2208    /// Clean markdown — ideal for LLM pipelines.
2209    #[cfg_attr(feature = "serde", serde(rename = "markdown"))]
2210    Markdown,
2211    /// CommonMark-flavored markdown.
2212    #[cfg_attr(feature = "serde", serde(rename = "commonmark"))]
2213    CommonMark,
2214    /// Plain text with markup stripped.
2215    #[cfg_attr(feature = "serde", serde(rename = "text"))]
2216    Text,
2217    /// Raw bytes (no encoding conversion).
2218    #[cfg_attr(feature = "serde", serde(rename = "bytes"))]
2219    Bytes,
2220}
2221
2222#[cfg(feature = "spider_cloud")]
2223impl SpiderCloudReturnFormat {
2224    /// The API wire value sent to spider.cloud.
2225    pub fn as_str(&self) -> &'static str {
2226        match self {
2227            Self::Raw => "raw",
2228            Self::Markdown => "markdown",
2229            Self::CommonMark => "commonmark",
2230            Self::Text => "text",
2231            Self::Bytes => "bytes",
2232        }
2233    }
2234}
2235
2236#[cfg(feature = "spider_cloud")]
2237impl std::fmt::Display for SpiderCloudReturnFormat {
2238    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2239        f.write_str(self.as_str())
2240    }
2241}
2242
2243#[cfg(feature = "spider_cloud")]
2244impl From<&str> for SpiderCloudReturnFormat {
2245    fn from(s: &str) -> Self {
2246        match s {
2247            "markdown" | "Markdown" | "MARKDOWN" => Self::Markdown,
2248            "commonmark" | "CommonMark" | "COMMONMARK" => Self::CommonMark,
2249            "text" | "Text" | "TEXT" => Self::Text,
2250            "bytes" | "Bytes" | "BYTES" => Self::Bytes,
2251            _ => Self::Raw,
2252        }
2253    }
2254}
2255
2256/// Configuration for spider.cloud integration.
2257///
2258/// Spider Cloud provides anti-bot bypass, proxy rotation, and high-throughput
2259/// data collection. Sign up at <https://spider.cloud> to obtain an API key.
2260#[cfg(feature = "spider_cloud")]
2261#[derive(Debug, Clone, PartialEq, Eq)]
2262#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2263pub struct SpiderCloudConfig {
2264    /// API key / secret. Sign up at <https://spider.cloud> to get one.
2265    pub api_key: String,
2266    /// Integration mode.
2267    #[cfg_attr(feature = "serde", serde(default))]
2268    pub mode: SpiderCloudMode,
2269    /// API base URL (default: `https://api.spider.cloud`).
2270    #[cfg_attr(
2271        feature = "serde",
2272        serde(default = "SpiderCloudConfig::default_api_url")
2273    )]
2274    pub api_url: String,
2275    /// Proxy URL (default: `https://proxy.spider.cloud`).
2276    #[cfg_attr(
2277        feature = "serde",
2278        serde(default = "SpiderCloudConfig::default_proxy_url")
2279    )]
2280    pub proxy_url: String,
2281    /// Return format for API responses (default: [`SpiderCloudReturnFormat::Raw`]).
2282    #[cfg_attr(feature = "serde", serde(default))]
2283    pub return_format: SpiderCloudReturnFormat,
2284    /// Request multiple return formats in a single crawl.
2285    ///
2286    /// When set, the API returns `content` as an object keyed by format
2287    /// (e.g. `{"markdown": "...", "raw": "..."}`). The primary `return_format`
2288    /// is stored in [`Page::get_content`](crate::page::Page::get_content) and
2289    /// the extras are accessible via [`Page::get_content_for`](crate::page::Page::get_content_for).
2290    #[cfg_attr(
2291        feature = "serde",
2292        serde(default, skip_serializing_if = "Option::is_none")
2293    )]
2294    pub return_formats: Option<Vec<SpiderCloudReturnFormat>>,
2295    /// Extra params forwarded in API mode (e.g. `stealth`, `fingerprint`, `cache`).
2296    #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
2297    pub extra_params: Option<hashbrown::HashMap<String, serde_json::Value>>,
2298}
2299
2300#[cfg(feature = "spider_cloud")]
2301impl Default for SpiderCloudConfig {
2302    fn default() -> Self {
2303        Self {
2304            api_key: String::new(),
2305            mode: SpiderCloudMode::default(),
2306            api_url: Self::default_api_url(),
2307            proxy_url: Self::default_proxy_url(),
2308            return_format: SpiderCloudReturnFormat::default(),
2309            return_formats: None,
2310            extra_params: None,
2311        }
2312    }
2313}
2314
2315#[cfg(feature = "spider_cloud")]
2316impl SpiderCloudConfig {
2317    /// Create a new config with defaults (Proxy mode).
2318    pub fn new(api_key: impl Into<String>) -> Self {
2319        Self {
2320            api_key: api_key.into(),
2321            ..Default::default()
2322        }
2323    }
2324
2325    /// Set the integration mode.
2326    pub fn with_mode(mut self, mode: SpiderCloudMode) -> Self {
2327        self.mode = mode;
2328        self
2329    }
2330
2331    /// Set a custom API base URL.
2332    pub fn with_api_url(mut self, url: impl Into<String>) -> Self {
2333        self.api_url = url.into();
2334        self
2335    }
2336
2337    /// Set a custom proxy URL.
2338    pub fn with_proxy_url(mut self, url: impl Into<String>) -> Self {
2339        self.proxy_url = url.into();
2340        self
2341    }
2342
2343    /// Set the return format for API responses.
2344    ///
2345    /// Accepts `SpiderCloudReturnFormat` directly or a string like `"markdown"`:
2346    /// ```ignore
2347    /// config.with_return_format(SpiderCloudReturnFormat::Markdown)
2348    /// config.with_return_format("markdown")
2349    /// ```
2350    pub fn with_return_format(mut self, fmt: impl Into<SpiderCloudReturnFormat>) -> Self {
2351        self.return_format = fmt.into();
2352        self
2353    }
2354
2355    /// Request multiple return formats in a single crawl.
2356    ///
2357    /// The first format becomes the primary content (accessible via
2358    /// [`Page::get_content`](crate::page::Page::get_content)), and all formats are
2359    /// accessible via [`Page::get_content_for`](crate::page::Page::get_content_for).
2360    ///
2361    /// ```ignore
2362    /// config.with_return_formats(vec![
2363    ///     SpiderCloudReturnFormat::Markdown,
2364    ///     SpiderCloudReturnFormat::Raw,
2365    /// ])
2366    /// ```
2367    pub fn with_return_formats(mut self, formats: Vec<SpiderCloudReturnFormat>) -> Self {
2368        // Deduplicate while preserving order.
2369        let mut seen = Vec::with_capacity(formats.len());
2370        for f in formats {
2371            if !seen.contains(&f) {
2372                seen.push(f);
2373            }
2374        }
2375        if let Some(first) = seen.first() {
2376            self.return_format = first.clone();
2377        }
2378        self.return_formats = Some(seen);
2379        self
2380    }
2381
2382    /// Check if multiple return formats are requested.
2383    pub fn has_multiple_formats(&self) -> bool {
2384        self.return_formats.as_ref().is_some_and(|f| f.len() > 1)
2385    }
2386
2387    /// Set extra params for API mode.
2388    pub fn with_extra_params(
2389        mut self,
2390        params: hashbrown::HashMap<String, serde_json::Value>,
2391    ) -> Self {
2392        self.extra_params = Some(params);
2393        self
2394    }
2395
2396    /// Determine if a response should trigger a spider.cloud API fallback.
2397    ///
2398    /// This encapsulates the intelligence about which status codes and
2399    /// content patterns indicate the page needs spider.cloud's help.
2400    ///
2401    /// Checks for:
2402    /// - HTTP 403 (Forbidden) — typically bot protection
2403    /// - HTTP 429 (Too Many Requests) — rate limiting
2404    /// - HTTP 503 (Service Unavailable) — often Cloudflare/DDoS protection
2405    /// - HTTP 520-530 (Cloudflare error range)
2406    /// - HTTP 5xx (server errors)
2407    /// - Empty body on what should be an HTML page
2408    /// - Known CAPTCHA / challenge page markers in the response body
2409    pub fn should_fallback(&self, status_code: u16, body: Option<&[u8]>) -> bool {
2410        match self.mode {
2411            SpiderCloudMode::Api | SpiderCloudMode::Unblocker => false, // already using API
2412            SpiderCloudMode::Proxy => false,                            // proxy-only, no fallback
2413            SpiderCloudMode::Fallback | SpiderCloudMode::Smart => {
2414                // Status code triggers
2415                if matches!(status_code, 403 | 429 | 503 | 520..=530) {
2416                    return true;
2417                }
2418                if status_code >= 500 {
2419                    return true;
2420                }
2421
2422                // Content-based triggers (Smart mode only)
2423                if self.mode == SpiderCloudMode::Smart {
2424                    if let Some(body) = body {
2425                        // Empty body when we expected HTML
2426                        if body.is_empty() {
2427                            return true;
2428                        }
2429
2430                        // Check for bot protection / CAPTCHA markers in the body
2431                        // (only check first 4KB for performance)
2432                        let check_len = body.len().min(4096);
2433                        let snippet = String::from_utf8_lossy(&body[..check_len]);
2434                        let lower = snippet.to_lowercase();
2435
2436                        // Cloudflare challenge
2437                        if lower.contains("cf-browser-verification")
2438                            || lower.contains("cloudflare") && lower.contains("challenge-platform")
2439                        {
2440                            return true;
2441                        }
2442
2443                        // Generic CAPTCHA / bot detection markers
2444                        if lower.contains("captcha") && lower.contains("challenge")
2445                            || lower.contains("please verify you are a human")
2446                            || lower.contains("access denied") && lower.contains("automated")
2447                            || lower.contains("bot detection")
2448                        {
2449                            return true;
2450                        }
2451
2452                        // Distil Networks / Imperva / Akamai patterns
2453                        if lower.contains("distil_r_captcha")
2454                            || lower.contains("_imperva")
2455                            || lower.contains("akamai") && lower.contains("bot manager")
2456                        {
2457                            return true;
2458                        }
2459                    }
2460                }
2461
2462                false
2463            }
2464        }
2465    }
2466
2467    /// Get the fallback API route for this config.
2468    ///
2469    /// - `Smart` mode → `/unblocker` (best for bot-protected pages)
2470    /// - `Fallback` mode → `/crawl` (general purpose)
2471    /// - Other modes → `/crawl` (default)
2472    pub fn fallback_route(&self) -> &'static str {
2473        match self.mode {
2474            SpiderCloudMode::Smart | SpiderCloudMode::Unblocker => "unblocker",
2475            _ => "crawl",
2476        }
2477    }
2478
2479    /// Whether this mode uses the proxy transport layer.
2480    pub fn uses_proxy(&self) -> bool {
2481        matches!(
2482            self.mode,
2483            SpiderCloudMode::Proxy | SpiderCloudMode::Fallback | SpiderCloudMode::Smart
2484        )
2485    }
2486
2487    fn default_api_url() -> String {
2488        "https://api.spider.cloud".to_string()
2489    }
2490
2491    fn default_proxy_url() -> String {
2492        "https://proxy.spider.cloud".to_string()
2493    }
2494}
2495
2496// ─── Spider Browser Cloud ────────────────────────────────────────────────────
2497
2498/// Configuration for [Spider Browser Cloud](https://spider.cloud/docs/api#browser).
2499///
2500/// Connects to a remote Chromium instance via CDP over WebSocket at
2501/// `wss://browser.spider.cloud/v1/browser`.  Authentication is via
2502/// `?token=API_KEY` query parameter.
2503///
2504/// Optional query parameters: `stealth`, `browser`, `country`.
2505#[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2506#[derive(Debug, Clone, PartialEq, Eq)]
2507#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2508pub struct SpiderBrowserConfig {
2509    /// API key / secret. Sign up at <https://spider.cloud> to get one.
2510    pub api_key: String,
2511    /// WebSocket base URL (default: `wss://browser.spider.cloud/v1/browser`).
2512    #[cfg_attr(
2513        feature = "serde",
2514        serde(default = "SpiderBrowserConfig::default_wss_url")
2515    )]
2516    pub wss_url: String,
2517    /// Enable stealth mode (anti-fingerprinting). Sent as `stealth=true` query param.
2518    #[cfg_attr(feature = "serde", serde(default))]
2519    pub stealth: bool,
2520    /// Browser type to request (e.g. `"chrome"`, `"firefox"`). Sent as `browser=<value>`.
2521    #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
2522    pub browser: Option<String>,
2523    /// Country code for geo-targeting (e.g. `"us"`, `"gb"`). Sent as `country=<value>`.
2524    #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
2525    pub country: Option<String>,
2526    /// Extra query parameters appended to the WSS URL.
2527    #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
2528    pub extra_params: Option<Vec<(String, String)>>,
2529}
2530
2531#[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2532impl Default for SpiderBrowserConfig {
2533    fn default() -> Self {
2534        Self {
2535            api_key: String::new(),
2536            wss_url: Self::default_wss_url(),
2537            stealth: false,
2538            browser: None,
2539            country: None,
2540            extra_params: None,
2541        }
2542    }
2543}
2544
2545#[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2546impl SpiderBrowserConfig {
2547    /// Create a new config with the given API key.
2548    pub fn new(api_key: impl Into<String>) -> Self {
2549        Self {
2550            api_key: api_key.into(),
2551            ..Default::default()
2552        }
2553    }
2554
2555    /// Set a custom WSS base URL.
2556    pub fn with_wss_url(mut self, url: impl Into<String>) -> Self {
2557        self.wss_url = url.into();
2558        self
2559    }
2560
2561    /// Enable or disable stealth mode.
2562    pub fn with_stealth(mut self, stealth: bool) -> Self {
2563        self.stealth = stealth;
2564        self
2565    }
2566
2567    /// Set the browser type to request.
2568    pub fn with_browser(mut self, browser: impl Into<String>) -> Self {
2569        self.browser = Some(browser.into());
2570        self
2571    }
2572
2573    /// Set the country code for geo-targeting.
2574    pub fn with_country(mut self, country: impl Into<String>) -> Self {
2575        self.country = Some(country.into());
2576        self
2577    }
2578
2579    /// Add extra query parameters.
2580    pub fn with_extra_params(mut self, params: Vec<(String, String)>) -> Self {
2581        self.extra_params = Some(params);
2582        self
2583    }
2584
2585    /// Build the full WSS connection URL with authentication and options.
2586    ///
2587    /// Returns a URL like:
2588    /// `wss://browser.spider.cloud/v1/browser?token=KEY&stealth=true&country=us`
2589    pub fn connection_url(&self) -> String {
2590        let mut url = self.wss_url.clone();
2591
2592        // Start query string
2593        if url.contains('?') {
2594            url.push('&');
2595        } else {
2596            url.push('?');
2597        }
2598        url.push_str("token=");
2599        url.push_str(&self.api_key);
2600
2601        if self.stealth {
2602            url.push_str("&stealth=true");
2603        }
2604        if let Some(ref browser) = self.browser {
2605            url.push_str("&browser=");
2606            url.push_str(browser);
2607        }
2608        if let Some(ref country) = self.country {
2609            url.push_str("&country=");
2610            url.push_str(country);
2611        }
2612        if let Some(ref extra) = self.extra_params {
2613            for (k, v) in extra {
2614                url.push('&');
2615                url.push_str(k);
2616                url.push('=');
2617                url.push_str(v);
2618            }
2619        }
2620
2621        url
2622    }
2623
2624    fn default_wss_url() -> String {
2625        "wss://browser.spider.cloud/v1/browser".to_string()
2626    }
2627}
2628
2629#[cfg(test)]
2630mod tests {
2631    use super::*;
2632
2633    #[test]
2634    fn test_configuration_defaults() {
2635        let config = Configuration::default();
2636        assert!(!config.respect_robots_txt);
2637        assert!(!config.subdomains);
2638        assert!(!config.tld);
2639        assert_eq!(config.delay, 0);
2640        assert!(config.user_agent.is_none());
2641        assert!(config.blacklist_url.is_none());
2642        assert!(config.whitelist_url.is_none());
2643        assert!(config.proxies.is_none());
2644        assert!(!config.http2_prior_knowledge);
2645    }
2646
2647    #[test]
2648    fn test_redirect_policy_variants() {
2649        assert_eq!(RedirectPolicy::default(), RedirectPolicy::Loose);
2650        let strict = RedirectPolicy::Strict;
2651        let none = RedirectPolicy::None;
2652        assert_ne!(strict, RedirectPolicy::Loose);
2653        assert_ne!(none, RedirectPolicy::Loose);
2654        assert_ne!(strict, none);
2655    }
2656
2657    #[test]
2658    fn test_proxy_ignore_variants() {
2659        assert_eq!(ProxyIgnore::default(), ProxyIgnore::No);
2660        let chrome = ProxyIgnore::Chrome;
2661        let http = ProxyIgnore::Http;
2662        assert_ne!(chrome, ProxyIgnore::No);
2663        assert_ne!(http, ProxyIgnore::No);
2664        assert_ne!(chrome, http);
2665    }
2666
2667    #[test]
2668    fn test_request_proxy_construction() {
2669        let proxy = RequestProxy {
2670            addr: "http://proxy.example.com:8080".to_string(),
2671            ignore: ProxyIgnore::No,
2672        };
2673        assert_eq!(proxy.addr, "http://proxy.example.com:8080");
2674        assert_eq!(proxy.ignore, ProxyIgnore::No);
2675    }
2676
2677    #[test]
2678    fn test_request_proxy_default() {
2679        let proxy = RequestProxy::default();
2680        assert!(proxy.addr.is_empty());
2681        assert_eq!(proxy.ignore, ProxyIgnore::No);
2682    }
2683
2684    #[test]
2685    fn test_configuration_blacklist_setup() {
2686        let mut config = Configuration::default();
2687        config.blacklist_url = Some(vec![
2688            "https://example.com/private".into(),
2689            "https://example.com/admin".into(),
2690        ]);
2691        assert_eq!(config.blacklist_url.as_ref().unwrap().len(), 2);
2692    }
2693
2694    #[test]
2695    fn test_configuration_whitelist_setup() {
2696        let mut config = Configuration::default();
2697        config.whitelist_url = Some(vec!["https://example.com/public".into()]);
2698        assert_eq!(config.whitelist_url.as_ref().unwrap().len(), 1);
2699    }
2700
2701    #[test]
2702    fn test_configuration_external_domains() {
2703        let mut config = Configuration::default();
2704        config.external_domains_caseless = Arc::new(
2705            [
2706                case_insensitive_string::CaseInsensitiveString::from("Example.Com"),
2707                case_insensitive_string::CaseInsensitiveString::from("OTHER.org"),
2708            ]
2709            .into_iter()
2710            .collect(),
2711        );
2712        assert_eq!(config.external_domains_caseless.len(), 2);
2713        assert!(config.external_domains_caseless.contains(
2714            &case_insensitive_string::CaseInsensitiveString::from("example.com")
2715        ));
2716    }
2717
2718    #[test]
2719    fn test_configuration_budget() {
2720        let mut config = Configuration::default();
2721        let mut budget = hashbrown::HashMap::new();
2722        budget.insert(
2723            case_insensitive_string::CaseInsensitiveString::from("/path"),
2724            100u32,
2725        );
2726        config.budget = Some(budget);
2727        assert!(config.budget.is_some());
2728        assert_eq!(
2729            config.budget.as_ref().unwrap().get(
2730                &case_insensitive_string::CaseInsensitiveString::from("/path")
2731            ),
2732            Some(&100u32)
2733        );
2734    }
2735
2736    #[cfg(not(feature = "regex"))]
2737    #[test]
2738    fn test_allow_list_set_default() {
2739        let allow_list = AllowListSet::default();
2740        assert!(allow_list.0.is_empty());
2741    }
2742
2743    #[cfg(feature = "agent")]
2744    #[test]
2745    fn test_build_remote_multimodal_engine_preserves_dual_models() {
2746        use crate::features::automation::{
2747            ModelEndpoint, RemoteMultimodalConfigs, VisionRouteMode,
2748        };
2749
2750        let mut config = Configuration::default();
2751        let mm = RemoteMultimodalConfigs::new(
2752            "https://api.example.com/v1/chat/completions",
2753            "primary-model",
2754        )
2755        .with_vision_model(ModelEndpoint::new("vision-model").with_api_key("vision-key"))
2756        .with_text_model(
2757            ModelEndpoint::new("text-model")
2758                .with_api_url("https://text.example.com/v1/chat/completions")
2759                .with_api_key("text-key"),
2760        )
2761        .with_vision_route_mode(VisionRouteMode::TextFirst);
2762        config.remote_multimodal = Some(Box::new(mm));
2763
2764        let engine = config
2765            .build_remote_multimodal_engine()
2766            .expect("engine should be built");
2767
2768        assert_eq!(
2769            engine.vision_model.as_ref().map(|m| m.model_name.as_str()),
2770            Some("vision-model")
2771        );
2772        assert_eq!(
2773            engine.text_model.as_ref().map(|m| m.model_name.as_str()),
2774            Some("text-model")
2775        );
2776        assert_eq!(engine.vision_route_mode, VisionRouteMode::TextFirst);
2777    }
2778
2779    #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2780    #[test]
2781    fn test_spider_browser_config_defaults() {
2782        let cfg = SpiderBrowserConfig::new("test-key");
2783        assert_eq!(cfg.api_key, "test-key");
2784        assert_eq!(cfg.wss_url, "wss://browser.spider.cloud/v1/browser");
2785        assert!(!cfg.stealth);
2786        assert!(cfg.browser.is_none());
2787        assert!(cfg.country.is_none());
2788        assert!(cfg.extra_params.is_none());
2789    }
2790
2791    #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2792    #[test]
2793    fn test_spider_browser_connection_url_basic() {
2794        let cfg = SpiderBrowserConfig::new("sk-abc123");
2795        assert_eq!(
2796            cfg.connection_url(),
2797            "wss://browser.spider.cloud/v1/browser?token=sk-abc123"
2798        );
2799    }
2800
2801    #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2802    #[test]
2803    fn test_spider_browser_connection_url_full() {
2804        let cfg = SpiderBrowserConfig::new("sk-abc123")
2805            .with_stealth(true)
2806            .with_browser("chrome")
2807            .with_country("us")
2808            .with_extra_params(vec![("timeout".into(), "30000".into())]);
2809        assert_eq!(
2810            cfg.connection_url(),
2811            "wss://browser.spider.cloud/v1/browser?token=sk-abc123&stealth=true&browser=chrome&country=us&timeout=30000"
2812        );
2813    }
2814
2815    #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2816    #[test]
2817    fn test_spider_browser_connection_url_custom_wss() {
2818        let cfg = SpiderBrowserConfig::new("key")
2819            .with_wss_url("wss://custom.browser.example.com/v1/browser");
2820        assert_eq!(
2821            cfg.connection_url(),
2822            "wss://custom.browser.example.com/v1/browser?token=key"
2823        );
2824    }
2825
2826    #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2827    #[test]
2828    fn test_with_spider_browser_sets_chrome_connection() {
2829        let mut config = Configuration::default();
2830        config.with_spider_browser("my-api-key");
2831        assert_eq!(
2832            config.chrome_connection_url.as_deref(),
2833            Some("wss://browser.spider.cloud/v1/browser?token=my-api-key")
2834        );
2835        assert!(config.spider_browser.is_some());
2836    }
2837
2838    #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2839    #[test]
2840    fn test_with_spider_browser_config_stealth() {
2841        let mut config = Configuration::default();
2842        let browser_cfg = SpiderBrowserConfig::new("key")
2843            .with_stealth(true)
2844            .with_country("gb");
2845        config.with_spider_browser_config(browser_cfg);
2846        assert_eq!(
2847            config.chrome_connection_url.as_deref(),
2848            Some("wss://browser.spider.cloud/v1/browser?token=key&stealth=true&country=gb")
2849        );
2850    }
2851}
spider/configuration.rs

spider/
configuration.rs