spider/
configuration.rs

1use crate::compact_str::CompactString;
2use crate::features::chrome_common::RequestInterceptConfiguration;
3pub use crate::features::chrome_common::{
4    AuthChallengeResponse, AuthChallengeResponseResponse, AutomationScripts, AutomationScriptsMap,
5    CaptureScreenshotFormat, CaptureScreenshotParams, ClipViewport, ExecutionScripts,
6    ExecutionScriptsMap, ScreenShotConfig, ScreenshotParams, Viewport, WaitFor, WaitForDelay,
7    WaitForIdleNetwork, WaitForSelector, WebAutomation,
8};
9pub use crate::features::gemini_common::GeminiConfigs;
10pub use crate::features::openai_common::GPTConfigs;
11#[cfg(feature = "search")]
12pub use crate::features::search::{
13    SearchError, SearchOptions, SearchResult, SearchResults, TimeRange,
14};
15pub use crate::features::webdriver_common::{WebDriverBrowser, WebDriverConfig};
16use crate::utils::get_domain_from_url;
17use crate::utils::BasicCachePolicy;
18use crate::website::CronType;
19use reqwest::header::{AsHeaderName, HeaderMap, HeaderName, HeaderValue, IntoHeaderName};
20use std::net::IpAddr;
21use std::sync::Arc;
22use std::time::Duration;
23
24#[cfg(feature = "chrome")]
25pub use spider_fingerprint::Fingerprint;
26
27/// Check if an API key is a placeholder or empty.
28pub fn is_placeholder_api_key(key: &str) -> bool {
29    let trimmed = key.trim();
30    trimmed.is_empty()
31        || trimmed.eq_ignore_ascii_case("YOUR_API_KEY")
32        || trimmed.eq_ignore_ascii_case("YOUR-API-KEY")
33        || trimmed.eq_ignore_ascii_case("API_KEY")
34        || trimmed.eq_ignore_ascii_case("API-KEY")
35}
36
37/// Redirect policy configuration for request
38#[derive(Debug, Default, Clone, PartialEq)]
39#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
40pub enum RedirectPolicy {
41    #[default]
42    #[cfg_attr(
43        feature = "serde",
44        serde(alias = "Loose", alias = "loose", alias = "LOOSE",)
45    )]
46    /// A loose policy that allows all request up to the redirect limit.
47    Loose,
48    #[cfg_attr(
49        feature = "serde",
50        serde(alias = "Strict", alias = "strict", alias = "STRICT",)
51    )]
52    /// A strict policy only allowing request that match the domain set for crawling.
53    Strict,
54    #[cfg_attr(
55        feature = "serde",
56        serde(alias = "None", alias = "none", alias = "NONE",)
57    )]
58    /// Prevent all redirects.
59    None,
60}
61
62#[cfg(not(feature = "regex"))]
63/// Allow list normal matching paths.
64pub type AllowList = Vec<CompactString>;
65
66#[cfg(feature = "regex")]
67/// Allow list regex.
68pub type AllowList = Box<regex::RegexSet>;
69
70/// Whitelist or Blacklist
71#[derive(Debug, Default, Clone)]
72#[cfg_attr(not(feature = "regex"), derive(PartialEq, Eq))]
73pub struct AllowListSet(pub AllowList);
74
75#[cfg(feature = "chrome")]
76/// Track the events made via chrome.
77#[derive(Debug, PartialEq, Eq, Clone, Default)]
78#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
79pub struct ChromeEventTracker {
80    /// Track the responses.
81    pub responses: bool,
82    /// Track the requests.
83    pub requests: bool,
84    /// Track the changes between web automation.
85    pub automation: bool,
86}
87
88#[cfg(feature = "chrome")]
89impl ChromeEventTracker {
90    /// Create a new chrome event tracker
91    pub fn new(requests: bool, responses: bool) -> Self {
92        ChromeEventTracker {
93            requests,
94            responses,
95            automation: true,
96        }
97    }
98}
99
100#[cfg(feature = "sitemap")]
101#[derive(Debug, Default)]
102/// Determine if the sitemap modified to the whitelist.
103pub struct SitemapWhitelistChanges {
104    /// Added the default sitemap.xml whitelist.
105    pub added_default: bool,
106    /// Added the custom whitelist path.
107    pub added_custom: bool,
108}
109
110#[cfg(feature = "sitemap")]
111impl SitemapWhitelistChanges {
112    /// Was the whitelist modified?
113    pub(crate) fn modified(&self) -> bool {
114        self.added_default || self.added_custom
115    }
116}
117
118/// Determine allow proxy
119#[derive(Debug, Default, Clone, PartialEq)]
120#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
121pub enum ProxyIgnore {
122    /// Chrome proxy.
123    Chrome,
124    /// HTTP proxy.
125    Http,
126    #[default]
127    /// Do not ignore
128    No,
129}
130
131/// The networking proxy to use.
132#[derive(Debug, Default, Clone, PartialEq)]
133#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
134pub struct RequestProxy {
135    /// The proxy address.
136    pub addr: String,
137    /// Ignore the proxy when running a request type.
138    pub ignore: ProxyIgnore,
139}
140
141/// Categorical "kind" a request can be routed under.
142///
143/// Carries no policy and no business semantics — what each kind *means*
144/// (when to route there, what proxies it should use) is entirely up to
145/// the consumer. Spider only stores the mapping and uses the kind as a
146/// lookup key.
147///
148/// Used in two places:
149/// * [`Configuration::proxies_by_kind`] — the optional sidecar map of
150///   `kind → Vec<RequestProxy>`, attached to the configuration without
151///   touching `RequestProxy` itself.
152/// * [`crate::proxy_strategy::ProxyStrategy::route`] — the per-request
153///   decision a strategy returns to pick which proxy list to use.
154///
155/// Returning [`ProxyKind::Default`] (or any kind not present in the
156/// sidecar map) keeps the existing fast path — no secondary client is
157/// built, no allocation, no behavior change.
158#[derive(Debug, Clone, PartialEq, Eq, Hash)]
159#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
160pub enum ProxyKind {
161    /// The default kind. Routes through the primary proxy list
162    /// ([`Configuration::proxies`]).
163    Default,
164    /// Media-asset request (image, video, audio, font, archive,
165    /// document). Pure technical classification — see
166    /// [`crate::utils::media_asset::is_media_asset_url`] for the helper
167    /// most strategies will pair with this kind.
168    MediaAsset,
169    /// Free-form, consumer-defined kind. Opaque to spider.
170    Custom(CompactString),
171}
172
173impl Default for ProxyKind {
174    #[inline]
175    fn default() -> Self {
176        ProxyKind::Default
177    }
178}
179
180/// The protocol used to communicate with a backend.
181#[cfg(feature = "parallel_backends")]
182#[derive(Debug, Clone, PartialEq)]
183#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
184pub enum BackendProtocol {
185    /// Chrome DevTools Protocol over WebSocket.
186    Cdp,
187    /// WebDriver (W3C) over HTTP.
188    WebDriver,
189}
190
191/// The engine type for a parallel crawl backend.
192#[cfg(feature = "parallel_backends")]
193#[derive(Debug, Default, Clone, PartialEq)]
194#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
195pub enum BackendEngine {
196    #[default]
197    /// CDP backend — communicates via the Chrome DevTools Protocol.
198    Cdp,
199    /// Servo — communicates via WebDriver protocol.
200    Servo,
201    /// A custom backend. Set `protocol` on [`BackendEndpoint`] to tell
202    /// spider whether to use CDP or WebDriver to communicate with it.
203    Custom,
204}
205
206/// A parallel crawl backend endpoint.
207///
208/// Each backend can run either **remotely** (connect to a running instance via
209/// `endpoint`) or **locally** (spider manages the engine process via
210/// `binary_path`). Set `endpoint` for remote mode, `binary_path` for local.
211#[cfg(feature = "parallel_backends")]
212#[derive(Debug, Default, Clone, PartialEq)]
213#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
214#[cfg_attr(feature = "serde", serde(default))]
215pub struct BackendEndpoint {
216    /// The browser engine to use.
217    pub engine: BackendEngine,
218    /// Remote endpoint URL. For CDP backends: a WebSocket URL
219    /// (e.g. `"ws://127.0.0.1:9222"`). For Servo: a WebDriver HTTP URL
220    /// (e.g. `"http://localhost:4444"`). When set, the engine is assumed to
221    /// be already running at this address.
222    pub endpoint: Option<String>,
223    /// Path to the engine binary for local mode. When set (and `endpoint` is
224    /// `None`), spider will spawn and manage the engine process. Uses PATH
225    /// lookup if empty string.
226    pub binary_path: Option<String>,
227    /// Explicit protocol override. When `None`, inferred from `engine`:
228    /// `Cdp` → CDP, `Servo` → WebDriver, `Custom` → **required**.
229    /// For custom backends, set this to tell spider how to communicate.
230    pub protocol: Option<BackendProtocol>,
231    /// Per-backend proxy address. When set, this backend routes its outbound
232    /// requests through the given proxy (e.g. `"socks5://proxy1:1080"`,
233    /// `"http://proxy2:8080"`). Overrides the global `ProxyRotator` for this
234    /// backend. For CDP backends, creates an isolated browser context with
235    /// the proxy. For WebDriver backends, sets the proxy capability.
236    pub proxy: Option<String>,
237}
238
239/// Configuration for parallel crawl backends.
240///
241/// When enabled, races alternative browser engines (CDP, Servo) alongside
242/// the primary crawl path. The best HTML response wins.
243#[cfg(feature = "parallel_backends")]
244#[derive(Debug, Clone, PartialEq)]
245#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
246#[cfg_attr(feature = "serde", serde(default))]
247pub struct ParallelBackendsConfig {
248    /// Alternative backends to race against the primary crawl.
249    pub backends: Vec<BackendEndpoint>,
250    /// Grace period (ms) after first response to wait for better results.
251    /// Allows slower backends to finish if they produce higher quality HTML.
252    /// Default: 500.
253    pub grace_period_ms: u64,
254    /// Master switch. Default: `true` (enabled when config is present).
255    pub enabled: bool,
256    /// Quality score threshold (0–100). If the first response scores at or
257    /// above this value, accept it immediately without waiting for the grace
258    /// period. Default: 80.
259    pub fast_accept_threshold: u16,
260    /// Maximum consecutive errors before auto-disabling a backend for
261    /// the remainder of the crawl. Default: 10.
262    pub max_consecutive_errors: u16,
263    /// Timeout (ms) for the initial TCP/WebSocket connection to a backend.
264    /// Separate from `request_timeout` so that down backends fail fast
265    /// without affecting navigation/fetch timeouts. Default: 5000 (5s).
266    pub connect_timeout_ms: u64,
267    /// Skip backend racing when the primary response has a binary
268    /// `Content-Type` (image/*, audio/*, video/*, font/*, application/pdf,
269    /// etc.). There is no HTML quality variance for binary resources.
270    /// Default: `true`.
271    pub skip_binary_content_types: bool,
272    /// Maximum concurrent backend sessions across all URLs. Prevents memory
273    /// spikes on large crawls. `0` means unlimited. Default: 8.
274    pub max_concurrent_sessions: usize,
275    /// Additional URL extensions to skip backend racing for, on top of the
276    /// built-in asset list (images, fonts, videos, etc.). Case-insensitive.
277    /// Example: `["xml", "rss"]`.
278    pub skip_extensions: Vec<CompactString>,
279    /// Maximum aggregate HTML bytes held by in-flight backend responses across
280    /// all concurrent races. When this cap is reached, new backend fetches are
281    /// skipped (primary-only) until existing responses are consumed or dropped.
282    /// Works without the `balance` feature. `0` means unlimited.
283    /// Default: 256 MiB (268_435_456).
284    pub max_backend_bytes_in_flight: usize,
285    /// Hard deadline (ms) for an entire backend fetch (connect + navigate +
286    /// extract). If a backend exceeds this, the task is cancelled and returns
287    /// `None`. Prevents a single stalled backend from blocking the primary
288    /// Chrome result during the grace window. `0` means no outer timeout
289    /// (individual phase timeouts still apply). Default: 30_000 (30s).
290    pub backend_timeout_ms: u64,
291}
292
293#[cfg(feature = "parallel_backends")]
294impl Default for ParallelBackendsConfig {
295    fn default() -> Self {
296        Self {
297            backends: Vec::new(),
298            grace_period_ms: 500,
299            enabled: true,
300            fast_accept_threshold: 80,
301            max_consecutive_errors: 10,
302            connect_timeout_ms: 5000,
303            skip_binary_content_types: true,
304            max_concurrent_sessions: 8,
305            skip_extensions: Vec::new(),
306            max_backend_bytes_in_flight: 256 * 1024 * 1024, // 256 MiB
307            backend_timeout_ms: 30_000,
308        }
309    }
310}
311
312/// User-configurable antibot detection patterns. Any match triggers `AntiBotTech::Custom`.
313#[derive(Debug, Default, Clone, PartialEq, Eq)]
314#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
315#[cfg_attr(feature = "serde", serde(default))]
316pub struct CustomAntibotPatterns {
317    /// Body substring patterns (matched against response bodies < 30KB).
318    pub body: Vec<CompactString>,
319    /// URL substring patterns.
320    pub url: Vec<CompactString>,
321    /// Header keys whose presence triggers antibot detection.
322    pub header_keys: Vec<CompactString>,
323}
324
325/// Structure to configure `Website` crawler
326/// ```rust
327/// use spider::website::Website;
328/// let mut website: Website = Website::new("https://choosealicense.com");
329/// website.configuration.blacklist_url.insert(Default::default()).push("https://choosealicense.com/licenses/".to_string().into());
330/// website.configuration.respect_robots_txt = true;
331/// website.configuration.subdomains = true;
332/// website.configuration.tld = true;
333/// ```
334#[derive(Debug, Default, Clone)]
335#[cfg_attr(
336    all(
337        not(feature = "regex"),
338        not(feature = "openai"),
339        not(feature = "cache_openai"),
340        not(feature = "gemini"),
341        not(feature = "cache_gemini")
342    ),
343    derive(PartialEq)
344)]
345#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
346#[cfg_attr(feature = "serde", serde(default))]
347pub struct Configuration {
348    /// Respect robots.txt file and not scrape not allowed files. This may slow down crawls if robots.txt file has a delay included.
349    pub respect_robots_txt: bool,
350    /// Allow sub-domains.
351    pub subdomains: bool,
352    /// Allow all tlds for domain.
353    pub tld: bool,
354    /// The max timeout for the crawl.
355    pub crawl_timeout: Option<Duration>,
356    /// Preserve the HTTP host header from being included.
357    pub preserve_host_header: bool,
358    /// List of pages to not crawl. [optional: regex pattern matching]
359    pub blacklist_url: Option<Vec<CompactString>>,
360    /// List of pages to only crawl. [optional: regex pattern matching]
361    pub whitelist_url: Option<Vec<CompactString>>,
362    /// User-Agent for request.
363    pub user_agent: Option<Box<CompactString>>,
364    /// Polite crawling delay in milli seconds.
365    pub delay: u64,
366    /// Request max timeout per page. By default the request times out in 15s. Set to None to disable.
367    pub request_timeout: Option<Duration>,
368    /// Use HTTP2 for connection. Enable if you know the website has http2 support.
369    pub http2_prior_knowledge: bool,
370    /// Use proxy list for performing network request.
371    pub proxies: Option<Vec<RequestProxy>>,
372    /// Optional sidecar map of alternative proxy lists keyed by
373    /// [`ProxyKind`].
374    ///
375    /// Lets a [`crate::proxy_strategy::ProxyStrategy`] route a request
376    /// through a non-default proxy set without touching `proxies` or
377    /// `RequestProxy` itself. When `None` (the default) or when the
378    /// strategy returns a kind that has no entry here, requests fall
379    /// through to `proxies` and the existing fast path — no behavior
380    /// change.
381    ///
382    /// Lookup is by enum equality / hash; the [`ProxyKind::Custom`]
383    /// variant lets consumers introduce their own kinds without an
384    /// upstream change. Spider never writes to this map after
385    /// configuration; runtime lazy state lives on the `Website`.
386    pub proxies_by_kind: Option<hashbrown::HashMap<ProxyKind, Vec<RequestProxy>>>,
387    /// Headers to include with request.
388    pub headers: Option<Box<SerializableHeaderMap>>,
389    #[cfg(feature = "sitemap")]
390    /// Include a sitemap in response of the crawl.
391    pub sitemap_url: Option<Box<CompactString>>,
392    #[cfg(feature = "sitemap")]
393    /// Prevent including the sitemap links with the crawl.
394    pub ignore_sitemap: bool,
395    /// The max redirections allowed for request.
396    pub redirect_limit: usize,
397    /// The redirect policy type to use.
398    pub redirect_policy: RedirectPolicy,
399    /// Whether `redirect_limit` was explicitly set by the caller.
400    ///
401    /// Set to `true` by `with_redirect_limit()` and by the external-config loader
402    /// when `redirect_limit` is provided. Chrome-path enforcement reads this flag
403    /// so it only caps redirects when the user opted in — preserving prior
404    /// behavior on pages whose navigation chains exceed the HTTP default of 7.
405    #[cfg_attr(feature = "serde", serde(skip))]
406    pub redirect_limit_set: bool,
407    /// Cap on main-frame cross-document navigations during a single Chrome
408    /// `goto` (requires the `chrome` feature — no effect on the HTTP path).
409    ///
410    /// Defends against JS / meta-refresh / HTTP-Refresh-header loops that
411    /// bypass the HTTP redirect cap because each hop is a fresh document
412    /// rather than a 3xx redirect. `None` disables the guard (default) so
413    /// prior behavior is preserved; `Some(n)` aborts the navigation with a
414    /// `net::ERR_TOO_MANY_NAVIGATIONS` error once the main frame has
415    /// navigated more than `n` times since `goto`.
416    pub max_main_frame_navigations: Option<u32>,
417    #[cfg(feature = "cookies")]
418    /// Cookie string to use for network requests ex: "foo=bar; Domain=blog.spider"
419    pub cookie_str: String,
420    #[cfg(feature = "wreq")]
421    /// The type of request emulation. This does nothing without the flag `sync` enabled.
422    pub emulation: Option<wreq_util::Emulation>,
423    #[cfg(feature = "cron")]
424    /// Cron string to perform crawls - use <https://crontab.guru/> to help generate a valid cron for needs.
425    pub cron_str: String,
426    #[cfg(feature = "cron")]
427    /// The type of cron to run either crawl or scrape.
428    pub cron_type: CronType,
429    /// The max depth to crawl for a website. Defaults to 25 to help prevent infinite recursion.
430    pub depth: usize,
431    /// The depth to crawl pertaining to the root.
432    pub depth_distance: usize,
433    /// Use stealth mode for requests.
434    pub stealth_mode: spider_fingerprint::configs::Tier,
435    /// Configure the viewport for chrome and viewport headers.
436    pub viewport: Option<Viewport>,
437    /// Crawl budget for the paths. This helps prevent crawling extra pages and limiting the amount.
438    pub budget: Option<hashbrown::HashMap<case_insensitive_string::CaseInsensitiveString, u32>>,
439    /// If wild card budgeting is found for the website.
440    pub wild_card_budgeting: bool,
441    /// External domains to include case-insensitive.
442    pub external_domains_caseless:
443        Arc<hashbrown::HashSet<case_insensitive_string::CaseInsensitiveString>>,
444    /// Collect all the resources found on the page.
445    pub full_resources: bool,
446    /// Dangerously accept invalid certficates.
447    pub accept_invalid_certs: bool,
448    /// The auth challenge response. The 'chrome_intercept' flag is also required in order to intercept the response.
449    pub auth_challenge_response: Option<AuthChallengeResponse>,
450    /// The OpenAI configs to use to help drive the chrome browser. This does nothing without the 'openai' flag.
451    pub openai_config: Option<Box<GPTConfigs>>,
452    /// The Gemini configs to use to help drive the chrome browser. This does nothing without the 'gemini' flag.
453    pub gemini_config: Option<Box<GeminiConfigs>>,
454    /// Remote multimodal automation config (vision + LLM-driven steps).
455    /// Requires the `agent` feature for full functionality, uses stub type otherwise.
456    pub remote_multimodal: Option<Box<crate::features::automation::RemoteMultimodalConfigs>>,
457    /// Use a shared queue strategy when crawling. This can scale workloads evenly that do not need priority.
458    pub shared_queue: bool,
459    /// Return the page links in the subscription channels. This does nothing without the flag `sync` enabled.
460    pub return_page_links: bool,
461    /// Retry count to attempt to swap proxies etc.
462    pub retry: u8,
463    /// Custom antibot detection patterns. When set, these are matched in addition
464    /// to the built-in patterns. Any match triggers `AntiBotTech::Custom`.
465    pub custom_antibot: Option<CustomAntibotPatterns>,
466    /// Skip spawning a control thread that can pause, start, and shutdown the crawl.
467    pub no_control_thread: bool,
468    /// The blacklist urls.
469    blacklist: AllowListSet,
470    /// The whitelist urls.
471    whitelist: AllowListSet,
472    /// Crawl budget for the paths. This helps prevent crawling extra pages and limiting the amount.
473    pub(crate) inner_budget:
474        Option<hashbrown::HashMap<case_insensitive_string::CaseInsensitiveString, u32>>,
475    /// Expect only to handle HTML to save on resources. This mainly only blocks the crawling and returning of resources from the server.
476    pub only_html: bool,
477    /// The concurrency limits to apply.
478    pub concurrency_limit: Option<usize>,
479    /// Normalize the html de-deplucating the content.
480    pub normalize: bool,
481    /// Share the state of the crawl requires the 'disk' feature flag.
482    pub shared: bool,
483    /// Modify the headers to act like a real-browser
484    pub modify_headers: bool,
485    /// Modify the HTTP client headers only to act like a real-browser
486    pub modify_http_client_headers: bool,
487    /// Cache the page following HTTP caching rules.
488    #[cfg(any(
489        feature = "cache_request",
490        feature = "chrome",
491        feature = "chrome_remote_cache"
492    ))]
493    pub cache: bool,
494    /// Skip browser rendering entirely if cached response exists.
495    /// When enabled, returns cached HTML directly without launching Chrome.
496    #[cfg(any(
497        feature = "cache_request",
498        feature = "chrome",
499        feature = "chrome_remote_cache"
500    ))]
501    pub cache_skip_browser: bool,
502    /// Namespace mixed into every cache key so logically distinct variants
503    /// (country, proxy pool, tenant, A/B bucket, device profile, …) never
504    /// collide on the same cached bytes. Free-form — spider treats it as an
505    /// opaque partition string. `None` uses the default (empty) namespace.
506    /// Always present (zero cost when unset); its effect is gated by whichever
507    /// cache feature is active.
508    pub cache_namespace: Option<Box<String>>,
509    /// Read-only mode for the remote Chrome cache. When enabled the local
510    /// cache + per-session cache still serve hits, but no responses are ever
511    /// uploaded to the remote `hybrid_cache_server` (neither via
512    /// `spider_remote_cache` enqueue nor via chromey's CDP listener). Intended
513    /// for deployments where an upstream proxy is the sole writer and spider
514    /// should only consume the cache. Default `false` preserves the existing
515    /// write-through behavior.
516    #[cfg(feature = "chrome_remote_cache")]
517    pub chrome_remote_cache_read_only: bool,
518    /// Publish fresh HTTP (skip_browser) responses to the shared remote
519    /// cache worker. When enabled, successful HTTP fetches made through
520    /// the skip_browser path are enqueued into `spider_remote_cache` so
521    /// they become available for later cache lookups. Independent of
522    /// chrome-path dumps — you can have chrome dumps off (via
523    /// `chrome_remote_cache_read_only = true`) while still publishing
524    /// from the HTTP path. Default `false` is a no-op.
525    #[cfg(feature = "chrome_remote_cache")]
526    pub remote_cache_skip_browser: bool,
527    /// Restrict chrome remote-cache dumps to the **main (initial)
528    /// document only**. When enabled, `cache_chrome_response` continues
529    /// to publish the navigated document body for each request
530    /// (whatever MIME type — HTML, JSON, XML, plain text, …), but the
531    /// per-response CDP listener (`spawn_cache_listener`) runs in
532    /// `dump_readonly` mode — populating the local + per-session cache
533    /// for sub-resources (CSS/JS/manifests) without uploading them to
534    /// the remote server. Orthogonal to `chrome_remote_cache_read_only`:
535    /// read-only suppresses *all* chrome dumps, this knob only
536    /// suppresses the asset/sub-resource path. Default `false`
537    /// preserves the existing dump-everything behavior.
538    #[cfg(feature = "chrome_remote_cache")]
539    pub chrome_remote_cache_main_doc_only: bool,
540    #[cfg(feature = "chrome")]
541    /// Enable or disable service workers. Enabled by default.
542    pub service_worker_enabled: bool,
543    #[cfg(feature = "chrome")]
544    /// Overrides default host system timezone with the specified one.
545    #[cfg(feature = "chrome")]
546    pub timezone_id: Option<Box<String>>,
547    /// Overrides default host system locale with the specified one.
548    #[cfg(feature = "chrome")]
549    pub locale: Option<Box<String>>,
550    /// Set a custom script to eval on each new document.
551    #[cfg(feature = "chrome")]
552    pub evaluate_on_new_document: Option<Box<String>>,
553    #[cfg(feature = "chrome")]
554    /// Dismiss dialogs.
555    pub dismiss_dialogs: Option<bool>,
556    #[cfg(feature = "chrome")]
557    /// Wait for options for the page.
558    pub wait_for: Option<WaitFor>,
559    #[cfg(feature = "chrome")]
560    /// Take a screenshot of the page.
561    pub screenshot: Option<ScreenShotConfig>,
562    #[cfg(feature = "chrome")]
563    /// Track the events made via chrome.
564    pub track_events: Option<ChromeEventTracker>,
565    #[cfg(feature = "chrome")]
566    /// Setup fingerprint ID on each document. This does nothing without the flag `chrome` enabled.
567    pub fingerprint: Fingerprint,
568    #[cfg(feature = "chrome")]
569    /// The chrome connection url. Useful for targeting different headless instances. Defaults to using the env CHROME_URL.
570    pub chrome_connection_url: Option<String>,
571    #[cfg(feature = "chrome")]
572    /// Multiple remote Chrome connection URLs for failover. When a connection
573    /// fails after retries, the next URL is tried automatically. Requires the
574    /// `chrome` feature. When set, takes priority over `chrome_connection_url`.
575    pub chrome_connection_urls: Option<Vec<String>>,
576    #[cfg(feature = "chrome")]
577    #[cfg_attr(feature = "serde", serde(skip))]
578    /// Lazy, lock-free chrome failover instance reused across every
579    /// `setup_browser_configuration` call. Built on first use, reset by
580    /// `with_chrome_connections`. Internal cache; not part of public API.
581    pub(crate) chrome_failover: crate::features::chrome::LazyChromeFailover,
582    #[cfg(feature = "chrome")]
583    /// First-byte watchdog for Chrome navigations. When set, fires if no
584    /// `Network.dataReceived` (or `Network.responseReceived`) event arrives
585    /// within this duration after the listener attaches. On fire the page
586    /// is force-stopped and (when a `browser_dead` flag is plumbed through
587    /// `ChromeFetchParams`) it is flipped so the website-level retry loop
588    /// can rotate the backend. `None` (default) disables the watchdog and
589    /// the legacy chunk-idle timeout (`SPIDER_CHUNK_IDLE_TIMEOUT_SECS`,
590    /// default 30s) is the only stall guard.
591    pub chrome_first_byte_timeout: Option<Duration>,
592    #[cfg(feature = "chrome")]
593    /// Per-fetch jitter window applied on top of `chrome_first_byte_timeout`.
594    /// When `Some(j)`, each fetch picks `actual_timeout = base + rand(0..j)`
595    /// so concurrent fetches don't all expire at exactly the same moment
596    /// (avoids thundering-herd backend rotation when a backend goes dark).
597    /// `None` (default) means no jitter — every fetch uses the configured
598    /// base timeout exactly. Ignored when `chrome_first_byte_timeout` is
599    /// `None` (no watchdog to jitter).
600    pub chrome_first_byte_timeout_jitter: Option<Duration>,
601    /// First-byte watchdog for HTTP fetches. When `Some(d)`, each
602    /// `client.get(url).send().await` is wrapped in
603    /// `tokio::time::timeout(base + rand(0..jitter))`. On timeout the
604    /// in-flight connect / TLS / header future is dropped (cancels the
605    /// request) and a synthetic `524 GATEWAY_TIMEOUT` response is built
606    /// so the existing retry path rotates to the next proxy. Covers the
607    /// gap between `connect_timeout` (TCP/TLS handshake) and
608    /// `chunk_idle_timeout` (per-chunk idle while streaming) where a
609    /// proxy can accept the connection but never produce headers.
610    /// `None` (default) disables the watchdog — `request_timeout` and
611    /// `chunk_idle_timeout` remain the only stall guards.
612    pub http_first_byte_timeout: Option<Duration>,
613    /// Per-fetch jitter window applied on top of
614    /// `http_first_byte_timeout`. Same semantics as
615    /// `chrome_first_byte_timeout_jitter`. `None` (default) means no
616    /// jitter; ignored when the base is `None`.
617    pub http_first_byte_timeout_jitter: Option<Duration>,
618    /// Scripts to execute for individual pages, the full path of the url is required for an exact match. This is useful for running one off JS on pages like performing custom login actions.
619    #[cfg(feature = "chrome")]
620    pub execution_scripts: Option<ExecutionScripts>,
621    /// Web automation scripts to run up to a duration of 60 seconds.
622    #[cfg(feature = "chrome")]
623    pub automation_scripts: Option<AutomationScripts>,
624    /// Setup network interception for request. This does nothing without the flag `chrome_intercept` enabled.
625    #[cfg(feature = "chrome")]
626    pub chrome_intercept: RequestInterceptConfiguration,
627    /// The referer to use.
628    pub referer: Option<String>,
629    /// Determine the max bytes per page.
630    pub max_page_bytes: Option<f64>,
631    /// Determine the max bytes per browser context.
632    pub max_bytes_allowed: Option<u64>,
633    #[cfg(feature = "chrome")]
634    /// Disables log domain, prevents further log entries from being reported to the client. This does nothing without the flag `chrome` enabled.
635    pub disable_log: bool,
636    #[cfg(feature = "chrome")]
637    /// Automatic locale and timezone handling via third party. This does nothing without the flag `chrome` enabled.
638    pub auto_geolocation: bool,
639    /// The cache policy to use.
640    pub cache_policy: Option<BasicCachePolicy>,
641    #[cfg(feature = "chrome")]
642    /// Enables bypassing CSP. This does nothing without the flag `chrome` enabled.
643    pub bypass_csp: bool,
644    #[cfg(feature = "chrome")]
645    /// Disables JavaScript execution on the page. This does nothing without the flag `chrome` enabled.
646    pub disable_javascript: bool,
647    /// Bind the connections only on the network interface.
648    pub network_interface: Option<String>,
649    /// Bind to a local IP Address.
650    pub local_address: Option<IpAddr>,
651    /// The default http connect timeout
652    pub default_http_connect_timeout: Option<Duration>,
653    /// The default http read timeout
654    pub default_http_read_timeout: Option<Duration>,
655    #[cfg(feature = "webdriver")]
656    /// WebDriver configuration for browser automation. This does nothing without the `webdriver` flag enabled.
657    pub webdriver_config: Option<Box<WebDriverConfig>>,
658    #[cfg(feature = "search")]
659    /// Search provider configuration for web search integration. This does nothing without the `search` flag enabled.
660    pub search_config: Option<Box<SearchConfig>>,
661    #[cfg(feature = "spider_cloud")]
662    /// Spider Cloud config. See <https://spider.cloud>.
663    pub spider_cloud: Option<Box<SpiderCloudConfig>>,
664    #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
665    /// Spider Browser Cloud config for remote CDP via `wss://browser.spider.cloud`.
666    pub spider_browser: Option<Box<SpiderBrowserConfig>>,
667    #[cfg(feature = "hedge")]
668    /// Hedged request configuration for work-stealing on slow requests.
669    /// When enabled, fires a duplicate request on a different proxy after a delay.
670    pub hedge: Option<crate::utils::hedge::HedgeConfig>,
671    #[cfg(feature = "auto_throttle")]
672    /// Latency-based auto-throttle configuration. When enabled, dynamically
673    /// adjusts per-domain crawl delay based on measured server response time.
674    pub auto_throttle: Option<crate::utils::auto_throttle::AutoThrottleConfig>,
675    #[cfg(feature = "etag_cache")]
676    /// Enable ETag / conditional request caching. When true, stores ETag and
677    /// Last-Modified headers from responses and sends If-None-Match /
678    /// If-Modified-Since on subsequent requests to the same URL, allowing
679    /// servers to respond with lightweight 304 Not Modified.
680    pub etag_cache: bool,
681    #[cfg(feature = "warc")]
682    /// WARC output configuration. When set, the crawl writes a WARC 1.1 file
683    /// containing all fetched pages as `response` records.
684    pub warc: Option<crate::utils::warc::WarcConfig>,
685    #[cfg(feature = "parallel_backends")]
686    /// Parallel crawl backend configuration. Race CDP / Servo backends alongside
687    /// the primary crawl path. Requires the `parallel_backends` feature.
688    pub parallel_backends: Option<ParallelBackendsConfig>,
689    #[cfg(feature = "decentralized")]
690    /// Per-`Website` remote Spider worker URLs used for crawl requests. When
691    /// `None`, falls back to the process-wide `SPIDER_WORKER` env var (or its
692    /// default), preserving pre-2.51.x behavior. When `Some`, overrides the
693    /// global pool for this `Website` only.
694    pub worker_connection_urls: Option<Vec<String>>,
695    #[cfg(feature = "decentralized")]
696    /// Per-`Website` remote Spider worker URLs used for scrape requests. When
697    /// `None`, falls back to the process-wide `SPIDER_WORKER_SCRAPER` env var
698    /// (or its default), preserving pre-2.51.x behavior. When `Some`,
699    /// overrides the global pool for this `Website` only.
700    pub scraper_worker_connection_urls: Option<Vec<String>>,
701}
702
703#[derive(Default, Debug, Clone, PartialEq, Eq)]
704/// Serializable HTTP headers.
705pub struct SerializableHeaderMap(pub HeaderMap);
706
707impl SerializableHeaderMap {
708    /// Innter HeaderMap.
709    pub fn inner(&self) -> &HeaderMap {
710        &self.0
711    }
712    /// Returns true if the map contains a value for the specified key.
713    pub fn contains_key<K>(&self, key: K) -> bool
714    where
715        K: AsHeaderName,
716    {
717        self.0.contains_key(key)
718    }
719    /// Inserts a key-value pair into the map.
720    pub fn insert<K>(
721        &mut self,
722        key: K,
723        val: reqwest::header::HeaderValue,
724    ) -> Option<reqwest::header::HeaderValue>
725    where
726        K: IntoHeaderName,
727    {
728        self.0.insert(key, val)
729    }
730    /// Extend a `HeaderMap` with the contents of another `HeaderMap`.
731    pub fn extend<I>(&mut self, iter: I)
732    where
733        I: IntoIterator<Item = (Option<HeaderName>, HeaderValue)>,
734    {
735        self.0.extend(iter);
736    }
737}
738
739/// Get a cloned copy of the `Referer` header as a `String` (if it exists and is valid UTF-8).
740pub fn get_referer(header_map: &Option<Box<SerializableHeaderMap>>) -> Option<String> {
741    match header_map {
742        Some(header_map) => {
743            header_map
744                .0
745                .get(crate::client::header::REFERER) // Retrieves the "Referer" HeaderValue if it exists
746                .and_then(|value| value.to_str().ok()) // &str from HeaderValue
747                .map(String::from) // Convert &str to String (owned)
748        }
749        _ => None,
750    }
751}
752
753impl From<HeaderMap> for SerializableHeaderMap {
754    fn from(header_map: HeaderMap) -> Self {
755        SerializableHeaderMap(header_map)
756    }
757}
758
759#[cfg(feature = "serde")]
760impl serde::Serialize for SerializableHeaderMap {
761    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
762    where
763        S: serde::Serializer,
764    {
765        let map: std::collections::BTreeMap<String, String> = self
766            .0
767            .iter()
768            .map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
769            .collect();
770        map.serialize(serializer)
771    }
772}
773
774#[cfg(feature = "serde")]
775impl<'de> serde::Deserialize<'de> for SerializableHeaderMap {
776    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
777    where
778        D: serde::Deserializer<'de>,
779    {
780        use reqwest::header::{HeaderName, HeaderValue};
781        use std::collections::BTreeMap;
782        let map: BTreeMap<String, String> = BTreeMap::deserialize(deserializer)?;
783        let mut headers = HeaderMap::with_capacity(map.len());
784        for (k, v) in map {
785            let key = HeaderName::from_bytes(k.as_bytes()).map_err(serde::de::Error::custom)?;
786            let value = HeaderValue::from_str(&v).map_err(serde::de::Error::custom)?;
787            headers.insert(key, value);
788        }
789        Ok(SerializableHeaderMap(headers))
790    }
791}
792
793#[cfg(feature = "serde")]
794impl serde::Serialize for AllowListSet {
795    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
796    where
797        S: serde::Serializer,
798    {
799        #[cfg(not(feature = "regex"))]
800        {
801            self.0.serialize(serializer)
802        }
803
804        #[cfg(feature = "regex")]
805        {
806            self.0
807                .patterns()
808                .iter()
809                .collect::<Vec<&String>>()
810                .serialize(serializer)
811        }
812    }
813}
814
815#[cfg(feature = "serde")]
816impl<'de> serde::Deserialize<'de> for AllowListSet {
817    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
818    where
819        D: serde::Deserializer<'de>,
820    {
821        #[cfg(not(feature = "regex"))]
822        {
823            let vec = Vec::<CompactString>::deserialize(deserializer)?;
824            Ok(AllowListSet(vec))
825        }
826
827        #[cfg(feature = "regex")]
828        {
829            let patterns = Vec::<String>::deserialize(deserializer)?;
830            let regex_set = regex::RegexSet::new(&patterns).map_err(serde::de::Error::custom)?;
831            Ok(AllowListSet(regex_set.into()))
832        }
833    }
834}
835
836/// Get the user agent from the top agent list randomly.
837#[cfg(feature = "ua_generator")]
838pub fn get_ua(chrome: bool) -> &'static str {
839    if chrome {
840        ua_generator::ua::spoof_chrome_ua()
841    } else {
842        ua_generator::ua::spoof_ua()
843    }
844}
845
846/// Get the user agent via cargo package + version.
847#[cfg(not(feature = "ua_generator"))]
848pub fn get_ua(_chrome: bool) -> &'static str {
849    use std::env;
850
851    lazy_static! {
852        static ref AGENT: &'static str =
853            concat!(env!("CARGO_PKG_NAME"), '/', env!("CARGO_PKG_VERSION"));
854    };
855
856    AGENT.as_ref()
857}
858
859impl Configuration {
860    /// Represents crawl configuration for a website.
861    #[cfg(not(feature = "chrome"))]
862    pub fn new() -> Self {
863        Self {
864            delay: 0,
865            depth: 25,
866            redirect_limit: 7,
867            request_timeout: Some(Duration::from_secs(120)),
868            only_html: true,
869            modify_headers: true,
870            ..Default::default()
871        }
872    }
873
874    /// Represents crawl configuration for a website.
875    #[cfg(feature = "chrome")]
876    pub fn new() -> Self {
877        Self {
878            delay: 0,
879            depth: 25,
880            redirect_limit: 7,
881            request_timeout: Some(Duration::from_secs(120)),
882            chrome_intercept: RequestInterceptConfiguration::new(cfg!(
883                feature = "chrome_intercept"
884            )),
885            user_agent: Some(Box::new(get_ua(true).into())),
886            only_html: true,
887            cache: true,
888            modify_headers: true,
889            service_worker_enabled: true,
890            fingerprint: Fingerprint::Basic,
891            auto_geolocation: false,
892            ..Default::default()
893        }
894    }
895
896    /// Build a `RemoteMultimodalEngine` from `RemoteMultimodalConfigs`.
897    /// Requires the `agent` feature.
898    #[cfg(feature = "agent")]
899    pub fn build_remote_multimodal_engine(
900        &self,
901    ) -> Option<crate::features::automation::RemoteMultimodalEngine> {
902        let cfgs = self.remote_multimodal.as_ref()?;
903        let sem = cfgs
904            .concurrency_limit
905            .filter(|&n| n > 0)
906            .map(|n| std::sync::Arc::new(tokio::sync::Semaphore::new(n)));
907
908        #[allow(unused_mut)]
909        let mut engine = crate::features::automation::RemoteMultimodalEngine::new(
910            cfgs.api_url.clone(),
911            cfgs.model_name.clone(),
912            cfgs.system_prompt.clone(),
913        )
914        .with_api_key(cfgs.api_key.as_deref())
915        .with_system_prompt_extra(cfgs.system_prompt_extra.as_deref())
916        .with_user_message_extra(cfgs.user_message_extra.as_deref())
917        .with_remote_multimodal_config(cfgs.cfg.clone())
918        .with_prompt_url_gate(cfgs.prompt_url_gate.clone())
919        .with_vision_model(cfgs.vision_model.clone())
920        .with_text_model(cfgs.text_model.clone())
921        .with_vision_route_mode(cfgs.vision_route_mode)
922        .with_chrome_ai(cfgs.use_chrome_ai)
923        .with_semaphore(sem)
924        .to_owned();
925
926        #[cfg(feature = "agent_skills")]
927        if let Some(ref registry) = cfgs.skill_registry {
928            engine.with_skill_registry(Some(registry.clone()));
929        }
930
931        // Build per-round complexity router from model pool (3+ models required)
932        let model_pool = cfgs.model_pool.clone();
933        if model_pool.len() >= 3 {
934            let model_names: Vec<&str> =
935                model_pool.iter().map(|ep| ep.model_name.as_str()).collect();
936            let policy = crate::features::automation::auto_policy(&model_names);
937            engine.model_router = Some(crate::features::automation::ModelRouter::with_policy(
938                policy,
939            ));
940        }
941        engine.model_pool = model_pool;
942
943        Some(engine)
944    }
945
946    /// Determine if the agent should be set to a Chrome Agent.
947    #[cfg(not(feature = "chrome"))]
948    pub(crate) fn only_chrome_agent(&self) -> bool {
949        false
950    }
951
952    /// Determine if the agent should be set to a Chrome Agent.
953    #[cfg(feature = "chrome")]
954    pub(crate) fn only_chrome_agent(&self) -> bool {
955        self.chrome_connection_url.is_some()
956            || self.wait_for.is_some()
957            || self.chrome_intercept.enabled
958            || self.stealth_mode.stealth()
959            || self.fingerprint.valid()
960    }
961
962    #[cfg(feature = "regex")]
963    /// Compile the regex for the blacklist.
964    pub fn get_blacklist(&self) -> Box<regex::RegexSet> {
965        match &self.blacklist_url {
966            Some(blacklist) => match regex::RegexSet::new(&**blacklist) {
967                Ok(s) => Box::new(s),
968                _ => Default::default(),
969            },
970            _ => Default::default(),
971        }
972    }
973
974    #[cfg(not(feature = "regex"))]
975    /// Handle the blacklist options.
976    pub fn get_blacklist(&self) -> AllowList {
977        match &self.blacklist_url {
978            Some(blacklist) => blacklist.to_owned(),
979            _ => Default::default(),
980        }
981    }
982
983    /// Set the blacklist
984    pub(crate) fn set_blacklist(&mut self) {
985        self.blacklist = AllowListSet(self.get_blacklist());
986    }
987
988    /// Set the whitelist
989    pub fn set_whitelist(&mut self) {
990        self.whitelist = AllowListSet(self.get_whitelist());
991    }
992
993    /// Configure the allow list.
994    pub fn configure_allowlist(&mut self) {
995        self.set_whitelist();
996        self.set_blacklist();
997    }
998
999    /// Get the blacklist compiled.
1000    pub fn get_blacklist_compiled(&self) -> &AllowList {
1001        &self.blacklist.0
1002    }
1003
1004    /// Setup the budget for crawling.
1005    pub fn configure_budget(&mut self) {
1006        self.inner_budget.clone_from(&self.budget);
1007    }
1008
1009    /// Get the whitelist compiled.
1010    pub fn get_whitelist_compiled(&self) -> &AllowList {
1011        &self.whitelist.0
1012    }
1013
1014    #[cfg(feature = "regex")]
1015    /// Compile the regex for the whitelist.
1016    pub fn get_whitelist(&self) -> Box<regex::RegexSet> {
1017        match &self.whitelist_url {
1018            Some(whitelist) => match regex::RegexSet::new(&**whitelist) {
1019                Ok(s) => Box::new(s),
1020                _ => Default::default(),
1021            },
1022            _ => Default::default(),
1023        }
1024    }
1025
1026    #[cfg(not(feature = "regex"))]
1027    /// Handle the whitelist options.
1028    pub fn get_whitelist(&self) -> AllowList {
1029        match &self.whitelist_url {
1030            Some(whitelist) => whitelist.to_owned(),
1031            _ => Default::default(),
1032        }
1033    }
1034
1035    #[cfg(feature = "sitemap")]
1036    /// Add sitemap paths to the whitelist and track what was added.
1037    pub fn add_sitemap_to_whitelist(&mut self) -> SitemapWhitelistChanges {
1038        let mut changes = SitemapWhitelistChanges::default();
1039
1040        if self.ignore_sitemap && self.whitelist_url.is_none() {
1041            return changes;
1042        }
1043
1044        if let Some(list) = self.whitelist_url.as_mut() {
1045            if list.is_empty() {
1046                return changes;
1047            }
1048
1049            let default = CompactString::from("sitemap.xml");
1050
1051            if !list.contains(&default) {
1052                list.push(default);
1053                changes.added_default = true;
1054            }
1055
1056            if let Some(custom) = &self.sitemap_url {
1057                if !list.contains(custom) {
1058                    // Clone the inner CompactString directly; `*custom.clone()`
1059                    // would allocate a new Box only to deref-move out of it.
1060                    list.push((**custom).clone());
1061                    changes.added_custom = true;
1062                }
1063            }
1064        }
1065
1066        changes
1067    }
1068
1069    #[cfg(feature = "sitemap")]
1070    /// Revert any changes made to the whitelist by `add_sitemap_to_whitelist`.
1071    pub fn remove_sitemap_from_whitelist(&mut self, changes: SitemapWhitelistChanges) {
1072        if let Some(list) = self.whitelist_url.as_mut() {
1073            if changes.added_default {
1074                let default = CompactString::from("sitemap.xml");
1075                if let Some(pos) = list.iter().position(|s| s == default) {
1076                    list.remove(pos);
1077                }
1078            }
1079            if changes.added_custom {
1080                if let Some(custom) = &self.sitemap_url {
1081                    if let Some(pos) = list.iter().position(|s| *s == **custom) {
1082                        list.remove(pos);
1083                    }
1084                }
1085            }
1086            if list.is_empty() {
1087                self.whitelist_url = None;
1088            }
1089        }
1090    }
1091
1092    /// Respect robots.txt file.
1093    pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self {
1094        self.respect_robots_txt = respect_robots_txt;
1095        self
1096    }
1097
1098    /// Include subdomains detection.
1099    pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self {
1100        self.subdomains = subdomains;
1101        self
1102    }
1103
1104    /// Bypass CSP protection detection. This does nothing without the feat flag `chrome` enabled.
1105    #[cfg(feature = "chrome")]
1106    pub fn with_csp_bypass(&mut self, enabled: bool) -> &mut Self {
1107        self.bypass_csp = enabled;
1108        self
1109    }
1110
1111    /// Bypass CSP protection detection. This does nothing without the feat flag `chrome` enabled.
1112    #[cfg(not(feature = "chrome"))]
1113    pub fn with_csp_bypass(&mut self, _enabled: bool) -> &mut Self {
1114        self
1115    }
1116
1117    /// Disable JavaScript execution on the page. This does nothing without the feat flag `chrome` enabled.
1118    #[cfg(feature = "chrome")]
1119    pub fn with_disable_javascript(&mut self, disabled: bool) -> &mut Self {
1120        self.disable_javascript = disabled;
1121        self
1122    }
1123
1124    /// Disable JavaScript execution on the page. This does nothing without the feat flag `chrome` enabled.
1125    #[cfg(not(feature = "chrome"))]
1126    pub fn with_disable_javascript(&mut self, _disabled: bool) -> &mut Self {
1127        self
1128    }
1129
1130    /// Bind the connections only on the network interface.
1131    pub fn with_network_interface(&mut self, network_interface: Option<String>) -> &mut Self {
1132        self.network_interface = network_interface;
1133        self
1134    }
1135
1136    /// Bind to a local IP Address.
1137    pub fn with_local_address(&mut self, local_address: Option<IpAddr>) -> &mut Self {
1138        self.local_address = local_address;
1139        self
1140    }
1141
1142    /// Include tld detection.
1143    pub fn with_tld(&mut self, tld: bool) -> &mut Self {
1144        self.tld = tld;
1145        self
1146    }
1147
1148    /// The max duration for the crawl. This is useful when websites use a robots.txt with long durations and throttle the timeout removing the full concurrency.
1149    pub fn with_crawl_timeout(&mut self, crawl_timeout: Option<Duration>) -> &mut Self {
1150        self.crawl_timeout = crawl_timeout;
1151        self
1152    }
1153
1154    /// Delay between request as ms.
1155    pub fn with_delay(&mut self, delay: u64) -> &mut Self {
1156        self.delay = delay;
1157        self
1158    }
1159
1160    /// Only use HTTP/2.
1161    pub fn with_http2_prior_knowledge(&mut self, http2_prior_knowledge: bool) -> &mut Self {
1162        self.http2_prior_knowledge = http2_prior_knowledge;
1163        self
1164    }
1165
1166    /// Max time to wait for request. By default request times out in 15s. Set to None to disable.
1167    pub fn with_request_timeout(&mut self, request_timeout: Option<Duration>) -> &mut Self {
1168        match request_timeout {
1169            Some(timeout) => self.request_timeout = Some(timeout),
1170            _ => self.request_timeout = None,
1171        };
1172
1173        self
1174    }
1175
1176    #[cfg(feature = "sitemap")]
1177    /// Set the sitemap url. This does nothing without the `sitemap` feature flag.
1178    pub fn with_sitemap(&mut self, sitemap_url: Option<&str>) -> &mut Self {
1179        match sitemap_url {
1180            Some(sitemap_url) => {
1181                self.sitemap_url = Some(CompactString::new(sitemap_url.to_string()).into())
1182            }
1183            _ => self.sitemap_url = None,
1184        };
1185        self
1186    }
1187
1188    #[cfg(not(feature = "sitemap"))]
1189    /// Set the sitemap url. This does nothing without the `sitemap` feature flag.
1190    pub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self {
1191        self
1192    }
1193
1194    #[cfg(feature = "sitemap")]
1195    /// Ignore the sitemap when crawling. This method does nothing if the `sitemap` is not enabled.
1196    pub fn with_ignore_sitemap(&mut self, ignore_sitemap: bool) -> &mut Self {
1197        self.ignore_sitemap = ignore_sitemap;
1198        self
1199    }
1200
1201    #[cfg(not(feature = "sitemap"))]
1202    /// Ignore the sitemap when crawling. This method does nothing if the `sitemap` is not enabled.
1203    pub fn with_ignore_sitemap(&mut self, _ignore_sitemap: bool) -> &mut Self {
1204        self
1205    }
1206
1207    /// Add user agent to request.
1208    pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self {
1209        match user_agent {
1210            Some(agent) => self.user_agent = Some(CompactString::new(agent).into()),
1211            _ => self.user_agent = None,
1212        };
1213        self
1214    }
1215
1216    /// Preserve the HOST header.
1217    pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self {
1218        self.preserve_host_header = preserve;
1219        self
1220    }
1221
1222    /// Use a remote multimodal model to drive browser automation.
1223    /// Requires the `agent` feature.
1224    #[cfg(feature = "agent")]
1225    pub fn with_remote_multimodal(
1226        &mut self,
1227        remote_multimodal: Option<crate::features::automation::RemoteMultimodalConfigs>,
1228    ) -> &mut Self {
1229        self.remote_multimodal = remote_multimodal.map(Box::new);
1230        self
1231    }
1232
1233    /// Use a remote multimodal model to drive browser automation.
1234    /// When the `agent` feature is not enabled, this uses a stub type.
1235    #[cfg(not(feature = "agent"))]
1236    pub fn with_remote_multimodal(
1237        &mut self,
1238        remote_multimodal: Option<crate::features::automation::RemoteMultimodalConfigs>,
1239    ) -> &mut Self {
1240        self.remote_multimodal = remote_multimodal.map(Box::new);
1241        self
1242    }
1243
1244    #[cfg(not(feature = "openai"))]
1245    /// The OpenAI configs to use to drive the browser. This method does nothing if the `openai` is not enabled.
1246    pub fn with_openai(&mut self, _openai_config: Option<GPTConfigs>) -> &mut Self {
1247        self
1248    }
1249
1250    /// The OpenAI configs to use to drive the browser. This method does nothing if the `openai` is not enabled.
1251    #[cfg(feature = "openai")]
1252    pub fn with_openai(&mut self, openai_config: Option<GPTConfigs>) -> &mut Self {
1253        match openai_config {
1254            Some(openai_config) => self.openai_config = Some(Box::new(openai_config)),
1255            _ => self.openai_config = None,
1256        };
1257        self
1258    }
1259
1260    #[cfg(not(feature = "gemini"))]
1261    /// The Gemini configs to use to drive the browser. This method does nothing if the `gemini` is not enabled.
1262    pub fn with_gemini(&mut self, _gemini_config: Option<GeminiConfigs>) -> &mut Self {
1263        self
1264    }
1265
1266    /// The Gemini configs to use to drive the browser. This method does nothing if the `gemini` is not enabled.
1267    #[cfg(feature = "gemini")]
1268    pub fn with_gemini(&mut self, gemini_config: Option<GeminiConfigs>) -> &mut Self {
1269        match gemini_config {
1270            Some(gemini_config) => self.gemini_config = Some(Box::new(gemini_config)),
1271            _ => self.gemini_config = None,
1272        };
1273        self
1274    }
1275
1276    #[cfg(feature = "cookies")]
1277    /// Cookie string to use in request. This does nothing without the `cookies` flag enabled.
1278    pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self {
1279        self.cookie_str = cookie_str.into();
1280        self
1281    }
1282
1283    #[cfg(not(feature = "cookies"))]
1284    /// Cookie string to use in request. This does nothing without the `cookies` flag enabled.
1285    pub fn with_cookies(&mut self, _cookie_str: &str) -> &mut Self {
1286        self
1287    }
1288
1289    #[cfg(feature = "chrome")]
1290    /// Set custom fingerprint ID for request. This does nothing without the `chrome` flag enabled.
1291    pub fn with_fingerprint(&mut self, fingerprint: bool) -> &mut Self {
1292        if fingerprint {
1293            self.fingerprint = Fingerprint::Basic;
1294        } else {
1295            self.fingerprint = Fingerprint::None;
1296        }
1297        self
1298    }
1299
1300    #[cfg(feature = "chrome")]
1301    /// Set custom fingerprint ID for request. This does nothing without the `chrome` flag enabled.
1302    pub fn with_fingerprint_advanced(&mut self, fingerprint: Fingerprint) -> &mut Self {
1303        self.fingerprint = fingerprint;
1304        self
1305    }
1306
1307    #[cfg(not(feature = "chrome"))]
1308    /// Set custom fingerprint ID for request. This does nothing without the `chrome` flag enabled.
1309    pub fn with_fingerprint(&mut self, _fingerprint: bool) -> &mut Self {
1310        self
1311    }
1312
1313    /// Use proxies for request.
1314    pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self {
1315        self.proxies = proxies.map(|p| {
1316            p.iter()
1317                .map(|addr| RequestProxy {
1318                    addr: addr.to_owned(),
1319                    ..Default::default()
1320                })
1321                .collect::<Vec<RequestProxy>>()
1322        });
1323        self
1324    }
1325
1326    /// Use proxies for request with control between chrome and http.
1327    pub fn with_proxies_direct(&mut self, proxies: Option<Vec<RequestProxy>>) -> &mut Self {
1328        self.proxies = proxies;
1329        self
1330    }
1331
1332    /// Set the proxy override list for a specific [`ProxyKind`].
1333    ///
1334    /// Lazily registers a sidecar mapping that a
1335    /// [`crate::proxy_strategy::ProxyStrategy`] can route requests
1336    /// through. Pass `None` for `proxies` to remove a previously-set
1337    /// kind. Setting a kind to `Some(empty_vec)` is allowed and means
1338    /// "route here but with no proxy" — the secondary client built for
1339    /// this kind will be unproxied.
1340    ///
1341    /// Has no effect on the primary [`Configuration::proxies`] list or
1342    /// on requests that route to [`ProxyKind::Default`].
1343    pub fn with_proxies_for_kind(
1344        &mut self,
1345        kind: ProxyKind,
1346        proxies: Option<Vec<RequestProxy>>,
1347    ) -> &mut Self {
1348        match (proxies, self.proxies_by_kind.as_mut()) {
1349            (Some(p), Some(map)) => {
1350                map.insert(kind, p);
1351            }
1352            (Some(p), None) => {
1353                let mut map = hashbrown::HashMap::new();
1354                map.insert(kind, p);
1355                self.proxies_by_kind = Some(map);
1356            }
1357            (None, Some(map)) => {
1358                map.remove(&kind);
1359                if map.is_empty() {
1360                    self.proxies_by_kind = None;
1361                }
1362            }
1363            (None, None) => {}
1364        }
1365        self
1366    }
1367
1368    /// Use a shared semaphore to evenly handle workloads. The default is false.
1369    pub fn with_shared_queue(&mut self, shared_queue: bool) -> &mut Self {
1370        self.shared_queue = shared_queue;
1371        self
1372    }
1373
1374    /// Add blacklist urls to ignore.
1375    pub fn with_blacklist_url<T>(&mut self, blacklist_url: Option<Vec<T>>) -> &mut Self
1376    where
1377        Vec<CompactString>: From<Vec<T>>,
1378    {
1379        match blacklist_url {
1380            Some(p) => self.blacklist_url = Some(p.into()),
1381            _ => self.blacklist_url = None,
1382        };
1383        self
1384    }
1385
1386    /// Add whitelist urls to allow.
1387    pub fn with_whitelist_url<T>(&mut self, whitelist_url: Option<Vec<T>>) -> &mut Self
1388    where
1389        Vec<CompactString>: From<Vec<T>>,
1390    {
1391        match whitelist_url {
1392            Some(p) => self.whitelist_url = Some(p.into()),
1393            _ => self.whitelist_url = None,
1394        };
1395        self
1396    }
1397
1398    /// Return the links found on the page in the channel subscriptions. This method does nothing if the `decentralized` is enabled.
1399    pub fn with_return_page_links(&mut self, return_page_links: bool) -> &mut Self {
1400        self.return_page_links = return_page_links;
1401        self
1402    }
1403
1404    /// Set HTTP headers for request using [reqwest::header::HeaderMap](https://docs.rs/reqwest/latest/reqwest/header/struct.HeaderMap.html).
1405    pub fn with_headers(&mut self, headers: Option<reqwest::header::HeaderMap>) -> &mut Self {
1406        match headers {
1407            Some(m) => self.headers = Some(SerializableHeaderMap::from(m).into()),
1408            _ => self.headers = None,
1409        };
1410        self
1411    }
1412
1413    /// Set the max redirects allowed for request.
1414    ///
1415    /// Calling this method opts in to redirect-cap enforcement on both the HTTP
1416    /// and Chrome paths. Without it, Chrome defers to Chromium's internal
1417    /// ~20-hop cap to preserve prior behavior.
1418    pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self {
1419        self.redirect_limit = redirect_limit;
1420        self.redirect_limit_set = true;
1421        self
1422    }
1423
1424    /// Cap the number of main-frame cross-document navigations per Chrome
1425    /// `goto()` call. `None` disables the guard.
1426    ///
1427    /// This is the JS / meta-refresh counterpart to `with_redirect_limit` —
1428    /// the HTTP redirect cap cannot catch loops implemented via
1429    /// `location.href`, `<meta http-equiv="refresh">`, or `Refresh:` headers,
1430    /// because each hop is a fresh document rather than a 3xx redirect.
1431    pub fn with_max_main_frame_navigations(&mut self, cap: Option<u32>) -> &mut Self {
1432        self.max_main_frame_navigations = cap;
1433        self
1434    }
1435
1436    /// Set the redirect policy to use.
1437    pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self {
1438        self.redirect_policy = policy;
1439        self
1440    }
1441
1442    /// Add a referer (mis-spelling) to the request.
1443    pub fn with_referer(&mut self, referer: Option<String>) -> &mut Self {
1444        self.referer = referer;
1445        self
1446    }
1447
1448    /// Add a referer to the request.
1449    pub fn with_referrer(&mut self, referer: Option<String>) -> &mut Self {
1450        self.referer = referer;
1451        self
1452    }
1453
1454    /// Determine whether to collect all the resources found on pages.
1455    pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self {
1456        self.full_resources = full_resources;
1457        self
1458    }
1459
1460    /// Determine whether to dismiss dialogs. This method does nothing if the `chrome` is enabled.
1461    #[cfg(feature = "chrome")]
1462    pub fn with_dismiss_dialogs(&mut self, dismiss_dialogs: bool) -> &mut Self {
1463        self.dismiss_dialogs = Some(dismiss_dialogs);
1464        self
1465    }
1466
1467    /// Determine whether to dismiss dialogs. This method does nothing if the `chrome` is enabled.
1468    #[cfg(not(feature = "chrome"))]
1469    pub fn with_dismiss_dialogs(&mut self, _dismiss_dialogs: bool) -> &mut Self {
1470        self
1471    }
1472
1473    /// Set the request emuluation. This method does nothing if the `wreq` flag is not enabled.
1474    #[cfg(feature = "wreq")]
1475    pub fn with_emulation(&mut self, emulation: Option<wreq_util::Emulation>) -> &mut Self {
1476        self.emulation = emulation;
1477        self
1478    }
1479
1480    #[cfg(feature = "cron")]
1481    /// Setup cron jobs to run. This does nothing without the `cron` flag enabled.
1482    pub fn with_cron(&mut self, cron_str: &str, cron_type: CronType) -> &mut Self {
1483        self.cron_str = cron_str.into();
1484        self.cron_type = cron_type;
1485        self
1486    }
1487
1488    #[cfg(not(feature = "cron"))]
1489    /// Setup cron jobs to run. This does nothing without the `cron` flag enabled.
1490    pub fn with_cron(&mut self, _cron_str: &str, _cron_type: CronType) -> &mut Self {
1491        self
1492    }
1493
1494    /// Set a crawl page limit. If the value is 0 there is no limit.
1495    pub fn with_limit(&mut self, limit: u32) -> &mut Self {
1496        self.with_budget(Some(hashbrown::HashMap::from([("*", limit)])));
1497        self
1498    }
1499
1500    /// Set the concurrency limits. If you set the value to None to use the default limits using the system CPU cors * n.
1501    pub fn with_concurrency_limit(&mut self, limit: Option<usize>) -> &mut Self {
1502        self.concurrency_limit = limit;
1503        self
1504    }
1505
1506    #[cfg(feature = "chrome")]
1507    /// Set the authentiation challenge response. This does nothing without the feat flag `chrome` enabled.
1508    pub fn with_auth_challenge_response(
1509        &mut self,
1510        auth_challenge_response: Option<AuthChallengeResponse>,
1511    ) -> &mut Self {
1512        self.auth_challenge_response = auth_challenge_response;
1513        self
1514    }
1515
1516    #[cfg(feature = "chrome")]
1517    /// Set a custom script to evaluate on new document creation. This does nothing without the feat flag `chrome` enabled.
1518    pub fn with_evaluate_on_new_document(
1519        &mut self,
1520        evaluate_on_new_document: Option<Box<String>>,
1521    ) -> &mut Self {
1522        self.evaluate_on_new_document = evaluate_on_new_document;
1523        self
1524    }
1525
1526    #[cfg(not(feature = "chrome"))]
1527    /// Set a custom script to evaluate on new document creation. This does nothing without the feat flag `chrome` enabled.
1528    pub fn with_evaluate_on_new_document(
1529        &mut self,
1530        _evaluate_on_new_document: Option<Box<String>>,
1531    ) -> &mut Self {
1532        self
1533    }
1534
1535    #[cfg(not(feature = "chrome"))]
1536    /// Set the authentiation challenge response. This does nothing without the feat flag `chrome` enabled.
1537    pub fn with_auth_challenge_response(
1538        &mut self,
1539        _auth_challenge_response: Option<AuthChallengeResponse>,
1540    ) -> &mut Self {
1541        self
1542    }
1543
1544    /// Set a crawl depth limit. If the value is 0 there is no limit.
1545    pub fn with_depth(&mut self, depth: usize) -> &mut Self {
1546        self.depth = depth;
1547        self
1548    }
1549
1550    #[cfg(any(feature = "cache_request", feature = "chrome_remote_cache"))]
1551    /// Cache the page following HTTP rules. This method does nothing if the `cache` feature is not enabled.
1552    pub fn with_caching(&mut self, cache: bool) -> &mut Self {
1553        self.cache = cache;
1554        self
1555    }
1556
1557    #[cfg(not(any(feature = "cache_request", feature = "chrome_remote_cache")))]
1558    /// Cache the page following HTTP rules. This method does nothing if the `cache` feature is not enabled.
1559    pub fn with_caching(&mut self, _cache: bool) -> &mut Self {
1560        self
1561    }
1562
1563    #[cfg(any(feature = "cache_request", feature = "chrome_remote_cache"))]
1564    /// Skip browser rendering entirely if cached response exists.
1565    /// When enabled with caching, returns cached HTML directly without launching Chrome.
1566    /// This is useful for performance when you only need the cached content.
1567    pub fn with_cache_skip_browser(&mut self, skip: bool) -> &mut Self {
1568        self.cache_skip_browser = skip;
1569        self
1570    }
1571
1572    #[cfg(not(any(feature = "cache_request", feature = "chrome_remote_cache")))]
1573    /// Skip browser rendering entirely if cached response exists.
1574    /// This method does nothing if the cache features are not enabled.
1575    pub fn with_cache_skip_browser(&mut self, _skip: bool) -> &mut Self {
1576        self
1577    }
1578
1579    /// Partition the cache by an opaque namespace so logically distinct
1580    /// variants of the same URL (country, proxy pool, tenant, A/B bucket,
1581    /// device profile, …) never collide on the same cached bytes.
1582    /// `None` uses the default (empty) namespace. Has no observable effect
1583    /// when no cache feature is active, but the configuration is always
1584    /// settable regardless of feature flags.
1585    pub fn with_cache_namespace<S: Into<String>>(&mut self, namespace: Option<S>) -> &mut Self {
1586        self.cache_namespace = namespace.map(|s| Box::new(s.into()));
1587        self
1588    }
1589
1590    /// Borrowed access to the cache namespace (`None` = default partition).
1591    /// Used by chrome / cache feature paths; the lib build without those
1592    /// flags has no callers, hence `#[allow(dead_code)]`.
1593    #[inline]
1594    #[allow(dead_code)]
1595    pub(crate) fn cache_namespace_str(&self) -> Option<&str> {
1596        self.cache_namespace.as_ref().map(|s| s.as_str())
1597    }
1598
1599    /// Enable read-only mode for the remote Chrome cache. When `true`, the
1600    /// local cache + per-session cache continue to serve hits but no
1601    /// responses are uploaded to the remote `hybrid_cache_server`. Has no
1602    /// observable effect without the `chrome_remote_cache` feature.
1603    #[cfg(feature = "chrome_remote_cache")]
1604    pub fn with_chrome_remote_cache_read_only(&mut self, read_only: bool) -> &mut Self {
1605        self.chrome_remote_cache_read_only = read_only;
1606        self
1607    }
1608
1609    /// Enable read-only mode for the remote Chrome cache. This method does
1610    /// nothing without the `chrome_remote_cache` feature.
1611    #[cfg(not(feature = "chrome_remote_cache"))]
1612    pub fn with_chrome_remote_cache_read_only(&mut self, _read_only: bool) -> &mut Self {
1613        self
1614    }
1615
1616    /// Whether the remote Chrome cache is in read-only mode. Always `false`
1617    /// without the `chrome_remote_cache` feature. Only consumed by the
1618    /// chrome-cache wiring at `Configuration::chrome_fetch_params`, so the
1619    /// default lib build flags this as unused.
1620    #[inline]
1621    #[allow(dead_code)]
1622    pub(crate) fn chrome_remote_cache_read_only_enabled(&self) -> bool {
1623        #[cfg(feature = "chrome_remote_cache")]
1624        {
1625            self.chrome_remote_cache_read_only
1626        }
1627        #[cfg(not(feature = "chrome_remote_cache"))]
1628        {
1629            false
1630        }
1631    }
1632
1633    /// Enable publishing of fresh HTTP (skip_browser) responses to the
1634    /// shared remote cache worker. Updates the process-global dump flag
1635    /// on `spider_remote_cache` — the setter is wait-free (single atomic
1636    /// store) and safe to call from any thread. Also opts into the
1637    /// disk-backed overflow spool so bursty crawls don't drop under
1638    /// memory pressure. Has no observable effect without the
1639    /// `chrome_remote_cache` feature.
1640    #[cfg(feature = "chrome_remote_cache")]
1641    pub fn with_remote_cache_skip_browser(&mut self, enabled: bool) -> &mut Self {
1642        self.remote_cache_skip_browser = enabled;
1643        spider_remote_cache::set_skip_browser_dumps_enabled(enabled);
1644        spider_remote_cache::set_spool_enabled(enabled);
1645        self
1646    }
1647
1648    /// Enable publishing of fresh HTTP (skip_browser) responses to the
1649    /// shared remote cache worker. This method does nothing without the
1650    /// `chrome_remote_cache` feature.
1651    #[cfg(not(feature = "chrome_remote_cache"))]
1652    pub fn with_remote_cache_skip_browser(&mut self, _enabled: bool) -> &mut Self {
1653        self
1654    }
1655
1656    /// Whether HTTP (skip_browser) responses should be enqueued to the
1657    /// shared remote cache worker. Always `false` without the
1658    /// `chrome_remote_cache` feature. Mirrors the process-global state
1659    /// held in `spider_remote_cache::skip_browser_dumps_enabled()` —
1660    /// callers that need the runtime toggle should read the global
1661    /// directly for wait-free access from the hot path.
1662    #[inline]
1663    #[allow(dead_code)]
1664    pub(crate) fn remote_cache_skip_browser_enabled(&self) -> bool {
1665        #[cfg(feature = "chrome_remote_cache")]
1666        {
1667            self.remote_cache_skip_browser
1668        }
1669        #[cfg(not(feature = "chrome_remote_cache"))]
1670        {
1671            false
1672        }
1673    }
1674
1675    /// Restrict chrome remote-cache dumps to the main (initial)
1676    /// document only. When `true`, the per-response listener runs in
1677    /// `dump_readonly` mode (local + per-session cache only) so
1678    /// CSS/JS/manifests are **not** uploaded to the remote
1679    /// `hybrid_cache_server`; the navigated document body — whatever
1680    /// MIME type it is — still goes through `cache_chrome_response`.
1681    /// Has no observable effect without the `chrome_remote_cache`
1682    /// feature.
1683    #[cfg(feature = "chrome_remote_cache")]
1684    pub fn with_chrome_remote_cache_main_doc_only(&mut self, enabled: bool) -> &mut Self {
1685        self.chrome_remote_cache_main_doc_only = enabled;
1686        self
1687    }
1688
1689    /// Restrict chrome remote-cache dumps to the main document only.
1690    /// This method does nothing without the `chrome_remote_cache`
1691    /// feature.
1692    #[cfg(not(feature = "chrome_remote_cache"))]
1693    pub fn with_chrome_remote_cache_main_doc_only(&mut self, _enabled: bool) -> &mut Self {
1694        self
1695    }
1696
1697    /// Whether chrome remote-cache dumps should be restricted to the
1698    /// main document. Always `false` without the `chrome_remote_cache`
1699    /// feature. Only consumed by the chrome-cache wiring at
1700    /// `Configuration::chrome_fetch_params`; default lib build is unused.
1701    #[inline]
1702    #[allow(dead_code)]
1703    pub(crate) fn chrome_remote_cache_main_doc_only_enabled(&self) -> bool {
1704        #[cfg(feature = "chrome_remote_cache")]
1705        {
1706            self.chrome_remote_cache_main_doc_only
1707        }
1708        #[cfg(not(feature = "chrome_remote_cache"))]
1709        {
1710            false
1711        }
1712    }
1713
1714    #[cfg(feature = "chrome")]
1715    /// Enable or disable Service Workers. This method does nothing if the `chrome` feature is not enabled.
1716    pub fn with_service_worker_enabled(&mut self, enabled: bool) -> &mut Self {
1717        self.service_worker_enabled = enabled;
1718        self
1719    }
1720
1721    #[cfg(not(feature = "chrome"))]
1722    /// Enable or disable Service Workers. This method does nothing if the `chrome` feature is not enabled.
1723    pub fn with_service_worker_enabled(&mut self, _enabled: bool) -> &mut Self {
1724        self
1725    }
1726
1727    /// Automatically setup geo-location configurations when using a proxy. This method does nothing if the `chrome` feature is not enabled.
1728    #[cfg(not(feature = "chrome"))]
1729    pub fn with_auto_geolocation(&mut self, _enabled: bool) -> &mut Self {
1730        self
1731    }
1732
1733    /// Automatically setup geo-location configurations when using a proxy. This method does nothing if the `chrome` feature is not enabled.
1734    #[cfg(feature = "chrome")]
1735    pub fn with_auto_geolocation(&mut self, enabled: bool) -> &mut Self {
1736        self.auto_geolocation = enabled;
1737        self
1738    }
1739
1740    /// Set the retry limit for request. Set the value to 0 for no retries. The default is 0.
1741    pub fn with_retry(&mut self, retry: u8) -> &mut Self {
1742        self.retry = retry;
1743        self
1744    }
1745
1746    /// The default http connect timeout.
1747    pub fn with_default_http_connect_timeout(
1748        &mut self,
1749        default_http_connect_timeout: Option<Duration>,
1750    ) -> &mut Self {
1751        self.default_http_connect_timeout = default_http_connect_timeout;
1752        self
1753    }
1754
1755    /// The default http read timeout.
1756    pub fn with_default_http_read_timeout(
1757        &mut self,
1758        default_http_read_timeout: Option<Duration>,
1759    ) -> &mut Self {
1760        self.default_http_read_timeout = default_http_read_timeout;
1761        self
1762    }
1763
1764    /// Skip setting up a control thread for pause, start, and shutdown programmatic handling. This does nothing without the 'control' flag enabled.
1765    pub fn with_no_control_thread(&mut self, no_control_thread: bool) -> &mut Self {
1766        self.no_control_thread = no_control_thread;
1767        self
1768    }
1769
1770    /// Configures the viewport of the browser, which defaults to 800x600. This method does nothing if the 'chrome' feature is not enabled.
1771    pub fn with_viewport(&mut self, viewport: Option<crate::configuration::Viewport>) -> &mut Self {
1772        self.viewport = viewport.map(|vp| vp);
1773        self
1774    }
1775
1776    #[cfg(feature = "chrome")]
1777    /// Use stealth mode for the request. This does nothing without the `chrome` flag enabled.
1778    pub fn with_stealth(&mut self, stealth_mode: bool) -> &mut Self {
1779        if stealth_mode {
1780            self.stealth_mode = spider_fingerprint::configs::Tier::Basic;
1781        } else {
1782            self.stealth_mode = spider_fingerprint::configs::Tier::None;
1783        }
1784        self
1785    }
1786
1787    #[cfg(feature = "chrome")]
1788    /// Use stealth mode for the request. This does nothing without the `chrome` flag enabled.
1789    pub fn with_stealth_advanced(
1790        &mut self,
1791        stealth_mode: spider_fingerprint::configs::Tier,
1792    ) -> &mut Self {
1793        self.stealth_mode = stealth_mode;
1794        self
1795    }
1796
1797    #[cfg(not(feature = "chrome"))]
1798    /// Use stealth mode for the request. This does nothing without the `chrome` flag enabled.
1799    pub fn with_stealth(&mut self, _stealth_mode: bool) -> &mut Self {
1800        self
1801    }
1802
1803    #[cfg(feature = "chrome")]
1804    /// Wait for network request to be idle within a time frame period (500ms no network connections). This does nothing without the `chrome` flag enabled.
1805    pub fn with_wait_for_idle_network(
1806        &mut self,
1807        wait_for_idle_network: Option<WaitForIdleNetwork>,
1808    ) -> &mut Self {
1809        match self.wait_for.as_mut() {
1810            Some(wait_for) => wait_for.idle_network = wait_for_idle_network,
1811            _ => {
1812                let mut wait_for = WaitFor::default();
1813                wait_for.idle_network = wait_for_idle_network;
1814                self.wait_for = Some(wait_for);
1815            }
1816        }
1817        self
1818    }
1819
1820    #[cfg(feature = "chrome")]
1821    /// Wait for network request with a max timeout. This does nothing without the `chrome` flag enabled.
1822    pub fn with_wait_for_idle_network0(
1823        &mut self,
1824        wait_for_idle_network0: Option<WaitForIdleNetwork>,
1825    ) -> &mut Self {
1826        match self.wait_for.as_mut() {
1827            Some(wait_for) => wait_for.idle_network0 = wait_for_idle_network0,
1828            _ => {
1829                let mut wait_for = WaitFor::default();
1830                wait_for.idle_network0 = wait_for_idle_network0;
1831                self.wait_for = Some(wait_for);
1832            }
1833        }
1834        self
1835    }
1836
1837    #[cfg(feature = "chrome")]
1838    /// Wait for network to be almost idle with a max timeout. This does nothing without the `chrome` flag enabled.
1839    pub fn with_wait_for_almost_idle_network0(
1840        &mut self,
1841        wait_for_almost_idle_network0: Option<WaitForIdleNetwork>,
1842    ) -> &mut Self {
1843        match self.wait_for.as_mut() {
1844            Some(wait_for) => wait_for.almost_idle_network0 = wait_for_almost_idle_network0,
1845            _ => {
1846                let mut wait_for = WaitFor::default();
1847                wait_for.almost_idle_network0 = wait_for_almost_idle_network0;
1848                self.wait_for = Some(wait_for);
1849            }
1850        }
1851        self
1852    }
1853
1854    #[cfg(not(feature = "chrome"))]
1855    /// Wait for network to be almost idle with a max timeout. This does nothing without the `chrome` flag enabled.
1856    pub fn with_wait_for_almost_idle_network0(
1857        &mut self,
1858        _wait_for_almost_idle_network0: Option<WaitForIdleNetwork>,
1859    ) -> &mut Self {
1860        self
1861    }
1862
1863    #[cfg(not(feature = "chrome"))]
1864    /// Wait for network request with a max timeout. This does nothing without the `chrome` flag enabled.
1865    pub fn with_wait_for_idle_network0(
1866        &mut self,
1867        _wait_for_idle_network0: Option<WaitForIdleNetwork>,
1868    ) -> &mut Self {
1869        self
1870    }
1871
1872    #[cfg(not(feature = "chrome"))]
1873    /// Wait for idle network request. This method does nothing if the `chrome` feature is not enabled.
1874    pub fn with_wait_for_idle_network(
1875        &mut self,
1876        _wait_for_idle_network: Option<WaitForIdleNetwork>,
1877    ) -> &mut Self {
1878        self
1879    }
1880
1881    #[cfg(feature = "chrome")]
1882    /// Wait for idle dom mutations for target element. This method does nothing if the [chrome] feature is not enabled.
1883    pub fn with_wait_for_idle_dom(
1884        &mut self,
1885        wait_for_idle_dom: Option<WaitForSelector>,
1886    ) -> &mut Self {
1887        match self.wait_for.as_mut() {
1888            Some(wait_for) => wait_for.dom = wait_for_idle_dom,
1889            _ => {
1890                let mut wait_for = WaitFor::default();
1891                wait_for.dom = wait_for_idle_dom;
1892                self.wait_for = Some(wait_for);
1893            }
1894        }
1895        self
1896    }
1897
1898    #[cfg(not(feature = "chrome"))]
1899    /// Wait for idle dom mutations for target element. This method does nothing if the `chrome` feature is not enabled.
1900    pub fn with_wait_for_idle_dom(
1901        &mut self,
1902        _wait_for_idle_dom: Option<WaitForSelector>,
1903    ) -> &mut Self {
1904        self
1905    }
1906
1907    #[cfg(feature = "chrome")]
1908    /// Wait for a selector. This method does nothing if the `chrome` feature is not enabled.
1909    pub fn with_wait_for_selector(
1910        &mut self,
1911        wait_for_selector: Option<WaitForSelector>,
1912    ) -> &mut Self {
1913        match self.wait_for.as_mut() {
1914            Some(wait_for) => wait_for.selector = wait_for_selector,
1915            _ => {
1916                let mut wait_for = WaitFor::default();
1917                wait_for.selector = wait_for_selector;
1918                self.wait_for = Some(wait_for);
1919            }
1920        }
1921        self
1922    }
1923
1924    #[cfg(not(feature = "chrome"))]
1925    /// Wait for a selector. This method does nothing if the `chrome` feature is not enabled.
1926    pub fn with_wait_for_selector(
1927        &mut self,
1928        _wait_for_selector: Option<WaitForSelector>,
1929    ) -> &mut Self {
1930        self
1931    }
1932
1933    #[cfg(feature = "chrome")]
1934    /// Wait for with delay. Should only be used for testing. This method does nothing if the 'chrome' feature is not enabled.
1935    pub fn with_wait_for_delay(&mut self, wait_for_delay: Option<WaitForDelay>) -> &mut Self {
1936        match self.wait_for.as_mut() {
1937            Some(wait_for) => wait_for.delay = wait_for_delay,
1938            _ => {
1939                let mut wait_for = WaitFor::default();
1940                wait_for.delay = wait_for_delay;
1941                self.wait_for = Some(wait_for);
1942            }
1943        }
1944        self
1945    }
1946
1947    #[cfg(not(feature = "chrome"))]
1948    /// Wait for with delay. Should only be used for testing. This method does nothing if the 'chrome' feature is not enabled.
1949    pub fn with_wait_for_delay(&mut self, _wait_for_delay: Option<WaitForDelay>) -> &mut Self {
1950        self
1951    }
1952
1953    #[cfg(feature = "chrome_intercept")]
1954    /// Use request intercept for the request to only allow content that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the `chrome_intercept` is not enabled.
1955    pub fn with_chrome_intercept(
1956        &mut self,
1957        chrome_intercept: RequestInterceptConfiguration,
1958        url: &Option<Box<url::Url>>,
1959    ) -> &mut Self {
1960        self.chrome_intercept = chrome_intercept;
1961        self.chrome_intercept.setup_intercept_manager(url);
1962        self
1963    }
1964
1965    #[cfg(not(feature = "chrome_intercept"))]
1966    /// Use request intercept for the request to only allow content required for the page that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the `chrome_intercept` is not enabled.
1967    pub fn with_chrome_intercept(
1968        &mut self,
1969        _chrome_intercept: RequestInterceptConfiguration,
1970        _url: &Option<Box<url::Url>>,
1971    ) -> &mut Self {
1972        self
1973    }
1974
1975    #[cfg(feature = "chrome_intercept")]
1976    /// Push the interception policy (the `chrome_intercept` flags + per-job
1977    /// blacklist/whitelist + page url) to a capable remote rendering engine
1978    /// once per navigation, so it resolves block/allow decisions locally
1979    /// instead of round-tripping every paused request. Enables request
1980    /// interception. No-op against a normal Chrome target (the vendor method is
1981    /// ignored), so it only changes behavior for engines that implement it.
1982    pub fn with_remote_local_policy(&mut self, enabled: bool) -> &mut Self {
1983        if enabled {
1984            self.chrome_intercept.enabled = true;
1985        }
1986        self.chrome_intercept.set_remote_local_policy(enabled);
1987        self
1988    }
1989
1990    #[cfg(not(feature = "chrome_intercept"))]
1991    /// Push the interception policy to a capable remote rendering engine. This
1992    /// method does nothing without the `chrome_intercept` flag.
1993    pub fn with_remote_local_policy(&mut self, _enabled: bool) -> &mut Self {
1994        self
1995    }
1996
1997    #[cfg(feature = "chrome")]
1998    /// Set the connection url for the chrome instance. This method does nothing if the `chrome` is not enabled.
1999    pub fn with_chrome_connection(&mut self, chrome_connection_url: Option<String>) -> &mut Self {
2000        self.chrome_connection_url = chrome_connection_url;
2001        self
2002    }
2003
2004    #[cfg(not(feature = "chrome"))]
2005    /// Set the connection url for the chrome instance. This method does nothing if the `chrome` is not enabled.
2006    pub fn with_chrome_connection(&mut self, _chrome_connection_url: Option<String>) -> &mut Self {
2007        self
2008    }
2009
2010    #[cfg(feature = "chrome")]
2011    /// Set multiple remote Chrome connection URLs for failover. When a
2012    /// connection fails after retries, the next URL is tried. Takes
2013    /// priority over `chrome_connection_url` when set.
2014    ///
2015    /// A single-URL vec routes through `chrome_connection_url` so the
2016    /// normal single-endpoint path (10 retries w/ backoff) is used
2017    /// instead of the failover path (3 retries, no other endpoint to try).
2018    pub fn with_chrome_connections(&mut self, urls: Vec<String>) -> &mut Self {
2019        match urls.len() {
2020            0 => {
2021                self.chrome_connection_urls = None;
2022            }
2023            1 => {
2024                self.chrome_connection_url = urls.into_iter().next();
2025                self.chrome_connection_urls = None;
2026            }
2027            _ => {
2028                self.chrome_connection_urls = Some(urls);
2029            }
2030        }
2031        // Drop any previously-built failover so the next setup call reflects
2032        // the new URL list. Outstanding readers keep the old Arc alive until
2033        // they release it; no leak.
2034        self.chrome_failover = crate::features::chrome::LazyChromeFailover::default();
2035        self
2036    }
2037
2038    #[cfg(not(feature = "chrome"))]
2039    /// Set multiple remote Chrome connection URLs. This method does nothing if the `chrome` is not enabled.
2040    pub fn with_chrome_connections(&mut self, _urls: Vec<String>) -> &mut Self {
2041        self
2042    }
2043
2044    #[cfg(feature = "decentralized")]
2045    /// Set the Spider worker URL for crawl requests. `None` clears the
2046    /// per-website override so this `Website` falls back to the process-wide
2047    /// `SPIDER_WORKER` env var (default `http://127.0.0.1:3030`). `Some` with
2048    /// a non-empty URL routes crawl traffic through that worker; `Some` with
2049    /// an empty/whitespace URL disables the crawl worker pool for this
2050    /// `Website` without affecting any other `Website` in the process.
2051    pub fn with_worker_connection(&mut self, worker_connection_url: Option<String>) -> &mut Self {
2052        self.worker_connection_urls = worker_connection_url.map(|url| {
2053            let url = url.trim();
2054            if url.is_empty() {
2055                Vec::new()
2056            } else {
2057                vec![url.to_string()]
2058            }
2059        });
2060        self
2061    }
2062
2063    #[cfg(not(feature = "decentralized"))]
2064    /// Set the Spider worker URL for crawl requests. This method does nothing
2065    /// if the `decentralized` feature is not enabled.
2066    pub fn with_worker_connection(&mut self, _worker_connection_url: Option<String>) -> &mut Self {
2067        self
2068    }
2069
2070    #[cfg(feature = "decentralized")]
2071    /// Set multiple Spider worker URLs for crawl requests. Empty/whitespace
2072    /// entries are dropped. An empty resulting list disables the crawl worker
2073    /// pool for this `Website` only.
2074    pub fn with_worker_connections(&mut self, urls: Vec<String>) -> &mut Self {
2075        self.worker_connection_urls = Some(
2076            urls.into_iter()
2077                .map(|url| url.trim().to_string())
2078                .filter(|url| !url.is_empty())
2079                .collect(),
2080        );
2081        self
2082    }
2083
2084    #[cfg(not(feature = "decentralized"))]
2085    /// Set multiple Spider worker URLs for crawl requests. This method does
2086    /// nothing if the `decentralized` feature is not enabled.
2087    pub fn with_worker_connections(&mut self, _urls: Vec<String>) -> &mut Self {
2088        self
2089    }
2090
2091    #[cfg(feature = "decentralized")]
2092    /// Set the Spider scraper worker URL for scrape requests. `None` clears
2093    /// the per-website override so this `Website` falls back to the
2094    /// process-wide `SPIDER_WORKER_SCRAPER` env var (default
2095    /// `http://127.0.0.1:3031`). `Some` with an empty/whitespace URL disables
2096    /// the scraper worker pool for this `Website` only.
2097    pub fn with_scraper_worker_connection(
2098        &mut self,
2099        scraper_worker_connection_url: Option<String>,
2100    ) -> &mut Self {
2101        self.scraper_worker_connection_urls = scraper_worker_connection_url.map(|url| {
2102            let url = url.trim();
2103            if url.is_empty() {
2104                Vec::new()
2105            } else {
2106                vec![url.to_string()]
2107            }
2108        });
2109        self
2110    }
2111
2112    #[cfg(not(feature = "decentralized"))]
2113    /// Set the Spider scraper worker URL for scrape requests. This method
2114    /// does nothing if the `decentralized` feature is not enabled.
2115    pub fn with_scraper_worker_connection(
2116        &mut self,
2117        _scraper_worker_connection_url: Option<String>,
2118    ) -> &mut Self {
2119        self
2120    }
2121
2122    #[cfg(feature = "decentralized")]
2123    /// Set multiple Spider scraper worker URLs for scrape requests.
2124    /// Empty/whitespace entries are dropped. An empty resulting list disables
2125    /// the scraper worker pool for this `Website` only.
2126    pub fn with_scraper_worker_connections(&mut self, urls: Vec<String>) -> &mut Self {
2127        self.scraper_worker_connection_urls = Some(
2128            urls.into_iter()
2129                .map(|url| url.trim().to_string())
2130                .filter(|url| !url.is_empty())
2131                .collect(),
2132        );
2133        self
2134    }
2135
2136    #[cfg(not(feature = "decentralized"))]
2137    /// Set multiple Spider scraper worker URLs for scrape requests. This
2138    /// method does nothing if the `decentralized` feature is not enabled.
2139    pub fn with_scraper_worker_connections(&mut self, _urls: Vec<String>) -> &mut Self {
2140        self
2141    }
2142
2143    #[cfg(feature = "chrome")]
2144    /// Set the first-byte watchdog timeout for Chrome navigations. `None`
2145    /// disables it; `Some(d)` fires after `d` of silence on both
2146    /// `Network.responseReceived` and `Network.dataReceived` and force-stops
2147    /// the page so the caller can rotate to a different Chrome backend.
2148    pub fn with_chrome_first_byte_timeout(&mut self, timeout: Option<Duration>) -> &mut Self {
2149        self.chrome_first_byte_timeout = timeout;
2150        self
2151    }
2152
2153    #[cfg(not(feature = "chrome"))]
2154    /// Set the first-byte watchdog timeout for Chrome navigations. This method does nothing if the `chrome` is not enabled.
2155    pub fn with_chrome_first_byte_timeout(&mut self, _timeout: Option<Duration>) -> &mut Self {
2156        self
2157    }
2158
2159    #[cfg(feature = "chrome")]
2160    /// Set the per-fetch jitter window for the first-byte watchdog. `None`
2161    /// disables jitter; `Some(j)` randomizes each fetch's timeout uniformly
2162    /// in `[base, base + j)`. Ignored when the base timeout is `None`.
2163    pub fn with_chrome_first_byte_timeout_jitter(&mut self, jitter: Option<Duration>) -> &mut Self {
2164        self.chrome_first_byte_timeout_jitter = jitter;
2165        self
2166    }
2167
2168    #[cfg(not(feature = "chrome"))]
2169    /// Set the first-byte watchdog jitter window. This method does nothing if the `chrome` is not enabled.
2170    pub fn with_chrome_first_byte_timeout_jitter(
2171        &mut self,
2172        _jitter: Option<Duration>,
2173    ) -> &mut Self {
2174        self
2175    }
2176
2177    /// Set the first-byte watchdog timeout for HTTP fetches. `None`
2178    /// disables it; `Some(d)` wraps each `client.get(url).send()` in
2179    /// `tokio::time::timeout(d + rand(0..jitter))` and returns a
2180    /// synthetic `524 GATEWAY_TIMEOUT` response on fire so the retry
2181    /// path rotates the proxy. Covers stalls between TCP connect and
2182    /// the first byte of the response — distinct from
2183    /// `connect_timeout` (handshake-only) and `chunk_idle_timeout`
2184    /// (body-streaming idle).
2185    pub fn with_http_first_byte_timeout(&mut self, timeout: Option<Duration>) -> &mut Self {
2186        self.http_first_byte_timeout = timeout;
2187        self
2188    }
2189
2190    /// Set the per-fetch jitter window for the HTTP first-byte
2191    /// watchdog. Same semantics as
2192    /// `with_chrome_first_byte_timeout_jitter`.
2193    pub fn with_http_first_byte_timeout_jitter(&mut self, jitter: Option<Duration>) -> &mut Self {
2194        self.http_first_byte_timeout_jitter = jitter;
2195        self
2196    }
2197
2198    #[cfg(not(feature = "chrome"))]
2199    /// Set JS to run on certain pages. This method does nothing if the `chrome` is not enabled.
2200    pub fn with_execution_scripts(
2201        &mut self,
2202        _execution_scripts: Option<ExecutionScriptsMap>,
2203    ) -> &mut Self {
2204        self
2205    }
2206
2207    #[cfg(feature = "chrome")]
2208    /// Set JS to run on certain pages. This method does nothing if the `chrome` is not enabled.
2209    pub fn with_execution_scripts(
2210        &mut self,
2211        execution_scripts: Option<ExecutionScriptsMap>,
2212    ) -> &mut Self {
2213        self.execution_scripts =
2214            crate::features::chrome_common::convert_to_trie_execution_scripts(&execution_scripts);
2215        self
2216    }
2217
2218    #[cfg(not(feature = "chrome"))]
2219    /// Run web automated actions on certain pages. This method does nothing if the `chrome` is not enabled.
2220    pub fn with_automation_scripts(
2221        &mut self,
2222        _automation_scripts: Option<AutomationScriptsMap>,
2223    ) -> &mut Self {
2224        self
2225    }
2226
2227    #[cfg(feature = "chrome")]
2228    /// Run web automated actions on certain pages. This method does nothing if the `chrome` is not enabled.
2229    pub fn with_automation_scripts(
2230        &mut self,
2231        automation_scripts: Option<AutomationScriptsMap>,
2232    ) -> &mut Self {
2233        self.automation_scripts =
2234            crate::features::chrome_common::convert_to_trie_automation_scripts(&automation_scripts);
2235        self
2236    }
2237
2238    /// Set a crawl budget per path with levels support /a/b/c or for all paths with "*". This does nothing without the `budget` flag enabled.
2239    pub fn with_budget(&mut self, budget: Option<hashbrown::HashMap<&str, u32>>) -> &mut Self {
2240        self.budget = match budget {
2241            Some(budget) => {
2242                let mut crawl_budget: hashbrown::HashMap<
2243                    case_insensitive_string::CaseInsensitiveString,
2244                    u32,
2245                > = hashbrown::HashMap::new();
2246
2247                for b in budget.into_iter() {
2248                    crawl_budget.insert(
2249                        case_insensitive_string::CaseInsensitiveString::from(b.0),
2250                        b.1,
2251                    );
2252                }
2253
2254                Some(crawl_budget)
2255            }
2256            _ => None,
2257        };
2258        self
2259    }
2260
2261    /// Group external domains to treat the crawl as one. If None is passed this will clear all prior domains.
2262    pub fn with_external_domains<'a, 'b>(
2263        &mut self,
2264        external_domains: Option<impl Iterator<Item = String> + 'a>,
2265    ) -> &mut Self {
2266        match external_domains {
2267            Some(external_domains) => {
2268                self.external_domains_caseless = external_domains
2269                    .into_iter()
2270                    .filter_map(|d| {
2271                        if d == "*" {
2272                            Some("*".into())
2273                        } else {
2274                            let host = get_domain_from_url(&d);
2275
2276                            if !host.is_empty() {
2277                                Some(host.into())
2278                            } else {
2279                                None
2280                            }
2281                        }
2282                    })
2283                    .collect::<hashbrown::HashSet<case_insensitive_string::CaseInsensitiveString>>()
2284                    .into();
2285            }
2286            _ => self.external_domains_caseless = Default::default(),
2287        }
2288
2289        self
2290    }
2291
2292    /// Dangerously accept invalid certificates - this should be used as a last resort.
2293    pub fn with_danger_accept_invalid_certs(&mut self, accept_invalid_certs: bool) -> &mut Self {
2294        self.accept_invalid_certs = accept_invalid_certs;
2295        self
2296    }
2297
2298    /// Normalize the content de-duplicating trailing slash pages and other pages that can be duplicated. This may initially show the link in your links_visited or subscription calls but, the following links will not be crawled.
2299    pub fn with_normalize(&mut self, normalize: bool) -> &mut Self {
2300        self.normalize = normalize;
2301        self
2302    }
2303
2304    #[cfg(not(feature = "disk"))]
2305    /// Store all the links found on the disk to share the state. This does nothing without the `disk` flag enabled.
2306    pub fn with_shared_state(&mut self, _shared: bool) -> &mut Self {
2307        self
2308    }
2309
2310    /// Store all the links found on the disk to share the state. This does nothing without the `disk` flag enabled.
2311    #[cfg(feature = "disk")]
2312    pub fn with_shared_state(&mut self, shared: bool) -> &mut Self {
2313        self.shared = shared;
2314        self
2315    }
2316
2317    #[cfg(not(feature = "chrome"))]
2318    /// Overrides default host system timezone with the specified one. This does nothing without the `chrome` flag enabled.
2319    pub fn with_timezone_id(&mut self, _timezone_id: Option<String>) -> &mut Self {
2320        self
2321    }
2322
2323    #[cfg(feature = "chrome")]
2324    /// Overrides default host system timezone with the specified one. This does nothing without the `chrome` flag enabled.
2325    pub fn with_timezone_id(&mut self, timezone_id: Option<String>) -> &mut Self {
2326        self.timezone_id = timezone_id.map(|timezone_id| timezone_id.into());
2327        self
2328    }
2329
2330    #[cfg(not(feature = "chrome"))]
2331    /// Overrides default host system locale with the specified one. This does nothing without the `chrome` flag enabled.
2332    pub fn with_locale(&mut self, _locale: Option<String>) -> &mut Self {
2333        self
2334    }
2335
2336    #[cfg(feature = "chrome")]
2337    /// Overrides default host system locale with the specified one. This does nothing without the `chrome` flag enabled.
2338    pub fn with_locale(&mut self, locale: Option<String>) -> &mut Self {
2339        self.locale = locale.map(|locale| locale.into());
2340        self
2341    }
2342
2343    #[cfg(feature = "chrome")]
2344    /// Track the events made via chrome.
2345    pub fn with_event_tracker(&mut self, track_events: Option<ChromeEventTracker>) -> &mut Self {
2346        self.track_events = track_events;
2347        self
2348    }
2349
2350    /// Set the chrome screenshot configuration. This does nothing without the `chrome` flag enabled.
2351    #[cfg(not(feature = "chrome"))]
2352    pub fn with_screenshot(&mut self, _screenshot_config: Option<ScreenShotConfig>) -> &mut Self {
2353        self
2354    }
2355
2356    /// Set the chrome screenshot configuration. This does nothing without the `chrome` flag enabled.
2357    #[cfg(feature = "chrome")]
2358    pub fn with_screenshot(&mut self, screenshot_config: Option<ScreenShotConfig>) -> &mut Self {
2359        self.screenshot = screenshot_config;
2360        self
2361    }
2362
2363    /// Set the max amount of bytes to collect per page. This method does nothing if the `chrome` is not enabled.
2364    pub fn with_max_page_bytes(&mut self, max_page_bytes: Option<f64>) -> &mut Self {
2365        self.max_page_bytes = max_page_bytes;
2366        self
2367    }
2368
2369    /// Set the max amount of bytes to collected for the browser context. This method does nothing if the `chrome` is not enabled.
2370    pub fn with_max_bytes_allowed(&mut self, max_bytes_allowed: Option<u64>) -> &mut Self {
2371        self.max_bytes_allowed = max_bytes_allowed;
2372        self
2373    }
2374
2375    /// Block assets from loading from the network.
2376    pub fn with_block_assets(&mut self, only_html: bool) -> &mut Self {
2377        self.only_html = only_html;
2378        self
2379    }
2380
2381    /// Modify the headers to mimic a real browser.
2382    pub fn with_modify_headers(&mut self, modify_headers: bool) -> &mut Self {
2383        self.modify_headers = modify_headers;
2384        self
2385    }
2386
2387    /// Modify the HTTP client headers to mimic a real browser.
2388    pub fn with_modify_http_client_headers(
2389        &mut self,
2390        modify_http_client_headers: bool,
2391    ) -> &mut Self {
2392        self.modify_http_client_headers = modify_http_client_headers;
2393        self
2394    }
2395
2396    /// Set the cache policy.
2397    pub fn with_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) -> &mut Self {
2398        self.cache_policy = cache_policy;
2399        self
2400    }
2401
2402    #[cfg(feature = "webdriver")]
2403    /// Set the WebDriver configuration. This does nothing without the `webdriver` flag enabled.
2404    pub fn with_webdriver_config(
2405        &mut self,
2406        webdriver_config: Option<WebDriverConfig>,
2407    ) -> &mut Self {
2408        self.webdriver_config = webdriver_config.map(Box::new);
2409        self
2410    }
2411
2412    #[cfg(not(feature = "webdriver"))]
2413    /// Set the WebDriver configuration. This does nothing without the `webdriver` flag enabled.
2414    pub fn with_webdriver_config(
2415        &mut self,
2416        _webdriver_config: Option<WebDriverConfig>,
2417    ) -> &mut Self {
2418        self
2419    }
2420
2421    /// Resolve the HTTP first-byte watchdog args.
2422    ///
2423    /// Returns the configured `http_first_byte_timeout` + `_jitter`
2424    /// whenever the timeout field is `Some(_)` — caller opted in by
2425    /// setting the field, so honor it regardless of proxy count.
2426    ///
2427    /// Previously this was gated on `balance` feature + ≥2 HTTP-eligible
2428    /// proxies, on the premise that the watchdog firing without a
2429    /// rotation target was wasted. That was wrong for the
2430    /// proxy-shrouded NXDOMAIN case: a single proxy still returns an
2431    /// upstream-DNS-shaped 5xx after ~15-22s, and reqwest's `.timeout()`
2432    /// is not enforced through the proxy CONNECT tunnel for that phase.
2433    /// The watchdog is the only knob that fires reliably, and a fast
2434    /// 524 surfaced to the caller is strictly better than waiting for
2435    /// the proxy's internal DNS deadline — even when no rotation
2436    /// target exists.
2437    ///
2438    /// When the timeout field is `None`, returns `(None, None)` —
2439    /// pure passthrough, no overhead. Setting the field on `Configuration`
2440    /// is the opt-in.
2441    #[inline]
2442    pub fn auto_http_first_byte_args(&self) -> (Option<Duration>, Option<Duration>) {
2443        match self.http_first_byte_timeout {
2444            Some(_) => (
2445                self.http_first_byte_timeout,
2446                self.http_first_byte_timeout_jitter,
2447            ),
2448            None => (None, None),
2449        }
2450    }
2451
2452    /// Build the borrowed chrome fetch parameter bundle.
2453    ///
2454    /// Zero-copy: all fields borrow directly from `self`. Build once at
2455    /// the top of a call chain and pass `&` through the layers to keep
2456    /// the hot path inlineable.
2457    #[cfg(feature = "chrome")]
2458    #[inline]
2459    pub fn chrome_fetch_params(&self) -> crate::utils::ChromeFetchParams<'_> {
2460        crate::utils::ChromeFetchParams {
2461            wait_for: &self.wait_for,
2462            screenshot: &self.screenshot,
2463            openai_config: &self.openai_config,
2464            execution_scripts: &self.execution_scripts,
2465            automation_scripts: &self.automation_scripts,
2466            viewport: &self.viewport,
2467            request_timeout: &self.request_timeout,
2468            track_events: &self.track_events,
2469            cache_policy: &self.cache_policy,
2470            remote_multimodal: &self.remote_multimodal,
2471            remote_cache_read_only: self.chrome_remote_cache_read_only_enabled(),
2472            remote_cache_main_doc_only: self.chrome_remote_cache_main_doc_only_enabled(),
2473            first_byte_timeout: &self.chrome_first_byte_timeout,
2474            first_byte_timeout_jitter: &self.chrome_first_byte_timeout_jitter,
2475            browser_dead: None,
2476            chrome_failover: Some(&self.chrome_failover),
2477            // Auto-populate the endpoint URL for every chrome / smart
2478            // crawl path so the first-byte watchdog can mark the right
2479            // backend bad without each call site having to thread a
2480            // `BrowserController` through its signature. Multi-URL
2481            // failover wins (the most recent successful URL); single-URL
2482            // config is the fallback. `None` for local launches.
2483            chrome_endpoint_url: self
2484                .chrome_failover
2485                .last_connected_url()
2486                .or(self.chrome_connection_url.as_deref()),
2487        }
2488    }
2489
2490    /// Get the cache option to use for the run. This does nothing without the 'cache_request' feature.
2491    #[cfg(any(feature = "cache_request", feature = "chrome_remote_cache"))]
2492    pub(crate) fn get_cache_options(&self) -> Option<crate::utils::CacheOptions> {
2493        use crate::utils::CacheOptions;
2494        if !self.cache {
2495            return None;
2496        }
2497        let auth_token = self
2498            .headers
2499            .as_ref()
2500            .and_then(|headers| {
2501                headers
2502                    .0
2503                    .get("authorization")
2504                    .or_else(|| headers.0.get("Authorization"))
2505            })
2506            .map(|s| s.to_owned());
2507
2508        // When using in-memory cache (cache_mem), auto-enable skip_browser
2509        // since the cached HTML was already rendered by a prior Chrome crawl
2510        // and re-rendering through Chrome is redundant. The browser only
2511        // launches when the cache has no hit for the requested page.
2512        #[cfg(feature = "cache_mem")]
2513        let skip_browser = true;
2514        #[cfg(not(feature = "cache_mem"))]
2515        let skip_browser = self.cache_skip_browser;
2516
2517        match auth_token {
2518            Some(token) if !token.is_empty() => {
2519                if let Ok(token_str) = token.to_str() {
2520                    if skip_browser {
2521                        Some(CacheOptions::SkipBrowserAuthorized(token_str.into()))
2522                    } else {
2523                        Some(CacheOptions::Authorized(token_str.into()))
2524                    }
2525                } else if skip_browser {
2526                    Some(CacheOptions::SkipBrowser)
2527                } else {
2528                    Some(CacheOptions::Yes)
2529                }
2530            }
2531            _ => {
2532                if skip_browser {
2533                    Some(CacheOptions::SkipBrowser)
2534                } else {
2535                    Some(CacheOptions::Yes)
2536                }
2537            }
2538        }
2539    }
2540
2541    /// Get the cache option to use for the run. This does nothing without the 'cache_request' feature.
2542    #[cfg(all(
2543        feature = "chrome",
2544        not(any(feature = "cache_request", feature = "chrome_remote_cache"))
2545    ))]
2546    pub(crate) fn get_cache_options(&self) -> Option<crate::utils::CacheOptions> {
2547        None
2548    }
2549
2550    /// Get the cache option to use for the run when chrome/cache features are disabled.
2551    #[cfg(not(any(
2552        feature = "cache_request",
2553        feature = "chrome_remote_cache",
2554        feature = "chrome"
2555    )))]
2556    #[allow(dead_code)]
2557    pub(crate) fn get_cache_options(&self) -> Option<crate::utils::CacheOptions> {
2558        None
2559    }
2560
2561    /// Build the website configuration when using with_builder.
2562    pub fn build(&self) -> Self {
2563        self.to_owned()
2564    }
2565
2566    #[cfg(feature = "search")]
2567    /// Configure web search integration. This does nothing without the `search` flag enabled.
2568    pub fn with_search_config(&mut self, search_config: Option<SearchConfig>) -> &mut Self {
2569        self.search_config = search_config.map(Box::new);
2570        self
2571    }
2572
2573    #[cfg(not(feature = "search"))]
2574    /// Configure web search integration. This does nothing without the `search` flag enabled.
2575    pub fn with_search_config(&mut self, _search_config: Option<()>) -> &mut Self {
2576        self
2577    }
2578
2579    /// Set a [spider.cloud](https://spider.cloud) API key (Proxy mode).
2580    #[cfg(feature = "spider_cloud")]
2581    pub fn with_spider_cloud(&mut self, api_key: &str) -> &mut Self {
2582        if is_placeholder_api_key(api_key) {
2583            log::warn!("Spider Cloud API key looks like a placeholder — skipping. Get a real key at https://spider.cloud");
2584            return self;
2585        }
2586        self.spider_cloud = Some(Box::new(SpiderCloudConfig::new(api_key)));
2587        self
2588    }
2589
2590    /// Set a [spider.cloud](https://spider.cloud) API key (no-op without `spider_cloud` feature).
2591    #[cfg(not(feature = "spider_cloud"))]
2592    pub fn with_spider_cloud(&mut self, _api_key: &str) -> &mut Self {
2593        self
2594    }
2595
2596    /// Set a [spider.cloud](https://spider.cloud) config.
2597    #[cfg(feature = "spider_cloud")]
2598    pub fn with_spider_cloud_config(&mut self, config: SpiderCloudConfig) -> &mut Self {
2599        self.spider_cloud = Some(Box::new(config));
2600        self
2601    }
2602
2603    /// Set a [spider.cloud](https://spider.cloud) config (no-op without `spider_cloud` feature).
2604    #[cfg(not(feature = "spider_cloud"))]
2605    pub fn with_spider_cloud_config(&mut self, _config: ()) -> &mut Self {
2606        self
2607    }
2608
2609    /// Connect to [Spider Browser Cloud](https://spider.cloud/docs/api#browser)
2610    /// via CDP over WebSocket using an API key.
2611    ///
2612    /// Sets `chrome_connection_url` to `wss://browser.spider.cloud/v1/browser?token=API_KEY`.
2613    #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2614    pub fn with_spider_browser(&mut self, api_key: &str) -> &mut Self {
2615        if is_placeholder_api_key(api_key) {
2616            log::warn!("Spider Browser Cloud API key looks like a placeholder — skipping. Get a real key at https://spider.cloud");
2617            return self;
2618        }
2619        let cfg = SpiderBrowserConfig::new(api_key);
2620        self.chrome_connection_url = Some(cfg.connection_url());
2621        self.spider_browser = Some(Box::new(cfg));
2622        self
2623    }
2624
2625    /// Connect to Spider Browser Cloud (no-op without `spider_cloud` + `chrome` features).
2626    #[cfg(not(all(feature = "spider_cloud", feature = "chrome")))]
2627    pub fn with_spider_browser(&mut self, _api_key: &str) -> &mut Self {
2628        self
2629    }
2630
2631    /// Connect to [Spider Browser Cloud](https://spider.cloud/docs/api#browser)
2632    /// with full configuration (stealth, country, browser type, etc.).
2633    #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2634    pub fn with_spider_browser_config(&mut self, config: SpiderBrowserConfig) -> &mut Self {
2635        self.chrome_connection_url = Some(config.connection_url());
2636        self.spider_browser = Some(Box::new(config));
2637        self
2638    }
2639
2640    /// Connect to Spider Browser Cloud with config (no-op without features).
2641    #[cfg(not(all(feature = "spider_cloud", feature = "chrome")))]
2642    pub fn with_spider_browser_config(&mut self, _config: ()) -> &mut Self {
2643        self
2644    }
2645
2646    /// Set the hedged request (work-stealing) configuration.
2647    #[cfg(feature = "hedge")]
2648    pub fn with_hedge(&mut self, config: crate::utils::hedge::HedgeConfig) -> &mut Self {
2649        self.hedge = Some(config);
2650        self
2651    }
2652
2653    /// Set the hedged request configuration (no-op without `hedge` feature).
2654    #[cfg(not(feature = "hedge"))]
2655    pub fn with_hedge(&mut self, _config: ()) -> &mut Self {
2656        self
2657    }
2658
2659    #[cfg(feature = "auto_throttle")]
2660    /// Set the auto-throttle configuration for latency-based adaptive delay.
2661    pub fn with_auto_throttle(
2662        &mut self,
2663        config: crate::utils::auto_throttle::AutoThrottleConfig,
2664    ) -> &mut Self {
2665        self.auto_throttle = Some(config);
2666        self
2667    }
2668
2669    /// Set the auto-throttle configuration (no-op without `auto_throttle` feature).
2670    #[cfg(not(feature = "auto_throttle"))]
2671    pub fn with_auto_throttle(&mut self, _config: ()) -> &mut Self {
2672        self
2673    }
2674
2675    #[cfg(feature = "etag_cache")]
2676    /// Enable or disable ETag / conditional request caching for bandwidth-efficient re-crawls.
2677    pub fn with_etag_cache(&mut self, enabled: bool) -> &mut Self {
2678        self.etag_cache = enabled;
2679        self
2680    }
2681
2682    /// Enable or disable ETag caching (no-op without `etag_cache` feature).
2683    #[cfg(not(feature = "etag_cache"))]
2684    pub fn with_etag_cache(&mut self, _enabled: bool) -> &mut Self {
2685        self
2686    }
2687
2688    #[cfg(feature = "warc")]
2689    /// Configure WARC output for writing a web archive file during crawl.
2690    pub fn with_warc(&mut self, config: crate::utils::warc::WarcConfig) -> &mut Self {
2691        self.warc = Some(config);
2692        self
2693    }
2694
2695    /// Configure WARC output (no-op without `warc` feature).
2696    #[cfg(not(feature = "warc"))]
2697    pub fn with_warc(&mut self, _config: ()) -> &mut Self {
2698        self
2699    }
2700}
2701
2702/// Search provider configuration for web search integration.
2703#[cfg(feature = "search")]
2704#[derive(Debug, Clone, PartialEq)]
2705#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2706pub struct SearchConfig {
2707    /// The search provider to use.
2708    pub provider: SearchProviderType,
2709    /// API key for the search provider.
2710    pub api_key: String,
2711    /// Custom API URL (overrides default endpoint for the provider).
2712    pub api_url: Option<String>,
2713    /// Default search options.
2714    pub default_options: Option<SearchOptions>,
2715}
2716
2717#[cfg(feature = "search")]
2718impl SearchConfig {
2719    /// Create a new search configuration.
2720    pub fn new(provider: SearchProviderType, api_key: impl Into<String>) -> Self {
2721        Self {
2722            provider,
2723            api_key: api_key.into(),
2724            api_url: None,
2725            default_options: None,
2726        }
2727    }
2728
2729    /// Use a custom API endpoint for this provider.
2730    pub fn with_api_url(mut self, url: impl Into<String>) -> Self {
2731        self.api_url = Some(url.into());
2732        self
2733    }
2734
2735    /// Set default search options.
2736    pub fn with_default_options(mut self, options: SearchOptions) -> Self {
2737        self.default_options = Some(options);
2738        self
2739    }
2740
2741    /// Check if this configuration is valid and search is enabled.
2742    ///
2743    /// Returns true if an API key is set or a custom API URL is configured.
2744    pub fn is_enabled(&self) -> bool {
2745        !self.api_key.is_empty() || self.api_url.is_some()
2746    }
2747}
2748
2749/// Available search providers.
2750#[cfg(feature = "search")]
2751#[derive(Debug, Clone, Default, PartialEq, Eq)]
2752#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2753pub enum SearchProviderType {
2754    /// Serper.dev - Google SERP API (high quality).
2755    #[default]
2756    Serper,
2757    /// Brave Search API (privacy-focused).
2758    Brave,
2759    /// Microsoft Bing Web Search API.
2760    Bing,
2761    /// Tavily AI Search (optimized for LLMs).
2762    Tavily,
2763}
2764
2765// ─── Spider Cloud ───────────────────────────────────────────────────────────
2766
2767/// Integration mode for [spider.cloud](https://spider.cloud).
2768#[cfg(feature = "spider_cloud")]
2769#[derive(Debug, Clone, Default, PartialEq, Eq)]
2770#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2771pub enum SpiderCloudMode {
2772    /// Route all HTTP requests through `proxy.spider.cloud`.
2773    /// This is the simplest mode — the existing fetch pipeline works
2774    /// unmodified, traffic goes through the proxy transparently.
2775    #[default]
2776    Proxy,
2777    /// Use the spider.cloud `POST /crawl` API (with `limit: 1`) for each page.
2778    /// Best for simple scraping needs.
2779    Api,
2780    /// Use the spider.cloud `POST /unblocker` API for anti-bot bypass.
2781    /// Best for hard-to-get pages behind advanced bot protection.
2782    Unblocker,
2783    /// Direct fetch first; fall back to spider.cloud API on
2784    /// 403 / 429 / 503 or connection errors.
2785    Fallback,
2786    /// Intelligent mode: proxy by default, automatically falls back to
2787    /// `/unblocker` when it detects bot protection (403, 429, 503, CAPTCHA
2788    /// pages, Cloudflare challenges, empty bodies on HTML pages, etc.).
2789    /// This is the recommended mode for production use.
2790    Smart,
2791}
2792
2793/// Return format for Spider Cloud API responses.
2794#[cfg(feature = "spider_cloud")]
2795#[derive(Debug, Clone, Default, PartialEq, Eq)]
2796#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2797pub enum SpiderCloudReturnFormat {
2798    /// Original HTML (default).
2799    #[default]
2800    #[cfg_attr(feature = "serde", serde(rename = "raw"))]
2801    Raw,
2802    /// Clean markdown — ideal for LLM pipelines.
2803    #[cfg_attr(feature = "serde", serde(rename = "markdown"))]
2804    Markdown,
2805    /// CommonMark-flavored markdown.
2806    #[cfg_attr(feature = "serde", serde(rename = "commonmark"))]
2807    CommonMark,
2808    /// Plain text with markup stripped.
2809    #[cfg_attr(feature = "serde", serde(rename = "text"))]
2810    Text,
2811    /// Raw bytes (no encoding conversion).
2812    #[cfg_attr(feature = "serde", serde(rename = "bytes"))]
2813    Bytes,
2814}
2815
2816#[cfg(feature = "spider_cloud")]
2817impl SpiderCloudReturnFormat {
2818    /// The API wire value sent to spider.cloud.
2819    pub fn as_str(&self) -> &'static str {
2820        match self {
2821            Self::Raw => "raw",
2822            Self::Markdown => "markdown",
2823            Self::CommonMark => "commonmark",
2824            Self::Text => "text",
2825            Self::Bytes => "bytes",
2826        }
2827    }
2828}
2829
2830#[cfg(feature = "spider_cloud")]
2831impl std::fmt::Display for SpiderCloudReturnFormat {
2832    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2833        f.write_str(self.as_str())
2834    }
2835}
2836
2837#[cfg(feature = "spider_cloud")]
2838impl From<&str> for SpiderCloudReturnFormat {
2839    fn from(s: &str) -> Self {
2840        match s {
2841            "markdown" | "Markdown" | "MARKDOWN" => Self::Markdown,
2842            "commonmark" | "CommonMark" | "COMMONMARK" => Self::CommonMark,
2843            "text" | "Text" | "TEXT" => Self::Text,
2844            "bytes" | "Bytes" | "BYTES" => Self::Bytes,
2845            _ => Self::Raw,
2846        }
2847    }
2848}
2849
2850/// Configuration for spider.cloud integration.
2851///
2852/// Spider Cloud provides anti-bot bypass, proxy rotation, and high-throughput
2853/// data collection. Sign up at <https://spider.cloud> to obtain an API key.
2854#[cfg(feature = "spider_cloud")]
2855#[derive(Debug, Clone, PartialEq, Eq)]
2856#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2857pub struct SpiderCloudConfig {
2858    /// API key / secret. Sign up at <https://spider.cloud> to get one.
2859    pub api_key: String,
2860    /// Integration mode.
2861    #[cfg_attr(feature = "serde", serde(default))]
2862    pub mode: SpiderCloudMode,
2863    /// API base URL (default: `https://api.spider.cloud`).
2864    #[cfg_attr(
2865        feature = "serde",
2866        serde(default = "SpiderCloudConfig::default_api_url")
2867    )]
2868    pub api_url: String,
2869    /// Proxy URL (default: `https://proxy.spider.cloud`).
2870    #[cfg_attr(
2871        feature = "serde",
2872        serde(default = "SpiderCloudConfig::default_proxy_url")
2873    )]
2874    pub proxy_url: String,
2875    /// Return format for API responses (default: [`SpiderCloudReturnFormat::Raw`]).
2876    #[cfg_attr(feature = "serde", serde(default))]
2877    pub return_format: SpiderCloudReturnFormat,
2878    /// Request multiple return formats in a single crawl.
2879    ///
2880    /// When set, the API returns `content` as an object keyed by format
2881    /// (e.g. `{"markdown": "...", "raw": "..."}`). The primary `return_format`
2882    /// is stored in [`Page::get_content`](crate::page::Page::get_content) and
2883    /// the extras are accessible via [`Page::get_content_for`](crate::page::Page::get_content_for).
2884    #[cfg_attr(
2885        feature = "serde",
2886        serde(default, skip_serializing_if = "Option::is_none")
2887    )]
2888    pub return_formats: Option<Vec<SpiderCloudReturnFormat>>,
2889    /// Extra params forwarded in API mode (e.g. `stealth`, `fingerprint`, `cache`).
2890    #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
2891    pub extra_params: Option<hashbrown::HashMap<String, serde_json::Value>>,
2892}
2893
2894#[cfg(feature = "spider_cloud")]
2895impl Default for SpiderCloudConfig {
2896    fn default() -> Self {
2897        Self {
2898            api_key: String::new(),
2899            mode: SpiderCloudMode::default(),
2900            api_url: Self::default_api_url(),
2901            proxy_url: Self::default_proxy_url(),
2902            return_format: SpiderCloudReturnFormat::default(),
2903            return_formats: None,
2904            extra_params: None,
2905        }
2906    }
2907}
2908
2909#[cfg(feature = "spider_cloud")]
2910impl SpiderCloudConfig {
2911    /// Create a new config with defaults (Proxy mode).
2912    pub fn new(api_key: impl Into<String>) -> Self {
2913        Self {
2914            api_key: api_key.into(),
2915            ..Default::default()
2916        }
2917    }
2918
2919    /// Set the integration mode.
2920    pub fn with_mode(mut self, mode: SpiderCloudMode) -> Self {
2921        self.mode = mode;
2922        self
2923    }
2924
2925    /// Set a custom API base URL.
2926    pub fn with_api_url(mut self, url: impl Into<String>) -> Self {
2927        self.api_url = url.into();
2928        self
2929    }
2930
2931    /// Set a custom proxy URL.
2932    pub fn with_proxy_url(mut self, url: impl Into<String>) -> Self {
2933        self.proxy_url = url.into();
2934        self
2935    }
2936
2937    /// Set the return format for API responses.
2938    ///
2939    /// Accepts `SpiderCloudReturnFormat` directly or a string like `"markdown"`:
2940    /// ```ignore
2941    /// config.with_return_format(SpiderCloudReturnFormat::Markdown)
2942    /// config.with_return_format("markdown")
2943    /// ```
2944    pub fn with_return_format(mut self, fmt: impl Into<SpiderCloudReturnFormat>) -> Self {
2945        self.return_format = fmt.into();
2946        self
2947    }
2948
2949    /// Request multiple return formats in a single crawl.
2950    ///
2951    /// The first format becomes the primary content (accessible via
2952    /// [`Page::get_content`](crate::page::Page::get_content)), and all formats are
2953    /// accessible via [`Page::get_content_for`](crate::page::Page::get_content_for).
2954    ///
2955    /// ```ignore
2956    /// config.with_return_formats(vec![
2957    ///     SpiderCloudReturnFormat::Markdown,
2958    ///     SpiderCloudReturnFormat::Raw,
2959    /// ])
2960    /// ```
2961    pub fn with_return_formats(mut self, formats: Vec<SpiderCloudReturnFormat>) -> Self {
2962        // Deduplicate while preserving order.
2963        let mut seen = Vec::with_capacity(formats.len());
2964        for f in formats {
2965            if !seen.contains(&f) {
2966                seen.push(f);
2967            }
2968        }
2969        if let Some(first) = seen.first() {
2970            self.return_format = first.clone();
2971        }
2972        self.return_formats = Some(seen);
2973        self
2974    }
2975
2976    /// Check if multiple return formats are requested.
2977    pub fn has_multiple_formats(&self) -> bool {
2978        self.return_formats.as_ref().is_some_and(|f| f.len() > 1)
2979    }
2980
2981    /// Set extra params for API mode.
2982    pub fn with_extra_params(
2983        mut self,
2984        params: hashbrown::HashMap<String, serde_json::Value>,
2985    ) -> Self {
2986        self.extra_params = Some(params);
2987        self
2988    }
2989
2990    /// Determine if a response should trigger a spider.cloud API fallback.
2991    ///
2992    /// This encapsulates the intelligence about which status codes and
2993    /// content patterns indicate the page needs spider.cloud's help.
2994    ///
2995    /// Checks for:
2996    /// - HTTP 403 (Forbidden) — typically bot protection
2997    /// - HTTP 429 (Too Many Requests) — rate limiting
2998    /// - HTTP 503 (Service Unavailable) — often Cloudflare/DDoS protection
2999    /// - HTTP 520-530 (Cloudflare error range)
3000    /// - HTTP 5xx (server errors)
3001    /// - Empty body on what should be an HTML page
3002    /// - Known CAPTCHA / challenge page markers in the response body
3003    pub fn should_fallback(&self, status_code: u16, body: Option<&[u8]>) -> bool {
3004        match self.mode {
3005            SpiderCloudMode::Api | SpiderCloudMode::Unblocker => false, // already using API
3006            SpiderCloudMode::Proxy => false,                            // proxy-only, no fallback
3007            SpiderCloudMode::Fallback | SpiderCloudMode::Smart => {
3008                // Status code triggers
3009                if matches!(status_code, 403 | 429 | 503 | 520..=530) {
3010                    return true;
3011                }
3012                if status_code >= 500 {
3013                    return true;
3014                }
3015
3016                // Content-based triggers (Smart mode only)
3017                if self.mode == SpiderCloudMode::Smart {
3018                    if let Some(body) = body {
3019                        // Empty body when we expected HTML
3020                        if body.is_empty() {
3021                            return true;
3022                        }
3023
3024                        // Check for bot protection / CAPTCHA markers in the body
3025                        // (only check first 4KB for performance)
3026                        let check_len = body.len().min(4096);
3027                        let snippet = String::from_utf8_lossy(&body[..check_len]);
3028                        let lower = snippet.to_lowercase();
3029
3030                        // Cloudflare challenge
3031                        if lower.contains("cf-browser-verification")
3032                            || lower.contains("cloudflare") && lower.contains("challenge-platform")
3033                        {
3034                            return true;
3035                        }
3036
3037                        // Generic CAPTCHA / bot detection markers
3038                        if lower.contains("captcha") && lower.contains("challenge")
3039                            || lower.contains("please verify you are a human")
3040                            || lower.contains("access denied") && lower.contains("automated")
3041                            || lower.contains("bot detection")
3042                        {
3043                            return true;
3044                        }
3045
3046                        // Distil Networks / Imperva / Akamai patterns
3047                        if lower.contains("distil_r_captcha")
3048                            || lower.contains("_imperva")
3049                            || lower.contains("akamai") && lower.contains("bot manager")
3050                        {
3051                            return true;
3052                        }
3053                    }
3054                }
3055
3056                false
3057            }
3058        }
3059    }
3060
3061    /// Get the fallback API route for this config.
3062    ///
3063    /// - `Smart` mode → `/unblocker` (best for bot-protected pages)
3064    /// - `Fallback` mode → `/crawl` (general purpose)
3065    /// - Other modes → `/crawl` (default)
3066    pub fn fallback_route(&self) -> &'static str {
3067        match self.mode {
3068            SpiderCloudMode::Smart | SpiderCloudMode::Unblocker => "unblocker",
3069            _ => "crawl",
3070        }
3071    }
3072
3073    /// Whether this mode uses the proxy transport layer.
3074    pub fn uses_proxy(&self) -> bool {
3075        matches!(
3076            self.mode,
3077            SpiderCloudMode::Proxy | SpiderCloudMode::Fallback | SpiderCloudMode::Smart
3078        )
3079    }
3080
3081    fn default_api_url() -> String {
3082        "https://api.spider.cloud".to_string()
3083    }
3084
3085    fn default_proxy_url() -> String {
3086        "https://proxy.spider.cloud".to_string()
3087    }
3088}
3089
3090// ─── Spider Browser Cloud ────────────────────────────────────────────────────
3091
3092/// Configuration for [Spider Browser Cloud](https://spider.cloud/docs/api#browser).
3093///
3094/// Connects to a remote Chromium instance via CDP over WebSocket at
3095/// `wss://browser.spider.cloud/v1/browser`.  Authentication is via
3096/// `?token=API_KEY` query parameter.
3097///
3098/// Optional query parameters: `stealth`, `browser`, `country`.
3099#[cfg(all(feature = "spider_cloud", feature = "chrome"))]
3100#[derive(Debug, Clone, PartialEq, Eq)]
3101#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
3102pub struct SpiderBrowserConfig {
3103    /// API key / secret. Sign up at <https://spider.cloud> to get one.
3104    pub api_key: String,
3105    /// WebSocket base URL (default: `wss://browser.spider.cloud/v1/browser`).
3106    #[cfg_attr(
3107        feature = "serde",
3108        serde(default = "SpiderBrowserConfig::default_wss_url")
3109    )]
3110    pub wss_url: String,
3111    /// Enable stealth mode (anti-fingerprinting). Sent as `stealth=true` query param.
3112    #[cfg_attr(feature = "serde", serde(default))]
3113    pub stealth: bool,
3114    /// Browser type to request (e.g. `"chrome"`, `"firefox"`). Sent as `browser=<value>`.
3115    #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
3116    pub browser: Option<String>,
3117    /// Country code for geo-targeting (e.g. `"us"`, `"gb"`). Sent as `country=<value>`.
3118    #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
3119    pub country: Option<String>,
3120    /// Extra query parameters appended to the WSS URL.
3121    #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
3122    pub extra_params: Option<Vec<(String, String)>>,
3123}
3124
3125#[cfg(all(feature = "spider_cloud", feature = "chrome"))]
3126impl Default for SpiderBrowserConfig {
3127    fn default() -> Self {
3128        Self {
3129            api_key: String::new(),
3130            wss_url: Self::default_wss_url(),
3131            stealth: false,
3132            browser: None,
3133            country: None,
3134            extra_params: None,
3135        }
3136    }
3137}
3138
3139#[cfg(all(feature = "spider_cloud", feature = "chrome"))]
3140impl SpiderBrowserConfig {
3141    /// Create a new config with the given API key.
3142    pub fn new(api_key: impl Into<String>) -> Self {
3143        Self {
3144            api_key: api_key.into(),
3145            ..Default::default()
3146        }
3147    }
3148
3149    /// Set a custom WSS base URL.
3150    pub fn with_wss_url(mut self, url: impl Into<String>) -> Self {
3151        self.wss_url = url.into();
3152        self
3153    }
3154
3155    /// Enable or disable stealth mode.
3156    pub fn with_stealth(mut self, stealth: bool) -> Self {
3157        self.stealth = stealth;
3158        self
3159    }
3160
3161    /// Set the browser type to request.
3162    pub fn with_browser(mut self, browser: impl Into<String>) -> Self {
3163        self.browser = Some(browser.into());
3164        self
3165    }
3166
3167    /// Set the country code for geo-targeting.
3168    pub fn with_country(mut self, country: impl Into<String>) -> Self {
3169        self.country = Some(country.into());
3170        self
3171    }
3172
3173    /// Add extra query parameters.
3174    pub fn with_extra_params(mut self, params: Vec<(String, String)>) -> Self {
3175        self.extra_params = Some(params);
3176        self
3177    }
3178
3179    /// Build the full WSS connection URL with authentication and options.
3180    ///
3181    /// Returns a URL like:
3182    /// `wss://browser.spider.cloud/v1/browser?token=KEY&stealth=true&country=us`
3183    pub fn connection_url(&self) -> String {
3184        let mut url = self.wss_url.clone();
3185
3186        // Start query string
3187        if url.contains('?') {
3188            url.push('&');
3189        } else {
3190            url.push('?');
3191        }
3192        url.push_str("token=");
3193        url.push_str(&self.api_key);
3194
3195        if self.stealth {
3196            url.push_str("&stealth=true");
3197        }
3198        if let Some(ref browser) = self.browser {
3199            url.push_str("&browser=");
3200            url.push_str(browser);
3201        }
3202        if let Some(ref country) = self.country {
3203            url.push_str("&country=");
3204            url.push_str(country);
3205        }
3206        if let Some(ref extra) = self.extra_params {
3207            for (k, v) in extra {
3208                url.push('&');
3209                url.push_str(k);
3210                url.push('=');
3211                url.push_str(v);
3212            }
3213        }
3214
3215        url
3216    }
3217
3218    fn default_wss_url() -> String {
3219        "wss://browser.spider.cloud/v1/browser".to_string()
3220    }
3221}
3222
3223#[cfg(test)]
3224mod tests {
3225    use super::*;
3226
3227    #[test]
3228    fn test_configuration_defaults() {
3229        let config = Configuration::default();
3230        assert!(!config.respect_robots_txt);
3231        assert!(!config.subdomains);
3232        assert!(!config.tld);
3233        assert_eq!(config.delay, 0);
3234        assert!(config.user_agent.is_none());
3235        assert!(config.blacklist_url.is_none());
3236        assert!(config.whitelist_url.is_none());
3237        assert!(config.proxies.is_none());
3238        assert!(!config.http2_prior_knowledge);
3239    }
3240
3241    #[test]
3242    fn test_redirect_policy_variants() {
3243        assert_eq!(RedirectPolicy::default(), RedirectPolicy::Loose);
3244        let strict = RedirectPolicy::Strict;
3245        let none = RedirectPolicy::None;
3246        assert_ne!(strict, RedirectPolicy::Loose);
3247        assert_ne!(none, RedirectPolicy::Loose);
3248        assert_ne!(strict, none);
3249    }
3250
3251    #[test]
3252    fn test_redirect_limit_is_opt_in_for_chrome_path() {
3253        // Fresh config preserves prior behavior: no flag, no Chrome enforcement.
3254        let fresh = Configuration::default();
3255        assert!(
3256            !fresh.redirect_limit_set,
3257            "Configuration::default() must not claim the redirect_limit was set"
3258        );
3259
3260        // Explicit opt-in flips the flag and records the cap.
3261        let mut opt_in = Configuration::default();
3262        opt_in.with_redirect_limit(3);
3263        assert!(opt_in.redirect_limit_set);
3264        assert_eq!(opt_in.redirect_limit, 3);
3265    }
3266
3267    #[test]
3268    fn test_proxy_ignore_variants() {
3269        assert_eq!(ProxyIgnore::default(), ProxyIgnore::No);
3270        let chrome = ProxyIgnore::Chrome;
3271        let http = ProxyIgnore::Http;
3272        assert_ne!(chrome, ProxyIgnore::No);
3273        assert_ne!(http, ProxyIgnore::No);
3274        assert_ne!(chrome, http);
3275    }
3276
3277    #[test]
3278    fn test_request_proxy_construction() {
3279        let proxy = RequestProxy {
3280            addr: "http://proxy.example.com:8080".to_string(),
3281            ignore: ProxyIgnore::No,
3282        };
3283        assert_eq!(proxy.addr, "http://proxy.example.com:8080");
3284        assert_eq!(proxy.ignore, ProxyIgnore::No);
3285    }
3286
3287    #[test]
3288    fn test_request_proxy_default() {
3289        let proxy = RequestProxy::default();
3290        assert!(proxy.addr.is_empty());
3291        assert_eq!(proxy.ignore, ProxyIgnore::No);
3292    }
3293
3294    #[test]
3295    fn test_configuration_blacklist_setup() {
3296        let mut config = Configuration::default();
3297        config.blacklist_url = Some(vec![
3298            "https://example.com/private".into(),
3299            "https://example.com/admin".into(),
3300        ]);
3301        assert_eq!(config.blacklist_url.as_ref().unwrap().len(), 2);
3302    }
3303
3304    #[test]
3305    fn test_configuration_whitelist_setup() {
3306        let mut config = Configuration::default();
3307        config.whitelist_url = Some(vec!["https://example.com/public".into()]);
3308        assert_eq!(config.whitelist_url.as_ref().unwrap().len(), 1);
3309    }
3310
3311    #[test]
3312    fn test_configuration_external_domains() {
3313        let mut config = Configuration::default();
3314        config.external_domains_caseless = Arc::new(
3315            [
3316                case_insensitive_string::CaseInsensitiveString::from("Example.Com"),
3317                case_insensitive_string::CaseInsensitiveString::from("OTHER.org"),
3318            ]
3319            .into_iter()
3320            .collect(),
3321        );
3322        assert_eq!(config.external_domains_caseless.len(), 2);
3323        assert!(config.external_domains_caseless.contains(
3324            &case_insensitive_string::CaseInsensitiveString::from("example.com")
3325        ));
3326    }
3327
3328    #[test]
3329    fn test_configuration_budget() {
3330        let mut config = Configuration::default();
3331        let mut budget = hashbrown::HashMap::new();
3332        budget.insert(
3333            case_insensitive_string::CaseInsensitiveString::from("/path"),
3334            100u32,
3335        );
3336        config.budget = Some(budget);
3337        assert!(config.budget.is_some());
3338        assert_eq!(
3339            config.budget.as_ref().unwrap().get(
3340                &case_insensitive_string::CaseInsensitiveString::from("/path")
3341            ),
3342            Some(&100u32)
3343        );
3344    }
3345
3346    #[cfg(not(feature = "regex"))]
3347    #[test]
3348    fn test_allow_list_set_default() {
3349        let allow_list = AllowListSet::default();
3350        assert!(allow_list.0.is_empty());
3351    }
3352
3353    #[cfg(feature = "agent")]
3354    #[test]
3355    fn test_build_remote_multimodal_engine_preserves_dual_models() {
3356        use crate::features::automation::{
3357            ModelEndpoint, RemoteMultimodalConfigs, VisionRouteMode,
3358        };
3359
3360        let mut config = Configuration::default();
3361        let mm = RemoteMultimodalConfigs::new(
3362            "https://api.example.com/v1/chat/completions",
3363            "primary-model",
3364        )
3365        .with_vision_model(ModelEndpoint::new("vision-model").with_api_key("vision-key"))
3366        .with_text_model(
3367            ModelEndpoint::new("text-model")
3368                .with_api_url("https://text.example.com/v1/chat/completions")
3369                .with_api_key("text-key"),
3370        )
3371        .with_vision_route_mode(VisionRouteMode::TextFirst);
3372        config.remote_multimodal = Some(Box::new(mm));
3373
3374        let engine = config
3375            .build_remote_multimodal_engine()
3376            .expect("engine should be built");
3377
3378        assert_eq!(
3379            engine.vision_model.as_ref().map(|m| m.model_name.as_str()),
3380            Some("vision-model")
3381        );
3382        assert_eq!(
3383            engine.text_model.as_ref().map(|m| m.model_name.as_str()),
3384            Some("text-model")
3385        );
3386        assert_eq!(engine.vision_route_mode, VisionRouteMode::TextFirst);
3387    }
3388
3389    #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
3390    #[test]
3391    fn test_spider_browser_config_defaults() {
3392        let cfg = SpiderBrowserConfig::new("test-key");
3393        assert_eq!(cfg.api_key, "test-key");
3394        assert_eq!(cfg.wss_url, "wss://browser.spider.cloud/v1/browser");
3395        assert!(!cfg.stealth);
3396        assert!(cfg.browser.is_none());
3397        assert!(cfg.country.is_none());
3398        assert!(cfg.extra_params.is_none());
3399    }
3400
3401    #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
3402    #[test]
3403    fn test_spider_browser_connection_url_basic() {
3404        let cfg = SpiderBrowserConfig::new("sk-abc123");
3405        assert_eq!(
3406            cfg.connection_url(),
3407            "wss://browser.spider.cloud/v1/browser?token=sk-abc123"
3408        );
3409    }
3410
3411    #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
3412    #[test]
3413    fn test_spider_browser_connection_url_full() {
3414        let cfg = SpiderBrowserConfig::new("sk-abc123")
3415            .with_stealth(true)
3416            .with_browser("chrome")
3417            .with_country("us")
3418            .with_extra_params(vec![("timeout".into(), "30000".into())]);
3419        assert_eq!(
3420            cfg.connection_url(),
3421            "wss://browser.spider.cloud/v1/browser?token=sk-abc123&stealth=true&browser=chrome&country=us&timeout=30000"
3422        );
3423    }
3424
3425    #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
3426    #[test]
3427    fn test_spider_browser_connection_url_custom_wss() {
3428        let cfg = SpiderBrowserConfig::new("key")
3429            .with_wss_url("wss://custom.browser.example.com/v1/browser");
3430        assert_eq!(
3431            cfg.connection_url(),
3432            "wss://custom.browser.example.com/v1/browser?token=key"
3433        );
3434    }
3435
3436    #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
3437    #[test]
3438    fn test_with_spider_browser_sets_chrome_connection() {
3439        let mut config = Configuration::default();
3440        config.with_spider_browser("my-api-key");
3441        assert_eq!(
3442            config.chrome_connection_url.as_deref(),
3443            Some("wss://browser.spider.cloud/v1/browser?token=my-api-key")
3444        );
3445        assert!(config.spider_browser.is_some());
3446    }
3447
3448    #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
3449    #[test]
3450    fn test_with_spider_browser_config_stealth() {
3451        let mut config = Configuration::default();
3452        let browser_cfg = SpiderBrowserConfig::new("key")
3453            .with_stealth(true)
3454            .with_country("gb");
3455        config.with_spider_browser_config(browser_cfg);
3456        assert_eq!(
3457            config.chrome_connection_url.as_deref(),
3458            Some("wss://browser.spider.cloud/v1/browser?token=key&stealth=true&country=gb")
3459        );
3460    }
3461}
spider/configuration.rs

spider/
configuration.rs