Skip to main content

Configuration

Struct Configuration 

Source
pub struct Configuration {
Show 92 fields pub respect_robots_txt: bool, pub subdomains: bool, pub tld: bool, pub crawl_timeout: Option<Duration>, pub preserve_host_header: bool, pub blacklist_url: Option<Vec<CompactString>>, pub whitelist_url: Option<Vec<CompactString>>, pub user_agent: Option<Box<CompactString>>, pub delay: u64, pub request_timeout: Option<Duration>, pub http2_prior_knowledge: bool, pub proxies: Option<Vec<RequestProxy>>, pub proxies_by_kind: Option<HashMap<ProxyKind, Vec<RequestProxy>>>, pub headers: Option<Box<SerializableHeaderMap>>, pub sitemap_url: Option<Box<CompactString>>, pub ignore_sitemap: bool, pub redirect_limit: usize, pub redirect_policy: RedirectPolicy, pub redirect_limit_set: bool, pub max_main_frame_navigations: Option<u32>, pub cookie_str: String, pub emulation: Option<Emulation>, pub cron_str: String, pub cron_type: CronType, pub depth: usize, pub depth_distance: usize, pub stealth_mode: Tier, pub viewport: Option<Viewport>, pub budget: Option<HashMap<CaseInsensitiveString, u32>>, pub wild_card_budgeting: bool, pub external_domains_caseless: Arc<HashSet<CaseInsensitiveString>>, pub full_resources: bool, pub accept_invalid_certs: bool, pub auth_challenge_response: Option<AuthChallengeResponse>, pub openai_config: Option<Box<GPTConfigs>>, pub gemini_config: Option<Box<GeminiConfigs>>, pub remote_multimodal: Option<Box<RemoteMultimodalConfigs>>, pub shared_queue: bool, pub return_page_links: bool, pub retry: u8, pub custom_antibot: Option<CustomAntibotPatterns>, pub no_control_thread: bool, pub only_html: bool, pub concurrency_limit: Option<usize>, pub normalize: bool, pub shared: bool, pub modify_headers: bool, pub modify_http_client_headers: bool, pub cache: bool, pub cache_skip_browser: bool, pub cache_namespace: Option<Box<String>>, pub service_worker_enabled: bool, pub timezone_id: Option<Box<String>>, pub locale: Option<Box<String>>, pub evaluate_on_new_document: Option<Box<String>>, pub dismiss_dialogs: Option<bool>, pub wait_for: Option<WaitFor>, pub screenshot: Option<ScreenShotConfig>, pub track_events: Option<ChromeEventTracker>, pub fingerprint: Fingerprint, pub chrome_connection_url: Option<String>, pub chrome_connection_urls: Option<Vec<String>>, pub chrome_first_byte_timeout: Option<Duration>, pub chrome_first_byte_timeout_jitter: Option<Duration>, pub http_first_byte_timeout: Option<Duration>, pub http_first_byte_timeout_jitter: Option<Duration>, pub execution_scripts: Option<ExecutionScripts>, pub automation_scripts: Option<AutomationScripts>, pub chrome_intercept: RequestInterceptConfiguration, pub referer: Option<String>, pub max_page_bytes: Option<f64>, pub max_bytes_allowed: Option<u64>, pub disable_log: bool, pub auto_geolocation: bool, pub cache_policy: Option<BasicCachePolicy>, pub bypass_csp: bool, pub disable_javascript: bool, pub network_interface: Option<String>, pub local_address: Option<IpAddr>, pub default_http_connect_timeout: Option<Duration>, pub default_http_read_timeout: Option<Duration>, pub webdriver_config: Option<Box<WebDriverConfig>>, pub search_config: Option<Box<SearchConfig>>, pub spider_cloud: Option<Box<SpiderCloudConfig>>, pub spider_browser: Option<Box<SpiderBrowserConfig>>, pub hedge: Option<HedgeConfig>, pub auto_throttle: Option<AutoThrottleConfig>, pub etag_cache: bool, pub warc: Option<WarcConfig>, pub parallel_backends: Option<ParallelBackendsConfig>, pub worker_connection_urls: Option<Vec<String>>, pub scraper_worker_connection_urls: Option<Vec<String>>, /* private fields */
}
Expand description

Structure to configure Website crawler

use spider::website::Website;
let mut website: Website = Website::new("https://choosealicense.com");
website.configuration.blacklist_url.insert(Default::default()).push("https://choosealicense.com/licenses/".to_string().into());
website.configuration.respect_robots_txt = true;
website.configuration.subdomains = true;
website.configuration.tld = true;

Fields§

§respect_robots_txt: bool

Respect robots.txt file and not scrape not allowed files. This may slow down crawls if robots.txt file has a delay included.

§subdomains: bool

Allow sub-domains.

§tld: bool

Allow all tlds for domain.

§crawl_timeout: Option<Duration>

The max timeout for the crawl.

§preserve_host_header: bool

Preserve the HTTP host header from being included.

§blacklist_url: Option<Vec<CompactString>>

List of pages to not crawl. [optional: regex pattern matching]

§whitelist_url: Option<Vec<CompactString>>

List of pages to only crawl. [optional: regex pattern matching]

§user_agent: Option<Box<CompactString>>

User-Agent for request.

§delay: u64

Polite crawling delay in milli seconds.

§request_timeout: Option<Duration>

Request max timeout per page. By default the request times out in 15s. Set to None to disable.

§http2_prior_knowledge: bool

Use HTTP2 for connection. Enable if you know the website has http2 support.

§proxies: Option<Vec<RequestProxy>>

Use proxy list for performing network request.

§proxies_by_kind: Option<HashMap<ProxyKind, Vec<RequestProxy>>>

Optional sidecar map of alternative proxy lists keyed by ProxyKind.

Lets a crate::proxy_strategy::ProxyStrategy route a request through a non-default proxy set without touching proxies or RequestProxy itself. When None (the default) or when the strategy returns a kind that has no entry here, requests fall through to proxies and the existing fast path — no behavior change.

Lookup is by enum equality / hash; the ProxyKind::Custom variant lets consumers introduce their own kinds without an upstream change. Spider never writes to this map after configuration; runtime lazy state lives on the Website.

§headers: Option<Box<SerializableHeaderMap>>

Headers to include with request.

§sitemap_url: Option<Box<CompactString>>
Available on crate feature sitemap only.

Include a sitemap in response of the crawl.

§ignore_sitemap: bool
Available on crate feature sitemap only.

Prevent including the sitemap links with the crawl.

§redirect_limit: usize

The max redirections allowed for request.

§redirect_policy: RedirectPolicy

The redirect policy type to use.

§redirect_limit_set: bool

Whether redirect_limit was explicitly set by the caller.

Set to true by with_redirect_limit() and by the external-config loader when redirect_limit is provided. Chrome-path enforcement reads this flag so it only caps redirects when the user opted in — preserving prior behavior on pages whose navigation chains exceed the HTTP default of 7.

§max_main_frame_navigations: Option<u32>

Cap on main-frame cross-document navigations during a single Chrome goto (requires the chrome feature — no effect on the HTTP path).

Defends against JS / meta-refresh / HTTP-Refresh-header loops that bypass the HTTP redirect cap because each hop is a fresh document rather than a 3xx redirect. None disables the guard (default) so prior behavior is preserved; Some(n) aborts the navigation with a net::ERR_TOO_MANY_NAVIGATIONS error once the main frame has navigated more than n times since goto.

§cookie_str: String
Available on crate feature cookies only.

Cookie string to use for network requests ex: “foo=bar; Domain=blog.spider”

§emulation: Option<Emulation>
Available on crate feature wreq only.

The type of request emulation. This does nothing without the flag sync enabled.

§cron_str: String
Available on crate feature cron only.

Cron string to perform crawls - use https://crontab.guru/ to help generate a valid cron for needs.

§cron_type: CronType
Available on crate feature cron only.

The type of cron to run either crawl or scrape.

§depth: usize

The max depth to crawl for a website. Defaults to 25 to help prevent infinite recursion.

§depth_distance: usize

The depth to crawl pertaining to the root.

§stealth_mode: Tier

Use stealth mode for requests.

§viewport: Option<Viewport>

Configure the viewport for chrome and viewport headers.

§budget: Option<HashMap<CaseInsensitiveString, u32>>

Crawl budget for the paths. This helps prevent crawling extra pages and limiting the amount.

§wild_card_budgeting: bool

If wild card budgeting is found for the website.

§external_domains_caseless: Arc<HashSet<CaseInsensitiveString>>

External domains to include case-insensitive.

§full_resources: bool

Collect all the resources found on the page.

§accept_invalid_certs: bool

Dangerously accept invalid certficates.

§auth_challenge_response: Option<AuthChallengeResponse>

The auth challenge response. The ‘chrome_intercept’ flag is also required in order to intercept the response.

§openai_config: Option<Box<GPTConfigs>>

The OpenAI configs to use to help drive the chrome browser. This does nothing without the ‘openai’ flag.

§gemini_config: Option<Box<GeminiConfigs>>

The Gemini configs to use to help drive the chrome browser. This does nothing without the ‘gemini’ flag.

§remote_multimodal: Option<Box<RemoteMultimodalConfigs>>

Remote multimodal automation config (vision + LLM-driven steps). Requires the agent feature for full functionality, uses stub type otherwise.

§shared_queue: bool

Use a shared queue strategy when crawling. This can scale workloads evenly that do not need priority.

§return_page_links: bool

Return the page links in the subscription channels. This does nothing without the flag sync enabled.

§retry: u8

Retry count to attempt to swap proxies etc.

§custom_antibot: Option<CustomAntibotPatterns>

Custom antibot detection patterns. When set, these are matched in addition to the built-in patterns. Any match triggers AntiBotTech::Custom.

§no_control_thread: bool

Skip spawning a control thread that can pause, start, and shutdown the crawl.

§only_html: bool

Expect only to handle HTML to save on resources. This mainly only blocks the crawling and returning of resources from the server.

§concurrency_limit: Option<usize>

The concurrency limits to apply.

§normalize: bool

Normalize the html de-deplucating the content.

§shared: bool

Share the state of the crawl requires the ‘disk’ feature flag.

§modify_headers: bool

Modify the headers to act like a real-browser

§modify_http_client_headers: bool

Modify the HTTP client headers only to act like a real-browser

§cache: bool
Available on crate features cache_request or chrome_remote_cache or chrome only.

Cache the page following HTTP caching rules.

§cache_skip_browser: bool
Available on crate features cache_request or chrome_remote_cache or chrome only.

Skip browser rendering entirely if cached response exists. When enabled, returns cached HTML directly without launching Chrome.

§cache_namespace: Option<Box<String>>

Namespace mixed into every cache key so logically distinct variants (country, proxy pool, tenant, A/B bucket, device profile, …) never collide on the same cached bytes. Free-form — spider treats it as an opaque partition string. None uses the default (empty) namespace. Always present (zero cost when unset); its effect is gated by whichever cache feature is active.

§service_worker_enabled: bool
Available on crate feature chrome only.

Enable or disable service workers. Enabled by default.

§timezone_id: Option<Box<String>>
Available on crate feature chrome only.

Overrides default host system timezone with the specified one.

§locale: Option<Box<String>>
Available on crate feature chrome only.

Overrides default host system locale with the specified one.

§evaluate_on_new_document: Option<Box<String>>
Available on crate feature chrome only.

Set a custom script to eval on each new document.

§dismiss_dialogs: Option<bool>
Available on crate feature chrome only.

Dismiss dialogs.

§wait_for: Option<WaitFor>
Available on crate feature chrome only.

Wait for options for the page.

§screenshot: Option<ScreenShotConfig>
Available on crate feature chrome only.

Take a screenshot of the page.

§track_events: Option<ChromeEventTracker>
Available on crate feature chrome only.

Track the events made via chrome.

§fingerprint: Fingerprint
Available on crate feature chrome only.

Setup fingerprint ID on each document. This does nothing without the flag chrome enabled.

§chrome_connection_url: Option<String>
Available on crate feature chrome only.

The chrome connection url. Useful for targeting different headless instances. Defaults to using the env CHROME_URL.

§chrome_connection_urls: Option<Vec<String>>
Available on crate feature chrome only.

Multiple remote Chrome connection URLs for failover. When a connection fails after retries, the next URL is tried automatically. Requires the chrome feature. When set, takes priority over chrome_connection_url.

§chrome_first_byte_timeout: Option<Duration>
Available on crate feature chrome only.

First-byte watchdog for Chrome navigations. When set, fires if no Network.dataReceived (or Network.responseReceived) event arrives within this duration after the listener attaches. On fire the page is force-stopped and (when a browser_dead flag is plumbed through ChromeFetchParams) it is flipped so the website-level retry loop can rotate the backend. None (default) disables the watchdog and the legacy chunk-idle timeout (SPIDER_CHUNK_IDLE_TIMEOUT_SECS, default 30s) is the only stall guard.

§chrome_first_byte_timeout_jitter: Option<Duration>
Available on crate feature chrome only.

Per-fetch jitter window applied on top of chrome_first_byte_timeout. When Some(j), each fetch picks actual_timeout = base + rand(0..j) so concurrent fetches don’t all expire at exactly the same moment (avoids thundering-herd backend rotation when a backend goes dark). None (default) means no jitter — every fetch uses the configured base timeout exactly. Ignored when chrome_first_byte_timeout is None (no watchdog to jitter).

§http_first_byte_timeout: Option<Duration>

First-byte watchdog for HTTP fetches. When Some(d), each client.get(url).send().await is wrapped in tokio::time::timeout(base + rand(0..jitter)). On timeout the in-flight connect / TLS / header future is dropped (cancels the request) and a synthetic 524 GATEWAY_TIMEOUT response is built so the existing retry path rotates to the next proxy. Covers the gap between connect_timeout (TCP/TLS handshake) and chunk_idle_timeout (per-chunk idle while streaming) where a proxy can accept the connection but never produce headers. None (default) disables the watchdog — request_timeout and chunk_idle_timeout remain the only stall guards.

§http_first_byte_timeout_jitter: Option<Duration>

Per-fetch jitter window applied on top of http_first_byte_timeout. Same semantics as chrome_first_byte_timeout_jitter. None (default) means no jitter; ignored when the base is None.

§execution_scripts: Option<ExecutionScripts>
Available on crate feature chrome only.

Scripts to execute for individual pages, the full path of the url is required for an exact match. This is useful for running one off JS on pages like performing custom login actions.

§automation_scripts: Option<AutomationScripts>
Available on crate feature chrome only.

Web automation scripts to run up to a duration of 60 seconds.

§chrome_intercept: RequestInterceptConfiguration
Available on crate feature chrome only.

Setup network interception for request. This does nothing without the flag chrome_intercept enabled.

§referer: Option<String>

The referer to use.

§max_page_bytes: Option<f64>

Determine the max bytes per page.

§max_bytes_allowed: Option<u64>

Determine the max bytes per browser context.

§disable_log: bool
Available on crate feature chrome only.

Disables log domain, prevents further log entries from being reported to the client. This does nothing without the flag chrome enabled.

§auto_geolocation: bool
Available on crate feature chrome only.

Automatic locale and timezone handling via third party. This does nothing without the flag chrome enabled.

§cache_policy: Option<BasicCachePolicy>

The cache policy to use.

§bypass_csp: bool
Available on crate feature chrome only.

Enables bypassing CSP. This does nothing without the flag chrome enabled.

§disable_javascript: bool
Available on crate feature chrome only.

Disables JavaScript execution on the page. This does nothing without the flag chrome enabled.

§network_interface: Option<String>

Bind the connections only on the network interface.

§local_address: Option<IpAddr>

Bind to a local IP Address.

§default_http_connect_timeout: Option<Duration>

The default http connect timeout

§default_http_read_timeout: Option<Duration>

The default http read timeout

§webdriver_config: Option<Box<WebDriverConfig>>
Available on crate feature webdriver only.

WebDriver configuration for browser automation. This does nothing without the webdriver flag enabled.

§search_config: Option<Box<SearchConfig>>
Available on crate feature search only.

Search provider configuration for web search integration. This does nothing without the search flag enabled.

§spider_cloud: Option<Box<SpiderCloudConfig>>
Available on crate feature spider_cloud only.

Spider Cloud config. See https://spider.cloud.

§spider_browser: Option<Box<SpiderBrowserConfig>>
Available on crate features chrome and spider_cloud only.

Spider Browser Cloud config for remote CDP via wss://browser.spider.cloud.

§hedge: Option<HedgeConfig>
Available on crate feature hedge only.

Hedged request configuration for work-stealing on slow requests. When enabled, fires a duplicate request on a different proxy after a delay.

§auto_throttle: Option<AutoThrottleConfig>
Available on crate feature auto_throttle only.

Latency-based auto-throttle configuration. When enabled, dynamically adjusts per-domain crawl delay based on measured server response time.

§etag_cache: bool
Available on crate feature etag_cache only.

Enable ETag / conditional request caching. When true, stores ETag and Last-Modified headers from responses and sends If-None-Match / If-Modified-Since on subsequent requests to the same URL, allowing servers to respond with lightweight 304 Not Modified.

§warc: Option<WarcConfig>
Available on crate feature warc only.

WARC output configuration. When set, the crawl writes a WARC 1.1 file containing all fetched pages as response records.

§parallel_backends: Option<ParallelBackendsConfig>
Available on crate feature parallel_backends only.

Parallel crawl backend configuration. Race CDP / Servo backends alongside the primary crawl path. Requires the parallel_backends feature.

§worker_connection_urls: Option<Vec<String>>
Available on crate feature decentralized only.

Per-Website remote Spider worker URLs used for crawl requests. When None, falls back to the process-wide SPIDER_WORKER env var (or its default), preserving pre-2.51.x behavior. When Some, overrides the global pool for this Website only.

§scraper_worker_connection_urls: Option<Vec<String>>
Available on crate feature decentralized only.

Per-Website remote Spider worker URLs used for scrape requests. When None, falls back to the process-wide SPIDER_WORKER_SCRAPER env var (or its default), preserving pre-2.51.x behavior. When Some, overrides the global pool for this Website only.

Implementations§

Source§

impl Configuration

Source

pub fn new() -> Self

Available on crate feature chrome only.

Represents crawl configuration for a website.

Source

pub fn build_remote_multimodal_engine(&self) -> Option<RemoteMultimodalEngine>

Available on crate feature agent only.

Build a RemoteMultimodalEngine from RemoteMultimodalConfigs. Requires the agent feature.

Source

pub fn get_blacklist(&self) -> Box<RegexSet>

Available on crate feature regex only.

Compile the regex for the blacklist.

Source

pub fn set_whitelist(&mut self)

Set the whitelist

Source

pub fn configure_allowlist(&mut self)

Configure the allow list.

Source

pub fn get_blacklist_compiled(&self) -> &AllowList

Get the blacklist compiled.

Source

pub fn configure_budget(&mut self)

Setup the budget for crawling.

Source

pub fn get_whitelist_compiled(&self) -> &AllowList

Get the whitelist compiled.

Source

pub fn get_whitelist(&self) -> Box<RegexSet>

Available on crate feature regex only.

Compile the regex for the whitelist.

Source

pub fn add_sitemap_to_whitelist(&mut self) -> SitemapWhitelistChanges

Available on crate feature sitemap only.

Add sitemap paths to the whitelist and track what was added.

Source

pub fn remove_sitemap_from_whitelist( &mut self, changes: SitemapWhitelistChanges, )

Available on crate feature sitemap only.

Revert any changes made to the whitelist by add_sitemap_to_whitelist.

Source

pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self

Respect robots.txt file.

Source

pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self

Include subdomains detection.

Source

pub fn with_csp_bypass(&mut self, enabled: bool) -> &mut Self

Available on crate feature chrome only.

Bypass CSP protection detection. This does nothing without the feat flag chrome enabled.

Source

pub fn with_disable_javascript(&mut self, disabled: bool) -> &mut Self

Available on crate feature chrome only.

Disable JavaScript execution on the page. This does nothing without the feat flag chrome enabled.

Source

pub fn with_network_interface( &mut self, network_interface: Option<String>, ) -> &mut Self

Bind the connections only on the network interface.

Source

pub fn with_local_address(&mut self, local_address: Option<IpAddr>) -> &mut Self

Bind to a local IP Address.

Source

pub fn with_tld(&mut self, tld: bool) -> &mut Self

Include tld detection.

Source

pub fn with_crawl_timeout( &mut self, crawl_timeout: Option<Duration>, ) -> &mut Self

The max duration for the crawl. This is useful when websites use a robots.txt with long durations and throttle the timeout removing the full concurrency.

Source

pub fn with_delay(&mut self, delay: u64) -> &mut Self

Delay between request as ms.

Source

pub fn with_http2_prior_knowledge( &mut self, http2_prior_knowledge: bool, ) -> &mut Self

Only use HTTP/2.

Source

pub fn with_request_timeout( &mut self, request_timeout: Option<Duration>, ) -> &mut Self

Max time to wait for request. By default request times out in 15s. Set to None to disable.

Source

pub fn with_sitemap(&mut self, sitemap_url: Option<&str>) -> &mut Self

Available on crate feature sitemap only.

Set the sitemap url. This does nothing without the sitemap feature flag.

Source

pub fn with_ignore_sitemap(&mut self, ignore_sitemap: bool) -> &mut Self

Available on crate feature sitemap only.

Ignore the sitemap when crawling. This method does nothing if the sitemap is not enabled.

Source

pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self

Add user agent to request.

Source

pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self

Preserve the HOST header.

Source

pub fn with_remote_multimodal( &mut self, remote_multimodal: Option<RemoteMultimodalConfigs>, ) -> &mut Self

Available on crate feature agent only.

Use a remote multimodal model to drive browser automation. Requires the agent feature.

Source

pub fn with_openai(&mut self, openai_config: Option<GPTConfigs>) -> &mut Self

Available on crate feature openai only.

The OpenAI configs to use to drive the browser. This method does nothing if the openai is not enabled.

Source

pub fn with_gemini(&mut self, gemini_config: Option<GeminiConfigs>) -> &mut Self

Available on crate feature gemini only.

The Gemini configs to use to drive the browser. This method does nothing if the gemini is not enabled.

Source

pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self

Available on crate feature cookies only.

Cookie string to use in request. This does nothing without the cookies flag enabled.

Source

pub fn with_fingerprint(&mut self, fingerprint: bool) -> &mut Self

Available on crate feature chrome only.

Set custom fingerprint ID for request. This does nothing without the chrome flag enabled.

Source

pub fn with_fingerprint_advanced( &mut self, fingerprint: Fingerprint, ) -> &mut Self

Available on crate feature chrome only.

Set custom fingerprint ID for request. This does nothing without the chrome flag enabled.

Source

pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self

Use proxies for request.

Source

pub fn with_proxies_direct( &mut self, proxies: Option<Vec<RequestProxy>>, ) -> &mut Self

Use proxies for request with control between chrome and http.

Source

pub fn with_proxies_for_kind( &mut self, kind: ProxyKind, proxies: Option<Vec<RequestProxy>>, ) -> &mut Self

Set the proxy override list for a specific ProxyKind.

Lazily registers a sidecar mapping that a crate::proxy_strategy::ProxyStrategy can route requests through. Pass None for proxies to remove a previously-set kind. Setting a kind to Some(empty_vec) is allowed and means “route here but with no proxy” — the secondary client built for this kind will be unproxied.

Has no effect on the primary Configuration::proxies list or on requests that route to ProxyKind::Default.

Source

pub fn with_shared_queue(&mut self, shared_queue: bool) -> &mut Self

Use a shared semaphore to evenly handle workloads. The default is false.

Source

pub fn with_blacklist_url<T>( &mut self, blacklist_url: Option<Vec<T>>, ) -> &mut Self
where Vec<CompactString>: From<Vec<T>>,

Add blacklist urls to ignore.

Source

pub fn with_whitelist_url<T>( &mut self, whitelist_url: Option<Vec<T>>, ) -> &mut Self
where Vec<CompactString>: From<Vec<T>>,

Add whitelist urls to allow.

Return the links found on the page in the channel subscriptions. This method does nothing if the decentralized is enabled.

Source

pub fn with_headers(&mut self, headers: Option<HeaderMap>) -> &mut Self

Set HTTP headers for request using reqwest::header::HeaderMap.

Source

pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self

Set the max redirects allowed for request.

Calling this method opts in to redirect-cap enforcement on both the HTTP and Chrome paths. Without it, Chrome defers to Chromium’s internal ~20-hop cap to preserve prior behavior.

Source

pub fn with_max_main_frame_navigations(&mut self, cap: Option<u32>) -> &mut Self

Cap the number of main-frame cross-document navigations per Chrome goto() call. None disables the guard.

This is the JS / meta-refresh counterpart to with_redirect_limit — the HTTP redirect cap cannot catch loops implemented via location.href, <meta http-equiv="refresh">, or Refresh: headers, because each hop is a fresh document rather than a 3xx redirect.

Source

pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self

Set the redirect policy to use.

Source

pub fn with_referer(&mut self, referer: Option<String>) -> &mut Self

Add a referer (mis-spelling) to the request.

Source

pub fn with_referrer(&mut self, referer: Option<String>) -> &mut Self

Add a referer to the request.

Source

pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self

Determine whether to collect all the resources found on pages.

Source

pub fn with_dismiss_dialogs(&mut self, dismiss_dialogs: bool) -> &mut Self

Available on crate feature chrome only.

Determine whether to dismiss dialogs. This method does nothing if the chrome is enabled.

Source

pub fn with_emulation(&mut self, emulation: Option<Emulation>) -> &mut Self

Available on crate feature wreq only.

Set the request emuluation. This method does nothing if the wreq flag is not enabled.

Source

pub fn with_cron(&mut self, cron_str: &str, cron_type: CronType) -> &mut Self

Available on crate feature cron only.

Setup cron jobs to run. This does nothing without the cron flag enabled.

Source

pub fn with_limit(&mut self, limit: u32) -> &mut Self

Set a crawl page limit. If the value is 0 there is no limit.

Source

pub fn with_concurrency_limit(&mut self, limit: Option<usize>) -> &mut Self

Set the concurrency limits. If you set the value to None to use the default limits using the system CPU cors * n.

Source

pub fn with_auth_challenge_response( &mut self, auth_challenge_response: Option<AuthChallengeResponse>, ) -> &mut Self

Available on crate feature chrome only.

Set the authentiation challenge response. This does nothing without the feat flag chrome enabled.

Source

pub fn with_evaluate_on_new_document( &mut self, evaluate_on_new_document: Option<Box<String>>, ) -> &mut Self

Available on crate feature chrome only.

Set a custom script to evaluate on new document creation. This does nothing without the feat flag chrome enabled.

Source

pub fn with_depth(&mut self, depth: usize) -> &mut Self

Set a crawl depth limit. If the value is 0 there is no limit.

Source

pub fn with_caching(&mut self, cache: bool) -> &mut Self

Available on crate features cache_request or chrome_remote_cache only.

Cache the page following HTTP rules. This method does nothing if the cache feature is not enabled.

Source

pub fn with_cache_skip_browser(&mut self, skip: bool) -> &mut Self

Available on crate features cache_request or chrome_remote_cache only.

Skip browser rendering entirely if cached response exists. When enabled with caching, returns cached HTML directly without launching Chrome. This is useful for performance when you only need the cached content.

Source

pub fn with_cache_namespace<S: Into<String>>( &mut self, namespace: Option<S>, ) -> &mut Self

Partition the cache by an opaque namespace so logically distinct variants of the same URL (country, proxy pool, tenant, A/B bucket, device profile, …) never collide on the same cached bytes. None uses the default (empty) namespace. Has no observable effect when no cache feature is active, but the configuration is always settable regardless of feature flags.

Source

pub fn with_chrome_remote_cache_read_only( &mut self, _read_only: bool, ) -> &mut Self

Available on non-crate feature chrome_remote_cache only.

Enable read-only mode for the remote Chrome cache. This method does nothing without the chrome_remote_cache feature.

Source

pub fn with_remote_cache_skip_browser(&mut self, _enabled: bool) -> &mut Self

Available on non-crate feature chrome_remote_cache only.

Enable publishing of fresh HTTP (skip_browser) responses to the shared remote cache worker. This method does nothing without the chrome_remote_cache feature.

Source

pub fn with_chrome_remote_cache_main_doc_only( &mut self, _enabled: bool, ) -> &mut Self

Available on non-crate feature chrome_remote_cache only.

Restrict chrome remote-cache dumps to the main document only. This method does nothing without the chrome_remote_cache feature.

Source

pub fn with_service_worker_enabled(&mut self, enabled: bool) -> &mut Self

Available on crate feature chrome only.

Enable or disable Service Workers. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_auto_geolocation(&mut self, enabled: bool) -> &mut Self

Available on crate feature chrome only.

Automatically setup geo-location configurations when using a proxy. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_retry(&mut self, retry: u8) -> &mut Self

Set the retry limit for request. Set the value to 0 for no retries. The default is 0.

Source

pub fn with_default_http_connect_timeout( &mut self, default_http_connect_timeout: Option<Duration>, ) -> &mut Self

The default http connect timeout.

Source

pub fn with_default_http_read_timeout( &mut self, default_http_read_timeout: Option<Duration>, ) -> &mut Self

The default http read timeout.

Source

pub fn with_no_control_thread(&mut self, no_control_thread: bool) -> &mut Self

Skip setting up a control thread for pause, start, and shutdown programmatic handling. This does nothing without the ‘control’ flag enabled.

Source

pub fn with_viewport(&mut self, viewport: Option<Viewport>) -> &mut Self

Configures the viewport of the browser, which defaults to 800x600. This method does nothing if the ‘chrome’ feature is not enabled.

Source

pub fn with_stealth(&mut self, stealth_mode: bool) -> &mut Self

Available on crate feature chrome only.

Use stealth mode for the request. This does nothing without the chrome flag enabled.

Source

pub fn with_stealth_advanced(&mut self, stealth_mode: Tier) -> &mut Self

Available on crate feature chrome only.

Use stealth mode for the request. This does nothing without the chrome flag enabled.

Source

pub fn with_wait_for_idle_network( &mut self, wait_for_idle_network: Option<WaitForIdleNetwork>, ) -> &mut Self

Available on crate feature chrome only.

Wait for network request to be idle within a time frame period (500ms no network connections). This does nothing without the chrome flag enabled.

Source

pub fn with_wait_for_idle_network0( &mut self, wait_for_idle_network0: Option<WaitForIdleNetwork>, ) -> &mut Self

Available on crate feature chrome only.

Wait for network request with a max timeout. This does nothing without the chrome flag enabled.

Source

pub fn with_wait_for_almost_idle_network0( &mut self, wait_for_almost_idle_network0: Option<WaitForIdleNetwork>, ) -> &mut Self

Available on crate feature chrome only.

Wait for network to be almost idle with a max timeout. This does nothing without the chrome flag enabled.

Source

pub fn with_wait_for_idle_dom( &mut self, wait_for_idle_dom: Option<WaitForSelector>, ) -> &mut Self

Available on crate feature chrome only.

Wait for idle dom mutations for target element. This method does nothing if the [chrome] feature is not enabled.

Source

pub fn with_wait_for_selector( &mut self, wait_for_selector: Option<WaitForSelector>, ) -> &mut Self

Available on crate feature chrome only.

Wait for a selector. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_wait_for_delay( &mut self, wait_for_delay: Option<WaitForDelay>, ) -> &mut Self

Available on crate feature chrome only.

Wait for with delay. Should only be used for testing. This method does nothing if the ‘chrome’ feature is not enabled.

Source

pub fn with_chrome_intercept( &mut self, chrome_intercept: RequestInterceptConfiguration, url: &Option<Box<Url>>, ) -> &mut Self

Available on crate feature chrome_intercept only.

Use request intercept for the request to only allow content that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the chrome_intercept is not enabled.

Source

pub fn with_remote_local_policy(&mut self, enabled: bool) -> &mut Self

Available on crate feature chrome_intercept only.

Push the interception policy (the chrome_intercept flags + per-job blacklist/whitelist + page url) to a capable remote rendering engine once per navigation, so it resolves block/allow decisions locally instead of round-tripping every paused request. Enables request interception. No-op against a normal Chrome target (the vendor method is ignored), so it only changes behavior for engines that implement it.

Source

pub fn with_chrome_connection( &mut self, chrome_connection_url: Option<String>, ) -> &mut Self

Available on crate feature chrome only.

Set the connection url for the chrome instance. This method does nothing if the chrome is not enabled.

Source

pub fn with_chrome_connections(&mut self, urls: Vec<String>) -> &mut Self

Available on crate feature chrome only.

Set multiple remote Chrome connection URLs for failover. When a connection fails after retries, the next URL is tried. Takes priority over chrome_connection_url when set.

A single-URL vec routes through chrome_connection_url so the normal single-endpoint path (10 retries w/ backoff) is used instead of the failover path (3 retries, no other endpoint to try).

Source

pub fn with_worker_connection( &mut self, worker_connection_url: Option<String>, ) -> &mut Self

Available on crate feature decentralized only.

Set the Spider worker URL for crawl requests. None clears the per-website override so this Website falls back to the process-wide SPIDER_WORKER env var (default http://127.0.0.1:3030). Some with a non-empty URL routes crawl traffic through that worker; Some with an empty/whitespace URL disables the crawl worker pool for this Website without affecting any other Website in the process.

Source

pub fn with_worker_connections(&mut self, urls: Vec<String>) -> &mut Self

Available on crate feature decentralized only.

Set multiple Spider worker URLs for crawl requests. Empty/whitespace entries are dropped. An empty resulting list disables the crawl worker pool for this Website only.

Source

pub fn with_scraper_worker_connection( &mut self, scraper_worker_connection_url: Option<String>, ) -> &mut Self

Available on crate feature decentralized only.

Set the Spider scraper worker URL for scrape requests. None clears the per-website override so this Website falls back to the process-wide SPIDER_WORKER_SCRAPER env var (default http://127.0.0.1:3031). Some with an empty/whitespace URL disables the scraper worker pool for this Website only.

Source

pub fn with_scraper_worker_connections( &mut self, urls: Vec<String>, ) -> &mut Self

Available on crate feature decentralized only.

Set multiple Spider scraper worker URLs for scrape requests. Empty/whitespace entries are dropped. An empty resulting list disables the scraper worker pool for this Website only.

Source

pub fn with_chrome_first_byte_timeout( &mut self, timeout: Option<Duration>, ) -> &mut Self

Available on crate feature chrome only.

Set the first-byte watchdog timeout for Chrome navigations. None disables it; Some(d) fires after d of silence on both Network.responseReceived and Network.dataReceived and force-stops the page so the caller can rotate to a different Chrome backend.

Source

pub fn with_chrome_first_byte_timeout_jitter( &mut self, jitter: Option<Duration>, ) -> &mut Self

Available on crate feature chrome only.

Set the per-fetch jitter window for the first-byte watchdog. None disables jitter; Some(j) randomizes each fetch’s timeout uniformly in [base, base + j). Ignored when the base timeout is None.

Source

pub fn with_http_first_byte_timeout( &mut self, timeout: Option<Duration>, ) -> &mut Self

Set the first-byte watchdog timeout for HTTP fetches. None disables it; Some(d) wraps each client.get(url).send() in tokio::time::timeout(d + rand(0..jitter)) and returns a synthetic 524 GATEWAY_TIMEOUT response on fire so the retry path rotates the proxy. Covers stalls between TCP connect and the first byte of the response — distinct from connect_timeout (handshake-only) and chunk_idle_timeout (body-streaming idle).

Source

pub fn with_http_first_byte_timeout_jitter( &mut self, jitter: Option<Duration>, ) -> &mut Self

Set the per-fetch jitter window for the HTTP first-byte watchdog. Same semantics as with_chrome_first_byte_timeout_jitter.

Source

pub fn with_execution_scripts( &mut self, execution_scripts: Option<ExecutionScriptsMap>, ) -> &mut Self

Available on crate feature chrome only.

Set JS to run on certain pages. This method does nothing if the chrome is not enabled.

Source

pub fn with_automation_scripts( &mut self, automation_scripts: Option<AutomationScriptsMap>, ) -> &mut Self

Available on crate feature chrome only.

Run web automated actions on certain pages. This method does nothing if the chrome is not enabled.

Source

pub fn with_budget(&mut self, budget: Option<HashMap<&str, u32>>) -> &mut Self

Set a crawl budget per path with levels support /a/b/c or for all paths with “*”. This does nothing without the budget flag enabled.

Source

pub fn with_external_domains<'a, 'b>( &mut self, external_domains: Option<impl Iterator<Item = String> + 'a>, ) -> &mut Self

Group external domains to treat the crawl as one. If None is passed this will clear all prior domains.

Source

pub fn with_danger_accept_invalid_certs( &mut self, accept_invalid_certs: bool, ) -> &mut Self

Dangerously accept invalid certificates - this should be used as a last resort.

Source

pub fn with_normalize(&mut self, normalize: bool) -> &mut Self

Normalize the content de-duplicating trailing slash pages and other pages that can be duplicated. This may initially show the link in your links_visited or subscription calls but, the following links will not be crawled.

Source

pub fn with_shared_state(&mut self, shared: bool) -> &mut Self

Available on crate feature disk only.

Store all the links found on the disk to share the state. This does nothing without the disk flag enabled.

Source

pub fn with_timezone_id(&mut self, timezone_id: Option<String>) -> &mut Self

Available on crate feature chrome only.

Overrides default host system timezone with the specified one. This does nothing without the chrome flag enabled.

Source

pub fn with_locale(&mut self, locale: Option<String>) -> &mut Self

Available on crate feature chrome only.

Overrides default host system locale with the specified one. This does nothing without the chrome flag enabled.

Source

pub fn with_event_tracker( &mut self, track_events: Option<ChromeEventTracker>, ) -> &mut Self

Available on crate feature chrome only.

Track the events made via chrome.

Source

pub fn with_screenshot( &mut self, screenshot_config: Option<ScreenShotConfig>, ) -> &mut Self

Available on crate feature chrome only.

Set the chrome screenshot configuration. This does nothing without the chrome flag enabled.

Source

pub fn with_max_page_bytes(&mut self, max_page_bytes: Option<f64>) -> &mut Self

Set the max amount of bytes to collect per page. This method does nothing if the chrome is not enabled.

Source

pub fn with_max_bytes_allowed( &mut self, max_bytes_allowed: Option<u64>, ) -> &mut Self

Set the max amount of bytes to collected for the browser context. This method does nothing if the chrome is not enabled.

Source

pub fn with_block_assets(&mut self, only_html: bool) -> &mut Self

Block assets from loading from the network.

Source

pub fn with_modify_headers(&mut self, modify_headers: bool) -> &mut Self

Modify the headers to mimic a real browser.

Source

pub fn with_modify_http_client_headers( &mut self, modify_http_client_headers: bool, ) -> &mut Self

Modify the HTTP client headers to mimic a real browser.

Source

pub fn with_cache_policy( &mut self, cache_policy: Option<BasicCachePolicy>, ) -> &mut Self

Set the cache policy.

Source

pub fn with_webdriver_config( &mut self, webdriver_config: Option<WebDriverConfig>, ) -> &mut Self

Available on crate feature webdriver only.

Set the WebDriver configuration. This does nothing without the webdriver flag enabled.

Source

pub fn auto_http_first_byte_args(&self) -> (Option<Duration>, Option<Duration>)

Resolve the HTTP first-byte watchdog args.

Returns the configured http_first_byte_timeout + _jitter whenever the timeout field is Some(_) — caller opted in by setting the field, so honor it regardless of proxy count.

Previously this was gated on balance feature + ≥2 HTTP-eligible proxies, on the premise that the watchdog firing without a rotation target was wasted. That was wrong for the proxy-shrouded NXDOMAIN case: a single proxy still returns an upstream-DNS-shaped 5xx after ~15-22s, and reqwest’s .timeout() is not enforced through the proxy CONNECT tunnel for that phase. The watchdog is the only knob that fires reliably, and a fast 524 surfaced to the caller is strictly better than waiting for the proxy’s internal DNS deadline — even when no rotation target exists.

When the timeout field is None, returns (None, None) — pure passthrough, no overhead. Setting the field on Configuration is the opt-in.

Source

pub fn chrome_fetch_params(&self) -> ChromeFetchParams<'_>

Available on crate feature chrome only.

Build the borrowed chrome fetch parameter bundle.

Zero-copy: all fields borrow directly from self. Build once at the top of a call chain and pass & through the layers to keep the hot path inlineable.

Source

pub fn build(&self) -> Self

Build the website configuration when using with_builder.

Source

pub fn with_search_config( &mut self, search_config: Option<SearchConfig>, ) -> &mut Self

Available on crate feature search only.

Configure web search integration. This does nothing without the search flag enabled.

Source

pub fn with_spider_cloud(&mut self, api_key: &str) -> &mut Self

Available on crate feature spider_cloud only.

Set a spider.cloud API key (Proxy mode).

Source

pub fn with_spider_cloud_config( &mut self, config: SpiderCloudConfig, ) -> &mut Self

Available on crate feature spider_cloud only.

Set a spider.cloud config.

Source

pub fn with_spider_browser(&mut self, api_key: &str) -> &mut Self

Available on crate features chrome and spider_cloud only.

Connect to Spider Browser Cloud via CDP over WebSocket using an API key.

Sets chrome_connection_url to wss://browser.spider.cloud/v1/browser?token=API_KEY.

Source

pub fn with_spider_browser_config( &mut self, config: SpiderBrowserConfig, ) -> &mut Self

Available on crate features chrome and spider_cloud only.

Connect to Spider Browser Cloud with full configuration (stealth, country, browser type, etc.).

Source

pub fn with_hedge(&mut self, config: HedgeConfig) -> &mut Self

Available on crate feature hedge only.

Set the hedged request (work-stealing) configuration.

Source

pub fn with_auto_throttle(&mut self, config: AutoThrottleConfig) -> &mut Self

Available on crate feature auto_throttle only.

Set the auto-throttle configuration for latency-based adaptive delay.

Source

pub fn with_etag_cache(&mut self, enabled: bool) -> &mut Self

Available on crate feature etag_cache only.

Enable or disable ETag / conditional request caching for bandwidth-efficient re-crawls.

Source

pub fn with_warc(&mut self, config: WarcConfig) -> &mut Self

Available on crate feature warc only.

Configure WARC output for writing a web archive file during crawl.

Trait Implementations§

Source§

impl Clone for Configuration

Source§

fn clone(&self) -> Configuration

Returns a duplicate of the value. Read more
1.0.0 (const: unstable) · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more
Source§

impl Debug for Configuration

Source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more
Source§

impl Default for Configuration

Source§

fn default() -> Configuration

Returns the “default value” for a type. Read more
Source§

impl<'de> Deserialize<'de> for Configuration

Source§

fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>
where __D: Deserializer<'de>,

Deserialize this value from the given Serde deserializer. Read more
Source§

impl Serialize for Configuration

Source§

fn serialize<__S>(&self, __serializer: __S) -> Result<__S::Ok, __S::Error>
where __S: Serializer,

Serialize this value into the given Serde serializer. Read more

Auto Trait Implementations§

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> CloneToUninit for T
where T: Clone,

Source§

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)
Performs copy-assignment from self to dest. Read more
Source§

impl<T> DeserializeOwned for T
where T: for<'de> Deserialize<'de>,

Source§

impl<T> DynClone for T
where T: Clone,

Source§

fn __clone_box(&self, _: Private) -> *mut ()

Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T> Instrument for T

Source§

fn instrument(self, span: Span) -> Instrumented<Self>

Instruments this type with the provided Span, returning an Instrumented wrapper. Read more
Source§

fn in_current_span(self) -> Instrumented<Self>

Instruments this type with the current Span, returning an Instrumented wrapper. Read more
Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T> IntoEither for T

Source§

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

impl<T> Pointable for T

Source§

const ALIGN: usize

The alignment of pointer.
Source§

type Init = T

The type for initializers.
Source§

unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more
Source§

unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more
Source§

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more
Source§

unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more
Source§

impl<T> PolicyExt for T
where T: ?Sized,

Source§

fn and<P, B, E>(self, other: P) -> And<T, P>
where T: Sized + Policy<B, E>, P: Policy<B, E>,

Create a new Policy that returns Action::Follow only if self and other return Action::Follow. Read more
Source§

fn or<P, B, E>(self, other: P) -> Or<T, P>
where T: Sized + Policy<B, E>, P: Policy<B, E>,

Create a new Policy that returns Action::Follow if either self or other returns Action::Follow. Read more
Source§

impl<T> Same for T

Source§

type Output = T

Should always be Self
Source§

impl<T> ToOwned for T
where T: Clone,

Source§

type Owned = T

The resulting type after obtaining ownership.
Source§

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more
Source§

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

Source§

fn vzip(self) -> V

Source§

impl<T> WithSubscriber for T

Source§

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

Attaches the provided Subscriber to this type, returning a WithDispatch wrapper. Read more
Source§

fn with_current_subscriber(self) -> WithDispatch<Self>

Attaches the current default Subscriber to this type, returning a WithDispatch wrapper. Read more