pub struct Configuration {Show 92 fields
pub respect_robots_txt: bool,
pub subdomains: bool,
pub tld: bool,
pub crawl_timeout: Option<Duration>,
pub preserve_host_header: bool,
pub blacklist_url: Option<Vec<CompactString>>,
pub whitelist_url: Option<Vec<CompactString>>,
pub user_agent: Option<Box<CompactString>>,
pub delay: u64,
pub request_timeout: Option<Duration>,
pub http2_prior_knowledge: bool,
pub proxies: Option<Vec<RequestProxy>>,
pub proxies_by_kind: Option<HashMap<ProxyKind, Vec<RequestProxy>>>,
pub headers: Option<Box<SerializableHeaderMap>>,
pub sitemap_url: Option<Box<CompactString>>,
pub ignore_sitemap: bool,
pub redirect_limit: usize,
pub redirect_policy: RedirectPolicy,
pub redirect_limit_set: bool,
pub max_main_frame_navigations: Option<u32>,
pub cookie_str: String,
pub emulation: Option<Emulation>,
pub cron_str: String,
pub cron_type: CronType,
pub depth: usize,
pub depth_distance: usize,
pub stealth_mode: Tier,
pub viewport: Option<Viewport>,
pub budget: Option<HashMap<CaseInsensitiveString, u32>>,
pub wild_card_budgeting: bool,
pub external_domains_caseless: Arc<HashSet<CaseInsensitiveString>>,
pub full_resources: bool,
pub accept_invalid_certs: bool,
pub auth_challenge_response: Option<AuthChallengeResponse>,
pub openai_config: Option<Box<GPTConfigs>>,
pub gemini_config: Option<Box<GeminiConfigs>>,
pub remote_multimodal: Option<Box<RemoteMultimodalConfigs>>,
pub shared_queue: bool,
pub return_page_links: bool,
pub retry: u8,
pub custom_antibot: Option<CustomAntibotPatterns>,
pub no_control_thread: bool,
pub only_html: bool,
pub concurrency_limit: Option<usize>,
pub normalize: bool,
pub shared: bool,
pub modify_headers: bool,
pub modify_http_client_headers: bool,
pub cache: bool,
pub cache_skip_browser: bool,
pub cache_namespace: Option<Box<String>>,
pub service_worker_enabled: bool,
pub timezone_id: Option<Box<String>>,
pub locale: Option<Box<String>>,
pub evaluate_on_new_document: Option<Box<String>>,
pub dismiss_dialogs: Option<bool>,
pub wait_for: Option<WaitFor>,
pub screenshot: Option<ScreenShotConfig>,
pub track_events: Option<ChromeEventTracker>,
pub fingerprint: Fingerprint,
pub chrome_connection_url: Option<String>,
pub chrome_connection_urls: Option<Vec<String>>,
pub chrome_first_byte_timeout: Option<Duration>,
pub chrome_first_byte_timeout_jitter: Option<Duration>,
pub http_first_byte_timeout: Option<Duration>,
pub http_first_byte_timeout_jitter: Option<Duration>,
pub execution_scripts: Option<ExecutionScripts>,
pub automation_scripts: Option<AutomationScripts>,
pub chrome_intercept: RequestInterceptConfiguration,
pub referer: Option<String>,
pub max_page_bytes: Option<f64>,
pub max_bytes_allowed: Option<u64>,
pub disable_log: bool,
pub auto_geolocation: bool,
pub cache_policy: Option<BasicCachePolicy>,
pub bypass_csp: bool,
pub disable_javascript: bool,
pub network_interface: Option<String>,
pub local_address: Option<IpAddr>,
pub default_http_connect_timeout: Option<Duration>,
pub default_http_read_timeout: Option<Duration>,
pub webdriver_config: Option<Box<WebDriverConfig>>,
pub search_config: Option<Box<SearchConfig>>,
pub spider_cloud: Option<Box<SpiderCloudConfig>>,
pub spider_browser: Option<Box<SpiderBrowserConfig>>,
pub hedge: Option<HedgeConfig>,
pub auto_throttle: Option<AutoThrottleConfig>,
pub etag_cache: bool,
pub warc: Option<WarcConfig>,
pub parallel_backends: Option<ParallelBackendsConfig>,
pub worker_connection_urls: Option<Vec<String>>,
pub scraper_worker_connection_urls: Option<Vec<String>>,
/* private fields */
}Expand description
Structure to configure Website crawler
use spider::website::Website;
let mut website: Website = Website::new("https://choosealicense.com");
website.configuration.blacklist_url.insert(Default::default()).push("https://choosealicense.com/licenses/".to_string().into());
website.configuration.respect_robots_txt = true;
website.configuration.subdomains = true;
website.configuration.tld = true;Fields§
§respect_robots_txt: boolRespect robots.txt file and not scrape not allowed files. This may slow down crawls if robots.txt file has a delay included.
subdomains: boolAllow sub-domains.
tld: boolAllow all tlds for domain.
crawl_timeout: Option<Duration>The max timeout for the crawl.
preserve_host_header: boolPreserve the HTTP host header from being included.
blacklist_url: Option<Vec<CompactString>>List of pages to not crawl. [optional: regex pattern matching]
whitelist_url: Option<Vec<CompactString>>List of pages to only crawl. [optional: regex pattern matching]
user_agent: Option<Box<CompactString>>User-Agent for request.
delay: u64Polite crawling delay in milli seconds.
request_timeout: Option<Duration>Request max timeout per page. By default the request times out in 15s. Set to None to disable.
http2_prior_knowledge: boolUse HTTP2 for connection. Enable if you know the website has http2 support.
proxies: Option<Vec<RequestProxy>>Use proxy list for performing network request.
proxies_by_kind: Option<HashMap<ProxyKind, Vec<RequestProxy>>>Optional sidecar map of alternative proxy lists keyed by
ProxyKind.
Lets a crate::proxy_strategy::ProxyStrategy route a request
through a non-default proxy set without touching proxies or
RequestProxy itself. When None (the default) or when the
strategy returns a kind that has no entry here, requests fall
through to proxies and the existing fast path — no behavior
change.
Lookup is by enum equality / hash; the ProxyKind::Custom
variant lets consumers introduce their own kinds without an
upstream change. Spider never writes to this map after
configuration; runtime lazy state lives on the Website.
headers: Option<Box<SerializableHeaderMap>>Headers to include with request.
sitemap_url: Option<Box<CompactString>>sitemap only.Include a sitemap in response of the crawl.
ignore_sitemap: boolsitemap only.Prevent including the sitemap links with the crawl.
redirect_limit: usizeThe max redirections allowed for request.
redirect_policy: RedirectPolicyThe redirect policy type to use.
redirect_limit_set: boolWhether redirect_limit was explicitly set by the caller.
Set to true by with_redirect_limit() and by the external-config loader
when redirect_limit is provided. Chrome-path enforcement reads this flag
so it only caps redirects when the user opted in — preserving prior
behavior on pages whose navigation chains exceed the HTTP default of 7.
Cap on main-frame cross-document navigations during a single Chrome
goto (requires the chrome feature — no effect on the HTTP path).
Defends against JS / meta-refresh / HTTP-Refresh-header loops that
bypass the HTTP redirect cap because each hop is a fresh document
rather than a 3xx redirect. None disables the guard (default) so
prior behavior is preserved; Some(n) aborts the navigation with a
net::ERR_TOO_MANY_NAVIGATIONS error once the main frame has
navigated more than n times since goto.
cookies only.Cookie string to use for network requests ex: “foo=bar; Domain=blog.spider”
emulation: Option<Emulation>wreq only.The type of request emulation. This does nothing without the flag sync enabled.
cron_str: Stringcron only.Cron string to perform crawls - use https://crontab.guru/ to help generate a valid cron for needs.
cron_type: CronTypecron only.The type of cron to run either crawl or scrape.
depth: usizeThe max depth to crawl for a website. Defaults to 25 to help prevent infinite recursion.
depth_distance: usizeThe depth to crawl pertaining to the root.
stealth_mode: TierUse stealth mode for requests.
viewport: Option<Viewport>Configure the viewport for chrome and viewport headers.
budget: Option<HashMap<CaseInsensitiveString, u32>>Crawl budget for the paths. This helps prevent crawling extra pages and limiting the amount.
wild_card_budgeting: boolIf wild card budgeting is found for the website.
external_domains_caseless: Arc<HashSet<CaseInsensitiveString>>External domains to include case-insensitive.
full_resources: boolCollect all the resources found on the page.
accept_invalid_certs: boolDangerously accept invalid certficates.
auth_challenge_response: Option<AuthChallengeResponse>The auth challenge response. The ‘chrome_intercept’ flag is also required in order to intercept the response.
openai_config: Option<Box<GPTConfigs>>The OpenAI configs to use to help drive the chrome browser. This does nothing without the ‘openai’ flag.
gemini_config: Option<Box<GeminiConfigs>>The Gemini configs to use to help drive the chrome browser. This does nothing without the ‘gemini’ flag.
remote_multimodal: Option<Box<RemoteMultimodalConfigs>>Remote multimodal automation config (vision + LLM-driven steps).
Requires the agent feature for full functionality, uses stub type otherwise.
Use a shared queue strategy when crawling. This can scale workloads evenly that do not need priority.
return_page_links: boolReturn the page links in the subscription channels. This does nothing without the flag sync enabled.
retry: u8Retry count to attempt to swap proxies etc.
custom_antibot: Option<CustomAntibotPatterns>Custom antibot detection patterns. When set, these are matched in addition
to the built-in patterns. Any match triggers AntiBotTech::Custom.
no_control_thread: boolSkip spawning a control thread that can pause, start, and shutdown the crawl.
only_html: boolExpect only to handle HTML to save on resources. This mainly only blocks the crawling and returning of resources from the server.
concurrency_limit: Option<usize>The concurrency limits to apply.
normalize: boolNormalize the html de-deplucating the content.
Share the state of the crawl requires the ‘disk’ feature flag.
modify_headers: boolModify the headers to act like a real-browser
modify_http_client_headers: boolModify the HTTP client headers only to act like a real-browser
cache: boolcache_request or chrome_remote_cache or chrome only.Cache the page following HTTP caching rules.
cache_skip_browser: boolcache_request or chrome_remote_cache or chrome only.Skip browser rendering entirely if cached response exists. When enabled, returns cached HTML directly without launching Chrome.
cache_namespace: Option<Box<String>>Namespace mixed into every cache key so logically distinct variants
(country, proxy pool, tenant, A/B bucket, device profile, …) never
collide on the same cached bytes. Free-form — spider treats it as an
opaque partition string. None uses the default (empty) namespace.
Always present (zero cost when unset); its effect is gated by whichever
cache feature is active.
service_worker_enabled: boolchrome only.Enable or disable service workers. Enabled by default.
timezone_id: Option<Box<String>>chrome only.Overrides default host system timezone with the specified one.
locale: Option<Box<String>>chrome only.Overrides default host system locale with the specified one.
evaluate_on_new_document: Option<Box<String>>chrome only.Set a custom script to eval on each new document.
dismiss_dialogs: Option<bool>chrome only.Dismiss dialogs.
wait_for: Option<WaitFor>chrome only.Wait for options for the page.
screenshot: Option<ScreenShotConfig>chrome only.Take a screenshot of the page.
track_events: Option<ChromeEventTracker>chrome only.Track the events made via chrome.
fingerprint: Fingerprintchrome only.Setup fingerprint ID on each document. This does nothing without the flag chrome enabled.
chrome_connection_url: Option<String>chrome only.The chrome connection url. Useful for targeting different headless instances. Defaults to using the env CHROME_URL.
chrome_connection_urls: Option<Vec<String>>chrome only.Multiple remote Chrome connection URLs for failover. When a connection
fails after retries, the next URL is tried automatically. Requires the
chrome feature. When set, takes priority over chrome_connection_url.
chrome_first_byte_timeout: Option<Duration>chrome only.First-byte watchdog for Chrome navigations. When set, fires if no
Network.dataReceived (or Network.responseReceived) event arrives
within this duration after the listener attaches. On fire the page
is force-stopped and (when a browser_dead flag is plumbed through
ChromeFetchParams) it is flipped so the website-level retry loop
can rotate the backend. None (default) disables the watchdog and
the legacy chunk-idle timeout (SPIDER_CHUNK_IDLE_TIMEOUT_SECS,
default 30s) is the only stall guard.
chrome_first_byte_timeout_jitter: Option<Duration>chrome only.Per-fetch jitter window applied on top of chrome_first_byte_timeout.
When Some(j), each fetch picks actual_timeout = base + rand(0..j)
so concurrent fetches don’t all expire at exactly the same moment
(avoids thundering-herd backend rotation when a backend goes dark).
None (default) means no jitter — every fetch uses the configured
base timeout exactly. Ignored when chrome_first_byte_timeout is
None (no watchdog to jitter).
http_first_byte_timeout: Option<Duration>First-byte watchdog for HTTP fetches. When Some(d), each
client.get(url).send().await is wrapped in
tokio::time::timeout(base + rand(0..jitter)). On timeout the
in-flight connect / TLS / header future is dropped (cancels the
request) and a synthetic 524 GATEWAY_TIMEOUT response is built
so the existing retry path rotates to the next proxy. Covers the
gap between connect_timeout (TCP/TLS handshake) and
chunk_idle_timeout (per-chunk idle while streaming) where a
proxy can accept the connection but never produce headers.
None (default) disables the watchdog — request_timeout and
chunk_idle_timeout remain the only stall guards.
http_first_byte_timeout_jitter: Option<Duration>Per-fetch jitter window applied on top of
http_first_byte_timeout. Same semantics as
chrome_first_byte_timeout_jitter. None (default) means no
jitter; ignored when the base is None.
execution_scripts: Option<ExecutionScripts>chrome only.Scripts to execute for individual pages, the full path of the url is required for an exact match. This is useful for running one off JS on pages like performing custom login actions.
automation_scripts: Option<AutomationScripts>chrome only.Web automation scripts to run up to a duration of 60 seconds.
chrome_intercept: RequestInterceptConfigurationchrome only.Setup network interception for request. This does nothing without the flag chrome_intercept enabled.
referer: Option<String>The referer to use.
max_page_bytes: Option<f64>Determine the max bytes per page.
max_bytes_allowed: Option<u64>Determine the max bytes per browser context.
disable_log: boolchrome only.Disables log domain, prevents further log entries from being reported to the client. This does nothing without the flag chrome enabled.
auto_geolocation: boolchrome only.Automatic locale and timezone handling via third party. This does nothing without the flag chrome enabled.
cache_policy: Option<BasicCachePolicy>The cache policy to use.
bypass_csp: boolchrome only.Enables bypassing CSP. This does nothing without the flag chrome enabled.
disable_javascript: boolchrome only.Disables JavaScript execution on the page. This does nothing without the flag chrome enabled.
network_interface: Option<String>Bind the connections only on the network interface.
local_address: Option<IpAddr>Bind to a local IP Address.
default_http_connect_timeout: Option<Duration>The default http connect timeout
default_http_read_timeout: Option<Duration>The default http read timeout
webdriver_config: Option<Box<WebDriverConfig>>webdriver only.WebDriver configuration for browser automation. This does nothing without the webdriver flag enabled.
search_config: Option<Box<SearchConfig>>search only.Search provider configuration for web search integration. This does nothing without the search flag enabled.
spider_cloud: Option<Box<SpiderCloudConfig>>spider_cloud only.Spider Cloud config. See https://spider.cloud.
spider_browser: Option<Box<SpiderBrowserConfig>>chrome and spider_cloud only.Spider Browser Cloud config for remote CDP via wss://browser.spider.cloud.
hedge: Option<HedgeConfig>hedge only.Hedged request configuration for work-stealing on slow requests. When enabled, fires a duplicate request on a different proxy after a delay.
auto_throttle: Option<AutoThrottleConfig>auto_throttle only.Latency-based auto-throttle configuration. When enabled, dynamically adjusts per-domain crawl delay based on measured server response time.
etag_cache: booletag_cache only.Enable ETag / conditional request caching. When true, stores ETag and Last-Modified headers from responses and sends If-None-Match / If-Modified-Since on subsequent requests to the same URL, allowing servers to respond with lightweight 304 Not Modified.
warc: Option<WarcConfig>warc only.WARC output configuration. When set, the crawl writes a WARC 1.1 file
containing all fetched pages as response records.
parallel_backends: Option<ParallelBackendsConfig>parallel_backends only.Parallel crawl backend configuration. Race CDP / Servo backends alongside
the primary crawl path. Requires the parallel_backends feature.
worker_connection_urls: Option<Vec<String>>decentralized only.Per-Website remote Spider worker URLs used for crawl requests. When
None, falls back to the process-wide SPIDER_WORKER env var (or its
default), preserving pre-2.51.x behavior. When Some, overrides the
global pool for this Website only.
scraper_worker_connection_urls: Option<Vec<String>>decentralized only.Per-Website remote Spider worker URLs used for scrape requests. When
None, falls back to the process-wide SPIDER_WORKER_SCRAPER env var
(or its default), preserving pre-2.51.x behavior. When Some,
overrides the global pool for this Website only.
Implementations§
Source§impl Configuration
impl Configuration
Sourcepub fn new() -> Self
Available on crate feature chrome only.
pub fn new() -> Self
chrome only.Represents crawl configuration for a website.
Sourcepub fn build_remote_multimodal_engine(&self) -> Option<RemoteMultimodalEngine>
Available on crate feature agent only.
pub fn build_remote_multimodal_engine(&self) -> Option<RemoteMultimodalEngine>
agent only.Build a RemoteMultimodalEngine from RemoteMultimodalConfigs.
Requires the agent feature.
Sourcepub fn get_blacklist(&self) -> Box<RegexSet>
Available on crate feature regex only.
pub fn get_blacklist(&self) -> Box<RegexSet>
regex only.Compile the regex for the blacklist.
Sourcepub fn set_whitelist(&mut self)
pub fn set_whitelist(&mut self)
Set the whitelist
Sourcepub fn configure_allowlist(&mut self)
pub fn configure_allowlist(&mut self)
Configure the allow list.
Sourcepub fn get_blacklist_compiled(&self) -> &AllowList
pub fn get_blacklist_compiled(&self) -> &AllowList
Get the blacklist compiled.
Sourcepub fn configure_budget(&mut self)
pub fn configure_budget(&mut self)
Setup the budget for crawling.
Sourcepub fn get_whitelist_compiled(&self) -> &AllowList
pub fn get_whitelist_compiled(&self) -> &AllowList
Get the whitelist compiled.
Sourcepub fn get_whitelist(&self) -> Box<RegexSet>
Available on crate feature regex only.
pub fn get_whitelist(&self) -> Box<RegexSet>
regex only.Compile the regex for the whitelist.
Sourcepub fn add_sitemap_to_whitelist(&mut self) -> SitemapWhitelistChanges
Available on crate feature sitemap only.
pub fn add_sitemap_to_whitelist(&mut self) -> SitemapWhitelistChanges
sitemap only.Add sitemap paths to the whitelist and track what was added.
Sourcepub fn remove_sitemap_from_whitelist(
&mut self,
changes: SitemapWhitelistChanges,
)
Available on crate feature sitemap only.
pub fn remove_sitemap_from_whitelist( &mut self, changes: SitemapWhitelistChanges, )
sitemap only.Revert any changes made to the whitelist by add_sitemap_to_whitelist.
Sourcepub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self
pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self
Respect robots.txt file.
Sourcepub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self
pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self
Include subdomains detection.
Sourcepub fn with_csp_bypass(&mut self, enabled: bool) -> &mut Self
Available on crate feature chrome only.
pub fn with_csp_bypass(&mut self, enabled: bool) -> &mut Self
chrome only.Bypass CSP protection detection. This does nothing without the feat flag chrome enabled.
Sourcepub fn with_disable_javascript(&mut self, disabled: bool) -> &mut Self
Available on crate feature chrome only.
pub fn with_disable_javascript(&mut self, disabled: bool) -> &mut Self
chrome only.Disable JavaScript execution on the page. This does nothing without the feat flag chrome enabled.
Sourcepub fn with_network_interface(
&mut self,
network_interface: Option<String>,
) -> &mut Self
pub fn with_network_interface( &mut self, network_interface: Option<String>, ) -> &mut Self
Bind the connections only on the network interface.
Sourcepub fn with_local_address(&mut self, local_address: Option<IpAddr>) -> &mut Self
pub fn with_local_address(&mut self, local_address: Option<IpAddr>) -> &mut Self
Bind to a local IP Address.
Sourcepub fn with_crawl_timeout(
&mut self,
crawl_timeout: Option<Duration>,
) -> &mut Self
pub fn with_crawl_timeout( &mut self, crawl_timeout: Option<Duration>, ) -> &mut Self
The max duration for the crawl. This is useful when websites use a robots.txt with long durations and throttle the timeout removing the full concurrency.
Sourcepub fn with_delay(&mut self, delay: u64) -> &mut Self
pub fn with_delay(&mut self, delay: u64) -> &mut Self
Delay between request as ms.
Sourcepub fn with_http2_prior_knowledge(
&mut self,
http2_prior_knowledge: bool,
) -> &mut Self
pub fn with_http2_prior_knowledge( &mut self, http2_prior_knowledge: bool, ) -> &mut Self
Only use HTTP/2.
Sourcepub fn with_request_timeout(
&mut self,
request_timeout: Option<Duration>,
) -> &mut Self
pub fn with_request_timeout( &mut self, request_timeout: Option<Duration>, ) -> &mut Self
Max time to wait for request. By default request times out in 15s. Set to None to disable.
Sourcepub fn with_sitemap(&mut self, sitemap_url: Option<&str>) -> &mut Self
Available on crate feature sitemap only.
pub fn with_sitemap(&mut self, sitemap_url: Option<&str>) -> &mut Self
sitemap only.Set the sitemap url. This does nothing without the sitemap feature flag.
Sourcepub fn with_ignore_sitemap(&mut self, ignore_sitemap: bool) -> &mut Self
Available on crate feature sitemap only.
pub fn with_ignore_sitemap(&mut self, ignore_sitemap: bool) -> &mut Self
sitemap only.Ignore the sitemap when crawling. This method does nothing if the sitemap is not enabled.
Sourcepub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self
pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self
Add user agent to request.
Sourcepub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self
pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self
Preserve the HOST header.
Sourcepub fn with_remote_multimodal(
&mut self,
remote_multimodal: Option<RemoteMultimodalConfigs>,
) -> &mut Self
Available on crate feature agent only.
pub fn with_remote_multimodal( &mut self, remote_multimodal: Option<RemoteMultimodalConfigs>, ) -> &mut Self
agent only.Use a remote multimodal model to drive browser automation.
Requires the agent feature.
Sourcepub fn with_openai(&mut self, openai_config: Option<GPTConfigs>) -> &mut Self
Available on crate feature openai only.
pub fn with_openai(&mut self, openai_config: Option<GPTConfigs>) -> &mut Self
openai only.The OpenAI configs to use to drive the browser. This method does nothing if the openai is not enabled.
Sourcepub fn with_gemini(&mut self, gemini_config: Option<GeminiConfigs>) -> &mut Self
Available on crate feature gemini only.
pub fn with_gemini(&mut self, gemini_config: Option<GeminiConfigs>) -> &mut Self
gemini only.The Gemini configs to use to drive the browser. This method does nothing if the gemini is not enabled.
Available on crate feature cookies only.
cookies only.Cookie string to use in request. This does nothing without the cookies flag enabled.
Sourcepub fn with_fingerprint(&mut self, fingerprint: bool) -> &mut Self
Available on crate feature chrome only.
pub fn with_fingerprint(&mut self, fingerprint: bool) -> &mut Self
chrome only.Set custom fingerprint ID for request. This does nothing without the chrome flag enabled.
Sourcepub fn with_fingerprint_advanced(
&mut self,
fingerprint: Fingerprint,
) -> &mut Self
Available on crate feature chrome only.
pub fn with_fingerprint_advanced( &mut self, fingerprint: Fingerprint, ) -> &mut Self
chrome only.Set custom fingerprint ID for request. This does nothing without the chrome flag enabled.
Sourcepub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self
pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self
Use proxies for request.
Sourcepub fn with_proxies_direct(
&mut self,
proxies: Option<Vec<RequestProxy>>,
) -> &mut Self
pub fn with_proxies_direct( &mut self, proxies: Option<Vec<RequestProxy>>, ) -> &mut Self
Use proxies for request with control between chrome and http.
Sourcepub fn with_proxies_for_kind(
&mut self,
kind: ProxyKind,
proxies: Option<Vec<RequestProxy>>,
) -> &mut Self
pub fn with_proxies_for_kind( &mut self, kind: ProxyKind, proxies: Option<Vec<RequestProxy>>, ) -> &mut Self
Set the proxy override list for a specific ProxyKind.
Lazily registers a sidecar mapping that a
crate::proxy_strategy::ProxyStrategy can route requests
through. Pass None for proxies to remove a previously-set
kind. Setting a kind to Some(empty_vec) is allowed and means
“route here but with no proxy” — the secondary client built for
this kind will be unproxied.
Has no effect on the primary Configuration::proxies list or
on requests that route to ProxyKind::Default.
Use a shared semaphore to evenly handle workloads. The default is false.
Sourcepub fn with_blacklist_url<T>(
&mut self,
blacklist_url: Option<Vec<T>>,
) -> &mut Self
pub fn with_blacklist_url<T>( &mut self, blacklist_url: Option<Vec<T>>, ) -> &mut Self
Add blacklist urls to ignore.
Sourcepub fn with_whitelist_url<T>(
&mut self,
whitelist_url: Option<Vec<T>>,
) -> &mut Self
pub fn with_whitelist_url<T>( &mut self, whitelist_url: Option<Vec<T>>, ) -> &mut Self
Add whitelist urls to allow.
Sourcepub fn with_return_page_links(&mut self, return_page_links: bool) -> &mut Self
pub fn with_return_page_links(&mut self, return_page_links: bool) -> &mut Self
Return the links found on the page in the channel subscriptions. This method does nothing if the decentralized is enabled.
Sourcepub fn with_headers(&mut self, headers: Option<HeaderMap>) -> &mut Self
pub fn with_headers(&mut self, headers: Option<HeaderMap>) -> &mut Self
Set HTTP headers for request using reqwest::header::HeaderMap.
Sourcepub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self
pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self
Set the max redirects allowed for request.
Calling this method opts in to redirect-cap enforcement on both the HTTP and Chrome paths. Without it, Chrome defers to Chromium’s internal ~20-hop cap to preserve prior behavior.
Cap the number of main-frame cross-document navigations per Chrome
goto() call. None disables the guard.
This is the JS / meta-refresh counterpart to with_redirect_limit —
the HTTP redirect cap cannot catch loops implemented via
location.href, <meta http-equiv="refresh">, or Refresh: headers,
because each hop is a fresh document rather than a 3xx redirect.
Sourcepub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self
pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self
Set the redirect policy to use.
Sourcepub fn with_referer(&mut self, referer: Option<String>) -> &mut Self
pub fn with_referer(&mut self, referer: Option<String>) -> &mut Self
Add a referer (mis-spelling) to the request.
Sourcepub fn with_referrer(&mut self, referer: Option<String>) -> &mut Self
pub fn with_referrer(&mut self, referer: Option<String>) -> &mut Self
Add a referer to the request.
Sourcepub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self
pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self
Determine whether to collect all the resources found on pages.
Sourcepub fn with_dismiss_dialogs(&mut self, dismiss_dialogs: bool) -> &mut Self
Available on crate feature chrome only.
pub fn with_dismiss_dialogs(&mut self, dismiss_dialogs: bool) -> &mut Self
chrome only.Determine whether to dismiss dialogs. This method does nothing if the chrome is enabled.
Sourcepub fn with_emulation(&mut self, emulation: Option<Emulation>) -> &mut Self
Available on crate feature wreq only.
pub fn with_emulation(&mut self, emulation: Option<Emulation>) -> &mut Self
wreq only.Set the request emuluation. This method does nothing if the wreq flag is not enabled.
Sourcepub fn with_cron(&mut self, cron_str: &str, cron_type: CronType) -> &mut Self
Available on crate feature cron only.
pub fn with_cron(&mut self, cron_str: &str, cron_type: CronType) -> &mut Self
cron only.Setup cron jobs to run. This does nothing without the cron flag enabled.
Sourcepub fn with_limit(&mut self, limit: u32) -> &mut Self
pub fn with_limit(&mut self, limit: u32) -> &mut Self
Set a crawl page limit. If the value is 0 there is no limit.
Sourcepub fn with_concurrency_limit(&mut self, limit: Option<usize>) -> &mut Self
pub fn with_concurrency_limit(&mut self, limit: Option<usize>) -> &mut Self
Set the concurrency limits. If you set the value to None to use the default limits using the system CPU cors * n.
Sourcepub fn with_auth_challenge_response(
&mut self,
auth_challenge_response: Option<AuthChallengeResponse>,
) -> &mut Self
Available on crate feature chrome only.
pub fn with_auth_challenge_response( &mut self, auth_challenge_response: Option<AuthChallengeResponse>, ) -> &mut Self
chrome only.Set the authentiation challenge response. This does nothing without the feat flag chrome enabled.
Sourcepub fn with_evaluate_on_new_document(
&mut self,
evaluate_on_new_document: Option<Box<String>>,
) -> &mut Self
Available on crate feature chrome only.
pub fn with_evaluate_on_new_document( &mut self, evaluate_on_new_document: Option<Box<String>>, ) -> &mut Self
chrome only.Set a custom script to evaluate on new document creation. This does nothing without the feat flag chrome enabled.
Sourcepub fn with_depth(&mut self, depth: usize) -> &mut Self
pub fn with_depth(&mut self, depth: usize) -> &mut Self
Set a crawl depth limit. If the value is 0 there is no limit.
Sourcepub fn with_caching(&mut self, cache: bool) -> &mut Self
Available on crate features cache_request or chrome_remote_cache only.
pub fn with_caching(&mut self, cache: bool) -> &mut Self
cache_request or chrome_remote_cache only.Cache the page following HTTP rules. This method does nothing if the cache feature is not enabled.
Sourcepub fn with_cache_skip_browser(&mut self, skip: bool) -> &mut Self
Available on crate features cache_request or chrome_remote_cache only.
pub fn with_cache_skip_browser(&mut self, skip: bool) -> &mut Self
cache_request or chrome_remote_cache only.Skip browser rendering entirely if cached response exists. When enabled with caching, returns cached HTML directly without launching Chrome. This is useful for performance when you only need the cached content.
Sourcepub fn with_cache_namespace<S: Into<String>>(
&mut self,
namespace: Option<S>,
) -> &mut Self
pub fn with_cache_namespace<S: Into<String>>( &mut self, namespace: Option<S>, ) -> &mut Self
Partition the cache by an opaque namespace so logically distinct
variants of the same URL (country, proxy pool, tenant, A/B bucket,
device profile, …) never collide on the same cached bytes.
None uses the default (empty) namespace. Has no observable effect
when no cache feature is active, but the configuration is always
settable regardless of feature flags.
Sourcepub fn with_chrome_remote_cache_read_only(
&mut self,
_read_only: bool,
) -> &mut Self
Available on non-crate feature chrome_remote_cache only.
pub fn with_chrome_remote_cache_read_only( &mut self, _read_only: bool, ) -> &mut Self
chrome_remote_cache only.Enable read-only mode for the remote Chrome cache. This method does
nothing without the chrome_remote_cache feature.
Sourcepub fn with_remote_cache_skip_browser(&mut self, _enabled: bool) -> &mut Self
Available on non-crate feature chrome_remote_cache only.
pub fn with_remote_cache_skip_browser(&mut self, _enabled: bool) -> &mut Self
chrome_remote_cache only.Enable publishing of fresh HTTP (skip_browser) responses to the
shared remote cache worker. This method does nothing without the
chrome_remote_cache feature.
Sourcepub fn with_chrome_remote_cache_main_doc_only(
&mut self,
_enabled: bool,
) -> &mut Self
Available on non-crate feature chrome_remote_cache only.
pub fn with_chrome_remote_cache_main_doc_only( &mut self, _enabled: bool, ) -> &mut Self
chrome_remote_cache only.Restrict chrome remote-cache dumps to the main document only.
This method does nothing without the chrome_remote_cache
feature.
Sourcepub fn with_service_worker_enabled(&mut self, enabled: bool) -> &mut Self
Available on crate feature chrome only.
pub fn with_service_worker_enabled(&mut self, enabled: bool) -> &mut Self
chrome only.Enable or disable Service Workers. This method does nothing if the chrome feature is not enabled.
Sourcepub fn with_auto_geolocation(&mut self, enabled: bool) -> &mut Self
Available on crate feature chrome only.
pub fn with_auto_geolocation(&mut self, enabled: bool) -> &mut Self
chrome only.Automatically setup geo-location configurations when using a proxy. This method does nothing if the chrome feature is not enabled.
Sourcepub fn with_retry(&mut self, retry: u8) -> &mut Self
pub fn with_retry(&mut self, retry: u8) -> &mut Self
Set the retry limit for request. Set the value to 0 for no retries. The default is 0.
Sourcepub fn with_default_http_connect_timeout(
&mut self,
default_http_connect_timeout: Option<Duration>,
) -> &mut Self
pub fn with_default_http_connect_timeout( &mut self, default_http_connect_timeout: Option<Duration>, ) -> &mut Self
The default http connect timeout.
Sourcepub fn with_default_http_read_timeout(
&mut self,
default_http_read_timeout: Option<Duration>,
) -> &mut Self
pub fn with_default_http_read_timeout( &mut self, default_http_read_timeout: Option<Duration>, ) -> &mut Self
The default http read timeout.
Sourcepub fn with_no_control_thread(&mut self, no_control_thread: bool) -> &mut Self
pub fn with_no_control_thread(&mut self, no_control_thread: bool) -> &mut Self
Skip setting up a control thread for pause, start, and shutdown programmatic handling. This does nothing without the ‘control’ flag enabled.
Sourcepub fn with_viewport(&mut self, viewport: Option<Viewport>) -> &mut Self
pub fn with_viewport(&mut self, viewport: Option<Viewport>) -> &mut Self
Configures the viewport of the browser, which defaults to 800x600. This method does nothing if the ‘chrome’ feature is not enabled.
Sourcepub fn with_stealth(&mut self, stealth_mode: bool) -> &mut Self
Available on crate feature chrome only.
pub fn with_stealth(&mut self, stealth_mode: bool) -> &mut Self
chrome only.Use stealth mode for the request. This does nothing without the chrome flag enabled.
Sourcepub fn with_stealth_advanced(&mut self, stealth_mode: Tier) -> &mut Self
Available on crate feature chrome only.
pub fn with_stealth_advanced(&mut self, stealth_mode: Tier) -> &mut Self
chrome only.Use stealth mode for the request. This does nothing without the chrome flag enabled.
Sourcepub fn with_wait_for_idle_network(
&mut self,
wait_for_idle_network: Option<WaitForIdleNetwork>,
) -> &mut Self
Available on crate feature chrome only.
pub fn with_wait_for_idle_network( &mut self, wait_for_idle_network: Option<WaitForIdleNetwork>, ) -> &mut Self
chrome only.Wait for network request to be idle within a time frame period (500ms no network connections). This does nothing without the chrome flag enabled.
Sourcepub fn with_wait_for_idle_network0(
&mut self,
wait_for_idle_network0: Option<WaitForIdleNetwork>,
) -> &mut Self
Available on crate feature chrome only.
pub fn with_wait_for_idle_network0( &mut self, wait_for_idle_network0: Option<WaitForIdleNetwork>, ) -> &mut Self
chrome only.Wait for network request with a max timeout. This does nothing without the chrome flag enabled.
Sourcepub fn with_wait_for_almost_idle_network0(
&mut self,
wait_for_almost_idle_network0: Option<WaitForIdleNetwork>,
) -> &mut Self
Available on crate feature chrome only.
pub fn with_wait_for_almost_idle_network0( &mut self, wait_for_almost_idle_network0: Option<WaitForIdleNetwork>, ) -> &mut Self
chrome only.Wait for network to be almost idle with a max timeout. This does nothing without the chrome flag enabled.
Sourcepub fn with_wait_for_idle_dom(
&mut self,
wait_for_idle_dom: Option<WaitForSelector>,
) -> &mut Self
Available on crate feature chrome only.
pub fn with_wait_for_idle_dom( &mut self, wait_for_idle_dom: Option<WaitForSelector>, ) -> &mut Self
chrome only.Wait for idle dom mutations for target element. This method does nothing if the [chrome] feature is not enabled.
Sourcepub fn with_wait_for_selector(
&mut self,
wait_for_selector: Option<WaitForSelector>,
) -> &mut Self
Available on crate feature chrome only.
pub fn with_wait_for_selector( &mut self, wait_for_selector: Option<WaitForSelector>, ) -> &mut Self
chrome only.Wait for a selector. This method does nothing if the chrome feature is not enabled.
Sourcepub fn with_wait_for_delay(
&mut self,
wait_for_delay: Option<WaitForDelay>,
) -> &mut Self
Available on crate feature chrome only.
pub fn with_wait_for_delay( &mut self, wait_for_delay: Option<WaitForDelay>, ) -> &mut Self
chrome only.Wait for with delay. Should only be used for testing. This method does nothing if the ‘chrome’ feature is not enabled.
Sourcepub fn with_chrome_intercept(
&mut self,
chrome_intercept: RequestInterceptConfiguration,
url: &Option<Box<Url>>,
) -> &mut Self
Available on crate feature chrome_intercept only.
pub fn with_chrome_intercept( &mut self, chrome_intercept: RequestInterceptConfiguration, url: &Option<Box<Url>>, ) -> &mut Self
chrome_intercept only.Use request intercept for the request to only allow content that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the chrome_intercept is not enabled.
Sourcepub fn with_remote_local_policy(&mut self, enabled: bool) -> &mut Self
Available on crate feature chrome_intercept only.
pub fn with_remote_local_policy(&mut self, enabled: bool) -> &mut Self
chrome_intercept only.Push the interception policy (the chrome_intercept flags + per-job
blacklist/whitelist + page url) to a capable remote rendering engine
once per navigation, so it resolves block/allow decisions locally
instead of round-tripping every paused request. Enables request
interception. No-op against a normal Chrome target (the vendor method is
ignored), so it only changes behavior for engines that implement it.
Sourcepub fn with_chrome_connection(
&mut self,
chrome_connection_url: Option<String>,
) -> &mut Self
Available on crate feature chrome only.
pub fn with_chrome_connection( &mut self, chrome_connection_url: Option<String>, ) -> &mut Self
chrome only.Set the connection url for the chrome instance. This method does nothing if the chrome is not enabled.
Sourcepub fn with_chrome_connections(&mut self, urls: Vec<String>) -> &mut Self
Available on crate feature chrome only.
pub fn with_chrome_connections(&mut self, urls: Vec<String>) -> &mut Self
chrome only.Set multiple remote Chrome connection URLs for failover. When a
connection fails after retries, the next URL is tried. Takes
priority over chrome_connection_url when set.
A single-URL vec routes through chrome_connection_url so the
normal single-endpoint path (10 retries w/ backoff) is used
instead of the failover path (3 retries, no other endpoint to try).
Sourcepub fn with_worker_connection(
&mut self,
worker_connection_url: Option<String>,
) -> &mut Self
Available on crate feature decentralized only.
pub fn with_worker_connection( &mut self, worker_connection_url: Option<String>, ) -> &mut Self
decentralized only.Set the Spider worker URL for crawl requests. None clears the
per-website override so this Website falls back to the process-wide
SPIDER_WORKER env var (default http://127.0.0.1:3030). Some with
a non-empty URL routes crawl traffic through that worker; Some with
an empty/whitespace URL disables the crawl worker pool for this
Website without affecting any other Website in the process.
Sourcepub fn with_worker_connections(&mut self, urls: Vec<String>) -> &mut Self
Available on crate feature decentralized only.
pub fn with_worker_connections(&mut self, urls: Vec<String>) -> &mut Self
decentralized only.Set multiple Spider worker URLs for crawl requests. Empty/whitespace
entries are dropped. An empty resulting list disables the crawl worker
pool for this Website only.
Sourcepub fn with_scraper_worker_connection(
&mut self,
scraper_worker_connection_url: Option<String>,
) -> &mut Self
Available on crate feature decentralized only.
pub fn with_scraper_worker_connection( &mut self, scraper_worker_connection_url: Option<String>, ) -> &mut Self
decentralized only.Set the Spider scraper worker URL for scrape requests. None clears
the per-website override so this Website falls back to the
process-wide SPIDER_WORKER_SCRAPER env var (default
http://127.0.0.1:3031). Some with an empty/whitespace URL disables
the scraper worker pool for this Website only.
Sourcepub fn with_scraper_worker_connections(
&mut self,
urls: Vec<String>,
) -> &mut Self
Available on crate feature decentralized only.
pub fn with_scraper_worker_connections( &mut self, urls: Vec<String>, ) -> &mut Self
decentralized only.Set multiple Spider scraper worker URLs for scrape requests.
Empty/whitespace entries are dropped. An empty resulting list disables
the scraper worker pool for this Website only.
Sourcepub fn with_chrome_first_byte_timeout(
&mut self,
timeout: Option<Duration>,
) -> &mut Self
Available on crate feature chrome only.
pub fn with_chrome_first_byte_timeout( &mut self, timeout: Option<Duration>, ) -> &mut Self
chrome only.Set the first-byte watchdog timeout for Chrome navigations. None
disables it; Some(d) fires after d of silence on both
Network.responseReceived and Network.dataReceived and force-stops
the page so the caller can rotate to a different Chrome backend.
Sourcepub fn with_chrome_first_byte_timeout_jitter(
&mut self,
jitter: Option<Duration>,
) -> &mut Self
Available on crate feature chrome only.
pub fn with_chrome_first_byte_timeout_jitter( &mut self, jitter: Option<Duration>, ) -> &mut Self
chrome only.Set the per-fetch jitter window for the first-byte watchdog. None
disables jitter; Some(j) randomizes each fetch’s timeout uniformly
in [base, base + j). Ignored when the base timeout is None.
Sourcepub fn with_http_first_byte_timeout(
&mut self,
timeout: Option<Duration>,
) -> &mut Self
pub fn with_http_first_byte_timeout( &mut self, timeout: Option<Duration>, ) -> &mut Self
Set the first-byte watchdog timeout for HTTP fetches. None
disables it; Some(d) wraps each client.get(url).send() in
tokio::time::timeout(d + rand(0..jitter)) and returns a
synthetic 524 GATEWAY_TIMEOUT response on fire so the retry
path rotates the proxy. Covers stalls between TCP connect and
the first byte of the response — distinct from
connect_timeout (handshake-only) and chunk_idle_timeout
(body-streaming idle).
Sourcepub fn with_http_first_byte_timeout_jitter(
&mut self,
jitter: Option<Duration>,
) -> &mut Self
pub fn with_http_first_byte_timeout_jitter( &mut self, jitter: Option<Duration>, ) -> &mut Self
Set the per-fetch jitter window for the HTTP first-byte
watchdog. Same semantics as
with_chrome_first_byte_timeout_jitter.
Sourcepub fn with_execution_scripts(
&mut self,
execution_scripts: Option<ExecutionScriptsMap>,
) -> &mut Self
Available on crate feature chrome only.
pub fn with_execution_scripts( &mut self, execution_scripts: Option<ExecutionScriptsMap>, ) -> &mut Self
chrome only.Set JS to run on certain pages. This method does nothing if the chrome is not enabled.
Sourcepub fn with_automation_scripts(
&mut self,
automation_scripts: Option<AutomationScriptsMap>,
) -> &mut Self
Available on crate feature chrome only.
pub fn with_automation_scripts( &mut self, automation_scripts: Option<AutomationScriptsMap>, ) -> &mut Self
chrome only.Run web automated actions on certain pages. This method does nothing if the chrome is not enabled.
Sourcepub fn with_budget(&mut self, budget: Option<HashMap<&str, u32>>) -> &mut Self
pub fn with_budget(&mut self, budget: Option<HashMap<&str, u32>>) -> &mut Self
Set a crawl budget per path with levels support /a/b/c or for all paths with “*”. This does nothing without the budget flag enabled.
Sourcepub fn with_external_domains<'a, 'b>(
&mut self,
external_domains: Option<impl Iterator<Item = String> + 'a>,
) -> &mut Self
pub fn with_external_domains<'a, 'b>( &mut self, external_domains: Option<impl Iterator<Item = String> + 'a>, ) -> &mut Self
Group external domains to treat the crawl as one. If None is passed this will clear all prior domains.
Sourcepub fn with_danger_accept_invalid_certs(
&mut self,
accept_invalid_certs: bool,
) -> &mut Self
pub fn with_danger_accept_invalid_certs( &mut self, accept_invalid_certs: bool, ) -> &mut Self
Dangerously accept invalid certificates - this should be used as a last resort.
Sourcepub fn with_normalize(&mut self, normalize: bool) -> &mut Self
pub fn with_normalize(&mut self, normalize: bool) -> &mut Self
Normalize the content de-duplicating trailing slash pages and other pages that can be duplicated. This may initially show the link in your links_visited or subscription calls but, the following links will not be crawled.
Available on crate feature disk only.
disk only.Store all the links found on the disk to share the state. This does nothing without the disk flag enabled.
Sourcepub fn with_timezone_id(&mut self, timezone_id: Option<String>) -> &mut Self
Available on crate feature chrome only.
pub fn with_timezone_id(&mut self, timezone_id: Option<String>) -> &mut Self
chrome only.Overrides default host system timezone with the specified one. This does nothing without the chrome flag enabled.
Sourcepub fn with_locale(&mut self, locale: Option<String>) -> &mut Self
Available on crate feature chrome only.
pub fn with_locale(&mut self, locale: Option<String>) -> &mut Self
chrome only.Overrides default host system locale with the specified one. This does nothing without the chrome flag enabled.
Sourcepub fn with_event_tracker(
&mut self,
track_events: Option<ChromeEventTracker>,
) -> &mut Self
Available on crate feature chrome only.
pub fn with_event_tracker( &mut self, track_events: Option<ChromeEventTracker>, ) -> &mut Self
chrome only.Track the events made via chrome.
Sourcepub fn with_screenshot(
&mut self,
screenshot_config: Option<ScreenShotConfig>,
) -> &mut Self
Available on crate feature chrome only.
pub fn with_screenshot( &mut self, screenshot_config: Option<ScreenShotConfig>, ) -> &mut Self
chrome only.Set the chrome screenshot configuration. This does nothing without the chrome flag enabled.
Sourcepub fn with_max_page_bytes(&mut self, max_page_bytes: Option<f64>) -> &mut Self
pub fn with_max_page_bytes(&mut self, max_page_bytes: Option<f64>) -> &mut Self
Set the max amount of bytes to collect per page. This method does nothing if the chrome is not enabled.
Sourcepub fn with_max_bytes_allowed(
&mut self,
max_bytes_allowed: Option<u64>,
) -> &mut Self
pub fn with_max_bytes_allowed( &mut self, max_bytes_allowed: Option<u64>, ) -> &mut Self
Set the max amount of bytes to collected for the browser context. This method does nothing if the chrome is not enabled.
Sourcepub fn with_block_assets(&mut self, only_html: bool) -> &mut Self
pub fn with_block_assets(&mut self, only_html: bool) -> &mut Self
Block assets from loading from the network.
Sourcepub fn with_modify_headers(&mut self, modify_headers: bool) -> &mut Self
pub fn with_modify_headers(&mut self, modify_headers: bool) -> &mut Self
Modify the headers to mimic a real browser.
Sourcepub fn with_modify_http_client_headers(
&mut self,
modify_http_client_headers: bool,
) -> &mut Self
pub fn with_modify_http_client_headers( &mut self, modify_http_client_headers: bool, ) -> &mut Self
Modify the HTTP client headers to mimic a real browser.
Sourcepub fn with_cache_policy(
&mut self,
cache_policy: Option<BasicCachePolicy>,
) -> &mut Self
pub fn with_cache_policy( &mut self, cache_policy: Option<BasicCachePolicy>, ) -> &mut Self
Set the cache policy.
Sourcepub fn with_webdriver_config(
&mut self,
webdriver_config: Option<WebDriverConfig>,
) -> &mut Self
Available on crate feature webdriver only.
pub fn with_webdriver_config( &mut self, webdriver_config: Option<WebDriverConfig>, ) -> &mut Self
webdriver only.Set the WebDriver configuration. This does nothing without the webdriver flag enabled.
Sourcepub fn auto_http_first_byte_args(&self) -> (Option<Duration>, Option<Duration>)
pub fn auto_http_first_byte_args(&self) -> (Option<Duration>, Option<Duration>)
Resolve the HTTP first-byte watchdog args.
Returns the configured http_first_byte_timeout + _jitter
whenever the timeout field is Some(_) — caller opted in by
setting the field, so honor it regardless of proxy count.
Previously this was gated on balance feature + ≥2 HTTP-eligible
proxies, on the premise that the watchdog firing without a
rotation target was wasted. That was wrong for the
proxy-shrouded NXDOMAIN case: a single proxy still returns an
upstream-DNS-shaped 5xx after ~15-22s, and reqwest’s .timeout()
is not enforced through the proxy CONNECT tunnel for that phase.
The watchdog is the only knob that fires reliably, and a fast
524 surfaced to the caller is strictly better than waiting for
the proxy’s internal DNS deadline — even when no rotation
target exists.
When the timeout field is None, returns (None, None) —
pure passthrough, no overhead. Setting the field on Configuration
is the opt-in.
Sourcepub fn chrome_fetch_params(&self) -> ChromeFetchParams<'_>
Available on crate feature chrome only.
pub fn chrome_fetch_params(&self) -> ChromeFetchParams<'_>
chrome only.Build the borrowed chrome fetch parameter bundle.
Zero-copy: all fields borrow directly from self. Build once at
the top of a call chain and pass & through the layers to keep
the hot path inlineable.
Sourcepub fn with_search_config(
&mut self,
search_config: Option<SearchConfig>,
) -> &mut Self
Available on crate feature search only.
pub fn with_search_config( &mut self, search_config: Option<SearchConfig>, ) -> &mut Self
search only.Configure web search integration. This does nothing without the search flag enabled.
Sourcepub fn with_spider_cloud(&mut self, api_key: &str) -> &mut Self
Available on crate feature spider_cloud only.
pub fn with_spider_cloud(&mut self, api_key: &str) -> &mut Self
spider_cloud only.Set a spider.cloud API key (Proxy mode).
Sourcepub fn with_spider_cloud_config(
&mut self,
config: SpiderCloudConfig,
) -> &mut Self
Available on crate feature spider_cloud only.
pub fn with_spider_cloud_config( &mut self, config: SpiderCloudConfig, ) -> &mut Self
spider_cloud only.Set a spider.cloud config.
Sourcepub fn with_spider_browser(&mut self, api_key: &str) -> &mut Self
Available on crate features chrome and spider_cloud only.
pub fn with_spider_browser(&mut self, api_key: &str) -> &mut Self
chrome and spider_cloud only.Connect to Spider Browser Cloud via CDP over WebSocket using an API key.
Sets chrome_connection_url to wss://browser.spider.cloud/v1/browser?token=API_KEY.
Sourcepub fn with_spider_browser_config(
&mut self,
config: SpiderBrowserConfig,
) -> &mut Self
Available on crate features chrome and spider_cloud only.
pub fn with_spider_browser_config( &mut self, config: SpiderBrowserConfig, ) -> &mut Self
chrome and spider_cloud only.Connect to Spider Browser Cloud with full configuration (stealth, country, browser type, etc.).
Sourcepub fn with_hedge(&mut self, config: HedgeConfig) -> &mut Self
Available on crate feature hedge only.
pub fn with_hedge(&mut self, config: HedgeConfig) -> &mut Self
hedge only.Set the hedged request (work-stealing) configuration.
Sourcepub fn with_auto_throttle(&mut self, config: AutoThrottleConfig) -> &mut Self
Available on crate feature auto_throttle only.
pub fn with_auto_throttle(&mut self, config: AutoThrottleConfig) -> &mut Self
auto_throttle only.Set the auto-throttle configuration for latency-based adaptive delay.
Sourcepub fn with_etag_cache(&mut self, enabled: bool) -> &mut Self
Available on crate feature etag_cache only.
pub fn with_etag_cache(&mut self, enabled: bool) -> &mut Self
etag_cache only.Enable or disable ETag / conditional request caching for bandwidth-efficient re-crawls.
Sourcepub fn with_warc(&mut self, config: WarcConfig) -> &mut Self
Available on crate feature warc only.
pub fn with_warc(&mut self, config: WarcConfig) -> &mut Self
warc only.Configure WARC output for writing a web archive file during crawl.
Trait Implementations§
Source§impl Clone for Configuration
impl Clone for Configuration
Source§fn clone(&self) -> Configuration
fn clone(&self) -> Configuration
1.0.0 (const: unstable) · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
source. Read moreSource§impl Debug for Configuration
impl Debug for Configuration
Source§impl Default for Configuration
impl Default for Configuration
Source§fn default() -> Configuration
fn default() -> Configuration
Source§impl<'de> Deserialize<'de> for Configurationwhere
Configuration: Default,
impl<'de> Deserialize<'de> for Configurationwhere
Configuration: Default,
Source§fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>where
__D: Deserializer<'de>,
fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>where
__D: Deserializer<'de>,
Auto Trait Implementations§
impl !RefUnwindSafe for Configuration
impl !UnwindSafe for Configuration
impl Freeze for Configuration
impl Send for Configuration
impl Sync for Configuration
impl Unpin for Configuration
impl UnsafeUnpin for Configuration
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> DeserializeOwned for Twhere
T: for<'de> Deserialize<'de>,
Source§impl<T> Instrument for T
impl<T> Instrument for T
Source§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
Source§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more