Skip to main content

Configuration

Struct Configuration 

Source
pub struct Configuration {
Show 47 fields pub respect_robots_txt: bool, pub subdomains: bool, pub tld: bool, pub crawl_timeout: Option<Duration>, pub preserve_host_header: bool, pub blacklist_url: Option<Vec<CompactString>>, pub whitelist_url: Option<Vec<CompactString>>, pub user_agent: Option<Box<CompactString>>, pub delay: u64, pub request_timeout: Option<Box<Duration>>, pub http2_prior_knowledge: bool, pub proxies: Option<Vec<RequestProxy>>, pub headers: Option<Box<SerializableHeaderMap>>, pub redirect_limit: Box<usize>, pub redirect_policy: RedirectPolicy, pub cookie_str: Box<String>, pub depth: usize, pub depth_distance: usize, pub stealth_mode: Tier, pub viewport: Option<Viewport>, pub budget: Option<HashMap<CaseInsensitiveString, u32>>, pub wild_card_budgeting: bool, pub external_domains_caseless: Box<HashSet<CaseInsensitiveString>>, pub full_resources: bool, pub accept_invalid_certs: bool, pub auth_challenge_response: Option<AuthChallengeResponse>, pub openai_config: Option<Box<GPTConfigs>>, pub gemini_config: Option<Box<GeminiConfigs>>, pub remote_multimodal: Option<Box<RemoteMultimodalConfigs>>, pub shared_queue: bool, pub return_page_links: bool, pub retry: u8, pub no_control_thread: bool, pub only_html: bool, pub concurrency_limit: Option<usize>, pub normalize: bool, pub shared: bool, pub modify_headers: bool, pub modify_http_client_headers: bool, pub referer: Option<String>, pub max_page_bytes: Option<f64>, pub max_bytes_allowed: Option<u64>, pub cache_policy: Option<BasicCachePolicy>, pub network_interface: Option<String>, pub local_address: Option<IpAddr>, pub default_http_connect_timeout: Option<Duration>, pub default_http_read_timeout: Option<Duration>, /* private fields */
}
Expand description

Structure to configure Website crawler

use spider::website::Website;
let mut website: Website = Website::new("https://choosealicense.com");
website.configuration.blacklist_url.insert(Default::default()).push("https://choosealicense.com/licenses/".to_string().into());
website.configuration.respect_robots_txt = true;
website.configuration.subdomains = true;
website.configuration.tld = true;

Fields§

§respect_robots_txt: bool

Respect robots.txt file and not scrape not allowed files. This may slow down crawls if robots.txt file has a delay included.

§subdomains: bool

Allow sub-domains.

§tld: bool

Allow all tlds for domain.

§crawl_timeout: Option<Duration>

The max timeout for the crawl.

§preserve_host_header: bool

Preserve the HTTP host header from being included.

§blacklist_url: Option<Vec<CompactString>>

List of pages to not crawl. [optional: regex pattern matching]

§whitelist_url: Option<Vec<CompactString>>

List of pages to only crawl. [optional: regex pattern matching]

§user_agent: Option<Box<CompactString>>

User-Agent for request.

§delay: u64

Polite crawling delay in milli seconds.

§request_timeout: Option<Box<Duration>>

Request max timeout per page. By default the request times out in 15s. Set to None to disable.

§http2_prior_knowledge: bool

Use HTTP2 for connection. Enable if you know the website has http2 support.

§proxies: Option<Vec<RequestProxy>>

Use proxy list for performing network request.

§headers: Option<Box<SerializableHeaderMap>>

Headers to include with request.

§redirect_limit: Box<usize>

The max redirections allowed for request.

§redirect_policy: RedirectPolicy

The redirect policy type to use.

§cookie_str: Box<String>

Cookie string to use for network requests ex: “foo=bar; Domain=blog.spider”

§depth: usize

The max depth to crawl for a website. Defaults to 25 to help prevent infinite recursion.

§depth_distance: usize

The depth to crawl pertaining to the root.

§stealth_mode: Tier

Use stealth mode for requests.

§viewport: Option<Viewport>

Configure the viewport for chrome and viewport headers.

§budget: Option<HashMap<CaseInsensitiveString, u32>>

Crawl budget for the paths. This helps prevent crawling extra pages and limiting the amount.

§wild_card_budgeting: bool

If wild card budgeting is found for the website.

§external_domains_caseless: Box<HashSet<CaseInsensitiveString>>

External domains to include case-insensitive.

§full_resources: bool

Collect all the resources found on the page.

§accept_invalid_certs: bool

Dangerously accept invalid certficates.

§auth_challenge_response: Option<AuthChallengeResponse>

The auth challenge response. The ‘chrome_intercept’ flag is also required in order to intercept the response.

§openai_config: Option<Box<GPTConfigs>>

The OpenAI configs to use to help drive the chrome browser. This does nothing without the ‘openai’ flag.

§gemini_config: Option<Box<GeminiConfigs>>

The Gemini configs to use to help drive the chrome browser. This does nothing without the ‘gemini’ flag.

§remote_multimodal: Option<Box<RemoteMultimodalConfigs>>

Remote multimodal automation config (vision + LLM-driven steps). Requires the agent feature for full functionality, uses stub type otherwise.

§shared_queue: bool

Use a shared queue strategy when crawling. This can scale workloads evenly that do not need priority.

§return_page_links: bool

Return the page links in the subscription channels. This does nothing without the flag sync enabled.

§retry: u8

Retry count to attempt to swap proxies etc.

§no_control_thread: bool

Skip spawning a control thread that can pause, start, and shutdown the crawl.

§only_html: bool

Expect only to handle HTML to save on resources. This mainly only blocks the crawling and returning of resources from the server.

§concurrency_limit: Option<usize>

The concurrency limits to apply.

§normalize: bool

Normalize the html de-deplucating the content.

§shared: bool

Share the state of the crawl requires the ‘disk’ feature flag.

§modify_headers: bool

Modify the headers to act like a real-browser

§modify_http_client_headers: bool

Modify the HTTP client headers only to act like a real-browser

§referer: Option<String>

The referer to use.

§max_page_bytes: Option<f64>

Determine the max bytes per page.

§max_bytes_allowed: Option<u64>

Determine the max bytes per browser context.

§cache_policy: Option<BasicCachePolicy>

The cache policy to use.

§network_interface: Option<String>

Bind the connections only on the network interface.

§local_address: Option<IpAddr>

Bind to a local IP Address.

§default_http_connect_timeout: Option<Duration>

The default http connect timeout

§default_http_read_timeout: Option<Duration>

The default http read timeout

Implementations§

Source§

impl Configuration

Source

pub fn new() -> Self

Represents crawl configuration for a website.

Source

pub fn get_blacklist(&self) -> AllowList

Handle the blacklist options.

Source

pub fn set_whitelist(&mut self)

Set the whitelist

Source

pub fn configure_allowlist(&mut self)

Configure the allow list.

Source

pub fn get_blacklist_compiled(&self) -> &AllowList

Get the blacklist compiled.

Source

pub fn configure_budget(&mut self)

Setup the budget for crawling.

Source

pub fn get_whitelist_compiled(&self) -> &AllowList

Get the whitelist compiled.

Source

pub fn get_whitelist(&self) -> AllowList

Handle the whitelist options.

Source

pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self

Respect robots.txt file.

Source

pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self

Include subdomains detection.

Source

pub fn with_csp_bypass(&mut self, _enabled: bool) -> &mut Self

Bypass CSP protection detection. This does nothing without the feat flag chrome enabled.

Source

pub fn with_network_interface( &mut self, network_interface: Option<String>, ) -> &mut Self

Bind the connections only on the network interface.

Source

pub fn with_local_address(&mut self, local_address: Option<IpAddr>) -> &mut Self

Bind to a local IP Address.

Source

pub fn with_tld(&mut self, tld: bool) -> &mut Self

Include tld detection.

Source

pub fn with_crawl_timeout( &mut self, crawl_timeout: Option<Duration>, ) -> &mut Self

The max duration for the crawl. This is useful when websites use a robots.txt with long durations and throttle the timeout removing the full concurrency.

Source

pub fn with_delay(&mut self, delay: u64) -> &mut Self

Delay between request as ms.

Source

pub fn with_http2_prior_knowledge( &mut self, http2_prior_knowledge: bool, ) -> &mut Self

Only use HTTP/2.

Source

pub fn with_request_timeout( &mut self, request_timeout: Option<Duration>, ) -> &mut Self

Max time to wait for request. By default request times out in 15s. Set to None to disable.

Source

pub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self

Set the sitemap url. This does nothing without the sitemap feature flag.

Source

pub fn with_ignore_sitemap(&mut self, _ignore_sitemap: bool) -> &mut Self

Ignore the sitemap when crawling. This method does nothing if the sitemap is not enabled.

Source

pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self

Add user agent to request.

Source

pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self

Preserve the HOST header.

Source

pub fn with_remote_multimodal( &mut self, remote_multimodal: Option<RemoteMultimodalConfigs>, ) -> &mut Self

Use a remote multimodal model to drive browser automation. When the agent feature is not enabled, this uses a stub type.

Source

pub fn with_openai(&mut self, _openai_config: Option<GPTConfigs>) -> &mut Self

The OpenAI configs to use to drive the browser. This method does nothing if the openai is not enabled.

Source

pub fn with_gemini( &mut self, _gemini_config: Option<GeminiConfigs>, ) -> &mut Self

The Gemini configs to use to drive the browser. This method does nothing if the gemini is not enabled.

Source

pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self

Cookie string to use in request. This does nothing without the cookies flag enabled.

Source

pub fn with_fingerprint(&mut self, _fingerprint: bool) -> &mut Self

Set custom fingerprint ID for request. This does nothing without the chrome flag enabled.

Source

pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self

Use proxies for request.

Source

pub fn with_proxies_direct( &mut self, proxies: Option<Vec<RequestProxy>>, ) -> &mut Self

Use proxies for request with control between chrome and http.

Source

pub fn with_shared_queue(&mut self, shared_queue: bool) -> &mut Self

Use a shared semaphore to evenly handle workloads. The default is false.

Source

pub fn with_blacklist_url<T>( &mut self, blacklist_url: Option<Vec<T>>, ) -> &mut Self
where Vec<CompactString>: From<Vec<T>>,

Add blacklist urls to ignore.

Source

pub fn with_whitelist_url<T>( &mut self, whitelist_url: Option<Vec<T>>, ) -> &mut Self
where Vec<CompactString>: From<Vec<T>>,

Add whitelist urls to allow.

Return the links found on the page in the channel subscriptions. This method does nothing if the decentralized is enabled.

Source

pub fn with_headers(&mut self, headers: Option<HeaderMap>) -> &mut Self

Set HTTP headers for request using reqwest::header::HeaderMap.

Source

pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self

Set the max redirects allowed for request.

Source

pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self

Set the redirect policy to use.

Source

pub fn with_referer(&mut self, referer: Option<String>) -> &mut Self

Add a referer (mis-spelling) to the request.

Source

pub fn with_referrer(&mut self, referer: Option<String>) -> &mut Self

Add a referer to the request.

Source

pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self

Determine whether to collect all the resources found on pages.

Source

pub fn with_dismiss_dialogs(&mut self, _dismiss_dialogs: bool) -> &mut Self

Determine whether to dismiss dialogs. This method does nothing if the chrome is enabled.

Source

pub fn with_cron(&mut self, _cron_str: &str, _cron_type: CronType) -> &mut Self

Setup cron jobs to run. This does nothing without the cron flag enabled.

Source

pub fn with_limit(&mut self, limit: u32) -> &mut Self

Set a crawl page limit. If the value is 0 there is no limit.

Source

pub fn with_concurrency_limit(&mut self, limit: Option<usize>) -> &mut Self

Set the concurrency limits. If you set the value to None to use the default limits using the system CPU cors * n.

Source

pub fn with_evaluate_on_new_document( &mut self, _evaluate_on_new_document: Option<Box<String>>, ) -> &mut Self

Set a custom script to evaluate on new document creation. This does nothing without the feat flag chrome enabled.

Source

pub fn with_auth_challenge_response( &mut self, _auth_challenge_response: Option<AuthChallengeResponse>, ) -> &mut Self

Set the authentiation challenge response. This does nothing without the feat flag chrome enabled.

Source

pub fn with_depth(&mut self, depth: usize) -> &mut Self

Set a crawl depth limit. If the value is 0 there is no limit.

Source

pub fn with_caching(&mut self, _cache: bool) -> &mut Self

Cache the page following HTTP rules. This method does nothing if the cache feature is not enabled.

Source

pub fn with_cache_skip_browser(&mut self, _skip: bool) -> &mut Self

Skip browser rendering entirely if cached response exists. This method does nothing if the cache features are not enabled.

Source

pub fn with_service_worker_enabled(&mut self, _enabled: bool) -> &mut Self

Enable or disable Service Workers. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_auto_geolocation(&mut self, _enabled: bool) -> &mut Self

Automatically setup geo-location configurations when using a proxy. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_retry(&mut self, retry: u8) -> &mut Self

Set the retry limit for request. Set the value to 0 for no retries. The default is 0.

Source

pub fn with_default_http_connect_timeout( &mut self, default_http_connect_timeout: Option<Duration>, ) -> &mut Self

The default http connect timeout.

Source

pub fn with_default_http_read_timeout( &mut self, default_http_read_timeout: Option<Duration>, ) -> &mut Self

The default http read timeout.

Source

pub fn with_no_control_thread(&mut self, no_control_thread: bool) -> &mut Self

Skip setting up a control thread for pause, start, and shutdown programmatic handling. This does nothing without the ‘control’ flag enabled.

Source

pub fn with_viewport(&mut self, viewport: Option<Viewport>) -> &mut Self

Configures the viewport of the browser, which defaults to 800x600. This method does nothing if the ‘chrome’ feature is not enabled.

Source

pub fn with_stealth(&mut self, _stealth_mode: bool) -> &mut Self

Use stealth mode for the request. This does nothing without the chrome flag enabled.

Source

pub fn with_wait_for_almost_idle_network0( &mut self, _wait_for_almost_idle_network0: Option<WaitForIdleNetwork>, ) -> &mut Self

Wait for network to be almost idle with a max timeout. This does nothing without the chrome flag enabled.

Source

pub fn with_wait_for_idle_network0( &mut self, _wait_for_idle_network0: Option<WaitForIdleNetwork>, ) -> &mut Self

Wait for network request with a max timeout. This does nothing without the chrome flag enabled.

Source

pub fn with_wait_for_idle_network( &mut self, _wait_for_idle_network: Option<WaitForIdleNetwork>, ) -> &mut Self

Wait for idle network request. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_wait_for_idle_dom( &mut self, _wait_for_idle_dom: Option<WaitForSelector>, ) -> &mut Self

Wait for idle dom mutations for target element. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_wait_for_selector( &mut self, _wait_for_selector: Option<WaitForSelector>, ) -> &mut Self

Wait for a selector. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_wait_for_delay( &mut self, _wait_for_delay: Option<WaitForDelay>, ) -> &mut Self

Wait for with delay. Should only be used for testing. This method does nothing if the ‘chrome’ feature is not enabled.

Source

pub fn with_chrome_intercept( &mut self, _chrome_intercept: RequestInterceptConfiguration, _url: &Option<Box<Url>>, ) -> &mut Self

Use request intercept for the request to only allow content required for the page that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the chrome_intercept is not enabled.

Source

pub fn with_chrome_connection( &mut self, _chrome_connection_url: Option<String>, ) -> &mut Self

Set the connection url for the chrome instance. This method does nothing if the chrome is not enabled.

Source

pub fn with_execution_scripts( &mut self, _execution_scripts: Option<ExecutionScriptsMap>, ) -> &mut Self

Set JS to run on certain pages. This method does nothing if the chrome is not enabled.

Source

pub fn with_automation_scripts( &mut self, _automation_scripts: Option<AutomationScriptsMap>, ) -> &mut Self

Run web automated actions on certain pages. This method does nothing if the chrome is not enabled.

Source

pub fn with_budget(&mut self, budget: Option<HashMap<&str, u32>>) -> &mut Self

Set a crawl budget per path with levels support /a/b/c or for all paths with “*”. This does nothing without the budget flag enabled.

Source

pub fn with_external_domains<'a, 'b>( &mut self, external_domains: Option<impl Iterator<Item = String> + 'a>, ) -> &mut Self

Group external domains to treat the crawl as one. If None is passed this will clear all prior domains.

Source

pub fn with_danger_accept_invalid_certs( &mut self, accept_invalid_certs: bool, ) -> &mut Self

Dangerously accept invalid certificates - this should be used as a last resort.

Source

pub fn with_normalize(&mut self, normalize: bool) -> &mut Self

Normalize the content de-duplicating trailing slash pages and other pages that can be duplicated. This may initially show the link in your links_visited or subscription calls but, the following links will not be crawled.

Source

pub fn with_shared_state(&mut self, shared: bool) -> &mut Self

Store all the links found on the disk to share the state. This does nothing without the disk flag enabled.

Source

pub fn with_timezone_id(&mut self, _timezone_id: Option<String>) -> &mut Self

Overrides default host system timezone with the specified one. This does nothing without the chrome flag enabled.

Source

pub fn with_locale(&mut self, _locale: Option<String>) -> &mut Self

Overrides default host system locale with the specified one. This does nothing without the chrome flag enabled.

Source

pub fn with_screenshot( &mut self, _screenshot_config: Option<ScreenShotConfig>, ) -> &mut Self

Set the chrome screenshot configuration. This does nothing without the chrome flag enabled.

Source

pub fn with_max_page_bytes(&mut self, max_page_bytes: Option<f64>) -> &mut Self

Set the max amount of bytes to collect per page. This method does nothing if the chrome is not enabled.

Source

pub fn with_max_bytes_allowed( &mut self, max_bytes_allowed: Option<u64>, ) -> &mut Self

Set the max amount of bytes to collected for the browser context. This method does nothing if the chrome is not enabled.

Source

pub fn with_block_assets(&mut self, only_html: bool) -> &mut Self

Block assets from loading from the network.

Source

pub fn with_modify_headers(&mut self, modify_headers: bool) -> &mut Self

Modify the headers to mimic a real browser.

Source

pub fn with_modify_http_client_headers( &mut self, modify_http_client_headers: bool, ) -> &mut Self

Modify the HTTP client headers to mimic a real browser.

Source

pub fn with_cache_policy( &mut self, cache_policy: Option<BasicCachePolicy>, ) -> &mut Self

Set the cache policy.

Source

pub fn with_webdriver_config( &mut self, _webdriver_config: Option<WebDriverConfig>, ) -> &mut Self

Set the WebDriver configuration. This does nothing without the webdriver flag enabled.

Source

pub fn build(&self) -> Self

Build the website configuration when using with_builder.

Source

pub fn with_search_config(&mut self, _search_config: Option<()>) -> &mut Self

Configure web search integration. This does nothing without the search flag enabled.

Trait Implementations§

Source§

impl Clone for Configuration

Source§

fn clone(&self) -> Configuration

Returns a duplicate of the value. Read more
1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more
Source§

impl Debug for Configuration

Source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more
Source§

impl Default for Configuration

Source§

fn default() -> Configuration

Returns the “default value” for a type. Read more
Source§

impl PartialEq for Configuration

Source§

fn eq(&self, other: &Configuration) -> bool

Tests for self and other values to be equal, and is used by ==.
1.0.0 · Source§

fn ne(&self, other: &Rhs) -> bool

Tests for !=. The default implementation is almost always sufficient, and should not be overridden without very good reason.
Source§

impl StructuralPartialEq for Configuration

Auto Trait Implementations§

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> CloneToUninit for T
where T: Clone,

Source§

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)
Performs copy-assignment from self to dest. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T> Instrument for T

Source§

fn instrument(self, span: Span) -> Instrumented<Self>

Instruments this type with the provided Span, returning an Instrumented wrapper. Read more
Source§

fn in_current_span(self) -> Instrumented<Self>

Instruments this type with the current Span, returning an Instrumented wrapper. Read more
Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T> IntoEither for T

Source§

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

impl<T> PolicyExt for T
where T: ?Sized,

Source§

fn and<P, B, E>(self, other: P) -> And<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

Create a new Policy that returns Action::Follow only if self and other return Action::Follow. Read more
Source§

fn or<P, B, E>(self, other: P) -> Or<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

Create a new Policy that returns Action::Follow if either self or other returns Action::Follow. Read more
Source§

impl<T> Same for T

Source§

type Output = T

Should always be Self
Source§

impl<T> ToOwned for T
where T: Clone,

Source§

type Owned = T

The resulting type after obtaining ownership.
Source§

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more
Source§

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

Source§

fn vzip(self) -> V

Source§

impl<T> WithSubscriber for T

Source§

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

Attaches the provided Subscriber to this type, returning a WithDispatch wrapper. Read more
Source§

fn with_current_subscriber(self) -> WithDispatch<Self>

Attaches the current default Subscriber to this type, returning a WithDispatch wrapper. Read more