Struct Configuration

Source

pub struct Configuration {Show 29 fields
    pub respect_robots_txt: bool,
    pub subdomains: bool,
    pub tld: bool,
    pub preserve_host_header: bool,
    pub blacklist_url: Option<Box<Vec<CompactString>>>,
    pub whitelist_url: Option<Box<Vec<CompactString>>>,
    pub user_agent: Option<Box<CompactString>>,
    pub delay: u64,
    pub request_timeout: Option<Box<Duration>>,
    pub http2_prior_knowledge: bool,
    pub proxies: Option<Box<Vec<String>>>,
    pub headers: Option<Box<SerializableHeaderMap>>,
    pub redirect_limit: Box<usize>,
    pub redirect_policy: RedirectPolicy,
    pub cookie_str: Box<String>,
    pub depth: usize,
    pub depth_distance: usize,
    pub budget: Option<HashMap<CaseInsensitiveString, u32>>,
    pub wild_card_budgeting: bool,
    pub external_domains_caseless: Box<HashSet<CaseInsensitiveString>>,
    pub full_resources: bool,
    pub accept_invalid_certs: bool,
    pub auth_challenge_response: Option<AuthChallengeResponse>,
    pub openai_config: Option<GPTConfigs>,
    pub shared_queue: bool,
    pub return_page_links: bool,
    pub retry: u8,
    pub only_html: bool,
    pub concurrency_limit: Option<usize>,
    /* private fields */
}

Expand description

Structure to configure Website crawler

use spider::website::Website;
let mut website: Website = Website::new("https://choosealicense.com");
website.configuration.blacklist_url.insert(Default::default()).push("https://choosealicense.com/licenses/".to_string().into());
website.configuration.respect_robots_txt = true;
website.configuration.subdomains = true;
website.configuration.tld = true;

Fields§

§respect_robots_txt: bool

Respect robots.txt file and not scrape not allowed files. This may slow down crawls if robots.txt file has a delay included.

§subdomains: bool

Allow sub-domains.

§tld: bool

Allow all tlds for domain.

§preserve_host_header: bool

Preserve the HTTP host header from being included.

§blacklist_url: Option<Box<Vec<CompactString>>>

List of pages to not crawl. [optional: regex pattern matching]

§whitelist_url: Option<Box<Vec<CompactString>>>

List of pages to only crawl. [optional: regex pattern matching]

§user_agent: Option<Box<CompactString>>

User-Agent for request.

§delay: u64

Polite crawling delay in milli seconds.

§request_timeout: Option<Box<Duration>>

Request max timeout per page. By default the request times out in 15s. Set to None to disable.

§http2_prior_knowledge: bool

Use HTTP2 for connection. Enable if you know the website has http2 support.

§proxies: Option<Box<Vec<String>>>

Use proxy list for performing network request.

§headers: Option<Box<SerializableHeaderMap>>

Headers to include with request.

§redirect_limit: Box<usize>

The max redirections allowed for request.

§redirect_policy: RedirectPolicy

The redirect policy type to use.

§cookie_str: Box<String>

Cookie string to use for network requests ex: “foo=bar; Domain=blog.spider”

§depth: usize

The max depth to crawl for a website.

§depth_distance: usize

The depth to crawl pertaining to the root.

§budget: Option<HashMap<CaseInsensitiveString, u32>>

Crawl budget for the paths. This helps prevent crawling extra pages and limiting the amount.

§wild_card_budgeting: bool

If wild card budgeting is found for the website.

§external_domains_caseless: Box<HashSet<CaseInsensitiveString>>

External domains to include case-insensitive.

§full_resources: bool

Collect all the resources found on the page.

§accept_invalid_certs: bool

Dangerously accept invalid certficates.

§auth_challenge_response: Option<AuthChallengeResponse>

The auth challenge response. The ‘chrome_intercept’ flag is also required in order to intercept the response.

§openai_config: Option<GPTConfigs>

The OpenAI configs to use to help drive the chrome browser. This does nothing without the ‘openai’ flag.

§shared_queue: bool

Use a shared queue strategy when crawling. This can scale workloads evenly that do not need priority.

§return_page_links: bool

Return the page links in the subscription channels. This does nothing without the flag sync enabled.

§retry: u8

Retry count to attempt to swap proxies etc.

§only_html: bool

Expect only to handle HTML to save on resources. This mainly only blocks the crawling and returning of resources from the server.

§concurrency_limit: Option<usize>

The concurrency limits to apply.

Implementations§

Source §

impl Configuration

Source

pub fn new() -> Self

Represents crawl configuration for a website.

Source

pub fn get_blacklist(&self) -> Box<Vec<CompactString>>

Handle the blacklist options.

Source

pub fn get_whitelist(&self) -> Box<Vec<CompactString>>

Handle the whitelist options.

Source

pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self

Respect robots.txt file.

Source

pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self

Include subdomains detection.

Source

pub fn with_tld(&mut self, tld: bool) -> &mut Self

Include tld detection.

Source

pub fn with_delay(&mut self, delay: u64) -> &mut Self

Delay between request as ms.

Source

pub fn with_http2_prior_knowledge( &mut self, http2_prior_knowledge: bool, ) -> &mut Self

Only use HTTP/2.

Source

pub fn with_request_timeout( &mut self, request_timeout: Option<Duration>, ) -> &mut Self

Max time to wait for request. By default request times out in 15s. Set to None to disable.

Source

pub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self

Set the sitemap url. This does nothing without the sitemap feature flag.

Source

pub fn with_ignore_sitemap(&mut self, _ignore_sitemap: bool) -> &mut Self

Ignore the sitemap when crawling. This method does nothing if the sitemap is not enabled.

Source

pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self

Add user agent to request.

Source

pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self

Preserve the HOST header.

Source

pub fn with_openai(&mut self, _openai_config: Option<GPTConfigs>) -> &mut Self

The OpenAI configs to use to drive the browser. This method does nothing if the openai is not enabled.

Source

pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self

Cookie string to use in request. This does nothing without the cookies flag enabled.

Source

pub fn with_fingerprint(&mut self, _fingerprint: bool) -> &mut Self

Set custom fingerprint ID for request. This does nothing without the chrome flag enabled.

Source

pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self

Use proxies for request.

Source

pub fn with_shared_queue(&mut self, shared_queue: bool) -> &mut Self

Use a shared semaphore to evenly handle workloads. The default is false.

Source

pub fn with_blacklist_url<T>( &mut self, blacklist_url: Option<Vec<T>>, ) -> &mut Self
where Vec<CompactString>: From<Vec<T>>,

Add blacklist urls to ignore.

Source

pub fn with_whitelist_url<T>( &mut self, whitelist_url: Option<Vec<T>>, ) -> &mut Self
where Vec<CompactString>: From<Vec<T>>,

Add whitelist urls to allow.

Source

pub fn with_return_page_links(&mut self, return_page_links: bool) -> &mut Self

Return the links found on the page in the channel subscriptions. This method does nothing if the decentralized is enabled.

Source

pub fn with_headers(&mut self, headers: Option<HeaderMap>) -> &mut Self

Set HTTP headers for request using reqwest::header::HeaderMap.

Source

pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self

Set the max redirects allowed for request.

Source

pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self

Set the redirect policy to use.

Source

pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self

Determine whether to collect all the resources found on pages.

Source

pub fn with_cron(&mut self, _cron_str: &str, _cron_type: CronType) -> &mut Self

Setup cron jobs to run. This does nothing without the cron flag enabled.

Source

pub fn with_limit(&mut self, limit: u32) -> &mut Self

Set a crawl page limit. If the value is 0 there is no limit. This does nothing without the feat flag budget enabled.

Source

pub fn with_concurrency_limit(&mut self, limit: Option<usize>) -> &mut Self

Set the concurrency limits. If you set the value to None to use the default limits using the system CPU cors * n.

Source

pub fn with_evaluate_on_new_document( &mut self, _evaluate_on_new_document: Option<Box<String>>, ) -> &mut Self

Set a custom script to evaluate on new document creation. This does nothing without the feat flag chrome enabled.

Source

pub fn with_auth_challenge_response( &mut self, _auth_challenge_response: Option<AuthChallengeResponse>, ) -> &mut Self

Set the authentiation challenge response. This does nothing without the feat flag chrome enabled.

Source

pub fn with_depth(&mut self, depth: usize) -> &mut Self

Set a crawl depth limit. If the value is 0 there is no limit. This does nothing without the feat flag budget enabled.

Source

pub fn with_caching(&mut self, _cache: bool) -> &mut Self

Cache the page following HTTP rules. This method does nothing if the cache feature is not enabled.

Source

pub fn with_viewport(&mut self, _viewport: Option<Viewport>) -> &mut Self

Configures the view port for chrome. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_retry(&mut self, retry: u8) -> &mut Self

Set the retry limit for request. Set the value to 0 for no retries. The default is 0.

Source

pub fn with_stealth(&mut self, _stealth_mode: bool) -> &mut Self

Use stealth mode for the request. This does nothing without the chrome flag enabled.

Source

pub fn with_wait_for_idle_network( &mut self, _wait_for_idle_network: Option<WaitForIdleNetwork>, ) -> &mut Self

Wait for idle network request. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_wait_for_idle_dom( &mut self, _wait_for_idle_dom: Option<WaitForSelector>, ) -> &mut Self

Wait for idle dom mutations for target element. This method does nothing if the [chrome] feature is not enabled.

Source

pub fn with_wait_for_selector( &mut self, _wait_for_selector: Option<WaitForSelector>, ) -> &mut Self

Wait for a selector. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_wait_for_delay( &mut self, _wait_for_delay: Option<WaitForDelay>, ) -> &mut Self

Wait for with delay. Should only be used for testing. This method does nothing if the ‘chrome’ feature is not enabled.

Source

pub fn with_chrome_intercept( &mut self, _chrome_intercept: RequestInterceptConfiguration, ) -> &mut Self

Use request intercept for the request to only allow content required for the page that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the chrome_intercept is not enabled.

Source

pub fn with_chrome_connection( &mut self, _chrome_connection_url: Option<String>, ) -> &mut Self

Set the connection url for the chrome instance. This method does nothing if the chrome is not enabled.

Source

pub fn with_execution_scripts( &mut self, _execution_scripts: Option<ExecutionScriptsMap>, ) -> &mut Self

Set JS to run on certain pages. This method does nothing if the chrome is not enabled.

Source

pub fn with_automation_scripts( &mut self, _automation_scripts: Option<AutomationScriptsMap>, ) -> &mut Self

Run web automated actions on certain pages. This method does nothing if the chrome is not enabled.

Source

pub fn with_budget(&mut self, budget: Option<HashMap<&str, u32>>) -> &mut Self

Set a crawl budget per path with levels support /a/b/c or for all paths with “*”. This does nothing without the budget flag enabled.

Source

pub fn with_external_domains<'a, 'b>( &mut self, external_domains: Option<impl Iterator<Item = String> + 'a>, ) -> &mut Self

Group external domains to treat the crawl as one. If None is passed this will clear all prior domains.

Source

pub fn with_danger_accept_invalid_certs( &mut self, accept_invalid_certs: bool, ) -> &mut Self

Dangerously accept invalid certificates - this should be used as a last resort.

Source

pub fn with_timezone_id(&mut self, _timezone_id: Option<String>) -> &mut Self

Overrides default host system timezone with the specified one. This does nothing without the chrome flag enabled.

Source

pub fn with_locale(&mut self, _locale: Option<String>) -> &mut Self

Overrides default host system locale with the specified one. This does nothing without the chrome flag enabled.

Source

pub fn with_screenshot( &mut self, _screenshot_config: Option<ScreenShotConfig>, ) -> &mut Self

Set the chrome screenshot configuration. This does nothing without the chrome flag enabled.

Source

pub fn with_block_assets(&mut self, only_html: bool) -> &mut Self

Block assets from loading from the network

Source

pub fn build(&self) -> Self

Build the website configuration when using with_builder.

Trait Implementations§

Source §

impl Clone for Configuration

Source §

fn clone(&self) -> Configuration

Returns a copy of the value. Read more

1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more

Source §

impl Debug for Configuration

Source §

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Source §

impl Default for Configuration

Source §

fn default() -> Configuration

Returns the “default value” for a type. Read more

Source §

impl PartialEq for Configuration

Source §

fn eq(&self, other: &Configuration) -> bool

Tests for self and other values to be equal, and is used by ==.

1.0.0 · Source§

fn ne(&self, other: &Rhs) -> bool

Tests for !=. The default implementation is almost always sufficient, and should not be overridden without very good reason.

Source §

impl StructuralPartialEq for Configuration

Auto Trait Implementations§

§

impl UnwindSafe for Configuration

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> CloneToUninit for T
where T: Clone,

Source §

unsafe fn clone_to_uninit(&self, dst: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)

Performs copy-assignment from self to dst. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T> Instrument for T

Source §

fn instrument(self, span: Span) -> Instrumented<Self>

Instruments this type with the provided Span, returning an Instrumented wrapper. Read more

Source §

fn in_current_span(self) -> Instrumented<Self>

Instruments this type with the current Span, returning an Instrumented wrapper. Read more

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T> ToOwned for T
where T: Clone,

Source §

type Owned = T

The resulting type after obtaining ownership.

Source §

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more

Source §

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more

Source §

impl<T, U> TryFrom for T
where U: Into<T>,

Source §

type Error = Infallible

The type returned in the event of a conversion error.

Source §

fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>

Performs the conversion.

Source §

impl<T, U> TryInto for T
where U: TryFrom<T>,

Source §

type Error = >::Error

The type returned in the event of a conversion error.

Source §

fn try_into(self) -> Result<U, >::Error>

Performs the conversion.

Source §

impl<T> WithSubscriber for T

Source §

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

Attaches the provided Subscriber to this type, returning a WithDispatch wrapper. Read more

Source §

fn with_current_subscriber(self) -> WithDispatch<Self>

Attaches the current default Subscriber to this type, returning a WithDispatch wrapper. Read more

Source §

impl<T> ErasedDestructor for T
where T: 'static,

Source §

Struct ConfigurationCopy item path

Fields§

Implementations§

impl Configuration

pub fn new() -> Self

pub fn get_blacklist(&self) -> Box<Vec<CompactString>>

pub fn get_whitelist(&self) -> Box<Vec<CompactString>>

pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self

pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self

pub fn with_tld(&mut self, tld: bool) -> &mut Self

pub fn with_delay(&mut self, delay: u64) -> &mut Self

pub fn with_http2_prior_knowledge( &mut self, http2_prior_knowledge: bool, ) -> &mut Self

pub fn with_request_timeout( &mut self, request_timeout: Option<Duration>, ) -> &mut Self

pub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self

pub fn with_ignore_sitemap(&mut self, _ignore_sitemap: bool) -> &mut Self

pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self

pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self

pub fn with_openai(&mut self, _openai_config: Option<GPTConfigs>) -> &mut Self

pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self

pub fn with_fingerprint(&mut self, _fingerprint: bool) -> &mut Self

pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self

pub fn with_shared_queue(&mut self, shared_queue: bool) -> &mut Self

pub fn with_blacklist_url<T>( &mut self, blacklist_url: Option<Vec<T>>, ) -> &mut Selfwhere Vec<CompactString>: From<Vec<T>>,

pub fn with_whitelist_url<T>( &mut self, whitelist_url: Option<Vec<T>>, ) -> &mut Selfwhere Vec<CompactString>: From<Vec<T>>,

pub fn with_return_page_links(&mut self, return_page_links: bool) -> &mut Self

pub fn with_headers(&mut self, headers: Option<HeaderMap>) -> &mut Self

pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self

pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self

pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self

pub fn with_cron(&mut self, _cron_str: &str, _cron_type: CronType) -> &mut Self

pub fn with_limit(&mut self, limit: u32) -> &mut Self

pub fn with_concurrency_limit(&mut self, limit: Option<usize>) -> &mut Self

pub fn with_evaluate_on_new_document( &mut self, _evaluate_on_new_document: Option<Box<String>>, ) -> &mut Self

pub fn with_auth_challenge_response( &mut self, _auth_challenge_response: Option<AuthChallengeResponse>, ) -> &mut Self

pub fn with_depth(&mut self, depth: usize) -> &mut Self

pub fn with_caching(&mut self, _cache: bool) -> &mut Self

pub fn with_viewport(&mut self, _viewport: Option<Viewport>) -> &mut Self

pub fn with_retry(&mut self, retry: u8) -> &mut Self

pub fn with_stealth(&mut self, _stealth_mode: bool) -> &mut Self

pub fn with_wait_for_idle_network( &mut self, _wait_for_idle_network: Option<WaitForIdleNetwork>, ) -> &mut Self

pub fn with_wait_for_idle_dom( &mut self, _wait_for_idle_dom: Option<WaitForSelector>, ) -> &mut Self

pub fn with_wait_for_selector( &mut self, _wait_for_selector: Option<WaitForSelector>, ) -> &mut Self

pub fn with_wait_for_delay( &mut self, _wait_for_delay: Option<WaitForDelay>, ) -> &mut Self

pub fn with_chrome_intercept( &mut self, _chrome_intercept: RequestInterceptConfiguration, ) -> &mut Self

pub fn with_chrome_connection( &mut self, _chrome_connection_url: Option<String>, ) -> &mut Self

pub fn with_execution_scripts( &mut self, _execution_scripts: Option<ExecutionScriptsMap>, ) -> &mut Self

pub fn with_automation_scripts( &mut self, _automation_scripts: Option<AutomationScriptsMap>, ) -> &mut Self

pub fn with_budget(&mut self, budget: Option<HashMap<&str, u32>>) -> &mut Self

pub fn with_external_domains<'a, 'b>( &mut self, external_domains: Option<impl Iterator<Item = String> + 'a>, ) -> &mut Self

pub fn with_danger_accept_invalid_certs( &mut self, accept_invalid_certs: bool, ) -> &mut Self

pub fn with_timezone_id(&mut self, _timezone_id: Option<String>) -> &mut Self

pub fn with_locale(&mut self, _locale: Option<String>) -> &mut Self

pub fn with_screenshot( &mut self, _screenshot_config: Option<ScreenShotConfig>, ) -> &mut Self

pub fn with_block_assets(&mut self, only_html: bool) -> &mut Self

pub fn build(&self) -> Self

Trait Implementations§

impl Clone for Configuration

fn clone(&self) -> Configuration

fn clone_from(&mut self, source: &Self)

impl Debug for Configuration

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl Default for Configuration

fn default() -> Configuration

impl PartialEq for Configuration

fn eq(&self, other: &Configuration) -> bool

fn ne(&self, other: &Rhs) -> bool

impl StructuralPartialEq for Configuration

Auto Trait Implementations§

impl Freeze for Configuration

impl RefUnwindSafe for Configuration

impl Send for Configuration

impl Sync for Configuration

impl Unpin for Configuration

impl UnwindSafe for Configuration

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

Struct Configuration

pub fn with_blacklist_url<T>( &mut self, blacklist_url: Option<Vec<T>>, ) -> &mut Self
where Vec<CompactString>: From<Vec<T>>,

pub fn with_whitelist_url<T>( &mut self, whitelist_url: Option<Vec<T>>, ) -> &mut Self
where Vec<CompactString>: From<Vec<T>>,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

impl<T> ErasedDestructor for T
where T: 'static,