Struct spider::configuration::Configuration

source ·
pub struct Configuration {
Show 18 fields pub respect_robots_txt: bool, pub subdomains: bool, pub tld: bool, pub blacklist_url: Option<Box<Vec<CompactString>>>, pub user_agent: Option<Box<CompactString>>, pub delay: u64, pub request_timeout: Option<Box<Duration>>, pub http2_prior_knowledge: bool, pub proxies: Option<Box<Vec<String>>>, pub headers: Option<Box<HeaderMap>>, pub redirect_limit: Box<usize>, pub redirect_policy: RedirectPolicy, pub cookie_str: Box<String>, pub external_domains_caseless: Box<HashSet<CaseInsensitiveString>>, pub full_resources: bool, pub accept_invalid_certs: bool, pub auth_challenge_response: Option<AuthChallengeResponse>, pub openai_config: Option<GPTConfigs>,
}
Expand description

Structure to configure Website crawler

use spider::website::Website;
let mut website: Website = Website::new("https://choosealicense.com");
website.configuration.blacklist_url.insert(Default::default()).push("https://choosealicense.com/licenses/".to_string().into());
website.configuration.respect_robots_txt = true;
website.configuration.subdomains = true;
website.configuration.tld = true;

Fields§

§respect_robots_txt: bool

Respect robots.txt file and not scrape not allowed files. This may slow down crawls if robots.txt file has a delay included.

§subdomains: bool

Allow sub-domains.

§tld: bool

Allow all tlds for domain.

§blacklist_url: Option<Box<Vec<CompactString>>>

List of pages to not crawl. [optional: regex pattern matching]

§user_agent: Option<Box<CompactString>>

User-Agent for request.

§delay: u64

Polite crawling delay in milli seconds.

§request_timeout: Option<Box<Duration>>

Request max timeout per page. By default the request times out in 15s. Set to None to disable.

§http2_prior_knowledge: bool

Use HTTP2 for connection. Enable if you know the website has http2 support.

§proxies: Option<Box<Vec<String>>>

Use proxy list for performing network request.

§headers: Option<Box<HeaderMap>>

Headers to include with request.

§redirect_limit: Box<usize>

The max redirections allowed for request.

§redirect_policy: RedirectPolicy

The redirect policy type to use.

§cookie_str: Box<String>

Cookie string to use for network requests ex: “foo=bar; Domain=blog.spider”

§external_domains_caseless: Box<HashSet<CaseInsensitiveString>>

External domains to include case-insensitive.

§full_resources: bool

Collect all the resources found on the page.

§accept_invalid_certs: bool

Dangerously accept invalid certficates.

§auth_challenge_response: Option<AuthChallengeResponse>

The auth challenge response. The ‘chrome_intercept’ flag is also required in order to intercept the response.

§openai_config: Option<GPTConfigs>

The OpenAI configs to use to help drive the chrome browser. This does nothing without the ‘openai’ flag.

Implementations§

source§

impl Configuration

source

pub fn new() -> Self

Represents crawl configuration for a website.

source

pub fn get_blacklist(&self) -> Box<Vec<CompactString>>

Handle the blacklist options.

source

pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self

Respect robots.txt file.

source

pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self

Include subdomains detection.

source

pub fn with_tld(&mut self, tld: bool) -> &mut Self

Include tld detection.

source

pub fn with_delay(&mut self, delay: u64) -> &mut Self

Delay between request as ms.

source

pub fn with_http2_prior_knowledge( &mut self, http2_prior_knowledge: bool ) -> &mut Self

Only use HTTP/2.

source

pub fn with_request_timeout( &mut self, request_timeout: Option<Duration> ) -> &mut Self

Max time to wait for request. By default request times out in 15s. Set to None to disable.

source

pub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self

Set the sitemap url. This does nothing without the sitemap feature flag.

source

pub fn with_ignore_sitemap(&mut self, _ignore_sitemap: bool) -> &mut Self

Ignore the sitemap when crawling. This method does nothing if the sitemap is not enabled.

source

pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self

Add user agent to request.

source

pub fn with_openai(&mut self, _openai_config: Option<GPTConfigs>) -> &mut Self

The OpenAI configs to use to drive the browser. This method does nothing if the openai is not enabled.

source

pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self

Cookie string to use in request. This does nothing without the cookies flag enabled.

source

pub fn with_fingerprint(&mut self, _fingerprint: bool) -> &mut Self

Set custom fingerprint ID for request. This does nothing without the chrome flag enabled.

source

pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self

Use proxies for request.

source

pub fn with_blacklist_url<T>( &mut self, blacklist_url: Option<Vec<T>> ) -> &mut Self
where Vec<CompactString>: From<Vec<T>>,

Add blacklist urls to ignore.

source

pub fn with_headers(&mut self, headers: Option<HeaderMap>) -> &mut Self

Set HTTP headers for request using reqwest::header::HeaderMap.

source

pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self

Set the max redirects allowed for request.

source

pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self

Set the redirect policy to use.

source

pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self

Determine whether to collect all the resources found on pages.

source

pub fn with_cron(&mut self, _cron_str: &str, _cron_type: CronType) -> &mut Self

Setup cron jobs to run. This does nothing without the cron flag enabled.

source

pub fn with_limit(&mut self, _limit: u32) -> &mut Self

Set a crawl page limit. If the value is 0 there is no limit. This does nothing without the feat flag budget enabled.

source

pub fn with_evaluate_on_new_document( &mut self, _evaluate_on_new_document: Option<Box<String>> ) -> &mut Self

Set a custom script to evaluate on new document creation. This does nothing without the feat flag chrome enabled.

source

pub fn with_auth_challenge_response( &mut self, _auth_challenge_response: Option<AuthChallengeResponse> ) -> &mut Self

Set the authentiation challenge response. This does nothing without the feat flag chrome enabled.

source

pub fn with_depth(&mut self, _depth: usize) -> &mut Self

Set a crawl depth limit. If the value is 0 there is no limit. This does nothing without the feat flag budget enabled.

source

pub fn with_caching(&mut self, _cache: bool) -> &mut Self

Cache the page following HTTP rules. This method does nothing if the cache feature is not enabled.

source

pub fn with_viewport(&mut self, _viewport: Option<Viewport>) -> &mut Self

Configures the view port for chrome. This method does nothing if the chrome feature is not enabled.

source

pub fn with_stealth(&mut self, _stealth_mode: bool) -> &mut Self

Use stealth mode for the request. This does nothing without the chrome flag enabled.

source

pub fn with_wait_for_idle_network( &mut self, _wait_for_idle_network: Option<WaitForIdleNetwork> ) -> &mut Self

Wait for idle network request. This method does nothing if the chrome feature is not enabled.

source

pub fn with_wait_for_selector( &mut self, _wait_for_selector: Option<WaitForSelector> ) -> &mut Self

Wait for a selector. This method does nothing if the chrome feature is not enabled.

source

pub fn with_wait_for_delay( &mut self, _wait_for_delay: Option<WaitForDelay> ) -> &mut Self

Wait for with delay. Should only be used for testing. This method does nothing if the ‘chrome’ feature is not enabled.

source

pub fn with_chrome_intercept( &mut self, _chrome_intercept: bool, _block_images: bool ) -> &mut Self

Use request intercept for the request to only allow content required for the page that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the chrome_intercept is not enabled.

source

pub fn with_budget(&mut self, _budget: Option<HashMap<&str, u32>>) -> &mut Self

Set a crawl budget per path with levels support /a/b/c or for all paths with “*”. This does nothing without the budget flag enabled.

source

pub fn with_external_domains<'a, 'b>( &mut self, external_domains: Option<impl Iterator<Item = String> + 'a> ) -> &mut Self

Group external domains to treat the crawl as one. If None is passed this will clear all prior domains.

source

pub fn with_danger_accept_invalid_certs( &mut self, accept_invalid_certs: bool ) -> &mut Self

Dangerously accept invalid certificates - this should be used as a last resort.

source

pub fn with_timezone_id(&mut self, _timezone_id: Option<String>) -> &mut Self

Overrides default host system timezone with the specified one. This does nothing without the chrome flag enabled.

source

pub fn with_locale(&mut self, _locale: Option<String>) -> &mut Self

Overrides default host system locale with the specified one. This does nothing without the chrome flag enabled.

source

pub fn with_screenshot( &mut self, _screenshot_config: Option<ScreenShotConfig> ) -> &mut Self

Set the chrome screenshot configuration. This does nothing without the chrome flag enabled.

source

pub fn build(&self) -> Self

Build the website configuration when using with_builder.

Trait Implementations§

source§

impl Clone for Configuration

source§

fn clone(&self) -> Configuration

Returns a copy of the value. Read more
1.0.0 · source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more
source§

impl Debug for Configuration

source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more
source§

impl Default for Configuration

source§

fn default() -> Configuration

Returns the “default value” for a type. Read more

Auto Trait Implementations§

Blanket Implementations§

source§

impl<T> Any for T
where T: 'static + ?Sized,

source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
source§

impl<T> Borrow<T> for T
where T: ?Sized,

source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
source§

impl<T> From<T> for T

source§

fn from(t: T) -> T

Returns the argument unchanged.

source§

impl<T> Instrument for T

source§

fn instrument(self, span: Span) -> Instrumented<Self>

Instruments this type with the provided Span, returning an Instrumented wrapper. Read more
source§

fn in_current_span(self) -> Instrumented<Self>

Instruments this type with the current Span, returning an Instrumented wrapper. Read more
source§

impl<T, U> Into<U> for T
where U: From<T>,

source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

source§

impl<T> ToOwned for T
where T: Clone,

§

type Owned = T

The resulting type after obtaining ownership.
source§

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more
source§

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more
source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

§

type Error = Infallible

The type returned in the event of a conversion error.
source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
source§

impl<T> WithSubscriber for T

source§

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

Attaches the provided Subscriber to this type, returning a WithDispatch wrapper. Read more
source§

fn with_current_subscriber(self) -> WithDispatch<Self>

Attaches the current default Subscriber to this type, returning a WithDispatch wrapper. Read more