Struct spider::configuration::Configuration

source ·

pub struct Configuration {Show 18 fields
    pub respect_robots_txt: bool,
    pub subdomains: bool,
    pub tld: bool,
    pub blacklist_url: Option<Box<Vec<CompactString>>>,
    pub user_agent: Option<Box<CompactString>>,
    pub delay: u64,
    pub request_timeout: Option<Box<Duration>>,
    pub http2_prior_knowledge: bool,
    pub proxies: Option<Box<Vec<String>>>,
    pub headers: Option<Box<HeaderMap>>,
    pub redirect_limit: Box<usize>,
    pub redirect_policy: RedirectPolicy,
    pub cookie_str: Box<String>,
    pub external_domains_caseless: Box<HashSet<CaseInsensitiveString>>,
    pub full_resources: bool,
    pub accept_invalid_certs: bool,
    pub auth_challenge_response: Option<AuthChallengeResponse>,
    pub openai_config: Option<GPTConfigs>,
}

Expand description

Structure to configure Website crawler

use spider::website::Website;
let mut website: Website = Website::new("https://choosealicense.com");
website.configuration.blacklist_url.insert(Default::default()).push("https://choosealicense.com/licenses/".to_string().into());
website.configuration.respect_robots_txt = true;
website.configuration.subdomains = true;
website.configuration.tld = true;

Fields§

§respect_robots_txt: bool

Respect robots.txt file and not scrape not allowed files. This may slow down crawls if robots.txt file has a delay included.

§subdomains: bool

Allow sub-domains.

§tld: bool

Allow all tlds for domain.

§blacklist_url: Option<Box<Vec<CompactString>>>

List of pages to not crawl. [optional: regex pattern matching]

§user_agent: Option<Box<CompactString>>

User-Agent for request.

§delay: u64

Polite crawling delay in milli seconds.

§request_timeout: Option<Box<Duration>>

Request max timeout per page. By default the request times out in 15s. Set to None to disable.

§http2_prior_knowledge: bool

Use HTTP2 for connection. Enable if you know the website has http2 support.

§proxies: Option<Box<Vec<String>>>

Use proxy list for performing network request.

§headers: Option<Box<HeaderMap>>

Headers to include with request.

§redirect_limit: Box<usize>

The max redirections allowed for request.

§redirect_policy: RedirectPolicy

The redirect policy type to use.

§cookie_str: Box<String>

Cookie string to use for network requests ex: “foo=bar; Domain=blog.spider”

§external_domains_caseless: Box<HashSet<CaseInsensitiveString>>

External domains to include case-insensitive.

§full_resources: bool

Collect all the resources found on the page.

§accept_invalid_certs: bool

Dangerously accept invalid certficates.

§auth_challenge_response: Option<AuthChallengeResponse>

The auth challenge response. The ‘chrome_intercept’ flag is also required in order to intercept the response.

§openai_config: Option<GPTConfigs>

The OpenAI configs to use to help drive the chrome browser. This does nothing without the ‘openai’ flag.

Implementations§

source §

impl Configuration

source

pub fn new() -> Self

Represents crawl configuration for a website.

source

pub fn get_blacklist(&self) -> Box<Vec<CompactString>>

Handle the blacklist options.

source

pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self

Respect robots.txt file.

source

pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self

Include subdomains detection.

source

pub fn with_tld(&mut self, tld: bool) -> &mut Self

Include tld detection.

source

pub fn with_delay(&mut self, delay: u64) -> &mut Self

Delay between request as ms.

source

pub fn with_http2_prior_knowledge( &mut self, http2_prior_knowledge: bool ) -> &mut Self

Only use HTTP/2.

source

pub fn with_request_timeout( &mut self, request_timeout: Option<Duration> ) -> &mut Self

Max time to wait for request. By default request times out in 15s. Set to None to disable.

source

pub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self

Set the sitemap url. This does nothing without the sitemap feature flag.

source

pub fn with_ignore_sitemap(&mut self, _ignore_sitemap: bool) -> &mut Self

Ignore the sitemap when crawling. This method does nothing if the sitemap is not enabled.

source

pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self

Add user agent to request.

source

pub fn with_openai(&mut self, _openai_config: Option<GPTConfigs>) -> &mut Self

The OpenAI configs to use to drive the browser. This method does nothing if the openai is not enabled.

source

pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self

Cookie string to use in request. This does nothing without the cookies flag enabled.

source

pub fn with_fingerprint(&mut self, _fingerprint: bool) -> &mut Self

Set custom fingerprint ID for request. This does nothing without the chrome flag enabled.

source

pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self

Use proxies for request.

source

pub fn with_blacklist_url<T>( &mut self, blacklist_url: Option<Vec<T>> ) -> &mut Self
where Vec<CompactString>: From<Vec<T>>,

Add blacklist urls to ignore.

source

pub fn with_headers(&mut self, headers: Option<HeaderMap>) -> &mut Self

Set HTTP headers for request using reqwest::header::HeaderMap.

source

pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self

Set the max redirects allowed for request.

source

pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self

Set the redirect policy to use.

source

pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self

Determine whether to collect all the resources found on pages.

source

pub fn with_cron(&mut self, _cron_str: &str, _cron_type: CronType) -> &mut Self

Setup cron jobs to run. This does nothing without the cron flag enabled.

source

pub fn with_limit(&mut self, _limit: u32) -> &mut Self

Set a crawl page limit. If the value is 0 there is no limit. This does nothing without the feat flag budget enabled.

source

pub fn with_evaluate_on_new_document( &mut self, _evaluate_on_new_document: Option<Box<String>> ) -> &mut Self

Set a custom script to evaluate on new document creation. This does nothing without the feat flag chrome enabled.

source

pub fn with_auth_challenge_response( &mut self, _auth_challenge_response: Option<AuthChallengeResponse> ) -> &mut Self

Set the authentiation challenge response. This does nothing without the feat flag chrome enabled.

source

pub fn with_depth(&mut self, _depth: usize) -> &mut Self

Set a crawl depth limit. If the value is 0 there is no limit. This does nothing without the feat flag budget enabled.

source

pub fn with_caching(&mut self, _cache: bool) -> &mut Self

Cache the page following HTTP rules. This method does nothing if the cache feature is not enabled.

source

pub fn with_viewport(&mut self, _viewport: Option<Viewport>) -> &mut Self

Configures the view port for chrome. This method does nothing if the chrome feature is not enabled.

source

pub fn with_stealth(&mut self, _stealth_mode: bool) -> &mut Self

Use stealth mode for the request. This does nothing without the chrome flag enabled.

source

pub fn with_wait_for_idle_network( &mut self, _wait_for_idle_network: Option<WaitForIdleNetwork> ) -> &mut Self

Wait for idle network request. This method does nothing if the chrome feature is not enabled.

source

pub fn with_wait_for_selector( &mut self, _wait_for_selector: Option<WaitForSelector> ) -> &mut Self

Wait for a selector. This method does nothing if the chrome feature is not enabled.

source

pub fn with_wait_for_delay( &mut self, _wait_for_delay: Option<WaitForDelay> ) -> &mut Self

Wait for with delay. Should only be used for testing. This method does nothing if the ‘chrome’ feature is not enabled.

source

pub fn with_chrome_intercept( &mut self, _chrome_intercept: bool, _block_images: bool ) -> &mut Self

Use request intercept for the request to only allow content required for the page that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the chrome_intercept is not enabled.

source