Struct spider::configuration::Configuration
source · pub struct Configuration {Show 18 fields
pub respect_robots_txt: bool,
pub subdomains: bool,
pub tld: bool,
pub blacklist_url: Option<Box<Vec<CompactString>>>,
pub user_agent: Option<Box<CompactString>>,
pub delay: u64,
pub request_timeout: Option<Box<Duration>>,
pub http2_prior_knowledge: bool,
pub proxies: Option<Box<Vec<String>>>,
pub headers: Option<Box<HeaderMap>>,
pub redirect_limit: Box<usize>,
pub redirect_policy: RedirectPolicy,
pub cookie_str: Box<String>,
pub external_domains_caseless: Box<HashSet<CaseInsensitiveString>>,
pub full_resources: bool,
pub accept_invalid_certs: bool,
pub auth_challenge_response: Option<AuthChallengeResponse>,
pub openai_config: Option<GPTConfigs>,
}
Expand description
Structure to configure Website
crawler
use spider::website::Website;
let mut website: Website = Website::new("https://choosealicense.com");
website.configuration.blacklist_url.insert(Default::default()).push("https://choosealicense.com/licenses/".to_string().into());
website.configuration.respect_robots_txt = true;
website.configuration.subdomains = true;
website.configuration.tld = true;
Fields§
§respect_robots_txt: bool
Respect robots.txt file and not scrape not allowed files. This may slow down crawls if robots.txt file has a delay included.
subdomains: bool
Allow sub-domains.
tld: bool
Allow all tlds for domain.
blacklist_url: Option<Box<Vec<CompactString>>>
List of pages to not crawl. [optional: regex pattern matching]
user_agent: Option<Box<CompactString>>
User-Agent for request.
delay: u64
Polite crawling delay in milli seconds.
request_timeout: Option<Box<Duration>>
Request max timeout per page. By default the request times out in 15s. Set to None to disable.
http2_prior_knowledge: bool
Use HTTP2 for connection. Enable if you know the website has http2 support.
proxies: Option<Box<Vec<String>>>
Use proxy list for performing network request.
headers: Option<Box<HeaderMap>>
Headers to include with request.
redirect_limit: Box<usize>
The max redirections allowed for request.
redirect_policy: RedirectPolicy
The redirect policy type to use.
Cookie string to use for network requests ex: “foo=bar; Domain=blog.spider”
external_domains_caseless: Box<HashSet<CaseInsensitiveString>>
External domains to include case-insensitive.
full_resources: bool
Collect all the resources found on the page.
accept_invalid_certs: bool
Dangerously accept invalid certficates.
auth_challenge_response: Option<AuthChallengeResponse>
The auth challenge response. The ‘chrome_intercept’ flag is also required in order to intercept the response.
openai_config: Option<GPTConfigs>
The OpenAI configs to use to help drive the chrome browser. This does nothing without the ‘openai’ flag.
Implementations§
source§impl Configuration
impl Configuration
sourcepub fn get_blacklist(&self) -> Box<Vec<CompactString>>
pub fn get_blacklist(&self) -> Box<Vec<CompactString>>
Handle the blacklist options.
sourcepub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self
pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self
Respect robots.txt file.
sourcepub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self
pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self
Include subdomains detection.
sourcepub fn with_delay(&mut self, delay: u64) -> &mut Self
pub fn with_delay(&mut self, delay: u64) -> &mut Self
Delay between request as ms.
sourcepub fn with_http2_prior_knowledge(
&mut self,
http2_prior_knowledge: bool
) -> &mut Self
pub fn with_http2_prior_knowledge( &mut self, http2_prior_knowledge: bool ) -> &mut Self
Only use HTTP/2.
sourcepub fn with_request_timeout(
&mut self,
request_timeout: Option<Duration>
) -> &mut Self
pub fn with_request_timeout( &mut self, request_timeout: Option<Duration> ) -> &mut Self
Max time to wait for request. By default request times out in 15s. Set to None to disable.
sourcepub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self
pub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self
Set the sitemap url. This does nothing without the sitemap
feature flag.
sourcepub fn with_ignore_sitemap(&mut self, _ignore_sitemap: bool) -> &mut Self
pub fn with_ignore_sitemap(&mut self, _ignore_sitemap: bool) -> &mut Self
Ignore the sitemap when crawling. This method does nothing if the sitemap
is not enabled.
sourcepub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self
pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self
Add user agent to request.
sourcepub fn with_openai(&mut self, _openai_config: Option<GPTConfigs>) -> &mut Self
pub fn with_openai(&mut self, _openai_config: Option<GPTConfigs>) -> &mut Self
The OpenAI configs to use to drive the browser. This method does nothing if the openai
is not enabled.
Cookie string to use in request. This does nothing without the cookies
flag enabled.
sourcepub fn with_fingerprint(&mut self, _fingerprint: bool) -> &mut Self
pub fn with_fingerprint(&mut self, _fingerprint: bool) -> &mut Self
Set custom fingerprint ID for request. This does nothing without the chrome
flag enabled.
sourcepub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self
pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self
Use proxies for request.
sourcepub fn with_blacklist_url<T>(
&mut self,
blacklist_url: Option<Vec<T>>
) -> &mut Self
pub fn with_blacklist_url<T>( &mut self, blacklist_url: Option<Vec<T>> ) -> &mut Self
Add blacklist urls to ignore.
sourcepub fn with_headers(&mut self, headers: Option<HeaderMap>) -> &mut Self
pub fn with_headers(&mut self, headers: Option<HeaderMap>) -> &mut Self
Set HTTP headers for request using reqwest::header::HeaderMap.
sourcepub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self
pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self
Set the max redirects allowed for request.
sourcepub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self
pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self
Set the redirect policy to use.
sourcepub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self
pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self
Determine whether to collect all the resources found on pages.
sourcepub fn with_cron(&mut self, _cron_str: &str, _cron_type: CronType) -> &mut Self
pub fn with_cron(&mut self, _cron_str: &str, _cron_type: CronType) -> &mut Self
Setup cron jobs to run. This does nothing without the cron
flag enabled.
sourcepub fn with_limit(&mut self, _limit: u32) -> &mut Self
pub fn with_limit(&mut self, _limit: u32) -> &mut Self
Set a crawl page limit. If the value is 0 there is no limit. This does nothing without the feat flag budget
enabled.
sourcepub fn with_evaluate_on_new_document(
&mut self,
_evaluate_on_new_document: Option<Box<String>>
) -> &mut Self
pub fn with_evaluate_on_new_document( &mut self, _evaluate_on_new_document: Option<Box<String>> ) -> &mut Self
Set a custom script to evaluate on new document creation. This does nothing without the feat flag chrome
enabled.
sourcepub fn with_auth_challenge_response(
&mut self,
_auth_challenge_response: Option<AuthChallengeResponse>
) -> &mut Self
pub fn with_auth_challenge_response( &mut self, _auth_challenge_response: Option<AuthChallengeResponse> ) -> &mut Self
Set the authentiation challenge response. This does nothing without the feat flag chrome
enabled.
sourcepub fn with_depth(&mut self, _depth: usize) -> &mut Self
pub fn with_depth(&mut self, _depth: usize) -> &mut Self
Set a crawl depth limit. If the value is 0 there is no limit. This does nothing without the feat flag budget
enabled.
sourcepub fn with_caching(&mut self, _cache: bool) -> &mut Self
pub fn with_caching(&mut self, _cache: bool) -> &mut Self
Cache the page following HTTP rules. This method does nothing if the cache
feature is not enabled.
sourcepub fn with_viewport(&mut self, _viewport: Option<Viewport>) -> &mut Self
pub fn with_viewport(&mut self, _viewport: Option<Viewport>) -> &mut Self
Configures the view port for chrome. This method does nothing if the chrome
feature is not enabled.
sourcepub fn with_stealth(&mut self, _stealth_mode: bool) -> &mut Self
pub fn with_stealth(&mut self, _stealth_mode: bool) -> &mut Self
Use stealth mode for the request. This does nothing without the chrome
flag enabled.
sourcepub fn with_wait_for_idle_network(
&mut self,
_wait_for_idle_network: Option<WaitForIdleNetwork>
) -> &mut Self
pub fn with_wait_for_idle_network( &mut self, _wait_for_idle_network: Option<WaitForIdleNetwork> ) -> &mut Self
Wait for idle network request. This method does nothing if the chrome
feature is not enabled.
sourcepub fn with_wait_for_selector(
&mut self,
_wait_for_selector: Option<WaitForSelector>
) -> &mut Self
pub fn with_wait_for_selector( &mut self, _wait_for_selector: Option<WaitForSelector> ) -> &mut Self
Wait for a selector. This method does nothing if the chrome
feature is not enabled.
sourcepub fn with_wait_for_delay(
&mut self,
_wait_for_delay: Option<WaitForDelay>
) -> &mut Self
pub fn with_wait_for_delay( &mut self, _wait_for_delay: Option<WaitForDelay> ) -> &mut Self
Wait for with delay. Should only be used for testing. This method does nothing if the ‘chrome’ feature is not enabled.
sourcepub fn with_chrome_intercept(
&mut self,
_chrome_intercept: bool,
_block_images: bool
) -> &mut Self
pub fn with_chrome_intercept( &mut self, _chrome_intercept: bool, _block_images: bool ) -> &mut Self
Use request intercept for the request to only allow content required for the page that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the chrome_intercept
is not enabled.
sourcepub fn with_budget(&mut self, _budget: Option<HashMap<&str, u32>>) -> &mut Self
pub fn with_budget(&mut self, _budget: Option<HashMap<&str, u32>>) -> &mut Self
Set a crawl budget per path with levels support /a/b/c or for all paths with “*”. This does nothing without the budget
flag enabled.
sourcepub fn with_external_domains<'a, 'b>(
&mut self,
external_domains: Option<impl Iterator<Item = String> + 'a>
) -> &mut Self
pub fn with_external_domains<'a, 'b>( &mut self, external_domains: Option<impl Iterator<Item = String> + 'a> ) -> &mut Self
Group external domains to treat the crawl as one. If None is passed this will clear all prior domains.
sourcepub fn with_danger_accept_invalid_certs(
&mut self,
accept_invalid_certs: bool
) -> &mut Self
pub fn with_danger_accept_invalid_certs( &mut self, accept_invalid_certs: bool ) -> &mut Self
Dangerously accept invalid certificates - this should be used as a last resort.
sourcepub fn with_timezone_id(&mut self, _timezone_id: Option<String>) -> &mut Self
pub fn with_timezone_id(&mut self, _timezone_id: Option<String>) -> &mut Self
Overrides default host system timezone with the specified one. This does nothing without the chrome
flag enabled.
sourcepub fn with_locale(&mut self, _locale: Option<String>) -> &mut Self
pub fn with_locale(&mut self, _locale: Option<String>) -> &mut Self
Overrides default host system locale with the specified one. This does nothing without the chrome
flag enabled.
sourcepub fn with_screenshot(
&mut self,
_screenshot_config: Option<ScreenShotConfig>
) -> &mut Self
pub fn with_screenshot( &mut self, _screenshot_config: Option<ScreenShotConfig> ) -> &mut Self
Set the chrome screenshot configuration. This does nothing without the chrome
flag enabled.
Trait Implementations§
source§impl Clone for Configuration
impl Clone for Configuration
source§fn clone(&self) -> Configuration
fn clone(&self) -> Configuration
1.0.0 · source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
source
. Read more