pub struct Website {
pub configuration: Box<Configuration>,
pub on_link_find_callback: Option<fn(_: CaseInsensitiveString, _: Option<String>) -> (CaseInsensitiveString, Option<String>)>,
/* private fields */
}
Expand description
Represents a website to crawl and gather all links or page content.
use spider::website::Website;
let mut website = Website::new("http://example.com");
website.crawl();
// `Website` will be filled with links or pages when crawled. If you need pages with the resource
// call the `website.scrape` method with `website.get_pages` instead.
for link in website.get_links() {
// do something
}
Fields§
§configuration: Box<Configuration>
Configuration properties for website.
on_link_find_callback: Option<fn(_: CaseInsensitiveString, _: Option<String>) -> (CaseInsensitiveString, Option<String>)>
The callback when a link is found.
Implementations§
source§impl Website
impl Website
sourcepub fn set_url(&mut self, url: &str) -> &mut Self
pub fn set_url(&mut self, url: &str) -> &mut Self
Set the url of the website to re-use configuration and data.
sourcepub fn is_allowed(
&self,
link: &CaseInsensitiveString,
blacklist_url: &Box<Vec<CompactString>>
) -> bool
pub fn is_allowed( &self, link: &CaseInsensitiveString, blacklist_url: &Box<Vec<CompactString>> ) -> bool
return true
if URL:
- is not already crawled
- is not blacklisted
- is not forbidden in robot.txt file (if parameter is defined)
sourcepub fn is_allowed_default(
&self,
link: &CompactString,
blacklist_url: &Box<Vec<CompactString>>
) -> bool
pub fn is_allowed_default( &self, link: &CompactString, blacklist_url: &Box<Vec<CompactString>> ) -> bool
return true
if URL:
- is not blacklisted
- is not forbidden in robot.txt file (if parameter is defined)
sourcepub fn is_allowed_robots(&self, link: &str) -> bool
pub fn is_allowed_robots(&self, link: &str) -> bool
return true
if URL:
- is not forbidden in robot.txt file (if parameter is defined)
sourcepub fn drain_links(&mut self) -> Drain<'_, CaseInsensitiveString>
pub fn drain_links(&mut self) -> Drain<'_, CaseInsensitiveString>
Drain the links visited.
sourcepub fn drain_extra_links(&mut self) -> Drain<'_, CaseInsensitiveString>
pub fn drain_extra_links(&mut self) -> Drain<'_, CaseInsensitiveString>
Drain the extra links used for things like the sitemap.
sourcepub fn set_extra_links(
&mut self,
extra_links: HashSet<CaseInsensitiveString>
) -> &HashSet<CaseInsensitiveString>
pub fn set_extra_links( &mut self, extra_links: HashSet<CaseInsensitiveString> ) -> &HashSet<CaseInsensitiveString>
Set extra links to crawl. This could be used in conjuntion with ‘website.persist_links’ to extend the crawl on the next run.
sourcepub fn get_client(&self) -> &Option<Client>
pub fn get_client(&self) -> &Option<Client>
Get the HTTP request client. The client is set after the crawl has started.
sourcepub fn get_links(&self) -> &HashSet<CaseInsensitiveString>
pub fn get_links(&self) -> &HashSet<CaseInsensitiveString>
Links visited getter.
sourcepub fn get_url_parsed(&self) -> &Option<Box<Url>>
pub fn get_url_parsed(&self) -> &Option<Box<Url>>
Domain parsed url getter.
sourcepub fn get_url(&self) -> &CaseInsensitiveString
pub fn get_url(&self) -> &CaseInsensitiveString
Domain name getter.
sourcepub fn get_status(&self) -> &CrawlStatus
pub fn get_status(&self) -> &CrawlStatus
Get the active crawl status.
sourcepub fn persist_links(&mut self) -> &mut Self
pub fn persist_links(&mut self) -> &mut Self
Set the crawl status to persist between the run. Example crawling a sitemap and all links after - website.crawl_sitemap().await.persist_links().crawl().await
sourcepub fn get_absolute_path(&self, domain: Option<&str>) -> Option<Url>
pub fn get_absolute_path(&self, domain: Option<&str>) -> Option<Url>
Absolute base url of crawl.
sourcepub async fn configure_robots_parser(&mut self, client: Client) -> Client
pub async fn configure_robots_parser(&mut self, client: Client) -> Client
configure the robots parser on initial crawl attempt and run.
sourcepub fn set_http_client(&mut self, client: Client) -> &Option<Client>
pub fn set_http_client(&mut self, client: Client) -> &Option<Client>
Set the HTTP client to use directly. This is helpful if you manually call ‘website.configure_http_client’ before the crawl.
sourcepub fn configure_http_client(&mut self) -> Client
pub fn configure_http_client(&mut self) -> Client
Configure http client.
sourcepub async fn crawl_sitemap(&mut self)
pub async fn crawl_sitemap(&mut self)
Start to crawl website with async concurrency using the sitemap. This does not page forward into the request. This does nothing without the sitemap
flag enabled.
sourcepub async fn crawl_smart(&mut self)
pub async fn crawl_smart(&mut self)
Start to crawl website with async concurrency smart. Use HTTP first and JavaScript Rendering as needed. This has no effect without the smart
flag enabled.
sourcepub async fn crawl_raw(&mut self)
pub async fn crawl_raw(&mut self)
Start to crawl website with async concurrency using the base raw functionality. Useful when using the chrome
feature and defaulting to the basic implementation.
sourcepub async fn scrape_raw(&mut self)
pub async fn scrape_raw(&mut self)
Start to crawl website with async concurrency using the base raw functionality. Useful when using the “chrome” feature and defaulting to the basic implementation.
sourcepub async fn sitemap_crawl(
&mut self,
_client: &Client,
_handle: &Option<Arc<AtomicI8>>,
_scrape: bool
)
pub async fn sitemap_crawl( &mut self, _client: &Client, _handle: &Option<Arc<AtomicI8>>, _scrape: bool )
Sitemap crawl entire lists. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the sitemap
flag.
sourcepub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self
pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self
Respect robots.txt file.
sourcepub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self
pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self
Include subdomains detection.
sourcepub fn with_http2_prior_knowledge(
&mut self,
http2_prior_knowledge: bool
) -> &mut Self
pub fn with_http2_prior_knowledge( &mut self, http2_prior_knowledge: bool ) -> &mut Self
Only use HTTP/2.
sourcepub fn with_delay(&mut self, delay: u64) -> &mut Self
pub fn with_delay(&mut self, delay: u64) -> &mut Self
Delay between request as ms.
sourcepub fn with_request_timeout(
&mut self,
request_timeout: Option<Duration>
) -> &mut Self
pub fn with_request_timeout( &mut self, request_timeout: Option<Duration> ) -> &mut Self
Max time to wait for request.
sourcepub fn with_danger_accept_invalid_certs(
&mut self,
accept_invalid_certs: bool
) -> &mut Self
pub fn with_danger_accept_invalid_certs( &mut self, accept_invalid_certs: bool ) -> &mut Self
Dangerously accept invalid certificates - this should be used as a last resort.
sourcepub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self
pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self
Add user agent to request.
sourcepub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self
pub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self
Add user agent to request. This does nothing without the sitemap
flag enabled.
sourcepub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self
pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self
Use proxies for request.
sourcepub fn with_crawl_id(&mut self, _crawl_id: String) -> &mut Self
pub fn with_crawl_id(&mut self, _crawl_id: String) -> &mut Self
Set a crawl ID to use for tracking crawls. This does nothing without the control
flag enabled.
sourcepub fn with_blacklist_url<T>(
&mut self,
blacklist_url: Option<Vec<T>>
) -> &mut Self
pub fn with_blacklist_url<T>( &mut self, blacklist_url: Option<Vec<T>> ) -> &mut Self
Add blacklist urls to ignore.
sourcepub fn with_headers(&mut self, headers: Option<HeaderMap>) -> &mut Self
pub fn with_headers(&mut self, headers: Option<HeaderMap>) -> &mut Self
Set HTTP headers for request using reqwest::header::HeaderMap.
sourcepub fn with_budget(&mut self, budget: Option<HashMap<&str, u32>>) -> &mut Self
pub fn with_budget(&mut self, budget: Option<HashMap<&str, u32>>) -> &mut Self
Set a crawl budget per path with levels support /a/b/c or for all paths with “*”. This does nothing without the budget
flag enabled.
sourcepub fn set_crawl_budget(
&mut self,
_budget: Option<HashMap<CaseInsensitiveString, u32>>
)
pub fn set_crawl_budget( &mut self, _budget: Option<HashMap<CaseInsensitiveString, u32>> )
Set the crawl budget directly. This does nothing without the budget
flag enabled.
sourcepub fn with_depth(&mut self, depth: usize) -> &mut Self
pub fn with_depth(&mut self, depth: usize) -> &mut Self
Set a crawl depth limit. If the value is 0 there is no limit. This does nothing without the feat flag budget
enabled.
sourcepub fn with_external_domains<'a, 'b>(
&mut self,
external_domains: Option<impl Iterator<Item = String> + 'a>
) -> &mut Self
pub fn with_external_domains<'a, 'b>( &mut self, external_domains: Option<impl Iterator<Item = String> + 'a> ) -> &mut Self
Group external domains to treat the crawl as one. If None is passed this will clear all prior domains.
sourcepub fn with_on_link_find_callback(
&mut self,
on_link_find_callback: Option<fn(_: CaseInsensitiveString, _: Option<String>) -> (CaseInsensitiveString, Option<String>)>
) -> &mut Self
pub fn with_on_link_find_callback( &mut self, on_link_find_callback: Option<fn(_: CaseInsensitiveString, _: Option<String>) -> (CaseInsensitiveString, Option<String>)> ) -> &mut Self
Perform a callback to run on each link find.
Cookie string to use in request. This does nothing without the cookies
flag enabled.
sourcepub fn with_cron(&mut self, cron_str: &str, cron_type: CronType) -> &mut Self
pub fn with_cron(&mut self, cron_str: &str, cron_type: CronType) -> &mut Self
Setup cron jobs to run. This does nothing without the cron
flag enabled.
sourcepub fn with_locale(&mut self, locale: Option<String>) -> &mut Self
pub fn with_locale(&mut self, locale: Option<String>) -> &mut Self
Overrides default host system locale with the specified one. This does nothing without the chrome
flag enabled.
sourcepub fn with_stealth(&mut self, stealth_mode: bool) -> &mut Self
pub fn with_stealth(&mut self, stealth_mode: bool) -> &mut Self
Use stealth mode for the request. This does nothing without the chrome
flag enabled.
sourcepub fn with_openai(&mut self, openai_configs: Option<GPTConfigs>) -> &mut Self
pub fn with_openai(&mut self, openai_configs: Option<GPTConfigs>) -> &mut Self
Use OpenAI to get dynamic javascript to drive the browser. This does nothing without the openai
flag enabled.
sourcepub fn with_caching(&mut self, cache: bool) -> &mut Self
pub fn with_caching(&mut self, cache: bool) -> &mut Self
Cache the page following HTTP rules. This method does nothing if the cache
feature is not enabled.
sourcepub fn with_fingerprint(&mut self, fingerprint: bool) -> &mut Self
pub fn with_fingerprint(&mut self, fingerprint: bool) -> &mut Self
Setup custom fingerprinting for chrome. This method does nothing if the chrome
feature is not enabled.
sourcepub fn with_viewport(&mut self, viewport: Option<Viewport>) -> &mut Self
pub fn with_viewport(&mut self, viewport: Option<Viewport>) -> &mut Self
Configures the viewport of the browser, which defaults to 800x600. This method does nothing if the chrome
feature is not enabled.
sourcepub fn with_wait_for_idle_network(
&mut self,
wait_for_idle_network: Option<WaitForIdleNetwork>
) -> &mut Self
pub fn with_wait_for_idle_network( &mut self, wait_for_idle_network: Option<WaitForIdleNetwork> ) -> &mut Self
Wait for idle network request. This method does nothing if the chrome
feature is not enabled.
sourcepub fn with_wait_for_selector(
&mut self,
wait_for_selector: Option<WaitForSelector>
) -> &mut Self
pub fn with_wait_for_selector( &mut self, wait_for_selector: Option<WaitForSelector> ) -> &mut Self
Wait for a CSS query selector. This method does nothing if the chrome
feature is not enabled.
sourcepub fn with_wait_for_delay(
&mut self,
wait_for_delay: Option<WaitForDelay>
) -> &mut Self
pub fn with_wait_for_delay( &mut self, wait_for_delay: Option<WaitForDelay> ) -> &mut Self
Wait for a delay. Should only be used for testing. This method does nothing if the chrome
feature is not enabled.
sourcepub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self
pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self
Set the max redirects allowed for request.
sourcepub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self
pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self
Set the redirect policy to use, either Strict or Loose by default.
sourcepub fn with_chrome_intercept(
&mut self,
chrome_intercept: bool,
block_images: bool
) -> &mut Self
pub fn with_chrome_intercept( &mut self, chrome_intercept: bool, block_images: bool ) -> &mut Self
Use request intercept for the request to only allow content that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the chrome_intercept
flag is not enabled.
sourcepub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self
pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self
Determine whether to collect all the resources found on pages.
sourcepub fn with_ignore_sitemap(&mut self, ignore_sitemap: bool) -> &mut Self
pub fn with_ignore_sitemap(&mut self, ignore_sitemap: bool) -> &mut Self
Ignore the sitemap when crawling. This method does nothing if the sitemap
flag is not enabled.
sourcepub fn with_timezone_id(&mut self, timezone_id: Option<String>) -> &mut Self
pub fn with_timezone_id(&mut self, timezone_id: Option<String>) -> &mut Self
Overrides default host system timezone with the specified one. This does nothing without the chrome
flag enabled.
sourcepub fn with_evaluate_on_new_document(
&mut self,
evaluate_on_new_document: Option<Box<String>>
) -> &mut Self
pub fn with_evaluate_on_new_document( &mut self, evaluate_on_new_document: Option<Box<String>> ) -> &mut Self
Set a custom script to evaluate on new document creation. This does nothing without the feat flag chrome
enabled.
sourcepub fn with_config(&mut self, config: Configuration) -> &mut Self
pub fn with_config(&mut self, config: Configuration) -> &mut Self
Set the configuration for the website directly.
sourcepub fn with_limit(&mut self, limit: u32) -> &mut Self
pub fn with_limit(&mut self, limit: u32) -> &mut Self
Set a crawl page limit. If the value is 0 there is no limit. This does nothing without the feat flag budget
enabled.
sourcepub fn with_screenshot(
&mut self,
screenshot_config: Option<ScreenShotConfig>
) -> &mut Self
pub fn with_screenshot( &mut self, screenshot_config: Option<ScreenShotConfig> ) -> &mut Self
Set the chrome screenshot configuration. This does nothing without the chrome
flag enabled.
sourcepub fn with_auth_challenge_response(
&mut self,
auth_challenge_response: Option<AuthChallengeResponse>
) -> &mut Self
pub fn with_auth_challenge_response( &mut self, auth_challenge_response: Option<AuthChallengeResponse> ) -> &mut Self
Set the authentiation challenge response. This does nothing without the feat flag chrome
enabled.
sourcepub fn build(&self) -> Result<Self, Error>
pub fn build(&self) -> Result<Self, Error>
Build the website configuration when using with_builder.
sourcepub fn subscribe(&mut self, capacity: usize) -> Option<Receiver<Page>>
pub fn subscribe(&mut self, capacity: usize) -> Option<Receiver<Page>>
Setup subscription for data. This will panic if capacity is equal to 0 or larger than usize::MAX / 2. This does nothing without the sync
flag enabled.
sourcepub fn queue(&mut self, capacity: usize) -> Option<Sender<String>>
pub fn queue(&mut self, capacity: usize) -> Option<Sender<String>>
Get a sender for queueing extra links mid crawl.
sourcepub fn unsubscribe(&mut self)
pub fn unsubscribe(&mut self)
Remove subscriptions for data. This is useful for auto droping subscriptions that are running on another thread. This does nothing without the sync
flag enabled.
sourcepub fn subscribe_guard(&mut self) -> Option<ChannelGuard>
pub fn subscribe_guard(&mut self) -> Option<ChannelGuard>
Setup subscription counter to track concurrent operation completions.
This helps keep a chrome instance active until all operations are completed from all threads to safely take screenshots and other actions.
Make sure to call inc
if you take a guard. Without calling inc
in the subscription receiver the crawl will stay in a infinite loop.
This does nothing without the sync
flag enabled. You also need to use the ‘chrome_store_page’ to keep the page alive between request.
§Example
use spider::tokio;
use spider::website::Website;
#[tokio::main]
async fn main() {
let mut website: Website = Website::new("http://example.com");
let mut rx2 = website.subscribe(18).unwrap();
let mut rxg = website.subscribe_guard().unwrap();
tokio::spawn(async move {
while let Ok(page) = rx2.recv().await {
println!("📸 - {:?}", page.get_url());
page
.screenshot(
true,
true,
spider::configuration::CaptureScreenshotFormat::Png,
Some(75),
None::<std::path::PathBuf>,
None,
)
.await;
rxg.inc();
}
});
website.crawl().await;
}