Struct Website

Source

pub struct Website {
    pub configuration: Box<Configuration>,
    pub on_link_find_callback: Option<OnLinkFindCallback>,
    pub on_should_crawl_callback: Option<OnShouldCrawlCallback>,
    pub crawl_id: Box<String>,
    pub cookie_jar: Arc<Jar>,
    /* private fields */
}

Expand description

Represents a website to crawl and gather all links or page content.

use spider::website::Website;
let mut website = Website::new("http://example.com");
website.crawl();
// `Website` will be filled with links or pages when crawled. If you need pages with the resource
// call the `website.scrape` method with `website.get_pages` instead.
for link in website.get_links() {
    // do something
}

Fields§

§configuration: Box<Configuration>

Configuration properties for website.

§on_link_find_callback: Option<OnLinkFindCallback>

The callback when a link is found.

§on_should_crawl_callback: Option<OnShouldCrawlCallback>

The callback to use if a page should be ignored. Return false to ensure that the discovered links are not crawled.

§crawl_id: Box<String>

Set the crawl ID to track. This allows explicit targeting for shutdown, pause, and etc.

§cookie_jar: Arc<Jar>

Cookie jar between request.

Implementations§

Source §

impl Website

Source

pub fn new(url: &str) -> Self

Initialize the Website with a starting link to crawl.

Source

pub fn new_with_firewall(url: &str, check_firewall: bool) -> Self

Initialize the Website with a starting link to crawl and check the firewall.

Source

pub fn setup_database_handler(&self) -> Box<DatabaseHandler>

Setup a shared database.

Source

pub fn setup_shared_db(&mut self, db: Box<DatabaseHandler>)

Setup the sqlist usage.

Source

pub fn setup_sqlite(&mut self)

Setup the sqlist usage.

Source

pub fn set_url(&mut self, url: &str) -> &mut Self

Set the url of the website to re-use configuration and data.

Source

pub fn set_url_only(&mut self, url: &str) -> &mut Self

Set the direct url of the website to re-use configuration and data without parsing the domain.

Source

pub fn target_id(&self) -> String

Get the target id for a crawl. This takes the crawl ID and the url and concats it without delimiters.

Source

pub fn single_page(&self) -> bool

Single page request.

Source

pub fn setup_disk(&mut self)

Setup SQLite. This does nothing with disk flag enabled.

Source

pub fn set_disk_persistance(&mut self, persist: bool) -> &mut Self

Set the sqlite disk persistance.

Source

pub fn get_robots_parser(&self) -> &Option<Box<RobotFileParser>>

Get the robots.txt parser.

Source

pub fn get_requires_javascript(&self) -> bool

Does the website require javascript to run?

Source

pub fn get_website_meta_info(&self) -> &WebsiteMetaInfo

Get the website meta information that can help with retry handling.

Source

pub async fn is_allowed_disk(&self, url_to_check: &str) -> bool

Check if URL exists (ignore case). This does nothing with disk flag enabled.

Source

pub async fn is_allowed_signature_disk(&self, signature_to_check: u64) -> bool

Check if signature exists (ignore case). This does nothing with disk flag enabled.

Source

pub async fn is_signature_allowed(&self, signature: u64) -> bool

Is the signature allowed.

Source

pub async fn clear_disk(&self)

Clear the disk. This does nothing with disk flag enabled.

Source

pub async fn insert_url_disk(&self, new_url: &str)

Insert a new URL to disk if it doesn’t exist. This does nothing with disk flag enabled.

Source

pub async fn insert_signature_disk(&self, signature: u64)

Insert a new signature to disk if it doesn’t exist. This does nothing with disk flag enabled.

Source

pub async fn insert_link(&mut self, new_url: CaseInsensitiveString)

Insert a new URL if it doesn’t exist. This does nothing with disk flag enabled.

Source

pub async fn insert_signature(&mut self, new_signature: u64)

Insert a new signature if it doesn’t exist. This does nothing with disk flag enabled.

Source

pub async fn seed(&mut self) -> Result<(), Error>

Seed the DB and clear the Hashset. This does nothing with disk flag enabled.

Source

pub fn is_allowed(&mut self, link: &CaseInsensitiveString) -> ProcessLinkStatus

return true if URL:

is not already crawled
is not over depth
is not over crawl budget
is optionally whitelisted
is not blacklisted
is not forbidden in robot.txt file (if parameter is defined)

Source

pub fn is_allowed_budgetless( &mut self, link: &CaseInsensitiveString, ) -> ProcessLinkStatus

return true if URL:

is not already crawled
is not over depth
is optionally whitelisted
is not blacklisted
is not forbidden in robot.txt file (if parameter is defined)

Source

pub fn is_allowed_default(&self, link: &CompactString) -> ProcessLinkStatus

return true if URL:

is optionally whitelisted
is not blacklisted
is not forbidden in robot.txt file (if parameter is defined)

Source

pub fn is_allowed_robots(&self, link: &str) -> bool

return true if URL:

is not forbidden in robot.txt file (if parameter is defined)

Source

pub fn size(&self) -> usize

Amount of pages crawled in memory only. Use get_size for full links between memory and disk.

Source

pub async fn get_size(&self) -> usize

Get the amount of resources collected.

Source

pub fn drain_extra_links(&mut self) -> Drain<'_, CaseInsensitiveString>

Drain the extra links used for things like the sitemap.

Source

pub fn set_initial_status_code(&mut self, initial_status_code: StatusCode)

Set the initial status code of the request.

Source

pub fn get_initial_status_code(&self) -> &StatusCode

Get the initial status code of the request.

Source

pub fn set_initial_html_length(&mut self, initial_html_length: usize)

Set the initial html size of the request.

Source

pub fn get_initial_html_length(&self) -> usize

Get the initial html size of the request.

Source

pub fn set_initial_anti_bot_tech(&mut self, initial_anti_bot_tech: AntiBotTech)

Set the initial anti-bot tech code used for the intitial request.

Source

pub fn get_initial_anti_bot_tech(&self) -> &AntiBotTech

Get the initial anti-bot tech code used for the intitial request.

Source

pub fn set_initial_page_waf_check(&mut self, initial_page_waf_check: bool)

Set the initial waf detected used for the intitial request

Source

pub fn get_initial_page_waf_check(&self) -> bool

Get the initial waf detected used for the intitial request.

Source

pub fn set_initial_page_should_retry(&mut self, initial_page_should_retry: bool)

Set the initial page should retry determination used for the intitial request.

Source

pub fn get_initial_page_should_retry(&self) -> bool

Get the initial page should retry determination used for the intitial request.

Source

pub fn drain_links(&mut self) -> Drain<'_, SymbolUsize>

Drain the links visited.

Source

pub fn drain_signatures(&mut self) -> Drain<'_, u64>

Drain the signatures visited.

Source

pub fn set_extra_links( &mut self, extra_links: HashSet<CaseInsensitiveString>, ) -> &HashSet<CaseInsensitiveString>

Set extra links to crawl. This could be used in conjuntion with ‘website.persist_links’ to extend the crawl on the next run.

Source

pub fn get_extra_links(&self) -> &HashSet<CaseInsensitiveString>

Get the extra links.

Source

pub async fn clear_all(&mut self)

Clear all pages, disk, and links stored in memory.

Source

pub fn clear(&mut self)

Clear all pages and links stored in memory.

Source

pub fn get_client(&self) -> &Option<Client>

Get the HTTP request client. The client is set after the crawl has started.

Source

pub fn get_pages(&self) -> Option<&Vec<Page>>

Page getter.

Source

pub async fn get_links_disk(&self) -> HashSet<CaseInsensitiveString>

Links visited getter for disk. This does nothing with disk flag enabled.

Source

pub async fn get_all_links_visited(&self) -> HashSet<CaseInsensitiveString>

Links all the links visited between memory and disk.

Source

pub fn get_links(&self) -> HashSet<CaseInsensitiveString>

Links visited getter for memory resources.

Source

pub fn get_url_parsed(&self) -> &Option<Box<Url>>

Domain parsed url getter.

Source

pub fn get_url(&self) -> &CaseInsensitiveString

Domain name getter.

Source

pub fn get_delay(&self) -> Duration

Crawl delay getter.

Source

pub fn get_status(&self) -> &CrawlStatus

Get the active crawl status.

Source

pub fn set_status(&mut self, status: CrawlStatus) -> &CrawlStatus

Set the active crawl status. This is helpful when chaining crawls concurrently.

Source

pub fn reset_status(&mut self) -> &CrawlStatus

Reset the active crawl status to bypass websites that are blocked.

Source

pub fn persist_links(&mut self) -> &mut Self

Set the crawl status to persist between the run. Example crawling a sitemap and all links after - website.crawl_sitemap().await.persist_links().crawl().await

Source

pub fn get_absolute_path(&self, domain: Option<&str>) -> Option<Url>

Absolute base url of crawl.

Source

pub fn stop(&mut self)

Stop all crawls for the website.

Source

pub fn start(&mut self)

Crawls commenced from fresh run.

Source

pub async fn configure_robots_parser(&mut self, client: &Client)

configure the robots parser on initial crawl attempt and run.

Source

pub fn setup_strict_policy(&self) -> Policy

Setup strict a strict redirect policy for request. All redirects need to match the host.

Source

pub fn setup_redirect_policy(&self) -> Policy

Setup redirect policy for reqwest.

Source

pub fn configure_headers(&mut self)

Configure the headers to use.

Source

pub fn configure_base_client(&self) -> ClientBuilder

Base client configuration.

Source

pub fn configure_http_client_builder(&self) -> ClientBuilder

Build the HTTP client.

Source

pub fn configure_http_client_cookies( &self, client: ClientBuilder, ) -> ClientBuilder

Build the HTTP client with cookie configurations.

Source

pub fn set_http_client(&mut self, client: Client) -> &Option<Client>

Set the HTTP client to use directly. This is helpful if you manually call ‘website.configure_http_client’ before the crawl.

Source

pub fn configure_http_client(&self) -> Client

Configure http client.

Source

pub fn configure_handler(&self) -> Option<(Arc<AtomicI8>, JoinHandle<()>)>

Setup atomic controller. This does nothing without the ‘control’ feature flag enabled.

Source

pub fn setup_selectors(&self) -> RelativeSelectors

Setup selectors for handling link targets.

Source

pub fn setup_base( &mut self, ) -> (Client, Option<(Arc<AtomicI8>, JoinHandle<()>)>)

Base configuration setup.

Source

pub async fn setup( &mut self, ) -> (Client, Option<(Arc<AtomicI8>, JoinHandle<()>)>)

Setup config for crawl.

Source

pub fn setup_crawl(&self) -> (Pin<Box<Interval>>, Pin<Box<Duration>>)

Setup shared concurrent configs.

Source

pub fn set_crawl_initial_status( &mut self, page: &Page, links: &HashSet<CaseInsensitiveString>, )

Set the initial crawl status by page output.

Source

pub async fn _crawl_establish( &mut self, client: &Client, base: &mut RelativeSelectors, _: bool, ) -> HashSet<CaseInsensitiveString>

Expand links for crawl base establish.

Source

pub fn set_crawl_status(&mut self)

Set the crawl status depending on crawl state. The crawl that only changes if the state is Start or Active.

Source

pub fn setup_semaphore(&self) -> Arc<Semaphore>

Setup the Semaphore for the crawl.

Source

pub async fn crawl(&mut self)

Start to crawl website with async concurrency.

Source

pub async fn crawl_sitemap(&mut self)

Start to crawl website with async concurrency using the sitemap. This does not page forward into the request. This does nothing without the sitemap flag enabled.

Source

pub async fn configure_setup(&mut self)

Configures the website crawling process for concurrent execution with the ability to send it across threads for subscriptions.

Source

pub fn configure_setup_norobots(&mut self)

Configures the website crawling process for concurrent execution with the ability to send it across threads for subscriptions without robot protection. You can manually call website.configure_robots_parser after.

Source

pub async fn crawl_raw_send(&self, url: Option<&str>)

Initiates the website crawling http process concurrently with the ability to send it across threads for subscriptions. Ensure that website.configure_setup() has been called before executing this function. It checks the status to ensure it is not firewall-blocked before proceeding with concurrent crawling. You can pass in a manual url in order to setup a new crawl directly with pre-configurations ready.

Source

pub async fn crawl_smart(&mut self)

Start to crawl website with async concurrency smart. Use HTTP first and JavaScript Rendering as needed. This has no effect without the smart flag enabled.

Source

pub async fn crawl_raw(&mut self)

Start to crawl website with async concurrency using the base raw functionality. Useful when using the chrome feature and defaulting to the basic implementation.

Source

pub async fn scrape(&mut self)

Start to scrape/download website with async concurrency.

Source

pub async fn scrape_raw(&mut self)

Start to crawl website with async concurrency using the base raw functionality. Useful when using the “chrome” feature and defaulting to the basic implementation.

Source

pub async fn scrape_smart(&mut self)

Start to scrape website with async concurrency smart. Use HTTP first and JavaScript Rendering as needed. This has no effect without the smart flag enabled.

Source

pub async fn scrape_sitemap(&mut self)

Start to scrape website sitemap with async concurrency. Use HTTP first and JavaScript Rendering as needed. This has no effect without the sitemap flag enabled.

Source

pub async fn crawl_concurrent( &mut self, client: &Client, handle: &Option<Arc<AtomicI8>>, )

Start to crawl website concurrently.

Source

pub async fn sitemap_crawl( &mut self, _client: &Client, _handle: &Option<Arc<AtomicI8>>, _scrape: bool, )

Sitemap crawl entire lists. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the sitemap flag.

Source

pub async fn sitemap_crawl_chain( &mut self, _client: &Client, _handle: &Option<Arc<AtomicI8>>, _scrape: bool, )

Sitemap crawl entire lists chain. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the sitemap flag.

Source

pub fn get_base_link(&self) -> &CompactString

get base link for crawl establishing.

Source

pub async fn subscription_guard(&self)

Guard the channel from closing until all subscription events complete.

Source

pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self

Respect robots.txt file.

Source

pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self

Include subdomains detection.

Source

pub fn with_csp_bypass(&mut self, enabled: bool) -> &mut Self

Bypass CSP protection detection. This does nothing without the feat flag chrome enabled.

Source

pub fn with_webdriver(&mut self, _webdriver_config: ()) -> &mut Self

Configure WebDriver for browser automation. This does nothing without the webdriver feature flag enabled.

Source

pub fn with_sqlite(&mut self, sqlite: bool) -> &mut Self

Use sqlite to store data and track large crawls. This does nothing without the disk flag enabled.

Source

pub fn with_tld(&mut self, tld: bool) -> &mut Self

Include tld detection.

Source

pub fn with_crawl_timeout( &mut self, crawl_timeout: Option<Duration>, ) -> &mut Self

The max duration for the crawl. This is useful when websites use a robots.txt with long durations and throttle the timeout removing the full concurrency.

Source

pub fn with_http2_prior_knowledge( &mut self, http2_prior_knowledge: bool, ) -> &mut Self

Only use HTTP/2.

Source

pub fn with_delay(&mut self, delay: u64) -> &mut Self

Delay between request as ms.

Source

pub fn with_request_timeout( &mut self, request_timeout: Option<Duration>, ) -> &mut Self

Max time to wait for request.

Source

pub fn with_danger_accept_invalid_certs( &mut self, accept_invalid_certs: bool, ) -> &mut Self

Dangerously accept invalid certificates - this should be used as a last resort.

Source

pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self

Add user agent to request.

Source

pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self

Preserve the HOST header.

Source

pub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self

Add user agent to request. This does nothing without the sitemap flag enabled.

Source

pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self

Use proxies for request.

Source

pub fn with_proxies_direct( &mut self, proxies: Option<Vec<RequestProxy>>, ) -> &mut Self

Use proxies for request with control between chrome and http.

Source

pub fn with_concurrency_limit(&mut self, limit: Option<usize>) -> &mut Self

Set the concurrency limits. If you set the value to None to use the default limits using the system CPU cors * n.

Source

pub fn with_crawl_id(&mut self, _crawl_id: String) -> &mut Self

Set a crawl ID to use for tracking crawls. This does nothing without the control flag enabled.

Source

pub fn with_blacklist_url<T>( &mut self, blacklist_url: Option<Vec<T>>, ) -> &mut Self
where Vec<CompactString>: From<Vec<T>>,

Add blacklist urls to ignore.

Source

pub fn with_retry(&mut self, retry: u8) -> &mut Self

Set the retry limit for request. Set the value to 0 for no retries. The default is 0.

Source

pub fn with_no_control_thread(&mut self, no_control_thread: bool) -> &mut Self

Skip setting up a control thread for pause, start, and shutdown programmatic handling. This does nothing without the ‘control’ flag enabled.

Source

pub fn with_whitelist_url<T>( &mut self, whitelist_url: Option<Vec<T>>, ) -> &mut Self
where Vec<CompactString>: From<Vec<T>>,

Add whitelist urls to allow.

Source

pub fn with_headers(&mut self, headers: Option<HeaderMap>) -> &mut Self

Set HTTP headers for request using reqwest::header::HeaderMap.

Source

pub fn with_modify_headers(&mut self, modify_headers: bool) -> &mut Self

Modify the headers to mimic a real browser.

Source

pub fn with_modify_http_client_headers( &mut self, modify_http_client_headers: bool, ) -> &mut Self

Modify the HTTP client headers to mimic a real browser.

Source

pub fn with_budget(&mut self, budget: Option<HashMap<&str, u32>>) -> &mut Self

Set a crawl budget per path with levels support /a/b/c or for all paths with “*”. This does nothing without the budget flag enabled.

Source

pub fn set_crawl_budget( &mut self, budget: Option<HashMap<CaseInsensitiveString, u32>>, )

Set the crawl budget directly. This does nothing without the budget flag enabled.

Source

pub fn with_depth(&mut self, depth: usize) -> &mut Self

Set a crawl depth limit. If the value is 0 there is no limit.

Source

pub fn with_external_domains<'a, 'b>( &mut self, external_domains: Option<impl Iterator<Item = String> + 'a>, ) -> &mut Self

Group external domains to treat the crawl as one. If None is passed this will clear all prior domains.

Source

pub fn with_on_link_find_callback( &mut self, on_link_find_callback: Option<OnLinkFindCallback>, ) -> &mut Self

Perform a callback to run on each link find.

Source

pub fn set_on_link_find<F>(&mut self, f: F)
where F: Fn(CaseInsensitiveString, Option<String>) -> (CaseInsensitiveString, Option<String>) + Send + Sync + 'static,

Perform a callback to run on each link find shorthand.

Source

pub fn with_on_should_crawl_callback( &mut self, on_should_crawl_callback: Option<fn(&Page) -> bool>, ) -> &mut Self

Use a callback to determine if a page should be ignored. Return false to ensure that the discovered links are not crawled.

Source

pub fn with_on_should_crawl_callback_closure<F: OnShouldCrawlClosure>( &mut self, on_should_crawl_closure: Option<F>, ) -> &mut Self

Use an immutable closure to determine if a page should be ignored. Return false to ensure that the discovered links are not crawled.

Slightly slower than Self::with_on_should_crawl_callback.

Source

pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self

Cookie string to use in request. This does nothing without the cookies flag enabled.

Source

pub fn with_cron(&mut self, cron_str: &str, cron_type: CronType) -> &mut Self

Setup cron jobs to run. This does nothing without the cron flag enabled.

Source

pub fn with_locale(&mut self, locale: Option<String>) -> &mut Self

Overrides default host system locale with the specified one. This does nothing without the chrome flag enabled.

Source

pub fn with_stealth(&mut self, stealth_mode: bool) -> &mut Self

Use stealth mode for the request. This does nothing without the chrome flag enabled.

Source

pub fn with_cache_policy( &mut self, cache_policy: Option<BasicCachePolicy>, ) -> &mut Self

Set the cache policy.

Source

pub fn with_openai(&mut self, openai_configs: Option<GPTConfigs>) -> &mut Self

Use OpenAI to get dynamic javascript to drive the browser. This does nothing without the openai flag enabled.

Source

pub fn with_gemini( &mut self, gemini_configs: Option<GeminiConfigs>, ) -> &mut Self

Use Gemini to get dynamic javascript to drive the browser. This does nothing without the gemini flag enabled.

Source

pub fn with_caching(&mut self, cache: bool) -> &mut Self

Cache the page following HTTP rules. This method does nothing if the cache feature is not enabled.

Source

pub fn with_cache_skip_browser(&mut self, skip: bool) -> &mut Self

Skip browser rendering entirely if cached content exists.

Source

pub fn with_service_worker_enabled(&mut self, enabled: bool) -> &mut Self

Enable or disable Service Workers. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_auto_geolocation(&mut self, enabled: bool) -> &mut Self

Automatically setup geo-location configurations when using a proxy. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_fingerprint(&mut self, fingerprint: bool) -> &mut Self

Setup custom fingerprinting for chrome. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_viewport(&mut self, viewport: Option<Viewport>) -> &mut Self

Configures the viewport of the browser, which defaults to 800x600. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_wait_for_idle_network( &mut self, wait_for_idle_network: Option<WaitForIdleNetwork>, ) -> &mut Self

Wait for network request to be idle within a time frame period (500ms no network connections). This does nothing without the chrome flag enabled.

Source

pub fn with_wait_for_idle_network0( &mut self, wait_for_idle_network: Option<WaitForIdleNetwork>, ) -> &mut Self

Wait for network request with a max timeout. This does nothing without the chrome flag enabled.

Source

pub fn with_wait_for_almost_idle_network0( &mut self, wait_for_idle_network: Option<WaitForIdleNetwork>, ) -> &mut Self

Wait for network to be almost idle with a max timeout. This does nothing without the chrome flag enabled.

Source

pub fn with_wait_for_selector( &mut self, wait_for_selector: Option<WaitForSelector>, ) -> &mut Self

Wait for a CSS query selector. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_wait_for_idle_dom( &mut self, wait_for_selector: Option<WaitForSelector>, ) -> &mut Self

Wait for idle dom mutations for target element. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_wait_for_delay( &mut self, wait_for_delay: Option<WaitForDelay>, ) -> &mut Self

Wait for a delay. Should only be used for testing. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_default_http_connect_timeout( &mut self, default_http_connect_timeout: Option<Duration>, ) -> &mut Self

The default http connect timeout.

Source

pub fn with_default_http_read_timeout( &mut self, default_http_read_timeout: Option<Duration>, ) -> &mut Self

The default http read timeout.

Source

pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self

Set the max redirects allowed for request.

Source

pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self

Set the redirect policy to use, either Strict or Loose by default.

Source

pub fn with_chrome_intercept( &mut self, chrome_intercept: RequestInterceptConfiguration, ) -> &mut Self

Use request intercept for the request to only allow content that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the chrome_intercept flag is not enabled.

Source

pub fn with_referer(&mut self, referer: Option<String>) -> &mut Self

Add a referer to the request.

Source

pub fn with_referrer(&mut self, referer: Option<String>) -> &mut Self

Add a referer to the request.

Source

pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self

Determine whether to collect all the resources found on pages.

Source

pub fn with_dismiss_dialogs(&mut self, full_resources: bool) -> &mut Self

Dismiss all dialogs on the page. This method does nothing if the chrome feature is not enabled.

Source

pub fn with_ignore_sitemap(&mut self, ignore_sitemap: bool) -> &mut Self

Ignore the sitemap when crawling. This method does nothing if the sitemap flag is not enabled.

Source

pub fn with_timezone_id(&mut self, timezone_id: Option<String>) -> &mut Self

Overrides default host system timezone with the specified one. This does nothing without the chrome flag enabled.

Source

pub fn with_evaluate_on_new_document( &mut self, evaluate_on_new_document: Option<Box<String>>, ) -> &mut Self

Set a custom script to evaluate on new document creation. This does nothing without the feat flag chrome enabled.

Source

pub fn with_limit(&mut self, limit: u32) -> &mut Self

Set a crawl page limit. If the value is 0 there is no limit.

Source

pub fn with_screenshot( &mut self, screenshot_config: Option<ScreenShotConfig>, ) -> &mut Self

Set the chrome screenshot configuration. This does nothing without the chrome flag enabled.

Source

pub fn with_shared_queue(&mut self, shared_queue: bool) -> &mut Self

Use a shared semaphore to evenly handle workloads. The default is false.

Source

pub fn with_auth_challenge_response( &mut self, auth_challenge_response: Option<AuthChallengeResponse>, ) -> &mut Self

Set the authentiation challenge response. This does nothing without the feat flag chrome enabled.

Source

pub fn with_return_page_links(&mut self, return_page_links: bool) -> &mut Self

Return the links found on the page in the channel subscriptions. This method does nothing if the decentralized is enabled.

Source

pub fn with_chrome_connection( &mut self, chrome_connection_url: Option<String>, ) -> &mut Self

Set the connection url for the chrome instance. This method does nothing if the chrome is not enabled.

Source

pub fn with_execution_scripts( &mut self, execution_scripts: Option<ExecutionScriptsMap>, ) -> &mut Self

Set JS to run on certain pages. This method does nothing if the chrome is not enabled.

Source

pub fn with_automation_scripts( &mut self, automation_scripts: Option<AutomationScriptsMap>, ) -> &mut Self

Run web automated actions on certain pages. This method does nothing if the chrome is not enabled.

Source

pub fn with_network_interface( &mut self, network_interface: Option<String>, ) -> &mut Self

Bind the connections only on the network interface.

Source

pub fn with_local_address(&mut self, local_address: Option<IpAddr>) -> &mut Self

Bind to a local IP Address.

Source

pub fn with_block_assets(&mut self, only_html: bool) -> &mut Self

Block assets from loading from the network. Focus primarly on HTML documents.

Source

pub fn with_normalize(&mut self, normalize: bool) -> &mut Self

Normalize the content de-duplicating trailing slash pages and other pages that can be duplicated. This may initially show the link in your links_visited or subscription calls but, the following links will not be crawled.

Source

pub fn with_shared_state(&mut self, shared: bool) -> &mut Self

Store all the links found on the disk to share the state. This does nothing without the disk flag enabled.

Source

pub fn with_max_page_bytes(&mut self, max_page_bytes: Option<f64>) -> &mut Self

Set the max amount of bytes to collect per page. Only used for chrome atm.

Source

pub fn with_max_bytes_allowed( &mut self, max_bytes_allowed: Option<u64>, ) -> &mut Self

Set the max amount of bytes to collected for the browser context. Only used for chrome atm.

Source

pub fn with_config(&mut self, config: Configuration) -> &mut Self

Set the configuration for the website directly.

Source

pub fn with_spider_cloud(&mut self, _api_key: &str) -> &mut Self

Set a spider.cloud API key (no-op without spider_cloud feature).

Source

pub fn with_spider_cloud_config(&mut self, _config: ()) -> &mut Self

Set a spider.cloud config (no-op without spider_cloud feature).

Source

pub fn build(&self) -> Result<Self, Self>

Build the website configuration when using with_builder.

Source

pub fn clear_headers(&mut self)

Clear the HTTP headers for the requests.

Source

pub fn determine_limits(&mut self)

Determine if the budget has a wildcard path and the depth limit distance. This does nothing without the budget flag enabled.

Sets up a subscription to receive concurrent data. This will panic if it is larger than usize::MAX / 2. Set the value to 0 to use the semaphore permits. If the subscription is going to block or use async methods, make sure to spawn a task to avoid losing messages. This does nothing unless the sync flag is enabled.

§Examples

Subscribe and receive messages using an async tokio environment:

use spider::{tokio, website::Website};

#[tokio::main]
async fn main() {
    let mut website = Website::new("http://example.com");
    let mut rx = website.subscribe(0).unwrap();

    tokio::spawn(async move {
        while let Ok(page) = rx.recv().await {
            tokio::spawn(async move {
                // Process the received page.
                // If performing non-blocking tasks or managing a high subscription count, configure accordingly.
            });
        }
    });

    website.crawl().await;
}

Source

pub fn queue(&mut self, capacity: usize) -> Option<Sender<String>>

Get a sender for queueing extra links mid crawl. This does nothing unless the sync flag is enabled.

Source

pub fn unsubscribe(&mut self)

Remove subscriptions for data. This is useful for auto droping subscriptions that are running on another thread. This does nothing without the sync flag enabled.

Source

pub fn get_channel(&self) -> &Option<(Sender<Page>, Arc<Receiver<Page>>)>

Get the channel sender to send manual subscriptions.

Source

pub fn get_channel_guard(&self) -> &Option<ChannelGuard>

Get the channel guard to send manual subscriptions from closing.

Source

pub fn subscribe_guard(&mut self) -> Option<ChannelGuard>

Setup subscription counter to track concurrent operation completions. This helps keep a chrome instance active until all operations are completed from all threads to safely take screenshots and other actions. Make sure to call inc if you take a guard. Without calling inc in the subscription receiver the crawl will stay in a infinite loop. This does nothing without the sync flag enabled. You also need to use the ‘chrome_store_page’ to keep the page alive between request.

§Example

use spider::tokio;
use spider::website::Website;

#[tokio::main]
async fn main() {
    let mut website: Website = Website::new("http://example.com");
    let mut rx2 = website.subscribe(18).unwrap();
    let mut rxg = website.subscribe_guard().unwrap();

    tokio::spawn(async move {
        while let Ok(page) = rx2.recv().await {
            println!("📸 - {:?}", page.get_url());
            page
                .screenshot(
                    true,
                    true,
                    spider::configuration::CaptureScreenshotFormat::Png,
                    Some(75),
                    None::<std::path::PathBuf>,
                    None,
                )
                .await;
            rxg.inc();
        }
    });
    website.crawl().await;
}

Source

pub fn get_crawl_id(&self) -> Option<&Box<String>>

Get the attached crawl id.

Source

pub fn set_seeded_html(&mut self, html: Option<String>)

Set the initial HTML page instead of firing a request to the URL.

Source

pub fn get_seeded_html(&self) -> &Option<String>

Get the initial seeded html.

Trait Implementations§

Source §

impl Clone for Website

Source §

fn clone(&self) -> Website

Returns a duplicate of the value. Read more

1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more

Source §

impl Debug for Website

Source §

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Source §

impl Default for Website

Source §

fn default() -> Website

Returns the “default value” for a type. Read more

Source §

impl Display for Website

Source §

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Source §

impl Error for Website

1.30.0 · Source§

fn source(&self) -> Option<&(dyn Error + 'static)>

Returns the lower-level source of this error, if any. Read more

1.0.0 · Source§

fn description(&self) -> &str

👎Deprecated since 1.42.0: use the Display impl or to_string()

fn cause(&self) -> Option<&dyn Error>

👎Deprecated since 1.33.0: replaced by Error::source, which can support downcasting

Source §

fn provide<'a>(&'a self, request: &mut Request<'a>)

🔬This is a nightly-only experimental API. (error_generic_member_access)

Provides type-based access to context intended for error reports. Read more

Auto Trait Implementations§

§

impl !UnwindSafe for Website

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> CloneToUninit for T
where T: Clone,

Source §

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)

Performs copy-assignment from self to dest. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T> Instrument for T

Source §

fn instrument(self, span: Span) -> Instrumented<Self>

Instruments this type with the provided Span, returning an Instrumented wrapper. Read more

Source §

fn in_current_span(self) -> Instrumented<Self>

Instruments this type with the current Span, returning an Instrumented wrapper. Read more

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T> IntoEither for T

Source §

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §