Struct spider::website::Website

pub struct Website {
    pub configuration: Box<Configuration>,
    pub on_link_find_callback: Option<fn(_: CaseInsensitiveString) -> CaseInsensitiveString>,
    /* private fields */
}

Expand description

Represents a website to crawl and gather all links.

use spider::website::Website;
let mut website = Website::new("http://example.com");
website.crawl();
// `Website` will be filled with `Pages` when crawled. To get them, just use
while let Some(page) = website.get_pages() {
    // do something
}

Fields§

§configuration: Box<Configuration>

configuration properties for website.

§on_link_find_callback: Option<fn(_: CaseInsensitiveString) -> CaseInsensitiveString>

callback when a link is found.

Implementations§

impl Website

pub fn new(domain: &str) -> Self

Initialize Website object with a start link to crawl.

pub fn is_allowed( &self, link: &CaseInsensitiveString, blacklist_url: &Box<Vec<CompactString>> ) -> bool

return true if URL:

is not already crawled
is not blacklisted
is not forbidden in robot.txt file (if parameter is defined)

pub fn is_allowed_default( &self, link: &CompactString, blacklist_url: &Box<Vec<CompactString>> ) -> bool

return true if URL:

is not blacklisted
is not forbidden in robot.txt file (if parameter is defined)

pub fn is_allowed_robots(&self, link: &str) -> bool

return true if URL:

is not forbidden in robot.txt file (if parameter is defined)

pub fn get_pages(&self) -> Option<&Box<Vec<Page>>>

page getter

pub fn get_links(&self) -> &HashSet<CaseInsensitiveString>

links visited getter

pub fn get_absolute_path(&self, domain: Option<&str>) -> Option<Url>

absolute base url of crawl

pub async fn configure_robots_parser(&mut self, client: Client) -> Client

configure the robots parser on initial crawl attempt and run.

pub fn configure_http_client(&mut self) -> Client

configure http client

pub async fn setup<T>(&mut self) -> (Client, Option<T>)

setup config for crawl

pub async fn crawl(&mut self)

Start to crawl website with async conccurency

pub async fn crawl_sync(&mut self)

Start to crawl website in sync

pub async fn scrape(&mut self)

Start to scrape/download website with async conccurency

Trait Implementations§

impl Clone for Website

fn clone(&self) -> Website

Returns a copy of the value. Read more

1.0.0 · source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more

impl Debug for Website

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Auto Trait Implementations§

impl RefUnwindSafe for Website

impl Send for Website

impl Sync for Website

impl Unpin for Website

impl UnwindSafe for Website

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

impl<T> From<T> for T

fn from(t: T) -> T

Returns the argument unchanged.

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

Instruments this type with the provided Span, returning an Instrumented wrapper. Read more

fn in_current_span(self) -> Instrumented<Self>

Instruments this type with the current Span, returning an Instrumented wrapper. Read more

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

The resulting type after obtaining ownership.

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

The type returned in the event of a conversion error.

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.

impl<T> WithSubscriber for T

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>where S: Into<Dispatch>,

Attaches the provided Subscriber to this type, returning a WithDispatch wrapper. Read more

fn with_current_subscriber(self) -> WithDispatch<Self>

Attaches the current default Subscriber to this type, returning a WithDispatch wrapper. Read more