Skip to main content

Page

Struct Page 

Source
pub struct Page {
Show 14 fields pub headers: Option<HeaderMap>, pub cookies: Option<HeaderMap>, pub status_code: StatusCode, pub error_status: Option<String>, pub external_domains_caseless: Box<HashSet<CaseInsensitiveString>>, pub final_redirect_destination: Option<String>, pub page_links: Option<Box<HashSet<CaseInsensitiveString>>>, pub should_retry: bool, pub waf_check: bool, pub bytes_transferred: Option<f64>, pub blocked_crawl: bool, pub signature: Option<u64>, pub anti_bot_tech: AntiBotTech, pub metadata: Option<Box<Metadata>>, /* private fields */
}
Expand description

Represent a page visited.

Fields§

§headers: Option<HeaderMap>

The headers of the page request response.

§cookies: Option<HeaderMap>

The cookies of the page request response.

§status_code: StatusCode

The status code of the page request.

§error_status: Option<String>

The error of the request if any.

§external_domains_caseless: Box<HashSet<CaseInsensitiveString>>

The external urls to group with the domain

§final_redirect_destination: Option<String>

The final destination of the page if redirects were performed [Not implemented in the chrome feature].

§page_links: Option<Box<HashSet<CaseInsensitiveString>>>

The links found on the page. This includes all links that have an href url.

§should_retry: bool

The request should retry.

§waf_check: bool

A WAF was found on the page.

§bytes_transferred: Option<f64>

The total byte transferred for the page. Mainly used for chrome events. Inspect the content for bytes when using http instead.

§blocked_crawl: bool

The page was blocked from crawling usual from using website::on_should_crawl_callback.

§signature: Option<u64>

The signature of the page to de-duplicate content.

§anti_bot_tech: AntiBotTech

The anti-bot tech used.

§metadata: Option<Box<Metadata>>

Page metadata.

Implementations§

Source§

impl Page

Source

pub async fn new_page(url: &str, client: &Client) -> Self

Instantiate a new page and gather the html repro of standard fetch_page_html.

Source

pub async fn new_page_streaming<A: PartialEq + Eq + Sync + Send + Clone + Default + Hash + From<String>>( url: &str, client: &Client, only_html: bool, selectors: &mut RelativeSelectors, external_domains_caseless: &Box<HashSet<CaseInsensitiveString>>, r_settings: &PageLinkBuildSettings, map: &mut HashSet<A>, ssg_map: Option<&mut HashSet<A>>, prior_domain: &Option<Box<Url>>, domain_parsed: &mut Option<Box<Url>>, links_pages: &mut Option<HashSet<A>>, ) -> Self

New page with rewriter

Source

pub async fn new_page_only_html(url: &str, client: &Client) -> Self

Instantiate a new page and gather the html repro of standard fetch_page_html only gathering resources to crawl.

Source

pub async fn new(url: &str, client: &Client) -> Self

Instantiate a new page and gather the html.

Source

pub async fn screenshot( &self, _full_page: bool, _omit_background: bool, _format: CaptureScreenshotFormat, _quality: Option<i64>, _output_path: Option<impl AsRef<Path>>, _clip: Option<ClipViewport>, ) -> Vec<u8>

Take a screenshot of the page. If the output path is set to None the screenshot will not be saved. The feature flag chrome_store_page is required.

Source

pub fn is_empty(&self) -> bool

Page request is empty. On chrome an empty page has bare html markup.

Source

pub fn get_url(&self) -> &str

Url getter for page.

Source

pub fn get_timeout(&self) -> Option<Duration>

Get the timeout required for rate limiting. The max duration is 30 seconds for delay respecting. Requires the feature flag headers.

Source

pub fn get_url_final(&self) -> &str

Url getter for page after redirects.

Source

pub fn set_external( &mut self, external_domains_caseless: Box<HashSet<CaseInsensitiveString>>, )

Set the external domains to treat as one

Source

pub fn set_html_bytes(&mut self, html: Option<Vec<u8>>)

Set the html directly of the page

Source

pub fn set_url(&mut self, url: String)

Set the url directly of the page. Useful for transforming the content and rewriting the url.

Source

pub fn set_url_parsed_direct(&mut self)

Set the url directly parsed url of the page. Useful for transforming the content and rewriting the url.

Source

pub fn set_url_parsed_direct_empty(&mut self)

Set the url directly parsed url of the page. Useful for transforming the content and rewriting the url.

Source

pub fn set_url_parsed(&mut self, url_parsed: Url)

Set the url directly parsed url of the page. Useful for transforming the content and rewriting the url.

Source

pub fn get_url_parsed_ref(&self) -> &Option<Url>

Parsed URL getter for page.

Source

pub fn get_url_parsed(&mut self) -> &Option<Url>

Parsed URL getter for page.

Source

pub fn take_url(&mut self) -> Option<Url>

Take the parsed url.

Source

pub fn get_bytes(&self) -> Option<&Vec<u8>>

Html getter for bytes on the page.

Source

pub fn get_html(&self) -> String

Html getter for bytes on the page as string.

Source

pub fn get_html_bytes_u8(&self) -> &[u8]

Html getter for page to u8.

Source

pub fn get_metadata(&self) -> &Option<Box<Metadata>>

Get the metadata found on the page.

Source

pub fn get_html_encoded(&self, label: &str) -> String

Html getter for getting the content with proper encoding. Pass in a proper encoding label like SHIFT_JIS. This fallsback to get_html without the encoding flag enabled.

Source

pub fn set_duration_elapsed(&mut self, scraped_at: Option<Instant>)

Set the elasped duration of the page since scraped from duration.

Source

pub fn set_duration_elapsed_from_duration(&mut self, elapsed: Option<Duration>)

Set the elasped duration of the page since scraped from duration.

Source

pub fn get_duration_elapsed(&self) -> Duration

Get the elasped duration of the page since scraped.

Find the links as a stream using string resource validation for XML files

Find the links as a stream using string resource validation

Find the links as a stream using string resource validation

Find the links as a stream using string resource validation and parsing the script for nextjs initial SSG paths.

Find all href links and return them using CSS selectors.

Find the links as a stream using string resource validation

Find the links as a stream using string resource validation

Find all href links and return them using CSS selectors.

Find all href links and return them using CSS selectors gathering all resources.

Trait Implementations§

Source§

impl Clone for Page

Source§

fn clone(&self) -> Page

Returns a duplicate of the value. Read more
1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more
Source§

impl Debug for Page

Source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more
Source§

impl Default for Page

Source§

fn default() -> Page

Returns the “default value” for a type. Read more

Auto Trait Implementations§

§

impl Freeze for Page

§

impl RefUnwindSafe for Page

§

impl Send for Page

§

impl Sync for Page

§

impl Unpin for Page

§

impl UnwindSafe for Page

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> CloneToUninit for T
where T: Clone,

Source§

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)
Performs copy-assignment from self to dest. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T> Instrument for T

Source§

fn instrument(self, span: Span) -> Instrumented<Self>

Instruments this type with the provided Span, returning an Instrumented wrapper. Read more
Source§

fn in_current_span(self) -> Instrumented<Self>

Instruments this type with the current Span, returning an Instrumented wrapper. Read more
Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T> IntoEither for T

Source§

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

impl<T> PolicyExt for T
where T: ?Sized,

Source§

fn and<P, B, E>(self, other: P) -> And<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

Create a new Policy that returns Action::Follow only if self and other return Action::Follow. Read more
Source§

fn or<P, B, E>(self, other: P) -> Or<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

Create a new Policy that returns Action::Follow if either self or other returns Action::Follow. Read more
Source§

impl<T> Same for T

Source§

type Output = T

Should always be Self
Source§

impl<T> ToOwned for T
where T: Clone,

Source§

type Owned = T

The resulting type after obtaining ownership.
Source§

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more
Source§

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

Source§

fn vzip(self) -> V

Source§

impl<T> WithSubscriber for T

Source§

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

Attaches the provided Subscriber to this type, returning a WithDispatch wrapper. Read more
Source§

fn with_current_subscriber(self) -> WithDispatch<Self>

Attaches the current default Subscriber to this type, returning a WithDispatch wrapper. Read more