Struct Page

Source

pub struct Page {Show 31 fields
    pub headers: Option<HeaderMap>,
    pub remote_addr: Option<SocketAddr>,
    pub cookies: Option<HeaderMap>,
    pub status_code: StatusCode,
    pub error_status: Option<String>,
    pub links: HashSet<CaseInsensitiveString>,
    pub external_domains_caseless: Arc<HashSet<CaseInsensitiveString>>,
    pub final_redirect_destination: Option<String>,
    pub screenshot_bytes: Option<Vec<u8>>,
    pub openai_credits_used: Option<Vec<OpenAIUsage>>,
    pub extra_ai_data: Option<Vec<AIResults>>,
    pub gemini_credits_used: Option<Vec<GeminiUsage>>,
    pub extra_gemini_data: Option<Vec<AIResults>>,
    pub remote_multimodal_usage: Option<Vec<AutomationUsage>>,
    pub extra_remote_multimodal_data: Option<Vec<AutomationResults>>,
    pub spawn_pages: Option<Vec<String>>,
    pub content_map: Option<HashMap<String, Bytes>>,
    pub page_links: Option<Box<HashSet<CaseInsensitiveString>>>,
    pub should_retry: bool,
    pub waf_check: bool,
    pub bytes_transferred: Option<f64>,
    pub blocked_crawl: bool,
    pub signature: Option<u64>,
    pub response_map: Option<HashMap<String, f64>>,
    pub request_map: Option<HashMap<String, f64>>,
    pub anti_bot_tech: AntiBotTech,
    pub metadata: Option<Box<Metadata>>,
    pub content_truncated: bool,
    pub proxy_configured: bool,
    pub binary_file: bool,
    pub backend_source: Option<CompactString>,
    /* private fields */
}

Available on crate feature decentralized only.

Expand description

Represent a page visited.

Fields§

§headers: Option<HeaderMap>

The headers of the page request response.

§remote_addr: Option<SocketAddr>

Available on crate feature remote_addr only.

The remote address of the page.

§cookies: Option<HeaderMap>

Available on crate feature cookies only.

The cookies of the page request response.

§status_code: StatusCode

The status code of the page request.

§error_status: Option<String>

The error of the request if any.

§links: HashSet<CaseInsensitiveString>

The current links for the page.

§external_domains_caseless: Arc<HashSet<CaseInsensitiveString>>

The external urls to group with the domain.

§final_redirect_destination: Option<String>

The final destination of the page if redirects were performed [Unused].

§screenshot_bytes: Option<Vec<u8>>

Available on crate feature chrome only.

The screenshot bytes of the page.

§openai_credits_used: Option<Vec<OpenAIUsage>>

Available on crate feature openai only.

The credits used from OpenAI in order.

§extra_ai_data: Option<Vec<AIResults>>

Available on crate feature openai only.

The extra data from the AI, example extracting data etc…

§gemini_credits_used: Option<Vec<GeminiUsage>>

Available on crate feature gemini only.

The credits used from Gemini in order.

§extra_gemini_data: Option<Vec<AIResults>>

Available on crate feature gemini only.

The extra data from the Gemini AI.

§remote_multimodal_usage: Option<Vec<AutomationUsage>>

The usage from remote multimodal automation (extraction, etc.). Works with both Chrome and HTTP-only crawls.

§extra_remote_multimodal_data: Option<Vec<AutomationResults>>

The extra data from the remote multimodal automation (extraction results, etc.). Works with both Chrome and HTTP-only crawls.

§spawn_pages: Option<Vec<String>>

URLs requested by automation to spawn as additional pages.

§content_map: Option<HashMap<String, Bytes>>

Available on crate feature spider_cloud only.

Additional content keyed by return format (e.g. "markdown", "text"). Populated when multiple formats are requested via SpiderCloudConfig::with_return_formats.

§page_links: Option<Box<HashSet<CaseInsensitiveString>>>

The links found on the page. Unused until we can structure the buffers to match.

§should_retry: bool

The request should retry.

§waf_check: bool

A WAF was found on the page.

§bytes_transferred: Option<f64>

The total byte transferred for the page. Mainly used for chrome events.

§blocked_crawl: bool

The page was blocked from crawling usual from using website::on_should_crawl_callback.

§signature: Option<u64>

The signature of the page to de-duplicate content.

§response_map: Option<HashMap<String, f64>>

Available on crate feature chrome only.

All of the response events mapped with the amount of bytes used.

§request_map: Option<HashMap<String, f64>>

Available on crate feature chrome only.

All of the request events mapped with the time period of the event sent.

§anti_bot_tech: AntiBotTech

The anti-bot tech used.

§metadata: Option<Box<Metadata>>

Page metadata.

§content_truncated: bool

Whether the response content was truncated due to a stream error, chunk idle timeout, or Content-Length mismatch.

§proxy_configured: bool

Whether a proxy was configured for this request. When true, 401 responses are retried (proxy rotation may fix auth).

§binary_file: bool

Whether the content is a binary file (image, PDF, etc.). Set once when HTML bytes are first available so the flag remains accurate after content is spooled to disk.

§backend_source: Option<CompactString>

Available on crate feature parallel_backends only.

Identifies which backend produced this page (e.g. “primary”, “cdp”, “servo”). None when parallel backends are not active.

Struct Page Copy item path

Fields§

Implementations§

impl Page

pub fn needs_retry(&self) -> bool

pub async fn new_page(url: &str, client: &Client) -> Self

pub async fn new_page_with_cache( url: &str, client: &Client, cache_options: Option<CacheOptions>, cache_policy: &Option<BasicCachePolicy>, cache_namespace: Option<&str>, ) -> Self

pub fn new_webdriver(url: &str, html: String, status_code: StatusCode) -> Self

pub async fn new_page_webdriver( url: &str, driver: &Arc<WebDriver>, timeout: Option<Duration>, ) -> Self

pub async fn new_page_webdriver_full( url: &str, driver: &Arc<WebDriver>, timeout: Option<Duration>, wait_for: &Option<WaitFor>, execution_scripts: &Option<ExecutionScripts>, automation_scripts: &Option<AutomationScripts>, ) -> Self

pub async fn new_page_only_html(url: &str, client: &Client) -> Self

pub async fn new(url: &str, client: &Client) -> Self

pub async fn new_links_only(url: &str, client: &Client) -> Self

pub async fn screenshot( &self, _full_page: bool, _omit_background: bool, _format: CaptureScreenshotFormat, _quality: Option<i64>, _output_path: Option<impl AsRef<Path>>, _clip: Option<ClipViewport>, ) -> Vec<u8> ⓘ

pub fn get_chrome_page(&self) -> Option<&Page>

pub async fn close_page(&mut self)

pub fn is_empty(&self) -> bool

pub fn get_timeout(&self) -> Option<Duration>

pub fn set_external( &mut self, external_domains_caseless: Arc<HashSet<CaseInsensitiveString>>, )

pub fn set_html_bytes(&mut self, html: Option<Vec<u8>>)

pub fn is_html_on_disk(&self) -> bool

pub fn is_binary_spool_aware(&self) -> bool

pub fn stream_html_bytes<F>(&self, chunk_size: usize, cb: F) -> usizewhere F: FnMut(&[u8]) -> bool,

pub async fn stream_html_bytes_async<F>( &self, chunk_size: usize, cb: F, ) -> usizewhere F: FnMut(&[u8]) -> bool,

pub async fn get_html_async(&self) -> String

pub fn set_url_parsed_direct(&mut self)

pub fn set_url_parsed_direct_empty(&mut self)

pub fn get_url_parsed(&self) -> &Option<Url>

pub fn get_url_parsed_ref(&self) -> &Option<Url>

pub fn take_url(&mut self) -> Option<Url>

pub fn get_url(&self) -> &str

pub fn get_bytes(&self) -> Option<&[u8]>

pub fn get_html(&self) -> String

pub fn get_content(&self) -> String

pub fn get_html_cow(&self) -> Cow<'_, str>

pub fn get_html_bytes_u8(&self) -> &[u8] ⓘ

pub fn get_content_bytes(&self) -> &[u8] ⓘ

pub fn get_content_for(&self, format: &str) -> Option<String>

pub fn get_content_bytes_for(&self, format: &str) -> Option<&[u8]>

pub fn has_content_map(&self) -> bool

pub fn quality_score(&self) -> u16

pub fn get_responses(&self) -> &Option<HashMap<String, f64>>

pub fn get_metadata(&self) -> &Option<Box<Metadata>>

pub fn get_request(&self) -> &Option<HashMap<String, f64>>

pub fn get_html_encoded(&self, label: &str) -> String

pub fn set_duration_elapsed(&mut self, scraped_at: Option<Instant>)

pub fn set_duration_elapsed_from_duration(&mut self, elapsed: Option<Duration>)

pub fn get_duration_elapsed(&self) -> Duration

pub async fn links_stream_xml_links_stream_base<A: PartialEq + Eq + Sync + Send + Clone + Default + ToString + Hash + From<String> + Into<CaseInsensitiveString> + for<'a> From<&'a str>>( &mut self, selectors: &RelativeSelectors, xml: &[u8], map: &mut HashSet<A>, base: &Option<Box<Url>>, )

pub async fn links_stream<A: PartialEq + Eq + Sync + Send + Clone + Default + ToString + Hash + From<String> + Into<CaseInsensitiveString> + for<'a> From<&'a str>>( &mut self, _: &RelativeSelectors, ) -> HashSet<A>

pub async fn links( &self, _: &RelativeSelectors, _: &Option<Box<Url>>, ) -> HashSet<CaseInsensitiveString>

pub async fn links_full( &self, _: &RelativeSelectors, _: &Option<Box<Url>>, ) -> HashSet<CaseInsensitiveString>

Trait Implementations§

impl Clone for Page

fn clone(&self) -> Page

fn clone_from(&mut self, source: &Self)

impl Debug for Page

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl Default for Page

fn default() -> Page

impl PageChromeExt for Page

fn chrome_page(&self) -> Option<&Page>

fn screenshot_bytes(&self) -> Option<&[u8]>

impl PageData for Page

fn url(&self) -> &str

fn url_final(&self) -> &str

fn bytes(&self) -> Option<&[u8]>

fn html(&self) -> String

fn html_bytes_u8(&self) -> &[u8] ⓘ

fn status_code(&self) -> StatusCode

fn headers(&self) -> Option<&HeaderMap>

fn is_empty(&self) -> bool

impl PageTimingExt for Page

fn duration_elapsed(&self) -> Duration

Auto Trait Implementations§

impl !Freeze for Page

impl RefUnwindSafe for Page

impl Send for Page

impl Sync for Page

impl Unpin for Page

Struct Page

pub fn stream_html_bytes<F>(&self, chunk_size: usize, cb: F) -> usize
where F: FnMut(&[u8]) -> bool,

pub async fn stream_html_bytes_async<F>( &self, chunk_size: usize, cb: F, ) -> usize
where F: FnMut(&[u8]) -> bool,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T> DynClone for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T> PolicyExt for T
where T: ?Sized,

fn and<P, B, E>(self, other: P) -> And<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

fn or<P, B, E>(self, other: P) -> Or<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,