pub struct Page {Show 31 fields
pub headers: Option<HeaderMap>,
pub remote_addr: Option<SocketAddr>,
pub cookies: Option<HeaderMap>,
pub status_code: StatusCode,
pub error_status: Option<String>,
pub links: HashSet<CaseInsensitiveString>,
pub external_domains_caseless: Arc<HashSet<CaseInsensitiveString>>,
pub final_redirect_destination: Option<String>,
pub screenshot_bytes: Option<Vec<u8>>,
pub openai_credits_used: Option<Vec<OpenAIUsage>>,
pub extra_ai_data: Option<Vec<AIResults>>,
pub gemini_credits_used: Option<Vec<GeminiUsage>>,
pub extra_gemini_data: Option<Vec<AIResults>>,
pub remote_multimodal_usage: Option<Vec<AutomationUsage>>,
pub extra_remote_multimodal_data: Option<Vec<AutomationResults>>,
pub spawn_pages: Option<Vec<String>>,
pub content_map: Option<HashMap<String, Bytes>>,
pub page_links: Option<Box<HashSet<CaseInsensitiveString>>>,
pub should_retry: bool,
pub waf_check: bool,
pub bytes_transferred: Option<f64>,
pub blocked_crawl: bool,
pub signature: Option<u64>,
pub response_map: Option<HashMap<String, f64>>,
pub request_map: Option<HashMap<String, f64>>,
pub anti_bot_tech: AntiBotTech,
pub metadata: Option<Box<Metadata>>,
pub content_truncated: bool,
pub proxy_configured: bool,
pub binary_file: bool,
pub backend_source: Option<CompactString>,
/* private fields */
}decentralized only.Expand description
Represent a page visited.
Fields§
§headers: Option<HeaderMap>The headers of the page request response.
remote_addr: Option<SocketAddr>remote_addr only.The remote address of the page.
cookies only.The cookies of the page request response.
status_code: StatusCodeThe status code of the page request.
error_status: Option<String>The error of the request if any.
links: HashSet<CaseInsensitiveString>The current links for the page.
external_domains_caseless: Arc<HashSet<CaseInsensitiveString>>The external urls to group with the domain.
final_redirect_destination: Option<String>The final destination of the page if redirects were performed [Unused].
screenshot_bytes: Option<Vec<u8>>chrome only.The screenshot bytes of the page.
openai_credits_used: Option<Vec<OpenAIUsage>>openai only.The credits used from OpenAI in order.
extra_ai_data: Option<Vec<AIResults>>openai only.The extra data from the AI, example extracting data etc…
gemini_credits_used: Option<Vec<GeminiUsage>>gemini only.The credits used from Gemini in order.
extra_gemini_data: Option<Vec<AIResults>>gemini only.The extra data from the Gemini AI.
remote_multimodal_usage: Option<Vec<AutomationUsage>>The usage from remote multimodal automation (extraction, etc.). Works with both Chrome and HTTP-only crawls.
extra_remote_multimodal_data: Option<Vec<AutomationResults>>The extra data from the remote multimodal automation (extraction results, etc.). Works with both Chrome and HTTP-only crawls.
spawn_pages: Option<Vec<String>>URLs requested by automation to spawn as additional pages.
content_map: Option<HashMap<String, Bytes>>spider_cloud only.Additional content keyed by return format (e.g. "markdown", "text").
Populated when multiple formats are requested via
SpiderCloudConfig::with_return_formats.
page_links: Option<Box<HashSet<CaseInsensitiveString>>>The links found on the page. Unused until we can structure the buffers to match.
should_retry: boolThe request should retry.
waf_check: boolA WAF was found on the page.
bytes_transferred: Option<f64>The total byte transferred for the page. Mainly used for chrome events.
blocked_crawl: boolThe page was blocked from crawling usual from using website::on_should_crawl_callback.
signature: Option<u64>The signature of the page to de-duplicate content.
response_map: Option<HashMap<String, f64>>chrome only.All of the response events mapped with the amount of bytes used.
request_map: Option<HashMap<String, f64>>chrome only.All of the request events mapped with the time period of the event sent.
anti_bot_tech: AntiBotTechThe anti-bot tech used.
metadata: Option<Box<Metadata>>Page metadata.
content_truncated: boolWhether the response content was truncated due to a stream error, chunk idle timeout, or Content-Length mismatch.
proxy_configured: boolWhether a proxy was configured for this request. When true, 401 responses are retried (proxy rotation may fix auth).
binary_file: boolWhether the content is a binary file (image, PDF, etc.). Set once when HTML bytes are first available so the flag remains accurate after content is spooled to disk.
backend_source: Option<CompactString>parallel_backends only.Identifies which backend produced this page (e.g. “primary”,
“cdp”, “servo”). None when parallel backends are not active.
Implementations§
Source§impl Page
impl Page
Sourcepub fn needs_retry(&self) -> bool
pub fn needs_retry(&self) -> bool
Whether the page needs a retry based on should_retry, a retryable status code,
a truncated response (upstream stream ended prematurely), or a proxy-retryable
401 (when proxy_configured is set, proxy rotation may resolve the auth failure).
Sourcepub async fn new_page(url: &str, client: &Client) -> Self
pub async fn new_page(url: &str, client: &Client) -> Self
Instantiate a new page and gather the html repro of standard fetch_page_html.
Sourcepub async fn new_page_with_cache(
url: &str,
client: &Client,
cache_options: Option<CacheOptions>,
cache_policy: &Option<BasicCachePolicy>,
cache_namespace: Option<&str>,
) -> Self
pub async fn new_page_with_cache( url: &str, client: &Client, cache_options: Option<CacheOptions>, cache_policy: &Option<BasicCachePolicy>, cache_namespace: Option<&str>, ) -> Self
Instantiate a new page using cache options when available.
Sourcepub fn new_webdriver(url: &str, html: String, status_code: StatusCode) -> Self
pub fn new_webdriver(url: &str, html: String, status_code: StatusCode) -> Self
Create a new page from WebDriver content.
Sourcepub async fn new_page_webdriver(
url: &str,
driver: &Arc<WebDriver>,
timeout: Option<Duration>,
) -> Self
pub async fn new_page_webdriver( url: &str, driver: &Arc<WebDriver>, timeout: Option<Duration>, ) -> Self
Create a new page from WebDriver with full response.
Sourcepub async fn new_page_webdriver_full(
url: &str,
driver: &Arc<WebDriver>,
timeout: Option<Duration>,
wait_for: &Option<WaitFor>,
execution_scripts: &Option<ExecutionScripts>,
automation_scripts: &Option<AutomationScripts>,
) -> Self
pub async fn new_page_webdriver_full( url: &str, driver: &Arc<WebDriver>, timeout: Option<Duration>, wait_for: &Option<WaitFor>, execution_scripts: &Option<ExecutionScripts>, automation_scripts: &Option<AutomationScripts>, ) -> Self
Create a new page from WebDriver with full response and automation support.
Sourcepub async fn new_page_streaming<A: PartialEq + Eq + Sync + Send + Clone + Default + Hash + From<String> + for<'a> From<&'a str>>(
url: &str,
client: &Client,
only_html: bool,
selectors: &mut RelativeSelectors,
external_domains_caseless: &Arc<HashSet<CaseInsensitiveString>>,
r_settings: &PageLinkBuildSettings,
map: &mut HashSet<A>,
ssg_map: Option<&mut HashSet<A>>,
prior_domain: &Option<Box<Url>>,
domain_parsed: &mut Option<Box<Url>>,
links_pages: &mut Option<HashSet<A>>,
) -> Self
pub async fn new_page_streaming<A: PartialEq + Eq + Sync + Send + Clone + Default + Hash + From<String> + for<'a> From<&'a str>>( url: &str, client: &Client, only_html: bool, selectors: &mut RelativeSelectors, external_domains_caseless: &Arc<HashSet<CaseInsensitiveString>>, r_settings: &PageLinkBuildSettings, map: &mut HashSet<A>, ssg_map: Option<&mut HashSet<A>>, prior_domain: &Option<Box<Url>>, domain_parsed: &mut Option<Box<Url>>, links_pages: &mut Option<HashSet<A>>, ) -> Self
New page with rewriter
Sourcepub async fn new_page_only_html(url: &str, client: &Client) -> Self
pub async fn new_page_only_html(url: &str, client: &Client) -> Self
Instantiate a new page and gather the html repro of standard fetch_page_html only gathering resources to crawl.
Sourcepub async fn new_page_streaming_from_bytes<A: PartialEq + Eq + Sync + Send + Clone + Default + Hash + From<String> + for<'a> From<&'a str>>(
url: &str,
input_bytes: &[u8],
selectors: &mut RelativeSelectors,
external_domains_caseless: &Arc<HashSet<CaseInsensitiveString>>,
r_settings: &PageLinkBuildSettings,
map: &mut HashSet<A>,
ssg_map: Option<&mut HashSet<A>>,
prior_domain: &Option<Box<Url>>,
domain_parsed: &mut Option<Box<Url>>,
links_pages: &mut Option<HashSet<A>>,
) -> Self
pub async fn new_page_streaming_from_bytes<A: PartialEq + Eq + Sync + Send + Clone + Default + Hash + From<String> + for<'a> From<&'a str>>( url: &str, input_bytes: &[u8], selectors: &mut RelativeSelectors, external_domains_caseless: &Arc<HashSet<CaseInsensitiveString>>, r_settings: &PageLinkBuildSettings, map: &mut HashSet<A>, ssg_map: Option<&mut HashSet<A>>, prior_domain: &Option<Box<Url>>, domain_parsed: &mut Option<Box<Url>>, links_pages: &mut Option<HashSet<A>>, ) -> Self
Instantiate a new page and gather the links from input bytes.
Sourcepub async fn new(url: &str, client: &Client) -> Self
pub async fn new(url: &str, client: &Client) -> Self
Instantiate a new page and gather the headers and links.
Sourcepub async fn new_links_only(url: &str, client: &Client) -> Self
pub async fn new_links_only(url: &str, client: &Client) -> Self
Instantiate a new page and gather the links.
Sourcepub async fn screenshot(
&self,
_full_page: bool,
_omit_background: bool,
_format: CaptureScreenshotFormat,
_quality: Option<i64>,
_output_path: Option<impl AsRef<Path>>,
_clip: Option<ClipViewport>,
) -> Vec<u8> ⓘ
pub async fn screenshot( &self, _full_page: bool, _omit_background: bool, _format: CaptureScreenshotFormat, _quality: Option<i64>, _output_path: Option<impl AsRef<Path>>, _clip: Option<ClipViewport>, ) -> Vec<u8> ⓘ
Take a screenshot of the page. If the output path is set to None the screenshot will not be saved.
The feature flag chrome_store_page is required.
Sourcepub fn get_chrome_page(&self) -> Option<&Page>
Available on crate feature chrome only.
pub fn get_chrome_page(&self) -> Option<&Page>
chrome only.Get the chrome page used. The feature flag chrome is required.
Sourcepub async fn close_page(&mut self)
Available on crate feature chrome only.
pub async fn close_page(&mut self)
chrome only.Close the chrome page used. Useful when storing the page for subscription usage. The feature flag chrome_store_page is required.
Sourcepub fn is_empty(&self) -> bool
pub fn is_empty(&self) -> bool
Page request is empty. On chrome an empty page has bare html markup.
When the balance feature is active, a page whose HTML has been
spooled to disk is not considered empty.
Sourcepub fn get_timeout(&self) -> Option<Duration>
Available on crate feature headers only.
pub fn get_timeout(&self) -> Option<Duration>
headers only.Get the timeout required for rate limiting. The max duration is 30 seconds for delay respecting. Requires the feature flag headers.
Sourcepub fn set_external(
&mut self,
external_domains_caseless: Arc<HashSet<CaseInsensitiveString>>,
)
pub fn set_external( &mut self, external_domains_caseless: Arc<HashSet<CaseInsensitiveString>>, )
Set the external domains to treat as one
Sourcepub fn set_html_bytes(&mut self, html: Option<Vec<u8>>)
pub fn set_html_bytes(&mut self, html: Option<Vec<u8>>)
Set the html directly of the page
Sourcepub fn is_html_on_disk(&self) -> bool
Available on non-crate feature balance or crate feature decentralized only.
pub fn is_html_on_disk(&self) -> bool
balance or crate feature decentralized only.Whether this page’s HTML currently lives on disk rather than in memory.
Always returns false when the balance feature is not enabled or
the decentralized feature is active.
Sourcepub fn is_binary_spool_aware(&self) -> bool
pub fn is_binary_spool_aware(&self) -> bool
Check if this page contains binary content, even when the HTML is spooled to disk.
Zero disk I/O: binary_file is snapshotted at spool time (before
bytes leave memory). For in-memory pages the magic-number check
runs on the existing buffer. Spooled pages rely solely on the
pre-cached flag — no disk peek needed.
Sourcepub fn stream_html_bytes<F>(&self, chunk_size: usize, cb: F) -> usize
Available on non-crate feature balance or crate feature decentralized only.
pub fn stream_html_bytes<F>(&self, chunk_size: usize, cb: F) -> usize
balance or crate feature decentralized only.Stream the HTML content in fixed-size chunks to a caller-supplied
callback. Works the same as
stream_html_bytes but is available
without the balance feature — it simply chunks the in-memory HTML.
Sourcepub async fn stream_html_bytes_async<F>(
&self,
chunk_size: usize,
cb: F,
) -> usize
Available on non-crate feature balance or crate feature decentralized only.
pub async fn stream_html_bytes_async<F>( &self, chunk_size: usize, cb: F, ) -> usize
balance or crate feature decentralized only.Async version of stream_html_bytes.
Without the balance feature this simply chunks the in-memory HTML
(no disk path exists).
Sourcepub async fn get_html_async(&self) -> String
Available on non-crate feature balance or crate feature decentralized only.
pub async fn get_html_async(&self) -> String
balance or crate feature decentralized only.Async version of get_html. Without balance this
delegates to the sync version (no disk path).
Sourcepub fn set_url_parsed_direct(&mut self)
pub fn set_url_parsed_direct(&mut self)
Set the url directly parsed url of the page.
Sourcepub fn set_url_parsed_direct_empty(&mut self)
pub fn set_url_parsed_direct_empty(&mut self)
Set the url directly parsed url of the page. Useful for transforming the content and rewriting the url.
Sourcepub fn get_url_parsed(&self) -> &Option<Url>
pub fn get_url_parsed(&self) -> &Option<Url>
Parsed URL getter for page.
Sourcepub fn get_url_parsed_ref(&self) -> &Option<Url>
pub fn get_url_parsed_ref(&self) -> &Option<Url>
Parsed URL getter for page.
Sourcepub fn get_bytes(&self) -> Option<&[u8]>
pub fn get_bytes(&self) -> Option<&[u8]>
Html getter for bytes on the page.
Returns None when HTML is spooled to disk. Use [get_html],
[get_html_async], or [stream_html_bytes] for disk-aware access.
Sourcepub fn get_html(&self) -> String
pub fn get_html(&self) -> String
Html getter for bytes on the page as string.
When the balance feature is active and the HTML was spooled to disk,
this transparently reads from the temporary file and returns the
content. The spool file is not deleted here (use
ensure_html_loaded to reload + delete).
Sourcepub fn get_content(&self) -> String
pub fn get_content(&self) -> String
Content getter — returns the page body as a string.
This is an alias for get_html that works with any
return format (HTML, markdown, text, etc.) set via
SpiderCloudConfig::with_return_format
or transformed locally with spider_transformations.
Sourcepub fn get_html_cow(&self) -> Cow<'_, str>
pub fn get_html_cow(&self) -> Cow<'_, str>
Html getter that avoids allocation when the content is already valid UTF-8.
Returns Cow::Borrowed for UTF-8 content (common case), Cow::Owned when
encoding conversion is needed or content is loaded from a disk spool.
Sourcepub fn get_html_bytes_u8(&self) -> &[u8] ⓘ
pub fn get_html_bytes_u8(&self) -> &[u8] ⓘ
Html getter for page to u8.
Sourcepub fn get_content_bytes(&self) -> &[u8] ⓘ
pub fn get_content_bytes(&self) -> &[u8] ⓘ
Content getter as raw bytes — alias for get_html_bytes_u8.
Works with any return format (HTML, markdown, text, etc.).
Sourcepub fn get_content_for(&self, format: &str) -> Option<String>
Available on crate feature spider_cloud only.
pub fn get_content_for(&self, format: &str) -> Option<String>
spider_cloud only.Get content for a specific return format from a multi-format response.
Returns None if multi-format was not requested or the format is not present.
Use with_return_formats
on SpiderCloudConfig to request multiple formats.
Sourcepub fn get_content_bytes_for(&self, format: &str) -> Option<&[u8]>
Available on crate feature spider_cloud only.
pub fn get_content_bytes_for(&self, format: &str) -> Option<&[u8]>
spider_cloud only.Get content for a specific return format as raw bytes.
Returns None if multi-format was not requested or the format is not present.
Sourcepub fn has_content_map(&self) -> bool
Available on crate feature spider_cloud only.
pub fn has_content_map(&self) -> bool
spider_cloud only.Check if this page has multi-format content available.
Sourcepub fn quality_score(&self) -> u16
Available on crate feature parallel_backends only.
pub fn quality_score(&self) -> u16
parallel_backends only.Compute an HTML quality score (0–100) for this page.
Uses status code, content length, structural HTML checks, and anti-bot detection to score the response.
Sourcepub fn get_responses(&self) -> &Option<HashMap<String, f64>>
Available on crate feature chrome only.
pub fn get_responses(&self) -> &Option<HashMap<String, f64>>
chrome only.Get the response events mapped.
Sourcepub fn get_metadata(&self) -> &Option<Box<Metadata>>
pub fn get_metadata(&self) -> &Option<Box<Metadata>>
Get the metadata found on the page.
Sourcepub fn get_request(&self) -> &Option<HashMap<String, f64>>
Available on crate feature chrome only.
pub fn get_request(&self) -> &Option<HashMap<String, f64>>
chrome only.Get the response events mapped.
Sourcepub fn get_html_encoded(&self, label: &str) -> String
Available on crate feature encoding only.
pub fn get_html_encoded(&self, label: &str) -> String
encoding only.Html getter for getting the content with proper encoding. Pass in a proper encoding label like SHIFT_JIS. This fallsback to get_html without the encoding flag enabled.
Sourcepub fn set_duration_elapsed(&mut self, scraped_at: Option<Instant>)
Available on crate feature time only.
pub fn set_duration_elapsed(&mut self, scraped_at: Option<Instant>)
time only.Set the elapsed duration of the page since scraped from duration.
Sourcepub fn set_duration_elapsed_from_duration(&mut self, elapsed: Option<Duration>)
Available on crate feature time only.
pub fn set_duration_elapsed_from_duration(&mut self, elapsed: Option<Duration>)
time only.Set the elapsed duration of the page since scraped from duration.
Sourcepub fn get_duration_elapsed(&self) -> Duration
Available on crate feature time only.
pub fn get_duration_elapsed(&self) -> Duration
time only.Get the elapsed duration of the page since scraped.
Sourcepub async fn links_stream_xml_links_stream_base<A: PartialEq + Eq + Sync + Send + Clone + Default + ToString + Hash + From<String> + Into<CaseInsensitiveString> + for<'a> From<&'a str>>(
&mut self,
selectors: &RelativeSelectors,
xml: &[u8],
map: &mut HashSet<A>,
base: &Option<Box<Url>>,
)
pub async fn links_stream_xml_links_stream_base<A: PartialEq + Eq + Sync + Send + Clone + Default + ToString + Hash + From<String> + Into<CaseInsensitiveString> + for<'a> From<&'a str>>( &mut self, selectors: &RelativeSelectors, xml: &[u8], map: &mut HashSet<A>, base: &Option<Box<Url>>, )
Find the links as a stream using string resource validation for XML files.
Sourcepub async fn links_stream<A: PartialEq + Eq + Sync + Send + Clone + Default + ToString + Hash + From<String> + Into<CaseInsensitiveString> + for<'a> From<&'a str>>(
&mut self,
_: &RelativeSelectors,
) -> HashSet<A>
pub async fn links_stream<A: PartialEq + Eq + Sync + Send + Clone + Default + ToString + Hash + From<String> + Into<CaseInsensitiveString> + for<'a> From<&'a str>>( &mut self, _: &RelativeSelectors, ) -> HashSet<A>
Find the links as a stream using string resource validation
Sourcepub async fn links(
&self,
_: &RelativeSelectors,
_: &Option<Box<Url>>,
) -> HashSet<CaseInsensitiveString>
pub async fn links( &self, _: &RelativeSelectors, _: &Option<Box<Url>>, ) -> HashSet<CaseInsensitiveString>
Find all href links and return them using CSS selectors.
Sourcepub async fn links_full(
&self,
_: &RelativeSelectors,
_: &Option<Box<Url>>,
) -> HashSet<CaseInsensitiveString>
pub async fn links_full( &self, _: &RelativeSelectors, _: &Option<Box<Url>>, ) -> HashSet<CaseInsensitiveString>
Find all href links and return them using CSS selectors gathering all resources.
Trait Implementations§
Source§impl PageChromeExt for Page
Available on crate feature chrome only.
impl PageChromeExt for Page
chrome only.Source§fn chrome_page(&self) -> Option<&Page>
fn chrome_page(&self) -> Option<&Page>
Source§fn screenshot_bytes(&self) -> Option<&[u8]>
fn screenshot_bytes(&self) -> Option<&[u8]>
Source§impl PageData for Page
impl PageData for Page
Source§fn html_bytes_u8(&self) -> &[u8] ⓘ
fn html_bytes_u8(&self) -> &[u8] ⓘ
Source§fn status_code(&self) -> StatusCode
fn status_code(&self) -> StatusCode
Source§impl PageTimingExt for Page
Available on crate feature time only.
impl PageTimingExt for Page
time only.Source§fn duration_elapsed(&self) -> Duration
fn duration_elapsed(&self) -> Duration
Auto Trait Implementations§
impl !Freeze for Page
impl RefUnwindSafe for Page
impl Send for Page
impl Sync for Page
impl Unpin for Page
impl UnsafeUnpin for Page
impl UnwindSafe for Page
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> Instrument for T
impl<T> Instrument for T
Source§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
Source§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more