use crate::utils::abs::convert_abs_path;
use crate::utils::templates::EMPTY_HTML_BASIC;
#[cfg(not(feature = "decentralized"))]
use crate::utils::RequestError;
use crate::utils::{
css_selectors::{compiled_base_element_selector, compiled_selector, compiled_xml_selector},
get_domain_from_url, hash_html, networking_capable, BasicCachePolicy, CacheOptions,
PageResponse,
};
use crate::CaseInsensitiveString;
use crate::Client;
use crate::RelativeSelectors;
use crate::{compact_str::CompactString, utils::templates::EMPTY_HTML};
use auto_encoder::auto_encode_bytes;
use hashbrown::HashSet;
use lol_html::AsciiCompatibleEncoding;
use phf::phf_set;
use regex::bytes::Regex;
use reqwest::StatusCode;
use std::sync::Arc;
use tokio::time::Duration;
#[cfg(feature = "time")]
use tokio::time::Instant;
#[cfg(all(feature = "decentralized", feature = "headers"))]
use crate::utils::FetchPageResult;
use lazy_static::lazy_static;
use url::Url;
macro_rules! element_precompiled {
($selector:expr, $handler:expr) => {{
#[inline(always)]
const fn type_hint<'h, T, H: lol_html::HandlerTypes>(h: T) -> T
where
T: FnMut(&mut lol_html::html_content::Element<'_, '_, H>) -> lol_html::HandlerResult
+ 'h,
{
h
}
(
std::borrow::Cow::Borrowed($selector),
lol_html::send::ElementContentHandlers::default().element(type_hint($handler)),
)
}};
}
pub(crate) const MAX_PRE_ALLOCATED_HTML_PAGE_SIZE: u64 = 128 * 1024;
pub(crate) const MAX_PRE_ALLOCATED_HTML_PAGE_SIZE_USIZE: usize =
MAX_PRE_ALLOCATED_HTML_PAGE_SIZE as usize;
pub(crate) const MAX_PREALLOC: usize = 10 * 1024 * 1024;
pub(crate) const MAX_CONTENT_LENGTH: u64 = 2 * 1024 * 1024 * 1024;
#[cfg(feature = "chrome")]
pub(crate) const TURNSTILE_WALL_PAGE_SIZE: usize = MAX_PRE_ALLOCATED_HTML_PAGE_SIZE_USIZE * 4;
lazy_static! {
static ref CASELESS_WILD_CARD: CaseInsensitiveString = CaseInsensitiveString::new("*");
static ref SSG_CAPTURE: Regex = Regex::new(r#""(.*?)""#).unwrap();
static ref GATSBY: Option<&'static str> = Some("gatsby-chunk-mapping");
static ref NUXT_DATA: Option<&'static str> = Some("__NUXT_DATA__");
static ref NUXT: Option<&'static str> = Some("__nuxt");
static ref REACT_SSR: Option<&'static str> = Some("react-app.embeddedData");
pub(crate) static ref UNKNOWN_STATUS_ERROR: StatusCode =
StatusCode::from_u16(599).expect("valid status code");
pub(crate) static ref CHROME_UNKNOWN_STATUS_ERROR: StatusCode =
StatusCode::from_u16(598).expect("valid status code");
pub(crate) static ref CONNECTION_TIMEOUT_ERROR: StatusCode =
StatusCode::from_u16(524).expect("valid status code");
pub(crate) static ref CONNECTION_REFUSED_ERROR: StatusCode =
StatusCode::from_u16(521).expect("valid status code");
pub(crate) static ref CONNECTION_ABORTED_ERROR: StatusCode =
StatusCode::from_u16(522).expect("valid status code");
pub(crate) static ref CONNECTION_RESET_ERROR: StatusCode =
StatusCode::from_u16(523).expect("valid status code");
pub(crate) static ref DNS_RESOLVE_ERROR: StatusCode =
StatusCode::from_u16(525).expect("valid status code");
pub(crate) static ref ADDRESS_UNREACHABLE_ERROR: StatusCode =
StatusCode::from_u16(526).expect("valid status code");
pub(crate) static ref TOO_MANY_REDIRECTS_ERROR: StatusCode =
StatusCode::from_u16(310).expect("valid status code");
pub(crate) static ref BODY_DECODE_ERROR: StatusCode =
StatusCode::from_u16(400).expect("valid status code");
pub(crate) static ref UNREACHABLE_REQUEST_ERROR: StatusCode =
StatusCode::from_u16(503).expect("valid status code");
}
lazy_static! {
static ref DNS_ERROR_AC: aho_corasick::AhoCorasick = aho_corasick::AhoCorasick::new([
"dns error",
"failed to lookup address",
"Name or service not known",
"No address associated with hostname",
"ENOTFOUND",
"no record found",
"dns resolution returned no addresses",
]).expect("valid patterns");
static ref PROXY_TUNNEL_FAILURE_AC: aho_corasick::AhoCorasick = aho_corasick::AhoCorasick::new([
"tunnel error", "TunnelUnsuccessful", "error connecting to socks proxy", "SOCKS error", ]).expect("valid patterns");
static ref SSL_HANDSHAKE_ERROR_AC: aho_corasick::AhoCorasick = aho_corasick::AhoCorasick::new([
"alert: HandshakeFailure",
"alert: ProtocolVersion",
"received fatal alert",
"no shared cipher",
"wrong_version_number",
"wrong version number",
"unsupported protocol",
"tls handshake error",
"TLS handshake error",
"tls handshake failure",
"TLS handshake failure",
"tls handshake failed",
"TLS handshake failed",
"ssl handshake failure",
"SSL handshake failure",
"ssl handshake failed",
"SSL handshake failed",
"ERR_SSL_VERSION_OR_CIPHER_MISMATCH",
"ERR_SSL_PROTOCOL_ERROR",
]).expect("valid patterns");
}
#[cfg(feature = "cache_request")]
lazy_static! {
static ref CACHE_WRAPPED_TRANSPORT_AC: aho_corasick::AhoCorasick =
aho_corasick::AhoCorasick::new(["Cache error: error sending request"])
.expect("valid patterns");
}
fn is_dns_error(err: &crate::client::Error) -> bool {
use std::error::Error;
if DNS_ERROR_AC.is_match(&err.to_string()) {
return true;
}
let mut source: Option<&(dyn Error + 'static)> = err.source();
let mut depth = 0u8;
while let Some(e) = source {
if depth >= 6 {
break;
}
if let Some(io_err) = e.downcast_ref::<std::io::Error>() {
if matches!(io_err.kind(), std::io::ErrorKind::NotFound) {
return true;
}
}
if DNS_ERROR_AC.is_match(&e.to_string()) {
return true;
}
source = e.source();
depth += 1;
}
false
}
fn is_proxy_tunnel_failure(err: &crate::client::Error) -> bool {
use std::error::Error;
if PROXY_TUNNEL_FAILURE_AC.is_match(&err.to_string()) {
return true;
}
let mut source: Option<&(dyn Error + 'static)> = err.source();
let mut depth = 0u8;
while let Some(e) = source {
if depth >= 6 {
break;
}
if PROXY_TUNNEL_FAILURE_AC.is_match(&e.to_string()) {
return true;
}
source = e.source();
depth += 1;
}
false
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LocalDnsState {
Resolved,
NxDomain,
TimedOut,
}
pub const DEFAULT_HOST_DNS_CACHE_CAPACITY: usize = 1024;
pub const DEFAULT_HOST_DNS_CACHE_TTL_SECS: u64 = 300;
#[derive(Debug, Clone, Copy)]
struct HostDnsCacheEntry {
state: LocalDnsState,
inserted_at: std::time::Instant,
}
pub struct HostDnsCache {
entries: dashmap::DashMap<String, HostDnsCacheEntry>,
capacity: usize,
ttl: std::time::Duration,
}
impl HostDnsCache {
pub fn new(capacity: usize, ttl: std::time::Duration) -> Self {
let capacity = capacity.max(1);
Self {
entries: dashmap::DashMap::with_capacity(capacity),
capacity,
ttl,
}
}
pub fn get(&self, host: &str) -> Option<LocalDnsState> {
let entry = *self.entries.get(host)?;
if entry.inserted_at.elapsed() < self.ttl {
Some(entry.state)
} else {
None
}
}
pub fn insert(&self, host: String, state: LocalDnsState) {
let entry = HostDnsCacheEntry {
state,
inserted_at: std::time::Instant::now(),
};
let was_present = self.entries.insert(host, entry).is_some();
if was_present {
return;
}
while self.entries.len() > self.capacity {
if !self.evict_one() {
break; }
}
}
fn evict_one(&self) -> bool {
let now = std::time::Instant::now();
let mut oldest_key: Option<String> = None;
let mut oldest_at: Option<std::time::Instant> = None;
for kv in self.entries.iter() {
let inserted = kv.value().inserted_at;
if now.duration_since(inserted) >= self.ttl {
let key = kv.key().clone();
drop(kv); return self.entries.remove(&key).is_some();
}
match oldest_at {
None => {
oldest_at = Some(inserted);
oldest_key = Some(kv.key().clone());
}
Some(prev) if inserted < prev => {
oldest_at = Some(inserted);
oldest_key = Some(kv.key().clone());
}
_ => {}
}
}
if let Some(k) = oldest_key {
self.entries.remove(&k).is_some()
} else {
false
}
}
pub fn len(&self) -> usize {
self.entries.len()
}
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
#[cfg(test)]
pub fn clear(&self) {
self.entries.clear();
}
}
static HOST_DNS_CACHE: std::sync::OnceLock<HostDnsCache> = std::sync::OnceLock::new();
#[inline]
fn host_dns_cache() -> &'static HostDnsCache {
HOST_DNS_CACHE.get_or_init(|| {
let capacity = std::env::var("SPIDER_HOST_DNS_CACHE_CAPACITY")
.ok()
.and_then(|s| s.parse::<usize>().ok())
.filter(|c| *c > 0)
.unwrap_or(DEFAULT_HOST_DNS_CACHE_CAPACITY);
let ttl_secs = std::env::var("SPIDER_HOST_DNS_CACHE_TTL_SECS")
.ok()
.and_then(|s| s.parse::<u64>().ok())
.unwrap_or(DEFAULT_HOST_DNS_CACHE_TTL_SECS);
HostDnsCache::new(capacity, std::time::Duration::from_secs(ttl_secs))
})
}
pub async fn host_resolves_locally_cached(
host: &str,
timeout: std::time::Duration,
) -> LocalDnsState {
let cache = host_dns_cache();
if let Some(cached) = cache.get(host) {
return cached;
}
let state = host_resolves_locally(host, timeout).await;
if !matches!(state, LocalDnsState::TimedOut) {
cache.insert(host.to_string(), state);
}
state
}
pub async fn host_resolves_locally(host: &str, timeout: std::time::Duration) -> LocalDnsState {
use std::io::ErrorKind;
let addr = format!("{host}:443");
match tokio::time::timeout(timeout, tokio::net::lookup_host(addr)).await {
Ok(Ok(mut iter)) => {
if iter.next().is_some() {
LocalDnsState::Resolved
} else {
LocalDnsState::NxDomain
}
}
Ok(Err(e)) => {
if matches!(e.kind(), ErrorKind::NotFound) || DNS_ERROR_AC.is_match(&e.to_string()) {
LocalDnsState::NxDomain
} else {
LocalDnsState::TimedOut
}
}
Err(_) => LocalDnsState::TimedOut,
}
}
pub async fn confirm_tunnel_failure_with_local_dns(
initial_status: StatusCode,
err: &crate::client::Error,
target_url: &str,
timeout: std::time::Duration,
) -> StatusCode {
let qualifies_503_tunnel =
initial_status == *UNREACHABLE_REQUEST_ERROR && is_proxy_tunnel_failure(err);
#[cfg(feature = "cache_request")]
let qualifies_cache_wrapped = (initial_status == *ADDRESS_UNREACHABLE_ERROR
|| initial_status == *UNREACHABLE_REQUEST_ERROR)
&& err.is_middleware()
&& CACHE_WRAPPED_TRANSPORT_AC.is_match(&err.to_string());
#[cfg(not(feature = "cache_request"))]
let qualifies_cache_wrapped = false;
if !qualifies_503_tunnel && !qualifies_cache_wrapped {
return initial_status;
}
let host = match url::Url::parse(target_url)
.ok()
.and_then(|u| u.host_str().map(str::to_owned))
{
Some(h) => h,
None => return initial_status,
};
match host_resolves_locally_cached(&host, timeout).await {
LocalDnsState::NxDomain => *DNS_RESOLVE_ERROR,
LocalDnsState::Resolved | LocalDnsState::TimedOut => initial_status,
}
}
#[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
async fn confirm_chrome_tunnel_failure_with_local_dns(
page: &mut Page,
target_url: &str,
timeout: std::time::Duration,
) {
let chrome_unknown_status = StatusCode::from_u16(599).unwrap_or(StatusCode::BAD_GATEWAY);
let chrome_empty_504_status = StatusCode::from_u16(504).unwrap_or(StatusCode::GATEWAY_TIMEOUT);
let is_chrome_unknown = page.status_code == chrome_unknown_status;
let is_chrome_empty_504 = page.status_code == chrome_empty_504_status;
if !is_chrome_unknown && !is_chrome_empty_504 {
return;
}
let qualifies_for_dns_check = {
let content = page.get_html_bytes_u8();
if is_chrome_unknown {
if content.is_empty() || !is_chrome_error_page(content) {
false
} else {
extract_chrome_error_code(content)
.map(|code| {
let trimmed = code.strip_prefix("net::").unwrap_or(code);
matches!(
trimmed,
"ERR_TUNNEL_CONNECTION_FAILED" | "ERR_PROXY_CONNECTION_FAILED"
)
})
.unwrap_or(false)
}
} else {
content.len() <= 256
}
};
if !qualifies_for_dns_check {
return;
}
let host = match url::Url::parse(target_url)
.ok()
.and_then(|u| u.host_str().map(str::to_owned))
{
Some(h) => h,
None => return,
};
if matches!(
host_resolves_locally_cached(&host, timeout).await,
LocalDnsState::NxDomain
) {
page.status_code = *DNS_RESOLVE_ERROR;
page.should_retry = false;
}
}
fn is_ssl_handshake_error(err: &crate::client::Error) -> bool {
if !(err.is_connect() || err.is_request()) {
return false;
}
SSL_HANDSHAKE_ERROR_AC.is_match(&err.to_string())
}
#[inline]
pub fn is_retryable_status(status: StatusCode) -> bool {
status != *DNS_RESOLVE_ERROR
&& status != *ADDRESS_UNREACHABLE_ERROR
&& status != *TOO_MANY_REDIRECTS_ERROR
&& status != StatusCode::NOT_IMPLEMENTED
&& status != StatusCode::HTTP_VERSION_NOT_SUPPORTED
&& status != StatusCode::NETWORK_AUTHENTICATION_REQUIRED
&& (status.is_server_error()
|| matches!(
status,
StatusCode::TOO_MANY_REQUESTS | StatusCode::REQUEST_TIMEOUT
))
}
#[inline]
pub fn is_permanent_target_failure(status: StatusCode) -> bool {
status == *DNS_RESOLVE_ERROR
|| status == *ADDRESS_UNREACHABLE_ERROR
|| status == *TOO_MANY_REDIRECTS_ERROR
}
pub(crate) fn get_error_http_status_code(err: &crate::client::Error) -> StatusCode {
use std::error::Error;
use std::io;
if let Some(status) = err.status() {
return status;
}
if err.is_timeout() {
return *CONNECTION_TIMEOUT_ERROR;
}
if is_ssl_handshake_error(err) {
return *ADDRESS_UNREACHABLE_ERROR;
}
#[cfg(not(feature = "decentralized"))]
if let Some(status) = h2_permanent_reason_status(err) {
return status;
}
if is_dns_error(err) {
return *DNS_RESOLVE_ERROR;
}
#[cfg(feature = "cache_request")]
if err.is_middleware() && CACHE_WRAPPED_TRANSPORT_AC.is_match(&err.to_string()) {
return *ADDRESS_UNREACHABLE_ERROR;
}
if err.is_connect() {
if let Some(io_err) = err.source().and_then(|e| e.downcast_ref::<io::Error>()) {
match io_err.kind() {
io::ErrorKind::ConnectionRefused => return *CONNECTION_REFUSED_ERROR,
io::ErrorKind::ConnectionAborted => return *CONNECTION_ABORTED_ERROR,
io::ErrorKind::ConnectionReset => return *CONNECTION_RESET_ERROR,
io::ErrorKind::NotFound => return *UNREACHABLE_REQUEST_ERROR,
io::ErrorKind::HostUnreachable | io::ErrorKind::NetworkUnreachable => {
return *ADDRESS_UNREACHABLE_ERROR
}
io::ErrorKind::TimedOut => return *CONNECTION_TIMEOUT_ERROR,
_ => (),
}
}
return *UNREACHABLE_REQUEST_ERROR;
}
if err.is_body() {
return *BODY_DECODE_ERROR;
}
if err.is_request() {
return StatusCode::BAD_REQUEST;
}
*UNKNOWN_STATUS_ERROR
}
#[cfg(all(not(feature = "decentralized"), feature = "smart"))]
#[inline]
fn is_tracker_script(src: &str) -> bool {
use chromiumoxide::spider_network_blocker;
if src.starts_with("http") {
spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(src)
} else {
spider_network_blocker::adblock::ADBLOCK_PATTERNS
.iter()
.any(|p| src.contains(p))
}
}
#[cfg(all(not(feature = "decentralized"), feature = "smart"))]
lazy_static! {
static ref NO_SCRIPT_JS_REQUIRED: aho_corasick::AhoCorasick = {
let patterns = &[
"enable javascript", "requires javascript", "turn on javascript",
];
aho_corasick::AhoCorasick::new(patterns).expect("valid dom script patterns")
};
static ref DOM_SCRIPT_WATCH_METHODS: aho_corasick::AhoCorasick = {
let patterns = &[
".createElementNS", ".removeChild", ".insertBefore", ".createElement(",
".createTextNode", ".replaceChildren(", ".prepend(",
".appendChild(", "document.write(", "window.location.href",
".innerHTML", ".outerHTML", ".insertAdjacentHTML(", ".insertAdjacentElement(",
".replaceWith(", ".replaceChild(", ".cloneNode(",
"new DOMParser",
"history.pushState", "history.replaceState",
"location.assign(", "location.replace(",
"window.location=", "document.location=",
"fetch(", "new XMLHttpRequest",
"window.__NUXT__"
];
aho_corasick::AhoCorasick::new(patterns).expect("valid dom script patterns")
};
static ref DOM_WATCH_ATTRIBUTE_PATTERNS: [&'static str; 5] = [
"__NEXT_DATA__", "__NUXT__", "data-reactroot",
"ng-version", "data-v-app",
];
pub(crate) static ref HYDRATION_IDS: phf::Set<&'static str> = phf_set! {
"__nuxt",
"__nuxt-loader",
"__NUXT_DATA__",
"__next",
"__NEXT_DATA__",
"___gatsby",
"redwood-app",
"sapper"
};
}
lazy_static! {
pub(crate) static ref DOWNLOADABLE_MEDIA_TYPES: phf::Set<&'static str> = phf_set! {
"audio/mpeg", "audio/wav", "audio/ogg", "audio/flac", "audio/aac", "audio/webm", "audio/midi", "audio/x-midi", "audio/mp4", "audio/x-m4a", "audio/aiff", "audio/x-aiff", "audio/3gpp", "audio/3gpp2", "video/mp4", "video/webm", "video/ogg", "video/x-matroska", "video/x-msvideo", "video/quicktime", "video/x-ms-wmv", "video/x-flv", "video/mpeg", "video/mp2t", "video/3gpp", "video/3gpp2", "image/jpeg", "image/png", "image/gif", "image/webp", "image/svg+xml", "image/bmp", "image/tiff", "image/vnd.microsoft.icon", "image/apng", "image/avif", "image/heic", "image/heif", "font/woff", "font/woff2", "font/ttf", "font/otf", "application/vnd.ms-fontobject", "application/pdf", "application/rtf", "text/plain", "text/csv", "text/markdown", "text/calendar", "application/msword", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/vnd.ms-powerpoint", "application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.oasis.opendocument.text", "application/vnd.oasis.opendocument.spreadsheet", "application/vnd.oasis.opendocument.presentation", "application/vnd.visio", "application/epub+zip", "application/vnd.amazon.ebook", "application/x-abiword", "application/octet-stream", "application/zip", "application/x-zip-compressed", "application/vnd.rar", "application/x-rar-compressed", "application/x-7z-compressed", "application/x-tar", "application/gzip", "application/x-gzip", "application/x-bzip", "application/x-bzip2", "application/java-archive", "application/x-freearc", "application/vnd.apple.installer+xml", "application/ogg", };
pub(crate) static ref STREAMING_CHUNK_SIZE: usize = {
let default_streaming_chunk_size: usize = (8192 * num_cpus::get_physical().min(64)).min(65536);
let min_streaming_chunk_size: usize = default_streaming_chunk_size * 2 / 3;
std::env::var("SPIDER_STREAMING_CHUNK_SIZE")
.ok()
.and_then(|val| val.parse::<usize>().ok())
.map(|val| {
if val < min_streaming_chunk_size {
min_streaming_chunk_size
} else {
val
}
})
.unwrap_or(default_streaming_chunk_size)
};
}
pub(crate) static IGNORE_EXTENSIONS: phf::Set<&'static str> = phf_set! {
"jpg", "jpeg", "png", "gif", "svg", "webp", "bmp", "tiff", "tif",
"heic", "heif", "apng", "avif", "ico",
"mp4", "avi", "mov", "wmv", "flv", "mkv", "webm", "m4v", "mpeg",
"3gp", "3g2",
"mp3", "wav", "ogg", "aac", "flac", "m4a", "aiff", "cda", "mid",
"midi", "oga", "opus", "weba",
"woff", "woff2", "ttf", "otf", "eot",
"pdf", "eps", "rtf", "txt", "doc", "docx", "csv", "epub",
"abw", "azw", "odt", "ods", "odp", "ppt", "pptx", "xls", "xlsx", "vsd",
"yaml", "yml", "ics", "md", "webmanifest",
"gz", "arc", "bin", "bz", "bz2", "jar", "mpkg", "rar", "tar", "zip", "7z",
"swf", "xap",
"ogv", "ogx",
"ts",
};
#[inline]
pub(crate) fn is_ignored_extension(ext: &str) -> bool {
let bytes = ext.as_bytes();
if bytes.len() > 16 || bytes.is_empty() {
return false;
}
let mut buf = [0u8; 16];
let dest = &mut buf[..bytes.len()];
dest.copy_from_slice(bytes);
dest.make_ascii_lowercase();
debug_assert!(std::str::from_utf8(dest).is_ok());
let lowered = std::str::from_utf8(dest).unwrap_or_default();
IGNORE_EXTENSIONS.contains(lowered)
}
static LINK_CAPACITY_HINT: std::sync::atomic::AtomicUsize = std::sync::atomic::AtomicUsize::new(32);
#[inline(always)]
pub(crate) fn link_set_capacity() -> usize {
LINK_CAPACITY_HINT
.load(std::sync::atomic::Ordering::Relaxed)
.max(32)
}
#[inline(always)]
pub(crate) fn update_link_capacity_hint(count: usize) {
let prev = LINK_CAPACITY_HINT.load(std::sync::atomic::Ordering::Relaxed);
let next = if prev == 0 {
count.max(32)
} else {
((prev * 3 + count) / 4).max(32)
};
LINK_CAPACITY_HINT.store(next, std::sync::atomic::Ordering::Relaxed);
}
thread_local! {
static XML_PARSE_BUF: std::cell::Cell<Vec<u8>> = const { std::cell::Cell::new(Vec::new()) };
}
pub const REWRITER_YIELD_THRESHOLD: usize = 512 * 1024;
pub const REWRITER_YIELD_INTERVAL: usize = 8;
#[derive(Debug, Clone, Default)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct AIResults {
pub input: String,
pub js_output: String,
pub content_output: Vec<String>,
pub screenshot_output: Option<Vec<u8>>,
pub error: Option<String>,
}
#[cfg(feature = "serde")]
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
pub struct AutomationResults {
pub input: String,
pub content_output: serde_json::Value,
pub screenshot_output: Option<String>,
pub error: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub usage: Option<crate::features::automation::AutomationUsage>,
#[serde(skip_serializing_if = "Option::is_none")]
pub relevant: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub steps_executed: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub reasoning: Option<String>,
}
#[cfg(not(feature = "serde"))]
#[derive(Debug, Clone, Default)]
pub struct AutomationResults {
pub input: String,
pub content_output: String,
pub screenshot_output: Option<String>,
pub error: Option<String>,
pub usage: Option<crate::features::automation::AutomationUsage>,
pub relevant: Option<bool>,
pub steps_executed: Option<usize>,
pub reasoning: Option<String>,
}
#[derive(Debug, Default, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct Metadata {
pub title: Option<CompactString>,
pub description: Option<CompactString>,
pub image: Option<CompactString>,
#[cfg(feature = "chrome")]
pub automation: Option<Vec<AutomationResults>>, }
impl Metadata {
pub fn exist(&self) -> bool {
self.title.is_some() || self.description.is_some() || self.image.is_some()
}
}
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
pub enum AntiBotTech {
Cloudflare,
DataDome,
HUMAN,
PerimeterX,
Kasada,
FingerprintJS,
ArkoseLabs,
Imperva,
F5,
QueueIt,
Netacea,
AppsFlyer,
Adjust,
AppTrana,
AkamaiBotManager,
RadwareBotManager,
Reblaze,
CHEQ,
Incode,
Singula,
AlibabaTMD,
Sucuri,
DDoSGuard,
Vercel,
AwsWaf,
Wordfence,
GeeTest,
HCaptcha,
Custom,
#[default]
None,
}
#[cfg(feature = "balance")]
#[derive(Debug)]
struct SpoolInner {
path: std::path::PathBuf,
#[allow(dead_code)]
dir_handle: Option<Arc<crate::utils::html_spool::WebsiteSpoolDir>>,
}
#[cfg(feature = "balance")]
impl Drop for SpoolInner {
#[inline]
fn drop(&mut self) {
crate::utils::html_spool::track_page_unspooled();
let path = std::mem::take(&mut self.path);
if !path.as_os_str().is_empty() {
crate::utils::html_spool::queue_spool_delete(path);
}
}
}
#[cfg(feature = "balance")]
#[derive(Debug, Clone, Default)]
pub(crate) struct HtmlSpoolGuard {
inner: Option<Arc<SpoolInner>>,
}
#[cfg(feature = "balance")]
impl HtmlSpoolGuard {
pub fn new(
path: std::path::PathBuf,
dir_handle: Option<Arc<crate::utils::html_spool::WebsiteSpoolDir>>,
) -> Self {
Self {
inner: Some(Arc::new(SpoolInner { path, dir_handle })),
}
}
#[inline]
pub fn path(&self) -> Option<&std::path::Path> {
self.inner.as_ref().map(|s| s.path.as_path())
}
}
#[derive(Debug, Clone, Default)]
#[cfg(not(feature = "decentralized"))]
pub struct Page {
pub(crate) html: Option<bytes::Bytes>,
pub(crate) base: Option<Url>,
pub(crate) url: String,
pub headers: Option<reqwest::header::HeaderMap>,
#[cfg(feature = "remote_addr")]
pub remote_addr: Option<core::net::SocketAddr>,
#[cfg(feature = "cookies")]
pub cookies: Option<reqwest::header::HeaderMap>,
pub status_code: StatusCode,
#[cfg(not(feature = "page_error_status_details"))]
pub error_status: Option<String>,
#[cfg(feature = "page_error_status_details")]
pub error_status: Option<std::sync::Arc<reqwest::Error>>,
pub external_domains_caseless: Arc<HashSet<CaseInsensitiveString>>,
pub final_redirect_destination: Option<String>,
#[cfg(feature = "time")]
duration: Option<Instant>,
#[cfg(feature = "chrome")]
chrome_page: Option<chromiumoxide::Page>,
#[cfg(feature = "chrome")]
pub screenshot_bytes: Option<Vec<u8>>,
#[cfg(feature = "openai")]
pub openai_credits_used: Option<Vec<crate::features::openai_common::OpenAIUsage>>,
#[cfg(feature = "openai")]
pub extra_ai_data: Option<Vec<AIResults>>,
#[cfg(feature = "gemini")]
pub gemini_credits_used: Option<Vec<crate::features::gemini_common::GeminiUsage>>,
#[cfg(feature = "gemini")]
pub extra_gemini_data: Option<Vec<AIResults>>,
pub remote_multimodal_usage: Option<Vec<crate::features::automation::AutomationUsage>>,
pub extra_remote_multimodal_data: Option<Vec<AutomationResults>>,
pub spawn_pages: Option<Vec<String>>,
#[cfg(feature = "spider_cloud")]
pub content_map: Option<hashbrown::HashMap<String, bytes::Bytes>>,
pub page_links: Option<Box<HashSet<CaseInsensitiveString>>>,
pub should_retry: bool,
pub waf_check: bool,
pub bytes_transferred: Option<f64>,
pub blocked_crawl: bool,
pub signature: Option<u64>,
#[cfg(feature = "chrome")]
pub response_map: Option<hashbrown::HashMap<String, f64>>,
#[cfg(feature = "chrome")]
pub request_map: Option<hashbrown::HashMap<String, f64>>,
pub anti_bot_tech: AntiBotTech,
pub metadata: Option<Box<Metadata>>,
pub content_truncated: bool,
pub proxy_configured: bool,
pub profile_key: Option<CompactString>,
pub binary_file: bool,
pub(crate) is_valid_utf8: bool,
pub(crate) is_xml: bool,
#[cfg(feature = "parallel_backends")]
pub backend_source: Option<crate::compact_str::CompactString>,
#[cfg(feature = "balance")]
pub(crate) html_spool_path: Option<HtmlSpoolGuard>,
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
pub(crate) balance_bytes_tracked: bool,
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
pub(crate) content_byte_len: usize,
}
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
impl Drop for Page {
#[inline]
fn drop(&mut self) {
if self.balance_bytes_tracked {
if let Some(ref html) = self.html {
let len = html.len();
if len > 0 {
crate::utils::html_spool::track_bytes_sub(len);
}
}
}
}
}
#[cfg(feature = "decentralized")]
#[derive(Debug, Clone, Default)]
pub struct Page {
pub(crate) html: Option<bytes::Bytes>,
pub(crate) base: Option<Url>,
pub(crate) url: String,
pub headers: Option<reqwest::header::HeaderMap>,
#[cfg(feature = "remote_addr")]
pub remote_addr: Option<core::net::SocketAddr>,
#[cfg(feature = "cookies")]
pub cookies: Option<reqwest::header::HeaderMap>,
pub status_code: StatusCode,
pub error_status: Option<String>,
pub links: HashSet<CaseInsensitiveString>,
pub external_domains_caseless: Arc<HashSet<CaseInsensitiveString>>,
pub final_redirect_destination: Option<String>,
#[cfg(feature = "time")]
duration: Option<Instant>,
#[cfg(feature = "chrome")]
pub screenshot_bytes: Option<Vec<u8>>,
#[cfg(feature = "openai")]
pub openai_credits_used: Option<Vec<crate::features::openai_common::OpenAIUsage>>,
#[cfg(feature = "openai")]
pub extra_ai_data: Option<Vec<AIResults>>,
#[cfg(feature = "gemini")]
pub gemini_credits_used: Option<Vec<crate::features::gemini_common::GeminiUsage>>,
#[cfg(feature = "gemini")]
pub extra_gemini_data: Option<Vec<AIResults>>,
pub remote_multimodal_usage: Option<Vec<crate::features::automation::AutomationUsage>>,
pub extra_remote_multimodal_data: Option<Vec<AutomationResults>>,
pub spawn_pages: Option<Vec<String>>,
#[cfg(feature = "spider_cloud")]
pub content_map: Option<hashbrown::HashMap<String, bytes::Bytes>>,
pub page_links: Option<Box<HashSet<CaseInsensitiveString>>>,
pub should_retry: bool,
pub waf_check: bool,
pub bytes_transferred: Option<f64>,
pub blocked_crawl: bool,
pub signature: Option<u64>,
#[cfg(feature = "chrome")]
pub response_map: Option<hashbrown::HashMap<String, f64>>,
#[cfg(feature = "chrome")]
pub request_map: Option<hashbrown::HashMap<String, f64>>,
pub anti_bot_tech: AntiBotTech,
pub metadata: Option<Box<Metadata>>,
pub content_truncated: bool,
pub proxy_configured: bool,
pub profile_key: Option<CompactString>,
pub binary_file: bool,
pub(crate) is_valid_utf8: bool,
pub(crate) is_xml: bool,
#[cfg(feature = "parallel_backends")]
pub backend_source: Option<crate::compact_str::CompactString>,
}
#[cfg(feature = "smart")]
pub fn page_assign(page: &mut Page, mut new_page: Page) {
if let Some(s) = new_page.final_redirect_destination.as_deref() {
let bad = match s.as_bytes().first().copied() {
None => true,
Some(b'a') => s.starts_with("about:blank"),
Some(b'c') => s.starts_with("chrome-error://chromewebdata"),
_ => false,
};
if !bad {
page.final_redirect_destination = Some(s.into());
}
}
let chrome_default_empty_200 =
new_page.status_code == 200 && new_page.bytes_transferred.is_none() && new_page.is_empty();
page.anti_bot_tech = new_page.anti_bot_tech;
page.base = std::mem::take(&mut new_page.base);
page.blocked_crawl = new_page.blocked_crawl;
if !chrome_default_empty_200 {
page.status_code = new_page.status_code;
page.bytes_transferred = new_page.bytes_transferred;
if new_page.html.is_some() {
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
{
if page.balance_bytes_tracked {
if let Some(ref old_html) = page.html {
crate::utils::html_spool::track_bytes_sub(old_html.len());
}
}
page.balance_bytes_tracked = new_page.balance_bytes_tracked;
new_page.balance_bytes_tracked = false;
}
page.html = std::mem::take(&mut new_page.html);
page.is_valid_utf8 = new_page.is_valid_utf8;
page.is_xml = new_page.is_xml;
}
} else {
page.should_retry = true;
}
#[cfg(feature = "remote_addr")]
{
page.remote_addr = new_page.remote_addr;
}
#[cfg(feature = "time")]
{
page.duration = new_page.duration;
}
#[cfg(feature = "page_error_status_details")]
{
page.error_status = std::mem::take(&mut new_page.error_status);
}
#[cfg(feature = "chrome")]
{
page.request_map = std::mem::take(&mut new_page.request_map);
page.response_map = std::mem::take(&mut new_page.response_map);
}
#[cfg(feature = "cookies")]
{
if new_page.cookies.is_some() {
page.cookies = std::mem::take(&mut new_page.cookies);
}
}
if new_page.headers.is_some() {
page.headers = std::mem::take(&mut new_page.headers);
}
page.waf_check = new_page.waf_check;
page.should_retry = new_page.should_retry;
page.signature = new_page.signature;
if let Some(mut new_spawn_pages) = std::mem::take(&mut new_page.spawn_pages) {
const MAX_SPAWN_PAGES: usize = 1000;
match page.spawn_pages.as_mut() {
Some(existing) => {
let remaining = MAX_SPAWN_PAGES.saturating_sub(existing.len());
new_spawn_pages.truncate(remaining);
existing.append(&mut new_spawn_pages);
}
None => {
new_spawn_pages.truncate(MAX_SPAWN_PAGES);
page.spawn_pages = Some(new_spawn_pages);
}
}
}
page.metadata = std::mem::take(&mut new_page.metadata);
}
pub(crate) fn validate_link<
A: PartialEq + Eq + std::hash::Hash + From<String> + for<'a> From<&'a str>,
>(
base: &Option<&Url>,
href: &str,
base_domain: &CompactString,
parent_host: &CompactString,
base_input_domain: &CompactString,
sub_matcher: &CompactString,
external_domains_caseless: &Arc<HashSet<CaseInsensitiveString>>,
links_pages: &mut Option<HashSet<A>>,
) -> Option<Url> {
if let Some(b) = base {
let abs = convert_abs_path(b, href);
if let Some(link_map) = links_pages {
link_map.insert(A::from(href));
}
let scheme = abs.scheme();
if scheme == "https" || scheme == "http" {
let host_name = abs.host_str();
let mut can_process = parent_host_match(
host_name,
base_domain,
parent_host,
base_input_domain,
sub_matcher,
);
if !can_process && host_name.is_some() && abs.port().is_some() {
if let Some(host) = host_name {
let hname =
string_concat!(host, ":", abs.port().unwrap_or_default().to_string());
can_process = parent_host_match(
Some(&hname),
base_domain,
parent_host,
base_input_domain,
sub_matcher,
);
}
}
if !can_process && host_name.is_some() && !external_domains_caseless.is_empty() {
can_process = external_domains_caseless
.contains::<CaseInsensitiveString>(&host_name.unwrap_or_default().into())
|| external_domains_caseless
.contains::<CaseInsensitiveString>(&CASELESS_WILD_CARD);
}
if can_process {
return Some(abs);
}
}
}
None
}
pub(crate) fn relative_directory_url(href: &str) -> bool {
if href.starts_with("./") || href.starts_with("//") || href.starts_with("../") {
true
} else {
let network_capable = networking_capable(href);
if network_capable {
false
} else {
!href.starts_with("/")
}
}
}
pub(crate) fn push_link<
A: PartialEq + Eq + std::hash::Hash + From<String> + for<'a> From<&'a str>,
>(
base: &Option<&Url>,
href: &str,
map: &mut HashSet<A>,
base_domain: &CompactString,
parent_host: &CompactString,
parent_host_scheme: &CompactString,
base_input_domain: &CompactString,
sub_matcher: &CompactString,
external_domains_caseless: &Arc<HashSet<CaseInsensitiveString>>,
links_pages: &mut Option<HashSet<A>>,
) {
let abs = validate_link(
base,
href,
base_domain,
parent_host,
base_input_domain,
sub_matcher,
external_domains_caseless,
links_pages,
);
if let Some(mut abs) = abs {
if abs.scheme() != parent_host_scheme.as_str() {
let _ = abs.set_scheme(parent_host_scheme.as_str());
}
map.insert(A::from(abs.as_str()));
}
}
pub(crate) fn push_link_verify<
A: PartialEq + Eq + std::hash::Hash + From<String> + for<'a> From<&'a str>,
>(
base: &Option<&Url>,
href: &str,
map: &mut HashSet<A>,
base_domain: &CompactString,
parent_host: &CompactString,
parent_host_scheme: &CompactString,
base_input_domain: &CompactString,
sub_matcher: &CompactString,
external_domains_caseless: &Arc<HashSet<CaseInsensitiveString>>,
full_resources: bool,
links_pages: &mut Option<HashSet<A>>,
verify: bool,
) {
let abs = validate_link(
base,
href,
base_domain,
parent_host,
base_input_domain,
sub_matcher,
external_domains_caseless,
links_pages,
);
if let Some(mut abs) = abs {
if abs.scheme() != parent_host_scheme.as_str() {
let _ = abs.set_scheme(parent_host_scheme.as_str());
}
if verify {
push_link_check(&mut abs, map, full_resources, &mut true);
} else {
map.insert(A::from(abs.as_str()));
}
}
}
pub fn is_asset_url(url: &str) -> bool {
if let Some(position) = url.rfind('.') {
if url.len() - position >= 3 {
return is_ignored_extension(&url[position + 1..]);
}
}
false
}
pub(crate) fn push_link_check<
A: PartialEq + Eq + std::hash::Hash + From<String> + for<'a> From<&'a str>,
>(
abs: &mut Url,
map: &mut HashSet<A>,
full_resources: bool,
can_process: &mut bool,
) {
let hchars = abs.path();
if let Some(position) = hchars.rfind('.') {
let hlen = hchars.len();
let has_asset = hlen - position;
if has_asset >= 3 {
let next_position = position + 1;
if !full_resources && is_ignored_extension(&hchars[next_position..]) {
*can_process = false;
}
}
}
if *can_process {
map.insert(A::from(abs.as_str()));
}
}
pub(crate) fn domain_name(domain: &Url) -> &str {
domain.host_str().unwrap_or_default()
}
#[inline]
fn extract_root_domain(domain: &str) -> &str {
let bytes = domain.as_bytes();
if let Some(last_dot) = memchr::memrchr(b'.', bytes) {
if let Some(second_last_dot) = memchr::memrchr(b'.', &bytes[..last_dot]) {
&domain[second_last_dot + 1..]
} else {
&domain[..last_dot]
}
} else {
domain
}
}
#[inline]
#[cfg_attr(not(test), allow(dead_code))]
fn is_subdomain(subdomain: &str, domain: &str) -> bool {
extract_root_domain(subdomain) == extract_root_domain(domain)
}
pub(crate) fn parent_host_match(
host_name: Option<&str>,
base_domain: &str,
parent_host: &CompactString,
base_host: &CompactString,
sub_matcher: &CompactString,
) -> bool {
match host_name {
Some(host) => {
if parent_host.eq(&host) || base_host.eq(&host) {
return true;
}
if base_domain.is_empty() {
return false;
}
let host_root = extract_root_domain(host);
extract_root_domain(parent_host) == host_root
|| extract_root_domain(sub_matcher) == host_root
}
_ => false,
}
}
pub(crate) fn get_page_selectors_base(u: &str, subdomains: bool, tld: bool) -> RelativeSelectors {
let dname = get_domain_from_url(u);
let host_name = CompactString::from(dname);
let scheme = if u.starts_with("https://") {
"https"
} else if u.starts_with("http://") {
"http"
} else if u.starts_with("file://") {
"file"
} else if u.starts_with("wss://") {
"wss"
} else if u.starts_with("ws://") {
"ws"
} else {
"https"
};
if tld || subdomains {
let dname = if tld {
extract_root_domain(dname)
} else {
dname
};
(
dname.into(), smallvec::SmallVec::from([host_name, CompactString::from(scheme)]),
CompactString::default(),
)
} else {
(
CompactString::default(),
smallvec::SmallVec::from([host_name, CompactString::from(scheme)]),
CompactString::default(),
)
}
}
pub fn get_page_selectors(url: &str, subdomains: bool, tld: bool) -> RelativeSelectors {
get_page_selectors_base(url, subdomains, tld)
}
#[cfg(not(feature = "decentralized"))]
pub fn validate_empty(content: &Option<Vec<u8>>, is_success: bool) -> bool {
match &content {
Some(content) => {
!( content.is_empty() || content.starts_with(b"<html><head></head><body></body></html>") || is_success &&
content.starts_with(b"<html>\r\n<head>\r\n<META NAME=\"robots\" CONTENT=\"noindex,nofollow\">\r\n<script src=\"/") &&
content.ends_with(b"\">\r\n</script>\r\n<body>\r\n</body></html>\r\n")
|| is_chrome_error_page(content))
}
_ => false,
}
}
#[cfg(not(feature = "decentralized"))]
#[inline]
pub fn is_chrome_error_page(content: &[u8]) -> bool {
const TAIL: &[u8] = b"};</script></html>";
const NEEDLE: &[u8] = b"\"errorCode\":\"ERR";
if content.len() < 500 {
return false;
}
let mut end = content.len();
while end > 0 && content[end - 1].is_ascii_whitespace() {
end -= 1;
}
let trimmed = &content[..end];
if !trimmed.ends_with(TAIL) {
return false;
}
let region = if trimmed.len() > 4096 {
&trimmed[trimmed.len() - 4096..]
} else {
trimmed
};
memchr::memmem::find(region, NEEDLE).is_some()
}
#[inline]
pub fn extract_chrome_error_code(content: &[u8]) -> Option<&str> {
const NEEDLE: &[u8] = b"\"errorCode\":\"";
let region = if content.len() > 4096 {
&content[content.len() - 4096..]
} else {
content
};
let start = memchr::memmem::find(region, NEEDLE)? + NEEDLE.len();
let rest = region.get(start..)?;
let end = memchr::memchr(b'"', rest)?;
std::str::from_utf8(&rest[..end]).ok()
}
#[inline]
pub fn is_chrome_name_resolution_error(code: &str) -> bool {
let trimmed = code.strip_prefix("net::").unwrap_or(code);
matches!(
trimmed,
"ERR_NAME_NOT_RESOLVED" | "ERR_NAME_RESOLUTION_FAILED"
)
}
#[inline]
pub fn is_chrome_permanent_failure(code: &str) -> bool {
let trimmed = code.strip_prefix("net::").unwrap_or(code);
matches!(
trimmed,
"ERR_NAME_NOT_RESOLVED"
| "ERR_NAME_RESOLUTION_FAILED"
| "ERR_ADDRESS_UNREACHABLE"
| "ERR_CONNECTION_REFUSED"
| "ERR_HTTP2_INADEQUATE_TRANSPORT_SECURITY"
| "ERR_INVALID_URL"
| "ERR_UNSAFE_PORT"
| "ERR_DISALLOWED_URL_SCHEME"
| "ERR_UNKNOWN_URL_SCHEME"
)
}
#[inline]
pub(crate) fn chrome_permanent_failure_status(code: &str) -> StatusCode {
let trimmed = code.strip_prefix("net::").unwrap_or(code);
match trimmed {
"ERR_ADDRESS_UNREACHABLE"
| "ERR_CONNECTION_REFUSED"
| "ERR_HTTP2_INADEQUATE_TRANSPORT_SECURITY" => *ADDRESS_UNREACHABLE_ERROR,
"ERR_INVALID_URL"
| "ERR_UNSAFE_PORT"
| "ERR_DISALLOWED_URL_SCHEME"
| "ERR_UNKNOWN_URL_SCHEME" => StatusCode::BAD_REQUEST,
_ => *DNS_RESOLVE_ERROR,
}
}
#[cfg(not(feature = "decentralized"))]
fn extract_specific_error<'a, T: std::error::Error + 'static>(
error: &'a (dyn std::error::Error + 'static),
) -> Option<&'a T> {
let mut current_error = Some(error);
while let Some(err) = current_error {
if let Some(desired_error) = err.downcast_ref::<T>() {
return Some(desired_error);
}
current_error = err.source();
}
None
}
#[cfg(not(feature = "decentralized"))]
fn should_attempt_retry(error: &(dyn std::error::Error + 'static)) -> bool {
if let Some(e) = extract_specific_error::<h2::Error>(error) {
if e.is_go_away() && e.is_remote() && e.reason() == Some(h2::Reason::NO_ERROR) {
return true;
}
if e.is_remote() {
if let Some(reason) = e.reason() {
return matches!(
reason,
h2::Reason::REFUSED_STREAM | h2::Reason::ENHANCE_YOUR_CALM
);
}
}
}
false
}
#[cfg(not(feature = "decentralized"))]
#[inline]
fn h2_permanent_reason_status(err: &crate::client::Error) -> Option<StatusCode> {
let h2_err = extract_specific_error::<h2::Error>(err)?;
if !h2_err.is_remote() {
return None;
}
let reason = h2_err.reason()?;
if matches!(
reason,
h2::Reason::INADEQUATE_SECURITY | h2::Reason::HTTP_1_1_REQUIRED
) {
Some(*ADDRESS_UNREACHABLE_ERROR)
} else {
None
}
}
#[cfg(not(feature = "decentralized"))]
fn get_error_status_base(
should_retry: &mut bool,
error_for_status: Option<Result<crate::utils::RequestResponse, RequestError>>,
pre_classified_status: StatusCode,
) -> Option<RequestError> {
match error_for_status {
Some(e) => match e {
Ok(_) => None,
Err(er) => {
if !is_retryable_status(pre_classified_status) {
return Some(er);
}
let mapped_status = get_error_http_status_code(&er);
if er.is_timeout() || (er.is_connect() && is_retryable_status(mapped_status)) {
*should_retry = true;
}
if !*should_retry && should_attempt_retry(&er) {
*should_retry = true;
}
if let Some(status_code) = er.status() {
let retry = matches!(
status_code,
StatusCode::TOO_MANY_REQUESTS
| StatusCode::INTERNAL_SERVER_ERROR
| StatusCode::BAD_GATEWAY
| StatusCode::SERVICE_UNAVAILABLE
| StatusCode::GATEWAY_TIMEOUT
);
if retry {
*should_retry = true;
}
}
if !*should_retry && is_retryable_status(mapped_status) {
*should_retry = true;
}
Some(er)
}
},
_ => None,
}
}
#[cfg(all(
not(feature = "page_error_status_details"),
not(feature = "decentralized")
))]
fn get_error_status(
should_retry: &mut bool,
error_for_status: Option<Result<crate::utils::RequestResponse, RequestError>>,
pre_classified_status: StatusCode,
) -> Option<String> {
get_error_status_base(should_retry, error_for_status, pre_classified_status)
.map(|e| e.to_string())
}
#[cfg(all(feature = "page_error_status_details", not(feature = "decentralized")))]
fn get_error_status(
should_retry: &mut bool,
error_for_status: Option<Result<crate::utils::RequestResponse, RequestError>>,
pre_classified_status: StatusCode,
) -> Option<std::sync::Arc<reqwest::Error>> {
get_error_status_base(should_retry, error_for_status, pre_classified_status)
.map(std::sync::Arc::new)
}
#[cfg(not(feature = "decentralized"))]
pub fn build_with_parse(url: &str, res: PageResponse) -> Page {
let mut page = build(url, res);
page.set_url_parsed_direct_empty();
page
}
#[cfg(feature = "decentralized")]
pub fn build_with_parse(url: &str, res: PageResponse) -> Page {
build(url, res)
}
#[cfg(not(feature = "decentralized"))]
pub fn build(url: &str, mut res: PageResponse) -> Page {
use crate::utils::validation::is_false_403;
let chrome_error =
res.status_code.is_success() && res.content.as_deref().is_some_and(is_chrome_error_page);
if chrome_error {
let permanent_code = res
.content
.as_deref()
.and_then(extract_chrome_error_code)
.filter(|code| is_chrome_permanent_failure(code));
res.status_code = if let Some(code) = permanent_code {
chrome_permanent_failure_status(code)
} else {
StatusCode::from_u16(599).unwrap_or(StatusCode::BAD_GATEWAY)
};
}
let success_initial = res.status_code.is_success() || res.status_code == StatusCode::NOT_FOUND;
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
let resource_found_initial = if res.content_spool.is_some() {
true
} else {
validate_empty(&res.content, success_initial)
};
#[cfg(not(all(feature = "balance", not(feature = "decentralized"))))]
let resource_found_initial = validate_empty(&res.content, success_initial);
if !chrome_error
&& res.status_code.is_success()
&& !res.content_truncated
&& !resource_found_initial
{
res.status_code = StatusCode::GATEWAY_TIMEOUT;
}
let success = res.status_code.is_success() || res.status_code == StatusCode::NOT_FOUND;
let resource_found = resource_found_initial;
let status = res.status_code;
let should_retry_status = status != *DNS_RESOLVE_ERROR
&& status != *ADDRESS_UNREACHABLE_ERROR
&& status != StatusCode::NOT_IMPLEMENTED
&& status != StatusCode::HTTP_VERSION_NOT_SUPPORTED
&& status != StatusCode::NETWORK_AUTHENTICATION_REQUIRED
&& (status.is_server_error()
|| matches!(
status,
StatusCode::TOO_MANY_REQUESTS | StatusCode::FORBIDDEN | StatusCode::REQUEST_TIMEOUT
));
let should_retry_resource = resource_found && !success && status != StatusCode::UNAUTHORIZED;
let should_retry_empty_success = success && !resource_found && !res.content_truncated;
let should_retry_antibot_false_403 = res.anti_bot_tech != AntiBotTech::None
&& res.status_code.is_success()
&& is_false_403(
res.content.as_deref(),
res.headers
.as_ref()
.and_then(|h| h.get(reqwest::header::CONTENT_LANGUAGE))
.and_then(|v| v.to_str().ok()),
);
let mut should_retry = should_retry_resource
|| should_retry_status
|| should_retry_empty_success
|| should_retry_antibot_false_403;
let mut empty_page = false;
if let Some(final_url) = &res.final_url {
if final_url.starts_with("chrome-error://chromewebdata")
|| final_url.starts_with("about:blank")
{
should_retry = false;
empty_page = true;
}
}
if should_retry
&& !resource_found
&& res.status_code == StatusCode::FORBIDDEN
&& res.headers.is_some()
&& res.anti_bot_tech == AntiBotTech::None
{
should_retry = false;
}
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
if let Some(spool) = res.content_spool.take() {
let precomputed_signature = spool.signature.or(res.signature);
return Page {
html: None,
binary_file: spool.vitals.binary_file,
is_valid_utf8: spool.vitals.is_valid_utf8,
is_xml: spool.vitals.is_xml,
headers: res.headers,
#[cfg(feature = "remote_addr")]
remote_addr: res.remote_addr,
#[cfg(feature = "cookies")]
cookies: res.cookies,
url: url.into(),
#[cfg(feature = "time")]
duration: res.duration,
status_code: res.status_code,
error_status: get_error_status(&mut should_retry, res.error_for_status, status),
final_redirect_destination: if empty_page { None } else { res.final_url },
#[cfg(feature = "chrome")]
chrome_page: None,
#[cfg(feature = "chrome")]
screenshot_bytes: res.screenshot_bytes,
#[cfg(feature = "openai")]
openai_credits_used: res.openai_credits_used,
#[cfg(feature = "openai")]
extra_ai_data: res.extra_ai_data,
#[cfg(feature = "gemini")]
gemini_credits_used: res.gemini_credits_used,
#[cfg(feature = "gemini")]
extra_gemini_data: res.extra_gemini_data,
remote_multimodal_usage: res.remote_multimodal_usage,
extra_remote_multimodal_data: res.extra_remote_multimodal_data,
spawn_pages: res.spawn_pages,
#[cfg(feature = "spider_cloud")]
content_map: res.content_map,
should_retry,
waf_check: res.waf_check,
bytes_transferred: res.bytes_transferred,
blocked_crawl: false,
signature: precomputed_signature,
#[cfg(feature = "chrome")]
response_map: res.response_map,
#[cfg(feature = "chrome")]
request_map: res.request_map,
anti_bot_tech: res.anti_bot_tech,
metadata: res.metadata,
content_truncated: res.content_truncated,
balance_bytes_tracked: false,
base: None,
external_domains_caseless: Default::default(),
page_links: None,
proxy_configured: false,
profile_key: None,
html_spool_path: Some(HtmlSpoolGuard::new(
spool.path,
crate::utils::html_spool::current_website_spool_dir(),
)),
content_byte_len: spool.vitals.byte_len,
#[cfg(feature = "parallel_backends")]
backend_source: None,
};
}
let binary_file = res
.content
.as_deref()
.is_some_and(crate::utils::is_binary_body);
let is_valid_utf8 = match res.is_valid_utf8 {
Some(v) => v,
None => res
.content
.as_deref()
.is_some_and(|b| simdutf8::basic::from_utf8(b).is_ok()),
};
let is_xml = res
.content
.as_deref()
.is_some_and(|b| b.starts_with(b"<?xml"));
let content_byte_len = res.content.as_ref().map_or(0, |c| c.len());
#[cfg(feature = "balance")]
let balance_has_bytes = if content_byte_len > 0 {
crate::utils::html_spool::track_bytes_add(content_byte_len);
true
} else {
false
};
Page {
html: res.content.map(bytes::Bytes::from),
binary_file,
is_valid_utf8,
is_xml,
headers: res.headers,
#[cfg(feature = "remote_addr")]
remote_addr: res.remote_addr,
#[cfg(feature = "cookies")]
cookies: res.cookies,
url: url.into(),
#[cfg(feature = "time")]
duration: res.duration,
status_code: res.status_code,
error_status: get_error_status(&mut should_retry, res.error_for_status, status),
final_redirect_destination: if empty_page { None } else { res.final_url },
#[cfg(feature = "chrome")]
chrome_page: None,
#[cfg(feature = "chrome")]
screenshot_bytes: res.screenshot_bytes,
#[cfg(feature = "openai")]
openai_credits_used: res.openai_credits_used,
#[cfg(feature = "openai")]
extra_ai_data: res.extra_ai_data,
#[cfg(feature = "gemini")]
gemini_credits_used: res.gemini_credits_used,
#[cfg(feature = "gemini")]
extra_gemini_data: res.extra_gemini_data,
remote_multimodal_usage: res.remote_multimodal_usage,
extra_remote_multimodal_data: res.extra_remote_multimodal_data,
spawn_pages: res.spawn_pages,
#[cfg(feature = "spider_cloud")]
content_map: res.content_map,
should_retry,
waf_check: res.waf_check,
bytes_transferred: res.bytes_transferred,
blocked_crawl: false,
signature: res.signature,
#[cfg(feature = "chrome")]
response_map: res.response_map,
#[cfg(feature = "chrome")]
request_map: res.request_map,
anti_bot_tech: res.anti_bot_tech,
metadata: res.metadata,
content_truncated: res.content_truncated,
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
balance_bytes_tracked: balance_has_bytes,
base: None,
external_domains_caseless: Default::default(),
page_links: None,
proxy_configured: false,
profile_key: None,
#[cfg(feature = "balance")]
html_spool_path: None,
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
content_byte_len,
#[cfg(feature = "parallel_backends")]
backend_source: None,
}
}
#[cfg(feature = "decentralized")]
pub fn build(_: &str, res: PageResponse) -> Page {
Page {
html: res.content.map(bytes::Bytes::from),
headers: res.headers,
#[cfg(feature = "remote_addr")]
remote_addr: res.remote_addr,
#[cfg(feature = "cookies")]
cookies: res.cookies,
final_redirect_destination: res.final_url,
status_code: res.status_code,
metadata: res.metadata,
spawn_pages: res.spawn_pages,
content_truncated: res.content_truncated,
error_status: match res.error_for_status {
Some(e) => match e {
Ok(_) => None,
Err(er) => Some(er.to_string()),
},
_ => None,
},
..Default::default()
}
}
#[cfg(all(feature = "headers", feature = "cookies"))]
pub fn build_cookie_header_from_set_cookie(page: &Page) -> Option<reqwest::header::HeaderValue> {
use reqwest::header::HeaderValue;
let mut cookie_pairs = Vec::with_capacity(8);
if let Some(headers) = &page.headers {
for cookie in headers.get_all(crate::client::header::SET_COOKIE).iter() {
if let Ok(cookie_str) = cookie.to_str() {
if let Ok(parsed) = cookie::Cookie::parse(cookie_str) {
cookie_pairs.push(format!("{}={}", parsed.name(), parsed.value()));
}
}
}
}
if cookie_pairs.is_empty() {
None
} else {
let cookie_header_str = cookie_pairs.join("; ");
HeaderValue::from_str(&cookie_header_str).ok()
}
}
#[cfg(not(all(feature = "headers", feature = "cookies")))]
pub fn build_cookie_header_from_set_cookie(_page: &Page) -> Option<reqwest::header::HeaderValue> {
None
}
#[derive(Debug, Default, Clone, Copy)]
pub struct PageLinkBuildSettings {
pub ssg_build: bool,
pub full_resources: bool,
pub tld: bool,
pub subdomains: bool,
pub normalize: bool,
pub skip_links: bool,
}
impl PageLinkBuildSettings {
pub(crate) fn new(ssg_build: bool, full_resources: bool) -> Self {
Self {
ssg_build,
full_resources,
..Default::default()
}
}
pub(crate) fn new_full(
ssg_build: bool,
full_resources: bool,
subdomains: bool,
tld: bool,
normalize: bool,
) -> Self {
Self {
ssg_build,
full_resources,
subdomains,
tld,
normalize,
skip_links: false,
}
}
}
pub(crate) fn get_charset_from_content_type(
headers: &reqwest::header::HeaderMap,
) -> Option<AsciiCompatibleEncoding> {
use auto_encoder::encoding_rs;
if let Some(content_type) = headers.get(reqwest::header::CONTENT_TYPE) {
if let Ok(content_type_str) = content_type.to_str() {
for part in content_type_str.split(';') {
let part = part.trim();
if part.len() >= 8 && part.as_bytes()[..8].eq_ignore_ascii_case(b"charset=") {
let stripped = &part[8..];
if let Some(encoding) = encoding_rs::Encoding::for_label(stripped.as_bytes()) {
if let Some(ascii_encoding) = AsciiCompatibleEncoding::new(encoding) {
return Some(ascii_encoding);
}
}
}
}
}
}
None
}
#[cfg(feature = "chrome")]
pub(crate) fn set_metadata(mdata: &mut Option<Box<Metadata>>, metadata: &mut Metadata) {
if let Some(mdata) = mdata {
if mdata.automation.is_some() {
metadata.automation = mdata.automation.take();
}
}
}
#[cfg(not(feature = "chrome"))]
pub(crate) fn set_metadata(_mdata: &mut Option<Box<Metadata>>, _metadata: &mut Metadata) {}
fn exact_url_match(url: &str, target_url: &str) -> bool {
let end_target_slash = target_url.ends_with('/');
let main_slash = url.ends_with('/');
if end_target_slash && !main_slash {
strip_trailing_slash(target_url) == url
} else if !end_target_slash && main_slash {
url == strip_trailing_slash(target_url)
} else {
url == target_url
}
}
fn strip_trailing_slash(s: &str) -> &str {
if s.ends_with('/') {
s.trim_end_matches('/')
} else {
s
}
}
pub(crate) fn metadata_handlers<'h>(
meta_title: &'h mut Option<CompactString>,
meta_description: &'h mut Option<CompactString>,
meta_og_image: &'h mut Option<CompactString>,
) -> Vec<(
std::borrow::Cow<'static, lol_html::Selector>,
lol_html::send::ElementContentHandlers<'h>,
)> {
vec![
lol_html::text!("head title", |el| {
let t = el.as_str();
if !t.is_empty() {
*meta_title = Some(t.into());
}
Ok(())
}),
lol_html::element!(r#"meta[name="description"]"#, |el| {
if let Some(content) = el.get_attribute("content") {
if !content.is_empty() {
*meta_description = Some(content.into());
}
}
Ok(())
}),
lol_html::element!(r#"meta[property="og:image"]"#, |el| {
if let Some(content) = el.get_attribute("content") {
if !content.is_empty() {
*meta_og_image = Some(content.into());
}
}
Ok(())
}),
]
}
#[cfg(feature = "chrome")]
pub(crate) struct NoopOutputSink;
#[cfg(feature = "chrome")]
impl lol_html::OutputSink for NoopOutputSink {
#[inline]
fn handle_chunk(&mut self, _: &[u8]) {}
}
#[cfg(feature = "chrome")]
const ASSET_SNIFF_BYTES: usize = 64;
#[cfg(feature = "chrome")]
#[allow(dead_code)] pub(crate) struct ChromeStreamingExtractor<'h> {
rewriter: lol_html::send::HtmlRewriter<'h, NoopOutputSink>,
write_failed: bool,
sniffed: bool,
streamed_through: bool,
}
#[cfg(feature = "chrome")]
#[allow(dead_code)]
impl<'h> ChromeStreamingExtractor<'h> {
pub(crate) fn new(
handlers: Vec<(
std::borrow::Cow<'static, lol_html::Selector>,
lol_html::send::ElementContentHandlers<'h>,
)>,
encoding: Option<lol_html::AsciiCompatibleEncoding>,
adjust_charset_on_meta_tag: bool,
) -> Self {
let settings = lol_html::send::Settings {
element_content_handlers: handlers,
adjust_charset_on_meta_tag,
encoding: encoding.unwrap_or_else(lol_html::AsciiCompatibleEncoding::utf_8),
..lol_html::send::Settings::new_for_handler_types()
};
Self {
rewriter: lol_html::send::HtmlRewriter::new(settings, NoopOutputSink),
write_failed: false,
sniffed: false,
streamed_through: false,
}
}
#[inline]
pub(crate) fn write(&mut self, chunk: &[u8]) {
if self.write_failed {
return;
}
if !self.sniffed {
self.sniffed = true;
if !chunk.is_empty() {
let trimmed = crate::utils::skip_leading_ascii_whitespace(chunk);
if !trimmed.is_empty() {
let head_len = trimmed.len().min(ASSET_SNIFF_BYTES);
if auto_encoder::is_binary_file(&trimmed[..head_len]) {
self.write_failed = true;
return;
}
} else {
self.sniffed = false;
}
}
}
if self.rewriter.write(chunk).is_err() {
self.write_failed = true;
}
}
#[inline]
pub(crate) fn mark_streamed(&mut self) {
self.streamed_through = true;
}
#[inline]
pub(crate) fn invalidate(&mut self) {
self.write_failed = true;
}
pub(crate) fn end(self) -> bool {
if self.write_failed || !self.streamed_through {
return false;
}
self.rewriter.end().is_ok()
}
#[inline]
pub(crate) fn ok(&self) -> bool {
!self.write_failed
}
}
pub(crate) struct LinkExtractCtx<'h, A> {
pub selectors: &'h RelativeSelectors,
pub external_domains_caseless: &'h Arc<HashSet<CaseInsensitiveString>>,
pub map: &'h mut hashbrown::HashSet<A>,
pub links_pages: &'h mut Option<hashbrown::HashSet<A>>,
pub base_input_url: &'h tokio::sync::OnceCell<Url>,
pub base: Option<&'h Url>,
pub original_page: Option<&'h Url>,
pub ssg_raw_src_cell: Option<&'h tokio::sync::OnceCell<String>>,
pub ssg_resolved_path_cell: Option<&'h tokio::sync::OnceCell<String>>,
pub xml_file: bool,
pub full_resources: bool,
pub skip_links: bool,
}
pub(crate) fn build_link_extract_handlers<'h, A>(
ctx: LinkExtractCtx<'h, A>,
meta_title: &'h mut Option<CompactString>,
meta_description: &'h mut Option<CompactString>,
meta_og_image: &'h mut Option<CompactString>,
) -> Vec<(
std::borrow::Cow<'static, lol_html::Selector>,
lol_html::send::ElementContentHandlers<'h>,
)>
where
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ std::hash::Hash
+ From<String>
+ for<'a> From<&'a str>,
{
let LinkExtractCtx {
selectors,
external_domains_caseless,
map,
links_pages,
base_input_url,
base,
original_page,
ssg_raw_src_cell,
ssg_resolved_path_cell,
xml_file,
full_resources,
skip_links,
} = ctx;
let parent_host = &selectors.1[0];
let parent_host_scheme = &selectors.1[1];
let base_input_domain = &selectors.2;
let sub_matcher = &selectors.0;
let mut handlers = Vec::with_capacity(
3
+ 1
+ (!skip_links) as usize
+ ssg_raw_src_cell.is_some() as usize
+ ssg_resolved_path_cell.is_some() as usize,
);
handlers.extend(metadata_handlers(
meta_title,
meta_description,
meta_og_image,
));
handlers.push(element_precompiled!(
compiled_base_element_selector(),
move |el| {
if let Some(href) = el.get_attribute("href") {
if let Ok(parsed_base) = Url::parse(&href) {
let _ = base_input_url.set(parsed_base);
}
}
Ok(())
}
));
if !skip_links {
if full_resources {
handlers.push(lol_html::element!(
"a[href]:not([aria-hidden=\"true\"]),script[src],link[href]",
move |el| {
let tag_name = el.tag_name();
let attribute = if tag_name == "script" { "src" } else { "href" };
if let Some(href) = el.get_attribute(attribute) {
let b = if relative_directory_url(&href) || base.is_none() {
original_page
} else {
base
};
let b = if base_input_url.initialized() {
base_input_url.get()
} else {
b
};
push_link(
&b,
&href,
map,
sub_matcher,
parent_host,
parent_host_scheme,
base_input_domain,
sub_matcher,
external_domains_caseless,
links_pages,
);
}
Ok(())
}
));
} else {
handlers.push(element_precompiled!(
if xml_file {
compiled_xml_selector()
} else {
compiled_selector()
},
move |el| {
if let Some(href) = el.get_attribute("href") {
let b = if relative_directory_url(&href) || base.is_none() {
original_page
} else {
base
};
let b = if base_input_url.initialized() {
base_input_url.get()
} else {
b
};
push_link(
&b,
&href,
map,
sub_matcher,
parent_host,
parent_host_scheme,
base_input_domain,
sub_matcher,
external_domains_caseless,
links_pages,
);
}
Ok(())
}
));
}
}
if let Some(cell) = ssg_raw_src_cell {
handlers.push(lol_html::element!("script", move |el| {
if let Some(build_path) = el.get_attribute("src") {
if build_path.starts_with("/_next/static/")
&& build_path.ends_with("/_ssgManifest.js")
{
let _ = cell.set(build_path);
}
}
Ok(())
}));
}
if let Some(cell) = ssg_resolved_path_cell {
handlers.push(lol_html::element!("script[src]", move |el| {
if let Some(source) = el.get_attribute("src") {
if source.starts_with("/_next/static/") && source.ends_with("/_ssgManifest.js") {
if let Some(b) = base {
let _ = cell.set(convert_abs_path(b, &source).to_string());
}
}
}
Ok(())
}));
}
handlers
}
impl Page {
#[inline]
pub fn needs_retry(&self) -> bool {
self.should_retry
|| self.content_truncated
|| is_retryable_status(self.status_code)
|| (self.proxy_configured && self.status_code == StatusCode::UNAUTHORIZED)
}
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub async fn new_page(url: &str, client: &Client) -> Self {
let page_resource: PageResponse = crate::utils::fetch_page_html_raw(url, client).await;
build(url, page_resource)
}
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub async fn new_page_with_watchdog(
url: &str,
client: &Client,
first_byte_timeout: Option<std::time::Duration>,
first_byte_jitter: Option<std::time::Duration>,
) -> Self {
let page_resource: PageResponse = crate::utils::fetch_page_html_raw_with_watchdog(
url,
client,
first_byte_timeout,
first_byte_jitter,
)
.await;
build(url, page_resource)
}
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub async fn new_page_auto_watchdog(
url: &str,
client: &Client,
config: &crate::configuration::Configuration,
) -> Self {
let (base, jitter) = config.auto_http_first_byte_args();
Self::new_page_with_watchdog(url, client, base, jitter).await
}
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub async fn new_page_with_cache(
url: &str,
client: &Client,
cache_options: Option<CacheOptions>,
cache_policy: &Option<BasicCachePolicy>,
cache_namespace: Option<&str>,
) -> Self {
let page_resource: PageResponse = crate::utils::fetch_page_html_raw_cached(
url,
client,
cache_options,
cache_policy,
cache_namespace,
)
.await;
build(url, page_resource)
}
#[cfg(feature = "webdriver")]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub fn new_webdriver(url: &str, html: String, status_code: StatusCode) -> Self {
let mut page = Page::default();
page.html = Some(bytes::Bytes::from(html.into_bytes()));
page.url = url.into();
page.status_code = status_code;
page
}
#[cfg(feature = "webdriver")]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub async fn new_page_webdriver(
url: &str,
driver: &std::sync::Arc<thirtyfour::WebDriver>,
timeout: Option<std::time::Duration>,
) -> Self {
use crate::features::webdriver::{attempt_navigation, get_current_url, get_page_content};
if let Err(e) = attempt_navigation(url, driver, &timeout).await {
log::error!("WebDriver navigation failed: {:?}", e);
let mut page = Page::default();
page.url = url.into();
page.status_code = *UNKNOWN_STATUS_ERROR;
#[cfg(not(feature = "page_error_status_details"))]
{
page.error_status = Some(format!("WebDriver navigation failed: {:?}", e));
}
return page;
}
let final_url = get_current_url(driver).await.ok();
match get_page_content(driver).await {
Ok(content) => {
let mut page = Page::default();
page.html = Some(bytes::Bytes::from(content.into_bytes()));
page.url = url.into();
page.status_code = StatusCode::OK;
page.final_redirect_destination = final_url;
page
}
Err(e) => {
log::error!("Failed to get WebDriver page content: {:?}", e);
let mut page = Page::default();
page.url = url.into();
page.status_code = *UNKNOWN_STATUS_ERROR;
#[cfg(not(feature = "page_error_status_details"))]
{
page.error_status = Some(format!("Failed to get page content: {:?}", e));
}
page
}
}
}
#[cfg(all(feature = "webdriver", feature = "chrome"))]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub async fn new_page_webdriver_full(
url: &str,
driver: &std::sync::Arc<thirtyfour::WebDriver>,
timeout: Option<std::time::Duration>,
wait_for: &Option<crate::configuration::WaitFor>,
execution_scripts: &Option<crate::features::chrome_common::ExecutionScripts>,
automation_scripts: &Option<crate::features::chrome_common::AutomationScripts>,
) -> Self {
use crate::features::webdriver::{
attempt_navigation, get_current_url, get_page_content, run_execution_scripts,
run_url_automation_scripts,
};
if let Err(e) = attempt_navigation(url, driver, &timeout).await {
log::error!("WebDriver navigation failed: {:?}", e);
let mut page = Page::default();
page.url = url.into();
page.status_code = *UNKNOWN_STATUS_ERROR;
#[cfg(not(feature = "page_error_status_details"))]
{
page.error_status = Some(format!("WebDriver navigation failed: {:?}", e));
}
return page;
}
run_execution_scripts(driver, url, execution_scripts).await;
run_url_automation_scripts(driver, url, automation_scripts).await;
if let Some(wait_config) = wait_for {
if let Some(ref delay) = wait_config.delay {
if let Some(timeout_duration) = delay.timeout {
tokio::time::sleep(timeout_duration).await;
}
}
if let Some(ref selector_wait) = wait_config.selector {
let wait_timeout = selector_wait
.timeout
.unwrap_or(std::time::Duration::from_secs(30));
let _ = crate::features::webdriver::wait_for_element(
driver,
&selector_wait.selector,
wait_timeout,
)
.await;
}
if let Some(ref idle) = wait_config.idle_network {
let wait_time = idle.timeout.unwrap_or(std::time::Duration::from_secs(5));
tokio::time::sleep(wait_time).await;
}
}
let final_url = get_current_url(driver).await.ok();
match get_page_content(driver).await {
Ok(content) => {
let mut page = Page::default();
page.html = Some(bytes::Bytes::from(content.into_bytes()));
page.url = url.into();
page.status_code = StatusCode::OK;
page.final_redirect_destination = final_url;
page
}
Err(e) => {
log::error!("Failed to get WebDriver page content: {:?}", e);
let mut page = Page::default();
page.url = url.into();
page.status_code = *UNKNOWN_STATUS_ERROR;
#[cfg(not(feature = "page_error_status_details"))]
{
page.error_status = Some(format!("Failed to get page content: {:?}", e));
}
page
}
}
}
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub async fn new_page_streaming<
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ std::hash::Hash
+ From<String>
+ for<'a> From<&'a str>,
>(
url: &str,
client: &Client,
only_html: bool,
selectors: &mut RelativeSelectors,
external_domains_caseless: &Arc<HashSet<CaseInsensitiveString>>,
r_settings: &PageLinkBuildSettings,
map: &mut hashbrown::HashSet<A>,
ssg_map: Option<&mut hashbrown::HashSet<A>>,
prior_domain: &Option<Box<Url>>,
domain_parsed: &mut Option<Box<Url>>,
links_pages: &mut Option<hashbrown::HashSet<A>>,
http_first_byte_args: (Option<std::time::Duration>, Option<std::time::Duration>),
) -> Self {
let (http_first_byte_timeout, http_first_byte_timeout_jitter) = http_first_byte_args;
use crate::utils::{
handle_response_bytes, handle_response_bytes_writer, modify_selectors,
AllowedDomainTypes,
};
let mut metadata: Option<Box<Metadata>> = None;
let mut meta_title: Option<_> = None;
let mut meta_description: Option<_> = None;
let mut meta_og_image: Option<_> = None;
let duration = if cfg!(feature = "time") {
Some(tokio::time::Instant::now())
} else {
None
};
#[cfg(feature = "balance")]
crate::utils::vitals::request_start();
let send_outcome = crate::utils::timeout_first_byte(
client.get(url).send(),
http_first_byte_timeout,
http_first_byte_timeout_jitter,
)
.await;
let mut page_response: PageResponse = match send_outcome {
crate::utils::HttpSendOutcome::FirstByteTimeout(_) => {
#[cfg(feature = "balance")]
{
crate::utils::vitals::request_error();
crate::utils::vitals::request_end();
}
return build(
url,
crate::utils::build_first_byte_timeout_page_response(url),
);
}
crate::utils::HttpSendOutcome::Ok(send_result) => match send_result {
Ok(res)
if crate::utils::valid_parsing_status(&res)
&& !crate::utils::block_streaming(&res, only_html) =>
{
let cell = if r_settings.ssg_build {
Some(tokio::sync::OnceCell::new())
} else {
None
};
let base_input_url = tokio::sync::OnceCell::new();
let (encoding, adjust_charset_on_meta_tag) =
match get_charset_from_content_type(res.headers()) {
Some(h) => (h, false),
_ => (AsciiCompatibleEncoding::utf_8(), true),
};
let target_url = res.url().as_str();
if ssg_map.is_some() && url != target_url && !exact_url_match(url, target_url) {
let mut url = Box::new(CaseInsensitiveString::new(&url));
modify_selectors(
prior_domain,
target_url,
domain_parsed,
&mut url,
selectors,
AllowedDomainTypes::new(r_settings.subdomains, r_settings.tld),
);
};
let base = if domain_parsed.is_none() {
prior_domain
} else {
domain_parsed
};
let original_page = Url::parse(url).ok();
let xml_file = target_url.ends_with(".xml");
let parent_host = &selectors.1[0];
let parent_host_scheme = &selectors.1[1];
let base_input_domain = &selectors.2;
let sub_matcher = &selectors.0;
let element_content_handlers = build_link_extract_handlers(
LinkExtractCtx {
selectors,
external_domains_caseless,
map,
links_pages,
base_input_url: &base_input_url,
base: base.as_deref(),
original_page: original_page.as_ref(),
ssg_raw_src_cell: if r_settings.ssg_build && !r_settings.skip_links {
cell.as_ref()
} else {
None
},
ssg_resolved_path_cell: None,
xml_file,
full_resources: r_settings.full_resources,
skip_links: r_settings.skip_links,
},
&mut meta_title,
&mut meta_description,
&mut meta_og_image,
);
let settings = lol_html::send::Settings {
element_content_handlers,
adjust_charset_on_meta_tag,
encoding,
..lol_html::send::Settings::new_for_handler_types()
};
let mut rewriter = lol_html::send::HtmlRewriter::new(settings, |_c: &[u8]| {});
let mut collected_bytes = match res.content_length() {
Some(cap) if cap > MAX_CONTENT_LENGTH => {
log::warn!("{url} Content-Length {cap} exceeds 2 GB limit, rejecting");
Vec::new()
}
Some(cap) if cap > 0 => {
Vec::with_capacity((cap as usize).min(MAX_PREALLOC))
}
_ => Vec::with_capacity(MAX_PRE_ALLOCATED_HTML_PAGE_SIZE_USIZE),
};
let mut response = handle_response_bytes_writer(
res,
url,
only_html,
&mut rewriter,
&mut collected_bytes,
)
.await;
let rewrite_error = response.1;
if !rewrite_error {
let _ = rewriter.end();
}
if r_settings.normalize {
response.0.signature = Some(hash_html(&collected_bytes).await);
}
response.0.content = if collected_bytes.is_empty() {
None
} else {
Some(collected_bytes)
};
if r_settings.ssg_build {
if let Some(ssg_map) = ssg_map {
if let Some(cell) = &cell {
if let Some(source) = cell.get() {
if let Some(url_base) = &base {
let build_ssg_path = convert_abs_path(url_base, source);
let build_page =
Page::new_page(build_ssg_path.as_str(), client).await;
for cap in SSG_CAPTURE
.captures_iter(build_page.get_html_bytes_u8())
{
if let Some(matched) = cap.get(1) {
let href = auto_encode_bytes(matched.as_bytes())
.replace(r#"\u002F"#, "/");
let last_segment =
crate::utils::get_last_segment(&href);
if !(last_segment.starts_with("[")
&& last_segment.ends_with("]"))
{
let base = if relative_directory_url(&href)
|| base.is_none()
{
original_page.as_ref()
} else {
base.as_deref()
};
let base = if base_input_url.initialized() {
base_input_url.get()
} else {
base
};
push_link(
&base,
&href,
ssg_map,
&selectors.0,
parent_host,
parent_host_scheme,
base_input_domain,
sub_matcher,
external_domains_caseless,
&mut None,
);
}
}
}
}
}
}
}
}
response.0
}
Ok(res) => {
let pr = handle_response_bytes(res, url, only_html).await;
if pr.content_truncated {
log::warn!("Response truncated for {url}, retrying once");
match client.get(url).send().await {
Ok(res2) => handle_response_bytes(res2, url, only_html).await,
Err(_) => pr,
}
} else {
pr
}
}
Err(err) => {
#[cfg(feature = "balance")]
crate::utils::vitals::request_error();
crate::utils::build_error_page_response(url, err).await
}
},
};
#[cfg(feature = "balance")]
crate::utils::vitals::request_end();
let valid_meta = meta_title.is_some()
|| meta_description.is_some()
|| meta_og_image.is_some()
|| metadata.is_some();
if valid_meta {
let mut metadata_inner = Metadata::default();
metadata_inner.title = meta_title;
metadata_inner.description = meta_description;
metadata_inner.image = meta_og_image;
if metadata_inner.exist() {
set_metadata(&mut metadata, &mut metadata_inner);
metadata.replace(Box::new(metadata_inner));
}
if metadata.is_some() {
page_response.metadata = metadata;
}
}
crate::utils::set_page_response_duration(&mut page_response, duration);
build(url, page_response)
}
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub async fn new_page_only_html(url: &str, client: &Client) -> Self {
let page_resource = crate::utils::fetch_page_html_raw_only_html(url, client).await;
build(url, page_resource)
}
#[cfg(all(not(feature = "decentralized"), not(feature = "chrome")))]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub async fn new(url: &str, client: &Client) -> Self {
let page_resource = crate::utils::fetch_page_html(url, client).await;
build(url, page_resource)
}
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
#[cfg(feature = "cmd")]
pub async fn new_page_streaming_from_bytes<
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ std::hash::Hash
+ From<String>
+ for<'a> From<&'a str>,
>(
url: &str,
input_bytes: &[u8],
selectors: &mut RelativeSelectors,
external_domains_caseless: &Arc<HashSet<CaseInsensitiveString>>,
r_settings: &PageLinkBuildSettings,
map: &mut hashbrown::HashSet<A>,
ssg_map: Option<&mut hashbrown::HashSet<A>>,
prior_domain: &Option<Box<Url>>,
domain_parsed: &mut Option<Box<Url>>,
links_pages: &mut Option<hashbrown::HashSet<A>>,
) -> Self {
use crate::utils::{modify_selectors, AllowedDomainTypes};
let mut metadata: Option<Box<Metadata>> = None;
let mut meta_title: Option<_> = None;
let mut meta_description: Option<_> = None;
let mut meta_og_image: Option<_> = None;
let duration = if cfg!(feature = "time") {
Some(tokio::time::Instant::now())
} else {
None
};
let encoding = AsciiCompatibleEncoding::utf_8();
let adjust_charset_on_meta_tag = true;
let base_input_url = tokio::sync::OnceCell::new();
let original_page = Url::parse(url).ok();
if ssg_map.is_some() {
let mut ci_url = Box::new(CaseInsensitiveString::new(url));
modify_selectors(
prior_domain,
url,
domain_parsed,
&mut ci_url,
selectors,
AllowedDomainTypes::new(r_settings.subdomains, r_settings.tld),
);
}
let base = if domain_parsed.is_none() {
prior_domain
} else {
domain_parsed
};
let xml_file = url.ends_with(".xml");
let element_content_handlers = build_link_extract_handlers(
LinkExtractCtx {
selectors,
external_domains_caseless,
map,
links_pages,
base_input_url: &base_input_url,
base: base.as_deref(),
original_page: original_page.as_ref(),
ssg_raw_src_cell: None,
ssg_resolved_path_cell: None,
xml_file,
full_resources: r_settings.full_resources,
skip_links: false,
},
&mut meta_title,
&mut meta_description,
&mut meta_og_image,
);
let settings = lol_html::send::Settings {
element_content_handlers,
adjust_charset_on_meta_tag,
encoding,
..lol_html::send::Settings::new_for_handler_types()
};
let mut collected_bytes: Vec<u8> = match input_bytes.len() {
n if n >= MAX_PRE_ALLOCATED_HTML_PAGE_SIZE_USIZE => Vec::with_capacity(n),
n if n > 0 => Vec::with_capacity(n),
_ => Vec::with_capacity(MAX_PRE_ALLOCATED_HTML_PAGE_SIZE_USIZE),
};
let mut rewriter = lol_html::send::HtmlRewriter::new(settings, |c: &[u8]| {
collected_bytes.extend_from_slice(c);
});
let _ = rewriter.write(input_bytes);
let _ = rewriter.end();
let mut page_response = PageResponse::default();
page_response.status_code = StatusCode::OK;
if r_settings.normalize {
page_response.signature = Some(hash_html(&collected_bytes).await);
}
if !collected_bytes.is_empty() {
page_response.content = Some(collected_bytes);
}
let valid_meta = meta_title.is_some()
|| meta_description.is_some()
|| meta_og_image.is_some()
|| metadata.is_some();
if valid_meta {
let mut metadata_inner = Metadata::default();
metadata_inner.title = meta_title;
metadata_inner.description = meta_description;
metadata_inner.image = meta_og_image;
if metadata_inner.exist() {
set_metadata(&mut metadata, &mut metadata_inner);
metadata.replace(Box::new(metadata_inner));
}
if metadata.is_some() {
page_response.metadata = metadata;
}
}
crate::utils::set_page_response_duration(&mut page_response, duration);
build(url, page_response)
}
#[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub(crate) async fn new_base<'h>(
url: &str,
client: &Client,
page: &chromiumoxide::Page,
page_set: bool,
referrer: Option<String>,
max_page_bytes: Option<f64>,
cache_options: Option<CacheOptions>,
seeded_resource: Option<String>,
jar: Option<&std::sync::Arc<crate::client::cookie::Jar>>,
cache_namespace: Option<&str>,
params: &crate::utils::ChromeFetchParams<'_>,
extract: Option<&mut ChromeStreamingExtractor<'h>>,
) -> Self {
let page_resource = if seeded_resource.is_some() {
crate::utils::fetch_page_html_seeded(
url,
client,
page,
page_set,
referrer,
max_page_bytes,
cache_options,
seeded_resource,
jar,
cache_namespace,
params,
extract,
)
.await
} else {
#[cfg(feature = "fs")]
{
crate::utils::fetch_page_html(
url,
client,
page,
page_set,
referrer,
max_page_bytes,
cache_options,
#[cfg(feature = "cookies")]
jar,
cache_namespace,
params,
extract,
)
.await
}
#[cfg(not(feature = "fs"))]
{
let _ = jar;
crate::utils::fetch_page_html(
url,
client,
page,
page_set,
referrer,
max_page_bytes,
cache_options,
cache_namespace,
params,
extract,
)
.await
}
};
let mut p = build(url, page_resource);
if cfg!(feature = "chrome_store_page") {
p.chrome_page = Some(page.clone());
}
confirm_chrome_tunnel_failure_with_local_dns(
&mut p,
url,
std::time::Duration::from_millis(1_500),
)
.await;
p
}
#[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub async fn new(
url: &str,
client: &Client,
page: &chromiumoxide::Page,
page_set: bool,
referrer: Option<String>,
max_page_bytes: Option<f64>,
cache_options: Option<CacheOptions>,
cache_namespace: Option<&str>,
params: &crate::utils::ChromeFetchParams<'_>,
) -> Self {
Self::new_base(
url,
client,
page,
page_set,
referrer,
max_page_bytes,
cache_options,
None,
None,
cache_namespace,
params,
None,
)
.await
}
#[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub(crate) async fn new_streaming<A>(
url: &str,
client: &Client,
page: &chromiumoxide::Page,
page_set: bool,
referrer: Option<String>,
max_page_bytes: Option<f64>,
cache_options: Option<CacheOptions>,
cache_namespace: Option<&str>,
params: &crate::utils::ChromeFetchParams<'_>,
selectors: &RelativeSelectors,
external_domains_caseless: &Arc<HashSet<CaseInsensitiveString>>,
links: &mut HashSet<A>,
links_pages: &mut Option<HashSet<A>>,
full_resources: bool,
skip_links: bool,
ssg_enabled: bool,
) -> (Self, bool)
where
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ ToString
+ std::hash::Hash
+ From<String>
+ Into<CaseInsensitiveString>
+ for<'a> From<&'a str>,
{
let parsed_target = Url::parse(url).ok();
let xml_file = url.ends_with(".xml");
let base_input_url = tokio::sync::OnceCell::new();
let asset_url = is_asset_url(url);
let ssg_cell = if ssg_enabled && !skip_links && !xml_file && !asset_url {
Some(tokio::sync::OnceCell::new())
} else {
None
};
let mut meta_title: Option<CompactString> = None;
let mut meta_description: Option<CompactString> = None;
let mut meta_og_image: Option<CompactString> = None;
let (page_out, mut extract_succeeded) = if asset_url {
let p = Self::new_base(
url,
client,
page,
page_set,
referrer,
max_page_bytes,
cache_options,
None,
None,
cache_namespace,
params,
None,
)
.await;
(p, false)
} else {
let handlers = build_link_extract_handlers(
LinkExtractCtx {
selectors,
external_domains_caseless,
map: links,
links_pages,
base_input_url: &base_input_url,
base: parsed_target.as_ref(),
original_page: parsed_target.as_ref(),
ssg_raw_src_cell: None,
ssg_resolved_path_cell: ssg_cell.as_ref(),
xml_file,
full_resources,
skip_links,
},
&mut meta_title,
&mut meta_description,
&mut meta_og_image,
);
let mut extract = ChromeStreamingExtractor::new(handlers, None, true);
let p = Self::new_base(
url,
client,
page,
page_set,
referrer,
max_page_bytes,
cache_options,
None,
None,
cache_namespace,
params,
Some(&mut extract),
)
.await;
let succeeded = extract.end();
(p, succeeded)
};
let mut p = page_out;
if extract_succeeded && !skip_links {
if let Some(cell) = ssg_cell.as_ref() {
if let Some(build_ssg_path) = cell.get() {
if !build_ssg_path.is_empty() {
let build_page = Self::new_page(build_ssg_path, client).await;
let parent_host = &selectors.1[0];
let parent_host_scheme = &selectors.1[1];
let base_input_domain = &selectors.2;
let sub_matcher = &selectors.0;
let ssg_base = if base_input_url.initialized() {
base_input_url.get()
} else {
parsed_target.as_ref()
};
for cap in SSG_CAPTURE.captures_iter(build_page.get_html_bytes_u8()) {
if let Some(matched) = cap.get(1) {
let href =
auto_encode_bytes(matched.as_bytes()).replace(r#"\u002F"#, "/");
let last_segment = crate::utils::get_last_segment(&href);
if !(last_segment.starts_with("[") && last_segment.ends_with("]")) {
let resolved_base =
if relative_directory_url(&href) || ssg_base.is_none() {
parsed_target.as_ref()
} else {
ssg_base
};
push_link(
&resolved_base,
&href,
links,
sub_matcher,
parent_host,
parent_host_scheme,
base_input_domain,
sub_matcher,
external_domains_caseless,
&mut None,
);
}
}
}
}
}
}
}
if !extract_succeeded && !skip_links && p.is_xml {
if let Some(html_bytes) = p.html.take() {
p.links_stream_xml_links_stream_base(selectors, &html_bytes, links, &None)
.await;
p.html = Some(html_bytes);
extract_succeeded = true;
}
}
if extract_succeeded {
if let Some(redirect) = p.final_redirect_destination.as_deref() {
if redirect != url {
extract_succeeded = false;
}
}
}
if extract_succeeded {
let valid_meta =
meta_title.is_some() || meta_description.is_some() || meta_og_image.is_some();
if valid_meta {
let mut metadata_inner = Metadata::default();
metadata_inner.title = meta_title;
metadata_inner.description = meta_description;
metadata_inner.image = meta_og_image;
if metadata_inner.exist() {
set_metadata(&mut p.metadata, &mut metadata_inner);
p.metadata.replace(Box::new(metadata_inner));
}
}
update_link_capacity_hint(links.len());
}
confirm_chrome_tunnel_failure_with_local_dns(
&mut p,
url,
std::time::Duration::from_millis(1_500),
)
.await;
(p, extract_succeeded)
}
#[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub async fn new_seeded(
url: &str,
client: &Client,
page: &chromiumoxide::Page,
page_set: bool,
referrer: Option<String>,
max_page_bytes: Option<f64>,
cache_options: Option<CacheOptions>,
seeded_resource: Option<String>,
jar: Option<&std::sync::Arc<crate::client::cookie::Jar>>,
cache_namespace: Option<&str>,
params: &crate::utils::ChromeFetchParams<'_>,
) -> Self {
Self::new_base(
url,
client,
page,
page_set,
referrer,
max_page_bytes,
cache_options,
seeded_resource,
jar,
cache_namespace,
params,
None,
)
.await
}
#[allow(dead_code)]
#[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub(crate) async fn new_seeded_streaming<A>(
url: &str,
client: &Client,
page: &chromiumoxide::Page,
page_set: bool,
referrer: Option<String>,
max_page_bytes: Option<f64>,
cache_options: Option<CacheOptions>,
seeded_resource: Option<String>,
jar: Option<&std::sync::Arc<crate::client::cookie::Jar>>,
cache_namespace: Option<&str>,
params: &crate::utils::ChromeFetchParams<'_>,
selectors: &RelativeSelectors,
external_domains_caseless: &Arc<HashSet<CaseInsensitiveString>>,
links: &mut HashSet<A>,
links_pages: &mut Option<HashSet<A>>,
full_resources: bool,
skip_links: bool,
ssg_enabled: bool,
) -> (Self, bool)
where
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ ToString
+ std::hash::Hash
+ From<String>
+ Into<CaseInsensitiveString>
+ for<'a> From<&'a str>,
{
let parsed_target = Url::parse(url).ok();
let xml_file = url.ends_with(".xml");
let base_input_url = tokio::sync::OnceCell::new();
let asset_url = is_asset_url(url);
let ssg_cell = if ssg_enabled && !skip_links && !xml_file && !asset_url {
Some(tokio::sync::OnceCell::new())
} else {
None
};
let mut meta_title: Option<CompactString> = None;
let mut meta_description: Option<CompactString> = None;
let mut meta_og_image: Option<CompactString> = None;
let (page_out, mut extract_succeeded) = if asset_url {
let p = Self::new_base(
url,
client,
page,
page_set,
referrer,
max_page_bytes,
cache_options,
seeded_resource,
jar,
cache_namespace,
params,
None,
)
.await;
(p, false)
} else {
let handlers = build_link_extract_handlers(
LinkExtractCtx {
selectors,
external_domains_caseless,
map: links,
links_pages,
base_input_url: &base_input_url,
base: parsed_target.as_ref(),
original_page: parsed_target.as_ref(),
ssg_raw_src_cell: None,
ssg_resolved_path_cell: ssg_cell.as_ref(),
xml_file,
full_resources,
skip_links,
},
&mut meta_title,
&mut meta_description,
&mut meta_og_image,
);
let mut extract = ChromeStreamingExtractor::new(handlers, None, true);
let p = Self::new_base(
url,
client,
page,
page_set,
referrer,
max_page_bytes,
cache_options,
seeded_resource,
jar,
cache_namespace,
params,
Some(&mut extract),
)
.await;
let succeeded = extract.end();
(p, succeeded)
};
let mut p = page_out;
if extract_succeeded {
if let Some(redirect) = p.final_redirect_destination.as_deref() {
if redirect != url {
extract_succeeded = false;
}
}
}
if extract_succeeded && !skip_links {
if let Some(cell) = ssg_cell.as_ref() {
if let Some(build_ssg_path) = cell.get() {
if !build_ssg_path.is_empty() {
let build_page = Self::new_page(build_ssg_path, client).await;
let parent_host = &selectors.1[0];
let parent_host_scheme = &selectors.1[1];
let base_input_domain = &selectors.2;
let sub_matcher = &selectors.0;
let ssg_base = if base_input_url.initialized() {
base_input_url.get()
} else {
parsed_target.as_ref()
};
for cap in SSG_CAPTURE.captures_iter(build_page.get_html_bytes_u8()) {
if let Some(matched) = cap.get(1) {
let href =
auto_encode_bytes(matched.as_bytes()).replace(r#"\u002F"#, "/");
let last_segment = crate::utils::get_last_segment(&href);
if !(last_segment.starts_with("[") && last_segment.ends_with("]")) {
let resolved_base =
if relative_directory_url(&href) || ssg_base.is_none() {
parsed_target.as_ref()
} else {
ssg_base
};
push_link(
&resolved_base,
&href,
links,
sub_matcher,
parent_host,
parent_host_scheme,
base_input_domain,
sub_matcher,
external_domains_caseless,
&mut None,
);
}
}
}
}
}
}
}
if !extract_succeeded && !skip_links && p.is_xml {
if let Some(html_bytes) = p.html.take() {
p.links_stream_xml_links_stream_base(selectors, &html_bytes, links, &None)
.await;
p.html = Some(html_bytes);
extract_succeeded = true;
}
}
if extract_succeeded {
let valid_meta =
meta_title.is_some() || meta_description.is_some() || meta_og_image.is_some();
if valid_meta {
let mut metadata_inner = Metadata::default();
metadata_inner.title = meta_title;
metadata_inner.description = meta_description;
metadata_inner.image = meta_og_image;
if metadata_inner.exist() {
set_metadata(&mut p.metadata, &mut metadata_inner);
p.metadata.replace(Box::new(metadata_inner));
}
}
update_link_capacity_hint(links.len());
}
confirm_chrome_tunnel_failure_with_local_dns(
&mut p,
url,
std::time::Duration::from_millis(1_500),
)
.await;
(p, extract_succeeded)
}
#[cfg(all(feature = "decentralized", not(feature = "headers")))]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub async fn new(url: &str, client: &Client) -> Self {
Self::new_links_only(url, client).await
}
#[cfg(all(feature = "decentralized", feature = "headers"))]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all,))]
pub async fn new(url: &str, client: &Client) -> Self {
use crate::serde::Deserialize;
match crate::utils::fetch_page_and_headers(url, client).await {
FetchPageResult::Success(headers, page_content) => {
let links = match page_content {
Some(b) => match flexbuffers::Reader::get_root(b.as_slice()) {
Ok(buf) => match HashSet::<CaseInsensitiveString>::deserialize(buf) {
Ok(link) => link,
_ => Default::default(),
},
_ => Default::default(),
},
_ => Default::default(),
};
Page {
html: None,
headers: Some(headers),
links,
..Default::default()
}
}
FetchPageResult::NoSuccess(headers) => Page {
headers: Some(headers),
..Default::default()
},
FetchPageResult::FetchError => Default::default(),
}
}
#[cfg(feature = "decentralized")]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all,))]
pub async fn new_links_only(url: &str, client: &Client) -> Self {
use crate::serde::Deserialize;
let links = match crate::utils::fetch_page(url, client).await {
Some(b) => match flexbuffers::Reader::get_root(b.as_slice()) {
Ok(buf) => match HashSet::<CaseInsensitiveString>::deserialize(buf) {
Ok(link) => link,
_ => Default::default(),
},
_ => Default::default(),
},
_ => Default::default(),
};
Page {
html: None,
links,
..Default::default()
}
}
#[cfg(not(all(not(feature = "decentralized"), feature = "chrome")))]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all,))]
pub async fn screenshot(
&self,
_full_page: bool,
_omit_background: bool,
_format: crate::configuration::CaptureScreenshotFormat,
_quality: Option<i64>,
_output_path: Option<impl AsRef<std::path::Path>>,
_clip: Option<crate::configuration::ClipViewport>,
) -> Vec<u8> {
Default::default()
}
#[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub async fn take_screenshot(
page: &Page,
full_page: bool,
omit_background: bool,
format: crate::configuration::CaptureScreenshotFormat,
quality: Option<i64>,
output_path: Option<impl AsRef<std::path::Path>>,
clip: Option<crate::configuration::ClipViewport>,
) -> Vec<u8> {
match &page.chrome_page {
Some(chrome_page) => {
let format: chromiumoxide::cdp::browser_protocol::page::CaptureScreenshotFormat =
chromiumoxide::cdp::browser_protocol::page::CaptureScreenshotFormat::from(
format,
);
let screenshot_configs = chromiumoxide::page::ScreenshotParams::builder()
.format(format)
.full_page(full_page)
.omit_background(omit_background);
let screenshot_configs = match quality {
Some(q) => screenshot_configs.quality(q),
_ => screenshot_configs,
};
let screenshot_configs = match clip {
Some(vp) => screenshot_configs.clip(
chromiumoxide::cdp::browser_protocol::page::Viewport::from(vp),
),
_ => screenshot_configs,
};
if output_path.is_none() {
match chrome_page.screenshot(screenshot_configs.build()).await {
Ok(v) => {
log::debug!("took screenshot: {:?}", page.url);
v
}
Err(e) => {
log::error!("failed to took screenshot: {:?} - {:?}", e, page.url);
Default::default()
}
}
} else {
let output_path = match output_path {
Some(out) => out.as_ref().to_path_buf(),
_ => Default::default(),
};
match chrome_page
.save_screenshot(screenshot_configs.build(), &output_path)
.await
{
Ok(v) => {
log::debug!("saved screenshot: {:?}", output_path);
v
}
Err(e) => {
log::error!("failed to save screenshot: {:?} - {:?}", e, output_path);
Default::default()
}
}
}
}
_ => Default::default(),
}
}
#[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub async fn screenshot(
&self,
full_page: bool,
omit_background: bool,
format: crate::configuration::CaptureScreenshotFormat,
quality: Option<i64>,
output_path: Option<impl AsRef<std::path::Path>>,
clip: Option<crate::configuration::ClipViewport>,
) -> Vec<u8> {
let screenshot_result = tokio::time::timeout(
tokio::time::Duration::from_secs(30),
Page::take_screenshot(
self,
full_page,
omit_background,
format,
quality,
output_path,
clip,
),
)
.await;
match screenshot_result {
Ok(sb) => sb,
_ => Default::default(),
}
}
#[cfg(all(feature = "chrome", not(feature = "decentralized")))]
pub fn get_chrome_page(&self) -> Option<&chromiumoxide::Page> {
self.chrome_page.as_ref()
}
#[cfg(all(feature = "chrome", feature = "decentralized"))]
pub fn get_chrome_page(&self) -> Option<&chromiumoxide::Page> {
None
}
#[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
pub async fn close_page(&mut self) {
if let Some(page) = self.chrome_page.as_mut() {
let _ = page
.send_command(chromiumoxide::cdp::browser_protocol::page::CloseParams::default())
.await;
}
}
#[cfg(all(feature = "decentralized", feature = "chrome"))]
pub async fn close_page(&mut self) {}
#[inline]
pub fn is_empty(&self) -> bool {
match self.html.as_deref() {
None => {
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
if self.html_spool_path.is_some() {
return false;
}
true
}
Some(html) => {
let html = html.trim_ascii();
html.is_empty() || html.eq(*EMPTY_HTML) || html.eq(*EMPTY_HTML_BASIC)
}
}
}
#[inline]
pub fn size(&self) -> usize {
if let Some(ref html) = self.html {
return html.len();
}
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
{
self.content_byte_len
}
#[cfg(any(not(feature = "balance"), feature = "decentralized"))]
{
0
}
}
#[cfg(not(feature = "decentralized"))]
pub fn get_url(&self) -> &str {
&self.url
}
#[cfg(not(feature = "headers"))]
pub fn get_timeout(&self) -> Option<Duration> {
if self.status_code == 429 {
return Some(Duration::from_millis(2_500));
} else if self.status_code == StatusCode::GATEWAY_TIMEOUT {
return Some(Duration::from_millis(1_500));
} else if self.status_code.as_u16() >= 598 {
return Some(Duration::from_millis(500));
}
None
}
#[cfg(feature = "headers")]
pub fn get_timeout(&self) -> Option<Duration> {
if self.status_code == 429 {
const MAX_TIMEOUT: Duration = Duration::from_secs(30);
if let Some(headers) = &self.headers {
if let Some(retry_after) = headers.get(reqwest::header::RETRY_AFTER) {
if let Ok(retry_after_str) = retry_after.to_str() {
if let Ok(seconds) = retry_after_str.parse::<u64>() {
return Some(Duration::from_secs(seconds).min(MAX_TIMEOUT));
}
if let Ok(date) = httpdate::parse_http_date(retry_after_str) {
if let Ok(duration) = date.duration_since(std::time::SystemTime::now())
{
return Some(duration.min(MAX_TIMEOUT));
}
}
}
}
};
return Some(Duration::from_millis(2_500));
} else if self.status_code == StatusCode::GATEWAY_TIMEOUT {
return Some(Duration::from_millis(1_500));
} else if self.status_code.as_u16() >= 598 {
return Some(Duration::from_millis(500));
}
None
}
#[cfg(not(feature = "decentralized"))]
pub fn get_url_final(&self) -> &str {
match self.final_redirect_destination.as_ref() {
Some(u) => u,
_ => &self.url,
}
}
pub fn set_external(&mut self, external_domains_caseless: Arc<HashSet<CaseInsensitiveString>>) {
self.external_domains_caseless = external_domains_caseless;
}
pub fn set_html_bytes(&mut self, html: Option<Vec<u8>>) {
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
{
if self.balance_bytes_tracked {
if let Some(old) = &self.html {
crate::utils::html_spool::track_bytes_sub(old.len());
}
self.balance_bytes_tracked = false;
}
self.html_spool_path = None;
}
self.html = html.map(bytes::Bytes::from);
self.binary_file = self
.html
.as_deref()
.is_some_and(crate::utils::is_binary_body);
self.is_valid_utf8 = self
.html
.as_deref()
.is_some_and(|b| simdutf8::basic::from_utf8(b).is_ok());
self.is_xml = self
.html
.as_deref()
.is_some_and(|b| b.starts_with(b"<?xml"));
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
{
self.content_byte_len = self.html.as_ref().map_or(0, |h| h.len());
if let Some(ref h) = self.html {
crate::utils::html_spool::track_bytes_add(h.len());
self.balance_bytes_tracked = true;
}
}
}
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
pub fn spool_html_to_disk(&mut self) -> bool {
let html = match self.html.as_ref() {
Some(h) if !h.is_empty() => h,
_ => return false,
};
{
let trimmed = html.trim_ascii();
if trimmed.is_empty() || trimmed == *EMPTY_HTML || trimmed == *EMPTY_HTML_BASIC {
return false;
}
}
if self.html_spool_path.is_some() {
return false;
}
let path = crate::utils::html_spool::next_spool_path();
if crate::utils::html_spool::spool_write(&path, html).is_ok() {
let len = html.len();
self.content_byte_len = len;
self.html = None;
crate::utils::html_spool::track_bytes_sub(len);
crate::utils::html_spool::track_page_spooled();
self.html_spool_path = Some(HtmlSpoolGuard::new(
path,
crate::utils::html_spool::current_website_spool_dir(),
));
self.balance_bytes_tracked = false;
true
} else {
false
}
}
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
pub async fn spool_html_to_disk_async(&mut self) -> bool {
let html = match self.html.as_ref() {
Some(h) if !h.is_empty() => h,
_ => return false,
};
{
let trimmed = html.trim_ascii();
if trimmed.is_empty() || trimmed == *EMPTY_HTML || trimmed == *EMPTY_HTML_BASIC {
return false;
}
}
if self.html_spool_path.is_some() {
return false;
}
let path = crate::utils::html_spool::next_spool_path();
match crate::utils::html_spool::spool_write_streaming_vitals(&path, html.as_ref()).await {
Ok(vitals) => {
self.content_byte_len = vitals.byte_len;
self.is_valid_utf8 = vitals.is_valid_utf8;
self.binary_file = vitals.binary_file;
self.is_xml = vitals.is_xml;
self.html = None;
crate::utils::html_spool::track_bytes_sub(vitals.byte_len);
crate::utils::html_spool::track_page_spooled();
self.html_spool_path = Some(HtmlSpoolGuard::new(
path,
crate::utils::html_spool::current_website_spool_dir(),
));
self.balance_bytes_tracked = false;
true
}
Err(_) => false,
}
}
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
pub fn ensure_html_loaded(&mut self) -> bool {
if self.html.is_some() {
return true;
}
if let Some(guard) = self.html_spool_path.as_ref() {
if let Some(path) = guard.path() {
match crate::utils::html_spool::spool_read_bytes(path) {
Ok(bytes) => {
crate::utils::html_spool::track_bytes_add(bytes.len());
self.html = Some(bytes);
self.html_spool_path = None;
self.balance_bytes_tracked = true;
true
}
Err(_) => {
self.html_spool_path = None;
false
}
}
} else {
self.html_spool_path = None;
false
}
} else {
false
}
}
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
pub async fn ensure_html_loaded_async(&mut self) -> bool {
if self.html.is_some() {
return true;
}
if let Some(guard) = self.html_spool_path.as_ref() {
if let Some(path) = guard.path() {
let path_buf = path.to_path_buf();
match crate::utils::html_spool::spool_read_bytes_async(path_buf).await {
Ok(bytes) => {
crate::utils::html_spool::track_bytes_add(bytes.len());
self.html = Some(bytes);
self.html_spool_path = None;
self.balance_bytes_tracked = true;
true
}
Err(_) => {
self.html_spool_path = None;
false
}
}
} else {
self.html_spool_path = None;
false
}
} else {
false
}
}
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
#[inline]
pub fn is_html_on_disk(&self) -> bool {
self.html.is_none() && self.html_spool_path.is_some()
}
#[cfg(any(not(feature = "balance"), feature = "decentralized"))]
#[inline]
pub fn is_html_on_disk(&self) -> bool {
false
}
#[inline]
pub fn is_binary_spool_aware(&self) -> bool {
if self.binary_file {
return true;
}
match self.html.as_deref() {
Some(bytes) => crate::utils::is_binary_body(bytes),
None => false,
}
}
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
#[inline]
pub fn get_html_spool_path(&self) -> Option<&std::path::Path> {
self.html_spool_path.as_ref().and_then(|guard| guard.path())
}
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
pub fn stream_html_bytes<F>(&self, chunk_size: usize, mut cb: F) -> usize
where
F: FnMut(&[u8]) -> bool,
{
if let Some(ref html) = self.html {
let mut total = 0usize;
for chunk in html.chunks(chunk_size.max(1)) {
total = total.saturating_add(chunk.len());
if !cb(chunk) {
break;
}
}
return total;
}
if let Some(ref guard) = self.html_spool_path {
if let Some(path) = guard.path() {
return crate::utils::html_spool::spool_stream_chunks(path, chunk_size, cb)
.unwrap_or(0);
}
}
0
}
#[cfg(any(not(feature = "balance"), feature = "decentralized"))]
pub fn stream_html_bytes<F>(&self, chunk_size: usize, mut cb: F) -> usize
where
F: FnMut(&[u8]) -> bool,
{
if let Some(ref html) = self.html {
let mut total = 0usize;
for chunk in html.chunks(chunk_size.max(1)) {
total = total.saturating_add(chunk.len());
if !cb(chunk) {
break;
}
}
return total;
}
0
}
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
pub async fn stream_html_bytes_async<F>(&self, chunk_size: usize, mut cb: F) -> usize
where
F: FnMut(&[u8]) -> bool,
{
if let Some(ref html) = self.html {
let mut total = 0usize;
for chunk in html.chunks(chunk_size.max(1)) {
total = total.saturating_add(chunk.len());
if !cb(chunk) {
break;
}
}
return total;
}
if let Some(ref guard) = self.html_spool_path {
if let Some(path) = guard.path() {
let chunk_size = chunk_size.max(1);
let mut total = 0usize;
let _ = crate::utils::uring_fs::read_file_chunked(
path.display().to_string(),
chunk_size,
|chunk| {
total = total.saturating_add(chunk.len());
cb(chunk)
},
)
.await;
return total;
}
}
0
}
#[cfg(any(not(feature = "balance"), feature = "decentralized"))]
pub async fn stream_html_bytes_async<F>(&self, chunk_size: usize, cb: F) -> usize
where
F: FnMut(&[u8]) -> bool,
{
self.stream_html_bytes(chunk_size, cb)
}
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
pub async fn get_html_async(&self) -> String {
if let Some(bytes) = self.html.as_deref() {
return if self.is_valid_utf8 {
unsafe { std::str::from_utf8_unchecked(bytes) }.to_string()
} else {
auto_encoder::auto_encode_bytes(bytes)
};
}
if let Some(guard) = &self.html_spool_path {
if let Some(path) = guard.path() {
if let Ok(bytes) =
crate::utils::html_spool::spool_read_async(path.to_path_buf()).await
{
return if self.is_valid_utf8 {
unsafe { String::from_utf8_unchecked(bytes) }
} else {
String::from_utf8(bytes)
.unwrap_or_else(|e| auto_encoder::auto_encode_bytes(&e.into_bytes()))
};
}
}
}
String::new()
}
#[cfg(any(not(feature = "balance"), feature = "decentralized"))]
pub async fn get_html_async(&self) -> String {
self.get_html()
}
#[cfg(not(feature = "decentralized"))]
pub fn set_url(&mut self, url: String) {
self.url = url;
}
#[cfg(not(feature = "decentralized"))]
pub fn set_url_parsed_direct(&mut self) {
let effective_url = match &self.final_redirect_destination {
Some(u) => u.as_str(),
None => &self.url,
};
if let Ok(base) = Url::parse(effective_url) {
self.base = Some(base);
}
}
#[cfg(not(feature = "decentralized"))]
pub fn set_url_parsed_direct_empty(&mut self) {
if !self.base.is_some() && !self.url.is_empty() {
self.set_url_parsed_direct()
}
}
#[cfg(feature = "decentralized")]
pub fn set_url_parsed_direct(&mut self) {}
#[cfg(feature = "decentralized")]
pub fn set_url_parsed_direct_empty(&mut self) {}
#[cfg(not(feature = "decentralized"))]
pub fn set_url_parsed(&mut self, url_parsed: Url) {
self.base = Some(url_parsed);
}
#[cfg(not(feature = "decentralized"))]
pub fn get_url_parsed_ref(&self) -> &Option<Url> {
&self.base
}
#[cfg(not(feature = "decentralized"))]
pub fn get_url_parsed(&mut self) -> &Option<Url> {
if self.base.is_none() && !self.url.is_empty() {
self.base = Url::parse(&self.url).ok();
}
&self.base
}
#[cfg(feature = "decentralized")]
pub fn get_url_parsed(&self) -> &Option<Url> {
&None
}
#[cfg(feature = "decentralized")]
pub fn get_url_parsed_ref(&self) -> &Option<Url> {
&None
}
#[cfg(not(feature = "decentralized"))]
pub fn take_url(&mut self) -> Option<Url> {
self.base.take()
}
#[cfg(feature = "decentralized")]
pub fn take_url(&mut self) -> Option<Url> {
None
}
#[cfg(feature = "decentralized")]
pub fn get_url(&self) -> &str {
&self.url
}
pub fn get_bytes(&self) -> Option<&[u8]> {
self.html.as_deref()
}
pub fn get_html(&self) -> String {
if let Some(bytes) = self.html.as_deref() {
return if self.is_valid_utf8 {
unsafe { std::str::from_utf8_unchecked(bytes) }.to_string()
} else {
auto_encoder::auto_encode_bytes(bytes)
};
}
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
if let Some(guard) = &self.html_spool_path {
if let Some(path) = guard.path() {
if let Ok(bytes) = crate::utils::html_spool::spool_read(path) {
return if self.is_valid_utf8 {
unsafe { String::from_utf8_unchecked(bytes) }
} else {
String::from_utf8(bytes)
.unwrap_or_else(|e| auto_encoder::auto_encode_bytes(&e.into_bytes()))
};
}
}
}
String::new()
}
#[inline]
pub fn get_content(&self) -> String {
self.get_html()
}
pub fn get_html_cow(&self) -> std::borrow::Cow<'_, str> {
match self.html.as_deref() {
Some(bytes) => {
if self.is_valid_utf8 {
std::borrow::Cow::Borrowed(unsafe { std::str::from_utf8_unchecked(bytes) })
} else {
std::borrow::Cow::Owned(auto_encoder::auto_encode_bytes(bytes))
}
}
None => {
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
if let Some(guard) = &self.html_spool_path {
if let Some(path) = guard.path() {
if let Ok(bytes) = crate::utils::html_spool::spool_read(path) {
return std::borrow::Cow::Owned(if self.is_valid_utf8 {
unsafe { String::from_utf8_unchecked(bytes) }
} else {
String::from_utf8(bytes).unwrap_or_else(|e| {
auto_encoder::auto_encode_bytes(&e.into_bytes())
})
});
}
}
}
std::borrow::Cow::Borrowed("")
}
}
}
pub fn get_html_bytes_u8(&self) -> &[u8] {
match self.html.as_deref() {
Some(html) => html,
_ => Default::default(),
}
}
#[inline]
pub fn get_content_bytes(&self) -> &[u8] {
self.get_html_bytes_u8()
}
#[cfg(feature = "spider_cloud")]
#[inline]
pub fn get_content_for(&self, format: &str) -> Option<String> {
self.content_map.as_ref().and_then(|map| {
map.get(format).map(|b| {
simdutf8::basic::from_utf8(b)
.map(|s| s.to_string())
.unwrap_or_else(|_| auto_encoder::auto_encode_bytes(b))
})
})
}
#[cfg(feature = "spider_cloud")]
#[inline]
pub fn get_content_bytes_for(&self, format: &str) -> Option<&[u8]> {
self.content_map
.as_ref()
.and_then(|map| map.get(format).map(|b| b.as_ref()))
}
#[cfg(feature = "spider_cloud")]
#[inline]
pub fn has_content_map(&self) -> bool {
self.content_map.as_ref().is_some_and(|m| !m.is_empty())
}
#[cfg(feature = "parallel_backends")]
#[inline]
pub fn quality_score(&self) -> u16 {
crate::utils::parallel_backends::html_quality_score(
self.html.as_deref(),
self.status_code,
&self.anti_bot_tech,
self.get_url(),
)
}
#[cfg(all(
feature = "sitemap",
feature = "chrome",
not(feature = "decentralized")
))]
pub(crate) fn modify_xml_html(&mut self) -> &[u8] {
if let Some(html_bytes) = self.html.take() {
const XML_DECL: &str = r#"<?xml version="1.0" encoding="UTF-8"?>"#;
let xml = html_bytes.as_ref();
if let Ok(xml_str) = simdutf8::basic::from_utf8(xml) {
let stripped = xml_str
.strip_prefix(XML_DECL)
.map(|f| f.trim_start())
.unwrap_or(xml_str);
let offset = stripped.as_ptr() as usize - xml.as_ptr() as usize;
self.html = Some(html_bytes.slice(offset..offset + stripped.len()));
} else {
self.html = Some(html_bytes);
}
}
self.html.as_deref().unwrap_or_default()
}
#[cfg(feature = "chrome")]
pub fn get_responses(&self) -> &Option<hashbrown::HashMap<String, f64>> {
&self.response_map
}
pub fn get_metadata(&self) -> &Option<Box<Metadata>> {
&self.metadata
}
#[cfg(feature = "chrome")]
pub fn get_request(&self) -> &Option<hashbrown::HashMap<String, f64>> {
&self.request_map
}
#[cfg(feature = "encoding")]
pub fn get_html_encoded(&self, label: &str) -> String {
get_html_encoded(&self.html, label)
}
#[cfg(not(feature = "encoding"))]
pub fn get_html_encoded(&self, _label: &str) -> String {
self.get_html()
}
#[inline]
#[cfg(all(feature = "time", not(feature = "decentralized")))]
pub fn set_duration_elapsed(&mut self, scraped_at: Option<Instant>) {
self.duration = scraped_at;
}
#[inline]
#[cfg(all(feature = "time", not(feature = "decentralized")))]
pub fn set_duration_elapsed_from_duration(&mut self, elapsed: Option<std::time::Duration>) {
self.duration = elapsed.map(|d| Instant::now().checked_sub(d).unwrap_or_else(Instant::now));
}
#[cfg(all(feature = "time", not(feature = "decentralized")))]
pub fn get_duration_elapsed(&self) -> Duration {
self.duration
.as_ref()
.map(|t| t.elapsed())
.unwrap_or_default()
}
#[inline]
#[cfg(all(feature = "time", feature = "decentralized"))]
pub fn set_duration_elapsed(&mut self, scraped_at: Option<Instant>) {
self.duration = scraped_at;
}
#[inline]
#[cfg(all(feature = "time", feature = "decentralized"))]
pub fn set_duration_elapsed_from_duration(&mut self, elapsed: Option<std::time::Duration>) {
self.duration = elapsed.map(|d| Instant::now().checked_sub(d).unwrap_or_else(Instant::now));
}
#[cfg(all(feature = "time", feature = "decentralized"))]
pub fn get_duration_elapsed(&self) -> Duration {
self.duration
.as_ref()
.map(|t| t.elapsed())
.unwrap_or_default()
}
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub async fn links_stream_xml_links_stream_base<
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ ToString
+ std::hash::Hash
+ From<String>
+ Into<CaseInsensitiveString>
+ for<'a> From<&'a str>,
>(
&mut self,
selectors: &RelativeSelectors,
xml: &[u8],
map: &mut HashSet<A>,
base: &Option<Box<Url>>,
) {
self.links_stream_xml_from_reader(selectors, xml, map, base)
.await;
}
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
pub async fn links_stream_xml_from_disk<
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ ToString
+ std::hash::Hash
+ From<String>
+ Into<CaseInsensitiveString>
+ for<'a> From<&'a str>,
>(
&mut self,
selectors: &RelativeSelectors,
spool_path: std::path::PathBuf,
map: &mut HashSet<A>,
base: &Option<Box<Url>>,
) {
match tokio::fs::File::open(&spool_path).await {
Ok(file) => {
let reader = tokio::io::BufReader::with_capacity(*STREAMING_CHUNK_SIZE, file);
self.links_stream_xml_from_reader(selectors, reader, map, base)
.await;
}
Err(_) => {
}
}
}
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub async fn links_stream_xml_from_reader<
R: tokio::io::AsyncBufRead + Unpin + Send,
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ ToString
+ std::hash::Hash
+ From<String>
+ Into<CaseInsensitiveString>
+ for<'a> From<&'a str>,
>(
&mut self,
selectors: &RelativeSelectors,
source: R,
map: &mut HashSet<A>,
base: &Option<Box<Url>>,
) {
use quick_xml::events::Event;
use quick_xml::reader::NsReader;
let mut reader = NsReader::from_reader(source);
reader.config_mut().trim_text(true);
let mut buf = XML_PARSE_BUF.with(|c| c.take());
let parent_host = &selectors.1[0];
let parent_host_scheme = &selectors.1[1];
let base_input_domain = &selectors.2;
let sub_matcher = &selectors.0;
let mut is_link_tag = false;
let mut links_pages: Option<HashSet<A>> = if self.page_links.is_some() {
Some(HashSet::new())
} else {
None
};
let base = if base.is_some() {
base.as_deref()
} else {
self.set_url_parsed_direct_empty();
let base = self.get_url_parsed_ref().as_ref();
base
};
loop {
match reader.read_event_into_async(&mut buf).await {
Ok(e) => match e {
Event::Start(e) => {
let (_, local) = reader.resolver().resolve_element(e.name());
if local.as_ref() == b"link" {
is_link_tag = true;
}
}
Event::Text(e) => {
if is_link_tag {
if let Ok(v) = e.decode() {
push_link_verify(
&base,
&v,
map,
&selectors.0,
parent_host,
parent_host_scheme,
base_input_domain,
sub_matcher,
&self.external_domains_caseless,
false,
&mut links_pages,
true,
);
}
}
}
Event::End(ref e) => {
let (_, local) = reader.resolver().resolve_element(e.name());
if local.as_ref() == b"link" {
is_link_tag = false;
}
}
Event::Eof => {
break;
}
_ => (),
},
_ => break,
}
buf.clear();
}
buf.clear();
XML_PARSE_BUF.with(|c| c.set(buf));
if let Some(lp) = links_pages {
let page_links = self.page_links.get_or_insert_with(Default::default);
page_links.extend(lp.into_iter().map(Into::into));
}
}
#[inline(always)]
#[cfg(not(feature = "decentralized"))]
pub async fn links_stream_base<
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ ToString
+ std::hash::Hash
+ From<String>
+ Into<CaseInsensitiveString>
+ for<'a> From<&'a str>,
>(
&mut self,
selectors: &RelativeSelectors,
html: &[u8],
base: &Option<Box<Url>>,
) -> HashSet<A> {
let mut map: HashSet<A> = HashSet::with_capacity(link_set_capacity());
let mut links_pages: Option<HashSet<A>> = if self.page_links.is_some() {
Some(HashSet::new())
} else {
None
};
let mut metadata: Option<Box<Metadata>> = None;
let mut meta_title: Option<_> = None;
let mut meta_description: Option<_> = None;
let mut meta_og_image: Option<_> = None;
if !html.is_empty() {
if self.is_xml {
self.links_stream_xml_links_stream_base(selectors, html, &mut map, base)
.await;
} else {
let base_input_url = tokio::sync::OnceCell::new();
let base = base.as_deref();
let xml_file = self.get_url().ends_with(".xml");
self.set_url_parsed_direct_empty();
let original_page = self.get_url_parsed_ref().as_ref();
let external_domains_caseless = &self.external_domains_caseless;
let element_content_handlers = build_link_extract_handlers(
LinkExtractCtx {
selectors,
external_domains_caseless,
map: &mut map,
links_pages: &mut links_pages,
base_input_url: &base_input_url,
base,
original_page,
ssg_raw_src_cell: None,
ssg_resolved_path_cell: None,
xml_file,
full_resources: false,
skip_links: false,
},
&mut meta_title,
&mut meta_description,
&mut meta_og_image,
);
let rewriter_settings = lol_html::Settings {
element_content_handlers,
adjust_charset_on_meta_tag: true,
..lol_html::send::Settings::new_for_handler_types()
};
let mut wrote_error = false;
let mut rewriter =
lol_html::send::HtmlRewriter::new(rewriter_settings, |_c: &[u8]| {});
let should_yield = html.len() > REWRITER_YIELD_THRESHOLD;
for (i, chunk) in html.chunks(*STREAMING_CHUNK_SIZE).enumerate() {
if rewriter.write(chunk).is_err() {
wrote_error = true;
break;
}
if should_yield && i % REWRITER_YIELD_INTERVAL == REWRITER_YIELD_INTERVAL - 1 {
tokio::task::yield_now().await;
}
}
if !wrote_error {
let _ = rewriter.end();
}
}
}
if let Some(lp) = links_pages {
let page_links = self.page_links.get_or_insert_with(Default::default);
page_links.extend(lp.into_iter().map(Into::into));
}
let valid_meta =
meta_title.is_some() || meta_description.is_some() || meta_og_image.is_some();
if valid_meta {
let mut metadata_inner = Metadata::default();
metadata_inner.title = meta_title;
metadata_inner.description = meta_description;
metadata_inner.image = meta_og_image;
if metadata_inner.exist() {
metadata.replace(Box::new(metadata_inner));
}
if metadata.is_some() {
self.metadata = metadata;
}
}
update_link_capacity_hint(map.len());
map
}
#[cfg(all(not(feature = "decentralized"), feature = "balance"))]
pub async fn links_stream_base_from_disk<
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ ToString
+ std::hash::Hash
+ From<String>
+ Into<CaseInsensitiveString>
+ for<'a> From<&'a str>,
>(
&mut self,
selectors: &RelativeSelectors,
spool_path: std::path::PathBuf,
base: &Option<Box<Url>>,
) -> HashSet<A> {
let mut map: HashSet<A> = HashSet::with_capacity(link_set_capacity());
let mut links_pages: Option<HashSet<A>> = if self.page_links.is_some() {
Some(HashSet::new())
} else {
None
};
let mut metadata: Option<Box<Metadata>> = None;
let mut meta_title: Option<_> = None;
let mut meta_description: Option<_> = None;
let mut meta_og_image: Option<_> = None;
if self.is_xml {
self.links_stream_xml_from_disk(selectors, spool_path.clone(), &mut map, base)
.await;
} else {
let base_input_url = tokio::sync::OnceCell::new();
let base = base.as_deref();
let xml_file = self.get_url().ends_with(".xml");
self.set_url_parsed_direct_empty();
let original_page = self.get_url_parsed_ref().as_ref();
let external_domains_caseless = &self.external_domains_caseless;
let element_content_handlers = build_link_extract_handlers(
LinkExtractCtx {
selectors,
external_domains_caseless,
map: &mut map,
links_pages: &mut links_pages,
base_input_url: &base_input_url,
base,
original_page,
ssg_raw_src_cell: None,
ssg_resolved_path_cell: None,
xml_file,
full_resources: false,
skip_links: false,
},
&mut meta_title,
&mut meta_description,
&mut meta_og_image,
);
let rewriter_settings = lol_html::Settings {
element_content_handlers,
adjust_charset_on_meta_tag: true,
..lol_html::send::Settings::new_for_handler_types()
};
let mut rewriter = lol_html::send::HtmlRewriter::new(rewriter_settings, |_c: &[u8]| {});
let chunk_size = *STREAMING_CHUNK_SIZE;
let mut wrote_error = false;
let mut chunk_idx = 0usize;
let _ = crate::utils::uring_fs::read_file_chunked(
spool_path.display().to_string(),
chunk_size,
|chunk| {
if rewriter.write(chunk).is_err() {
wrote_error = true;
return false;
}
chunk_idx += 1;
true
},
)
.await;
if !wrote_error {
let _ = rewriter.end();
}
}
if let Some(lp) = links_pages {
let page_links = self.page_links.get_or_insert_with(Default::default);
page_links.extend(lp.into_iter().map(Into::into));
}
let valid_meta =
meta_title.is_some() || meta_description.is_some() || meta_og_image.is_some();
if valid_meta {
let mut metadata_inner = Metadata::default();
metadata_inner.title = meta_title;
metadata_inner.description = meta_description;
metadata_inner.image = meta_og_image;
if metadata_inner.exist() {
metadata.replace(Box::new(metadata_inner));
}
if metadata.is_some() {
self.metadata = metadata;
}
}
update_link_capacity_hint(map.len());
map
}
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
pub async fn links_stream_base_from_disk_ssg<
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ ToString
+ std::hash::Hash
+ From<String>
+ Into<CaseInsensitiveString>
+ for<'a> From<&'a str>,
>(
&mut self,
selectors: &RelativeSelectors,
spool_path: std::path::PathBuf,
client: &Client,
base: &Option<Box<Url>>,
) -> HashSet<A> {
let mut map: HashSet<A> = HashSet::with_capacity(link_set_capacity());
let mut map_ssg: HashSet<A> = HashSet::new();
let mut links_pages: Option<HashSet<A>> = if self.page_links.is_some() {
Some(HashSet::new())
} else {
None
};
let mut metadata: Option<Box<Metadata>> = None;
let mut meta_title: Option<_> = None;
let mut meta_description: Option<_> = None;
let mut meta_og_image: Option<_> = None;
if self.is_xml {
self.links_stream_xml_from_disk(selectors, spool_path.clone(), &mut map, base)
.await;
} else {
let cell = tokio::sync::OnceCell::new();
let base_input_url = tokio::sync::OnceCell::new();
let parent_host = &selectors.1[0];
let parent_host_scheme = &selectors.1[1];
let base_input_domain = &selectors.2;
let sub_matcher = &selectors.0;
let base = base.as_deref();
let xml_file = self.get_url().ends_with(".xml");
self.set_url_parsed_direct_empty();
let original_page = self.get_url_parsed_ref().as_ref();
{
let external_domains_caseless = &self.external_domains_caseless;
let element_content_handlers = build_link_extract_handlers(
LinkExtractCtx {
selectors,
external_domains_caseless,
map: &mut map,
links_pages: &mut links_pages,
base_input_url: &base_input_url,
base,
original_page,
ssg_raw_src_cell: None,
ssg_resolved_path_cell: Some(&cell),
xml_file,
full_resources: false,
skip_links: false,
},
&mut meta_title,
&mut meta_description,
&mut meta_og_image,
);
let rewriter_settings = lol_html::Settings {
element_content_handlers,
adjust_charset_on_meta_tag: true,
..lol_html::send::Settings::new_for_handler_types()
};
let mut rewriter =
lol_html::send::HtmlRewriter::new(rewriter_settings, |_c: &[u8]| {});
let chunk_size = *STREAMING_CHUNK_SIZE;
let mut wrote_error = false;
let _ = crate::utils::uring_fs::read_file_chunked(
spool_path.display().to_string(),
chunk_size,
|chunk| {
if rewriter.write(chunk).is_err() {
wrote_error = true;
return false;
}
true
},
)
.await;
if !wrote_error {
let _ = rewriter.end();
}
}
if let Some(build_ssg_path) = cell.get() {
if !build_ssg_path.is_empty() {
let build_page = Page::new_page(build_ssg_path, client).await;
for cap in SSG_CAPTURE.captures_iter(build_page.get_html_bytes_u8()) {
if let Some(matched) = cap.get(1) {
let href =
auto_encode_bytes(matched.as_bytes()).replace(r#"\u002F"#, "/");
let last_segment = crate::utils::get_last_segment(&href);
if !(last_segment.starts_with("[") && last_segment.ends_with("]")) {
let base = if relative_directory_url(&href) || base.is_none() {
original_page
} else {
base
};
let base = if base_input_url.initialized() {
base_input_url.get()
} else {
base
};
push_link(
&base,
&href,
&mut map_ssg,
&selectors.0,
parent_host,
parent_host_scheme,
base_input_domain,
sub_matcher,
&self.external_domains_caseless,
&mut None,
);
}
}
}
}
}
}
if let Some(lp) = links_pages {
let page_links = self.page_links.get_or_insert_with(Default::default);
page_links.extend(lp.into_iter().map(Into::into));
}
let valid_meta = meta_title.is_some()
|| meta_description.is_some()
|| meta_og_image.is_some()
|| self.get_metadata().is_some();
if valid_meta {
let mut metadata_inner = Metadata::default();
metadata_inner.title = meta_title;
metadata_inner.description = meta_description;
metadata_inner.image = meta_og_image;
if metadata_inner.exist() && self.metadata.is_some() {
set_metadata(&mut self.metadata, &mut metadata_inner);
}
if metadata_inner.exist() {
metadata.replace(Box::new(metadata_inner));
}
if metadata.is_some() {
self.metadata = metadata;
}
}
map.extend(map_ssg);
update_link_capacity_hint(map.len());
map
}
#[inline(always)]
#[cfg(not(feature = "decentralized"))]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub async fn links_stream_base_ssg<
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ ToString
+ std::hash::Hash
+ From<String>
+ Into<CaseInsensitiveString>
+ for<'a> From<&'a str>,
>(
&mut self,
selectors: &RelativeSelectors,
html: &[u8],
client: &Client,
base: &Option<Box<Url>>,
) -> HashSet<A> {
let mut map: HashSet<A> = HashSet::with_capacity(link_set_capacity());
let mut map_ssg: HashSet<A> = HashSet::new();
let mut links_pages: Option<HashSet<A>> = if self.page_links.is_some() {
Some(HashSet::new())
} else {
None
};
let mut metadata: Option<Box<Metadata>> = None;
let mut meta_title: Option<_> = None;
let mut meta_description: Option<_> = None;
let mut meta_og_image: Option<_> = None;
if !html.is_empty() {
if self.is_xml {
self.links_stream_xml_links_stream_base(selectors, html, &mut map, base)
.await;
} else {
let cell = tokio::sync::OnceCell::new();
let base_input_url = tokio::sync::OnceCell::new();
let parent_host = &selectors.1[0];
let parent_host_scheme = &selectors.1[1];
let base_input_domain = &selectors.2; let sub_matcher = &selectors.0;
let base = base.as_deref();
let original_page = {
self.set_url_parsed_direct_empty();
self.get_url_parsed_ref().as_ref()
};
let xml_file = self.get_url().ends_with(".xml");
{
let external_domains_caseless = &self.external_domains_caseless;
let element_content_handlers = build_link_extract_handlers(
LinkExtractCtx {
selectors,
external_domains_caseless,
map: &mut map,
links_pages: &mut links_pages,
base_input_url: &base_input_url,
base,
original_page,
ssg_raw_src_cell: None,
ssg_resolved_path_cell: Some(&cell),
xml_file,
full_resources: false,
skip_links: false,
},
&mut meta_title,
&mut meta_description,
&mut meta_og_image,
);
let rewriter_settings = lol_html::Settings {
element_content_handlers,
adjust_charset_on_meta_tag: true,
..lol_html::send::Settings::new_for_handler_types()
};
let mut rewriter =
lol_html::send::HtmlRewriter::new(rewriter_settings, |_c: &[u8]| {});
let mut wrote_error = false;
let should_yield = html.len() > REWRITER_YIELD_THRESHOLD;
for (i, chunk) in html.chunks(*STREAMING_CHUNK_SIZE).enumerate() {
if rewriter.write(chunk).is_err() {
wrote_error = true;
break;
}
if should_yield
&& i % REWRITER_YIELD_INTERVAL == REWRITER_YIELD_INTERVAL - 1
{
tokio::task::yield_now().await;
}
}
if !wrote_error {
let _ = rewriter.end();
}
}
if let Some(build_ssg_path) = cell.get() {
if !build_ssg_path.is_empty() {
let build_page = Page::new_page(build_ssg_path, client).await;
for cap in SSG_CAPTURE.captures_iter(build_page.get_html_bytes_u8()) {
if let Some(matched) = cap.get(1) {
let href =
auto_encode_bytes(matched.as_bytes()).replace(r#"\u002F"#, "/");
let last_segment = crate::utils::get_last_segment(&href);
if !(last_segment.starts_with("[") && last_segment.ends_with("]")) {
let base = if relative_directory_url(&href) || base.is_none() {
original_page
} else {
base
};
let base = if base_input_url.initialized() {
base_input_url.get()
} else {
base
};
push_link(
&base,
&href,
&mut map_ssg,
&selectors.0,
parent_host,
parent_host_scheme,
base_input_domain,
sub_matcher,
&self.external_domains_caseless,
&mut None,
);
}
}
}
}
}
}
}
if let Some(lp) = links_pages {
let page_links = self.page_links.get_or_insert_with(Default::default);
page_links.extend(lp.into_iter().map(Into::into));
}
let valid_meta = meta_title.is_some()
|| meta_description.is_some()
|| meta_og_image.is_some()
|| self.get_metadata().is_some();
if valid_meta {
let mut metadata_inner = Metadata::default();
metadata_inner.title = meta_title;
metadata_inner.description = meta_description;
metadata_inner.image = meta_og_image;
if metadata_inner.exist() && self.metadata.is_some() {
set_metadata(&mut self.metadata, &mut metadata_inner);
}
if metadata_inner.exist() {
metadata.replace(Box::new(metadata_inner));
}
if metadata.is_some() {
self.metadata = metadata;
}
}
map.extend(map_ssg);
update_link_capacity_hint(map.len());
map
}
#[cfg(not(feature = "decentralized"))]
pub async fn links_stream_ssg<
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ ToString
+ std::hash::Hash
+ From<String>
+ Into<CaseInsensitiveString>
+ for<'a> From<&'a str>,
>(
&mut self,
selectors: &RelativeSelectors,
client: &Client,
prior_domain: &Option<Box<Url>>,
) -> HashSet<A> {
if self.is_binary_spool_aware() {
Default::default()
} else if let Some(html_bytes) = self.html.take() {
let result = self
.links_stream_base_ssg(selectors, &html_bytes, client, prior_domain)
.await;
self.html = Some(html_bytes);
result
} else {
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
if let Some(ref guard) = self.html_spool_path {
if let Some(path) = guard.path() {
return self
.links_stream_base_from_disk_ssg(
selectors,
path.to_path_buf(),
client,
prior_domain,
)
.await;
}
}
Default::default()
}
}
#[inline(always)]
#[cfg(not(feature = "decentralized"))]
pub async fn links_ssg(
&mut self,
selectors: &RelativeSelectors,
client: &Client,
prior_domain: &Option<Box<Url>>,
) -> HashSet<CaseInsensitiveString> {
let has_html = self.html.is_some();
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
let has_html = has_html || self.html_spool_path.is_some();
match has_html {
false => Default::default(),
true => {
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
if self.html.is_none() && self.html_spool_path.is_some() {
if let Some(ref guard) = self.html_spool_path {
if let Some(path) = guard.path() {
return self
.links_stream_base_from_disk_ssg(
selectors,
path.to_path_buf(),
client,
prior_domain,
)
.await;
}
}
return Default::default();
}
self.links_stream_ssg::<CaseInsensitiveString>(selectors, client, prior_domain)
.await
}
}
}
#[inline(always)]
#[cfg(all(not(feature = "decentralized"), not(feature = "full_resources")))]
pub async fn links_stream<
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ ToString
+ std::hash::Hash
+ From<String>
+ Into<CaseInsensitiveString>
+ for<'a> From<&'a str>,
>(
&mut self,
selectors: &RelativeSelectors,
base: &Option<Box<Url>>,
) -> HashSet<A> {
if self.is_binary_spool_aware() {
Default::default()
} else if let Some(html_bytes) = self.html.take() {
let result = self.links_stream_base(selectors, &html_bytes, base).await;
self.html = Some(html_bytes);
result
} else {
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
if let Some(ref guard) = self.html_spool_path {
if let Some(path) = guard.path() {
return self
.links_stream_base_from_disk(selectors, path.to_path_buf(), base)
.await;
}
}
Default::default()
}
}
#[cfg(all(
not(feature = "decentralized"),
not(feature = "full_resources"),
feature = "smart"
))]
#[inline(always)]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub(crate) async fn links_stream_smart<
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ ToString
+ std::hash::Hash
+ From<String>
+ Into<CaseInsensitiveString>
+ for<'a> From<&'a str>,
>(
&mut self,
selectors: &RelativeSelectors,
configuration: &crate::configuration::Configuration,
base: &Option<Box<Url>>,
browser: &crate::features::chrome::OnceBrowser,
jar: Option<&std::sync::Arc<crate::client::cookie::Jar>>,
) -> (HashSet<A>, Option<f64>) {
use lol_html::{element, text};
use std::sync::atomic::Ordering;
let mut bytes_transferred: Option<f64> = None;
let mut map: HashSet<A> = HashSet::with_capacity(link_set_capacity());
let mut inner_map: HashSet<A> = HashSet::new();
let mut links_pages: Option<HashSet<A>> = if self.page_links.is_some() {
Some(HashSet::new())
} else {
None
};
let mut metadata: Option<Box<Metadata>> = None;
let mut meta_title: Option<_> = None;
let mut meta_description: Option<_> = None;
let mut meta_og_image: Option<_> = None;
if self.is_xml {
if let Some(html_bytes_taken) = self.html.take() {
self.links_stream_xml_links_stream_base(
selectors,
html_bytes_taken.as_ref(),
&mut map,
&base,
)
.await;
self.html = Some(html_bytes_taken);
} else {
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
if let Some(ref guard) = self.html_spool_path {
if let Some(path) = guard.path() {
self.links_stream_xml_from_disk(
selectors,
path.to_path_buf(),
&mut map,
&base,
)
.await;
}
}
}
} else {
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
if self.html.is_none() && self.html_spool_path.is_some() {
if let Some(ref guard) = self.html_spool_path {
if let Some(path) = guard.path() {
let disk_links: HashSet<A> = self
.links_stream_base_from_disk(selectors, path.to_path_buf(), base)
.await;
map.extend(disk_links);
}
}
}
if let Some(html_bytes_taken) = self.html.take() {
{
let base_input_url = tokio::sync::OnceCell::new();
let base_input_domain = &selectors.2;
let parent_frags = &selectors.1; let parent_host = &parent_frags[0];
let parent_host_scheme = &parent_frags[1];
let sub_matcher = &selectors.0;
let base1 = base.as_deref();
let original_page = {
self.set_url_parsed_direct_empty();
self.get_url_parsed_ref().as_ref().cloned()
};
let external_domains_caseless = &self.external_domains_caseless;
const SMART_UPGRADE_THRESHOLD: u8 = 10;
let upgrade_score = std::sync::atomic::AtomicU8::new(0);
let mut static_app = false;
let mut script_src_count: u8 = 0;
let xml_file = self.get_url().ends_with(".xml");
let mut element_content_handlers = metadata_handlers(
&mut meta_title,
&mut meta_description,
&mut meta_og_image,
);
element_content_handlers.push(element_precompiled!(
compiled_base_element_selector(),
|el| {
if let Some(href) = el.get_attribute("href") {
if let Ok(parsed_base) = Url::parse(&href) {
let _ = base_input_url.set(parsed_base);
}
}
Ok(())
}
));
element_content_handlers.push(element!("script", |el| {
if static_app
|| upgrade_score.load(Ordering::Relaxed) >= SMART_UPGRADE_THRESHOLD
{
return Ok(());
}
let id = el.get_attribute("id");
if id.as_deref() == *NUXT_DATA {
static_app = true;
upgrade_score.store(SMART_UPGRADE_THRESHOLD, Ordering::Relaxed);
return Ok(());
}
if el.get_attribute("data-target").as_deref() == *REACT_SSR {
upgrade_score.store(SMART_UPGRADE_THRESHOLD, Ordering::Relaxed);
return Ok(());
}
let Some(src) = el.get_attribute("src") else {
return Ok(());
};
if !is_tracker_script(&src) {
script_src_count = script_src_count.saturating_add(1);
if script_src_count >= 4 {
let _ = upgrade_score.fetch_update(
Ordering::Relaxed,
Ordering::Relaxed,
|v| Some(v.saturating_add(SMART_UPGRADE_THRESHOLD)),
);
}
}
if !src.starts_with('/') {
return Ok(());
}
let is_next = src.starts_with("/_next/static/chunks/pages/")
|| src.starts_with("/webpack-runtime-");
let is_gatsby = id.as_deref() == *GATSBY;
let is_nuxt_asset = src.starts_with("/_nuxt/");
if is_next || is_gatsby || is_nuxt_asset {
static_app = true;
}
if is_nuxt_asset {
upgrade_score.store(SMART_UPGRADE_THRESHOLD, Ordering::Relaxed);
return Ok(());
}
if let Some(base) = base1.as_ref() {
let abs = convert_abs_path(base, &src);
if abs.path_segments().is_some_and(|mut segs| {
segs.any(|p| {
chromiumoxide::handler::network::ALLOWED_MATCHER.is_match(p)
})
}) {
upgrade_score.store(SMART_UPGRADE_THRESHOLD, Ordering::Relaxed);
}
}
Ok(())
}));
element_content_handlers.push(element_precompiled!(
if xml_file {
compiled_xml_selector()
} else {
compiled_selector()
},
|el| {
if let Some(href) = el.get_attribute("href") {
let base = if relative_directory_url(&href) || base.is_none() {
original_page.as_ref()
} else {
base.as_deref()
};
let base = if base_input_url.initialized() {
base_input_url.get()
} else {
base
};
push_link(
&base,
&href,
&mut inner_map,
&selectors.0,
parent_host,
parent_host_scheme,
base_input_domain,
sub_matcher,
external_domains_caseless,
&mut links_pages,
);
}
Ok(())
}
));
element_content_handlers.push(text!("noscript", |el| {
if upgrade_score.load(Ordering::Relaxed) < SMART_UPGRADE_THRESHOLD {
if NO_SCRIPT_JS_REQUIRED.find(el.as_str()).is_some() {
upgrade_score.store(SMART_UPGRADE_THRESHOLD, Ordering::Relaxed);
}
}
Ok(())
}));
element_content_handlers.push(text!("script", |el| {
let s = el.as_str();
if !s.is_empty()
&& upgrade_score.load(Ordering::Relaxed) < SMART_UPGRADE_THRESHOLD
{
if DOM_SCRIPT_WATCH_METHODS.find(s).is_some() {
let _ = upgrade_score.fetch_update(
Ordering::Relaxed,
Ordering::Relaxed,
|v| Some(v.saturating_add(7)),
);
}
}
Ok(())
}));
element_content_handlers.push(element!("body", |el| {
if upgrade_score.load(Ordering::Relaxed) < SMART_UPGRADE_THRESHOLD {
let mut matched = false;
if let Some(id) = el.get_attribute("id") {
if HYDRATION_IDS.contains(&id) {
upgrade_score.store(SMART_UPGRADE_THRESHOLD, Ordering::Relaxed);
matched = true;
}
}
if !matched {
for attr in DOM_WATCH_ATTRIBUTE_PATTERNS.iter() {
if el.has_attribute(attr) {
upgrade_score
.store(SMART_UPGRADE_THRESHOLD, Ordering::Relaxed);
break;
}
}
}
}
Ok(())
}));
let rewriter_settings = lol_html::Settings {
element_content_handlers,
adjust_charset_on_meta_tag: true,
..lol_html::send::Settings::new_for_handler_types()
};
let mut rewriter =
lol_html::send::HtmlRewriter::new(rewriter_settings, |_c: &[u8]| {});
let mut wrote_error = false;
let should_yield = html_bytes_taken.len() > REWRITER_YIELD_THRESHOLD;
for (i, chunk) in html_bytes_taken.chunks(*STREAMING_CHUNK_SIZE).enumerate() {
if rewriter.write(chunk).is_err() {
wrote_error = true;
break;
}
if should_yield
&& i % REWRITER_YIELD_INTERVAL == REWRITER_YIELD_INTERVAL - 1
{
tokio::task::yield_now().await;
}
}
if !wrote_error {
let _ = rewriter.end();
} else {
drop(rewriter);
}
let mut score = upgrade_score.load(Ordering::Relaxed);
if score < SMART_UPGRADE_THRESHOLD
&& crate::utils::detect_anti_bot_from_body(&html_bytes_taken).is_some()
{
score = SMART_UPGRADE_THRESHOLD;
}
if score >= SMART_UPGRADE_THRESHOLD {
if let Some(browser_controller) = browser
.get_or_init(|| {
crate::website::Website::setup_browser_base(
&configuration,
&base,
jar,
)
})
.await
{
if let Ok(new_page) = crate::features::chrome::attempt_navigation(
"about:blank",
&browser_controller.browser.0,
&configuration.request_timeout,
&browser_controller.browser.2,
&configuration.viewport,
)
.await
{
let (intercept_handle, _) = tokio::join!(
crate::features::chrome::setup_chrome_interception_base(
&new_page,
configuration.chrome_intercept.enabled,
&configuration.auth_challenge_response,
configuration.chrome_intercept.block_visuals,
&parent_host,
),
crate::features::chrome::setup_chrome_events(
&new_page,
&configuration,
),
);
if let Some(cookie_jar) = jar {
if let Some(u) = &original_page {
if !configuration.cookie_str.is_empty() {
let _ =
crate::features::chrome::seed_jar_from_cookie_header(
cookie_jar,
&configuration.cookie_str,
&u,
);
}
if let Ok(cps) =
crate::features::chrome::cookie_params_from_jar(
cookie_jar, &u,
)
{
let _ = crate::features::chrome::set_page_cookies(
&new_page, cps,
)
.await;
}
}
}
let fetch_params = configuration.chrome_fetch_params();
let chrome_parsed_target = original_page.clone();
let chrome_xml_file = self.get_url().ends_with(".xml");
let chrome_base_input_url = tokio::sync::OnceCell::new();
let mut chrome_meta_title_unused: Option<CompactString> = None;
let mut chrome_meta_description_unused: Option<CompactString> =
None;
let mut chrome_meta_og_image_unused: Option<CompactString> = None;
let mut chrome_links_pages_unused: Option<HashSet<A>> = None;
let mut chrome_extracted_links: HashSet<A> =
HashSet::with_capacity(link_set_capacity());
let (page_resource, chrome_extract_succeeded) = {
let chrome_external_domains_caseless =
&self.external_domains_caseless;
let chrome_handlers = build_link_extract_handlers(
LinkExtractCtx {
selectors,
external_domains_caseless:
chrome_external_domains_caseless,
map: &mut chrome_extracted_links,
links_pages: &mut chrome_links_pages_unused,
base_input_url: &chrome_base_input_url,
base: chrome_parsed_target.as_ref(),
original_page: chrome_parsed_target.as_ref(),
ssg_raw_src_cell: None,
ssg_resolved_path_cell: None,
xml_file: chrome_xml_file,
full_resources: false,
skip_links: false,
},
&mut chrome_meta_title_unused,
&mut chrome_meta_description_unused,
&mut chrome_meta_og_image_unused,
);
let mut chrome_extract =
ChromeStreamingExtractor::new(chrome_handlers, None, true);
let resource = crate::utils::fetch_page_html_chrome_base(
&html_bytes_taken,
&new_page,
true,
true,
false,
Some(&self.url),
configuration.referer.clone(),
configuration.max_page_bytes,
configuration.get_cache_options(),
{
#[cfg(feature = "headers")]
{
&self.headers
}
#[cfg(not(feature = "headers"))]
{
&None
}
},
&Some(&configuration.chrome_intercept),
jar,
configuration.cache_namespace_str(),
&fetch_params,
Some(&mut chrome_extract),
)
.await;
let succeeded = chrome_extract.end();
(resource, succeeded)
};
if let Some(h) = intercept_handle {
let abort_handle = h.abort_handle();
if let Err(elasped) = tokio::time::timeout(
tokio::time::Duration::from_secs(15),
h,
)
.await
{
log::warn!("Handler timeout exceeded {elasped}");
abort_handle.abort();
}
}
if let Ok(resource) = page_resource {
let base = if base_input_url.initialized() {
base_input_url.get().cloned().map(Box::new)
} else {
base1.as_deref().cloned().map(Box::new)
};
bytes_transferred = resource.bytes_transferred;
let new_page = build(&self.url, resource);
page_assign(self, new_page);
let extended_map: HashSet<A> = if chrome_extract_succeeded {
chrome_extracted_links
} else {
let fallback_bytes = self.html.take();
let m = self
.links_stream_base::<A>(
selectors,
fallback_bytes.as_deref().unwrap_or(&[]),
&base,
)
.await;
if let Some(b) = fallback_bytes {
self.html = Some(b);
}
m
};
map.extend(extended_map);
};
}
}
}
}
map.extend(inner_map);
self.html = Some(html_bytes_taken);
}
}
if let Some(lp) = links_pages {
let page_links = self.page_links.get_or_insert_with(Default::default);
page_links.extend(lp.into_iter().map(Into::into));
page_links.extend(map.iter().map(|item| item.clone().into()));
}
let valid_meta =
meta_title.is_some() || meta_description.is_some() || meta_og_image.is_some();
if valid_meta {
let mut metadata_inner = Metadata::default();
metadata_inner.title = meta_title;
metadata_inner.description = meta_description;
metadata_inner.image = meta_og_image;
if metadata_inner.exist() {
metadata.replace(Box::new(metadata_inner));
}
if metadata.is_some() {
self.metadata = metadata;
}
}
update_link_capacity_hint(map.len());
(map, bytes_transferred)
}
#[cfg(all(
not(feature = "decentralized"),
feature = "full_resources",
feature = "smart"
))]
#[inline(always)]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
pub async fn links_stream_smart<
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ ToString
+ std::hash::Hash
+ From<String>
+ Into<CaseInsensitiveString>
+ for<'a> From<&'a str>,
>(
&mut self,
selectors: &RelativeSelectors,
configuration: &crate::configuration::Configuration,
base: &Option<Box<Url>>,
browser: &crate::features::chrome::OnceBrowser,
jar: Option<&std::sync::Arc<crate::client::cookie::Jar>>,
) -> (HashSet<A>, Option<f64>) {
use lol_html::{element, text};
use std::sync::atomic::Ordering;
let mut bytes_transferred: Option<f64> = None;
let mut map: HashSet<A> = HashSet::with_capacity(link_set_capacity());
let mut inner_map: HashSet<A> = HashSet::new();
let mut links_pages: Option<HashSet<A>> = if self.page_links.is_some() {
Some(HashSet::new())
} else {
None
};
let mut metadata: Option<Box<Metadata>> = None;
let mut meta_title: Option<_> = None;
let mut meta_description: Option<_> = None;
let mut meta_og_image: Option<_> = None;
if self.is_xml {
if let Some(html_bytes_taken) = self.html.take() {
self.links_stream_xml_links_stream_base(
selectors,
html_bytes_taken.as_ref(),
&mut map,
base,
)
.await;
self.html = Some(html_bytes_taken);
} else {
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
if let Some(ref guard) = self.html_spool_path {
if let Some(path) = guard.path() {
self.links_stream_xml_from_disk(
selectors,
path.to_path_buf(),
&mut map,
base,
)
.await;
}
}
}
} else {
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
if self.html.is_none() && self.html_spool_path.is_some() {
if let Some(ref guard) = self.html_spool_path {
if let Some(path) = guard.path() {
let disk_links: HashSet<A> = self
.links_stream_base_from_disk(selectors, path.to_path_buf(), base)
.await;
map.extend(disk_links);
}
}
}
if let Some(html_bytes_taken) = self.html.take() {
{
let base_input_url = tokio::sync::OnceCell::new();
let base_input_domain = &selectors.2;
let parent_frags = &selectors.1; let parent_host = &parent_frags[0];
let parent_host_scheme = &parent_frags[1];
let sub_matcher = &selectors.0;
let base1 = base.as_deref();
let original_page = {
self.set_url_parsed_direct_empty();
self.get_url_parsed_ref().as_ref().cloned()
};
let external_domains_caseless = &self.external_domains_caseless;
const SMART_UPGRADE_THRESHOLD: u8 = 10;
let upgrade_score = std::sync::atomic::AtomicU8::new(0);
let mut static_app = false;
let mut script_src_count: u8 = 0;
let mut element_content_handlers = vec![
element_precompiled!(compiled_base_element_selector(), |el| {
if let Some(href) = el.get_attribute("href") {
if let Ok(parsed_base) = Url::parse(&href) {
let _ = base_input_url.set(parsed_base);
}
}
Ok(())
}),
element!("script", |el| {
if static_app
|| upgrade_score.load(Ordering::Relaxed) >= SMART_UPGRADE_THRESHOLD
{
return Ok(());
}
let id = el.get_attribute("id");
if id.as_deref() == *NUXT_DATA {
static_app = true;
upgrade_score.store(SMART_UPGRADE_THRESHOLD, Ordering::Relaxed);
return Ok(());
}
if el.get_attribute("data-target").as_deref() == *REACT_SSR {
upgrade_score.store(SMART_UPGRADE_THRESHOLD, Ordering::Relaxed);
return Ok(());
}
let Some(src) = el.get_attribute("src") else {
return Ok(());
};
if !src.starts_with('/') {
return Ok(());
}
let is_next = src.starts_with("/_next/static/chunks/pages/")
|| src.starts_with("/webpack-runtime-");
let is_gatsby = id.as_deref() == *GATSBY;
let is_nuxt_asset = src.starts_with("/_nuxt/");
if is_next || is_gatsby || is_nuxt_asset {
static_app = true;
}
if is_nuxt_asset {
upgrade_score.store(SMART_UPGRADE_THRESHOLD, Ordering::Relaxed);
return Ok(());
}
if let Some(base) = base1.as_ref() {
let abs = convert_abs_path(base, &src);
if abs.path_segments().is_some_and(|mut segs| {
segs.any(|p| {
chromiumoxide::handler::network::ALLOWED_MATCHER.is_match(p)
})
}) {
upgrade_score.store(SMART_UPGRADE_THRESHOLD, Ordering::Relaxed);
}
}
Ok(())
}),
element!(
"a[href]:not([aria-hidden=\"true\"]),script[src],link[href]",
|el| {
let attribute = if el.tag_name() == "script" {
if let Some(src) = el.get_attribute("src") {
if !is_tracker_script(&src) {
script_src_count = script_src_count.saturating_add(1);
if script_src_count >= 4 {
let _ = upgrade_score.fetch_update(
Ordering::Relaxed,
Ordering::Relaxed,
|v| {
Some(v.saturating_add(
SMART_UPGRADE_THRESHOLD,
))
},
);
}
}
}
"src"
} else {
"href"
};
if let Some(href) = el.get_attribute(attribute) {
let base = if relative_directory_url(&href) || base.is_none() {
original_page.as_ref()
} else {
base.as_deref()
};
let base = if base_input_url.initialized() {
base_input_url.get()
} else {
base
};
push_link(
&base,
&href,
&mut inner_map,
&selectors.0,
parent_host,
parent_host_scheme,
base_input_domain,
sub_matcher,
external_domains_caseless,
&mut links_pages,
);
}
Ok(())
}
),
text!("noscript", |el| {
if upgrade_score.load(Ordering::Relaxed) < SMART_UPGRADE_THRESHOLD
&& NO_SCRIPT_JS_REQUIRED.find(el.as_str()).is_some()
{
upgrade_score.store(SMART_UPGRADE_THRESHOLD, Ordering::Relaxed);
}
Ok(())
}),
text!("script", |el| {
let s = el.as_str();
if !s.is_empty()
&& upgrade_score.load(Ordering::Relaxed) < SMART_UPGRADE_THRESHOLD
&& DOM_SCRIPT_WATCH_METHODS.find(s).is_some()
{
let _ = upgrade_score.fetch_update(
Ordering::Relaxed,
Ordering::Relaxed,
|v| Some(v.saturating_add(7)),
);
}
Ok(())
}),
element!("body", |el| {
if upgrade_score.load(Ordering::Relaxed) < SMART_UPGRADE_THRESHOLD {
let mut matched = false;
if let Some(id) = el.get_attribute("id") {
if HYDRATION_IDS.contains(&id) {
upgrade_score
.store(SMART_UPGRADE_THRESHOLD, Ordering::Relaxed);
matched = true;
}
}
if !matched {
for attr in DOM_WATCH_ATTRIBUTE_PATTERNS.iter() {
if el.has_attribute(attr) {
upgrade_score
.store(SMART_UPGRADE_THRESHOLD, Ordering::Relaxed);
break;
}
}
}
}
Ok(())
}),
];
element_content_handlers.extend(metadata_handlers(
&mut meta_title,
&mut meta_description,
&mut meta_og_image,
));
let rewriter_settings = lol_html::Settings {
element_content_handlers,
adjust_charset_on_meta_tag: true,
..lol_html::send::Settings::new_for_handler_types()
};
let mut rewriter =
lol_html::send::HtmlRewriter::new(rewriter_settings, |_c: &[u8]| {});
let mut wrote_error = false;
let should_yield = html_bytes_taken.len() > REWRITER_YIELD_THRESHOLD;
for (i, chunk) in html_bytes_taken.chunks(*STREAMING_CHUNK_SIZE).enumerate() {
if rewriter.write(chunk).is_err() {
wrote_error = true;
break;
}
if should_yield
&& i % REWRITER_YIELD_INTERVAL == REWRITER_YIELD_INTERVAL - 1
{
tokio::task::yield_now().await;
}
}
if !wrote_error {
let _ = rewriter.end();
} else {
drop(rewriter);
}
let mut score = upgrade_score.load(Ordering::Relaxed);
if score < SMART_UPGRADE_THRESHOLD
&& crate::utils::detect_anti_bot_from_body(&html_bytes_taken).is_some()
{
score = SMART_UPGRADE_THRESHOLD;
}
if score >= SMART_UPGRADE_THRESHOLD {
if let Some(browser_controller) = browser
.get_or_init(|| {
crate::website::Website::setup_browser_base(
configuration,
base,
jar,
)
})
.await
{
if let Ok(new_page) = crate::features::chrome::attempt_navigation(
"about:blank",
&browser_controller.browser.0,
&configuration.request_timeout,
&browser_controller.browser.2,
&configuration.viewport,
)
.await
{
let (intercept_handle, _) = tokio::join!(
crate::features::chrome::setup_chrome_interception_base(
&new_page,
configuration.chrome_intercept.enabled,
&configuration.auth_challenge_response,
configuration.chrome_intercept.block_visuals,
parent_host,
),
crate::features::chrome::setup_chrome_events(
&new_page,
configuration,
)
);
if let Some(cookie_jar) = jar {
if let Some(u) = &original_page {
if !configuration.cookie_str.is_empty() {
let _ =
crate::features::chrome::seed_jar_from_cookie_header(
cookie_jar,
&configuration.cookie_str,
u,
);
}
if let Ok(cps) =
crate::features::chrome::cookie_params_from_jar(
cookie_jar, u,
)
{
let _ = crate::features::chrome::set_page_cookies(
&new_page, cps,
)
.await;
}
}
}
let fetch_params = configuration.chrome_fetch_params();
let chrome_parsed_target = original_page.clone();
let chrome_xml_file = self.get_url().ends_with(".xml");
let chrome_base_input_url = tokio::sync::OnceCell::new();
let mut chrome_meta_title_unused: Option<CompactString> = None;
let mut chrome_meta_description_unused: Option<CompactString> =
None;
let mut chrome_meta_og_image_unused: Option<CompactString> = None;
let mut chrome_links_pages_unused: Option<HashSet<A>> = None;
let mut chrome_extracted_links: HashSet<A> =
HashSet::with_capacity(link_set_capacity());
let (page_resource, chrome_extract_succeeded) = {
let chrome_external_domains_caseless =
&self.external_domains_caseless;
let chrome_handlers = build_link_extract_handlers(
LinkExtractCtx {
selectors,
external_domains_caseless:
chrome_external_domains_caseless,
map: &mut chrome_extracted_links,
links_pages: &mut chrome_links_pages_unused,
base_input_url: &chrome_base_input_url,
base: chrome_parsed_target.as_ref(),
original_page: chrome_parsed_target.as_ref(),
ssg_raw_src_cell: None,
ssg_resolved_path_cell: None,
xml_file: chrome_xml_file,
full_resources: true,
skip_links: false,
},
&mut chrome_meta_title_unused,
&mut chrome_meta_description_unused,
&mut chrome_meta_og_image_unused,
);
let mut chrome_extract =
ChromeStreamingExtractor::new(chrome_handlers, None, true);
let resource = crate::utils::fetch_page_html_chrome_base(
&html_bytes_taken,
&new_page,
true,
true,
false,
Some(&self.url),
configuration.referer.clone(),
configuration.max_page_bytes,
configuration.get_cache_options(),
{
#[cfg(feature = "headers")]
{
&self.headers
}
#[cfg(not(feature = "headers"))]
{
&None
}
},
&Some(&configuration.chrome_intercept),
jar,
configuration.cache_namespace_str(),
&fetch_params,
Some(&mut chrome_extract),
)
.await;
let succeeded = chrome_extract.end();
(resource, succeeded)
};
if let Some(h) = intercept_handle {
let abort_handle = h.abort_handle();
if let Err(elasped) = tokio::time::timeout(
tokio::time::Duration::from_secs(15),
h,
)
.await
{
log::warn!("Handler timeout exceeded {elasped}");
abort_handle.abort();
}
}
if let Ok(v) = page_resource {
bytes_transferred = v.bytes_transferred;
let new_page = build(&self.url, v);
page_assign(self, new_page);
let extended_map: HashSet<A> = if chrome_extract_succeeded {
chrome_extracted_links
} else {
let fallback_bytes = self.html.take();
let m = self
.links_stream_base::<A>(
selectors,
fallback_bytes.as_deref().unwrap_or(&[]),
&base.as_deref().cloned().map(Box::new),
)
.await;
if let Some(b) = fallback_bytes {
self.html = Some(b);
}
m
};
map.extend(extended_map);
}
}
}
}
}
map.extend(inner_map);
self.html = Some(html_bytes_taken);
}
}
if let Some(lp) = links_pages {
let page_links = self.page_links.get_or_insert_with(Default::default);
page_links.extend(lp.into_iter().map(Into::into));
page_links.extend(map.iter().map(|item| item.clone().into()));
}
let valid_meta =
meta_title.is_some() || meta_description.is_some() || meta_og_image.is_some();
if valid_meta {
let mut metadata_inner = Metadata::default();
metadata_inner.title = meta_title;
metadata_inner.description = meta_description;
metadata_inner.image = meta_og_image;
if metadata_inner.exist() {
metadata.replace(Box::new(metadata_inner));
}
if metadata.is_some() {
self.metadata = metadata;
}
}
update_link_capacity_hint(map.len());
(map, bytes_transferred)
}
#[inline(always)]
#[cfg(not(feature = "decentralized"))]
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all,))]
pub async fn links_stream_full_resource<
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ ToString
+ std::hash::Hash
+ From<String>
+ Into<CaseInsensitiveString>
+ for<'a> From<&'a str>,
>(
&mut self,
selectors: &RelativeSelectors,
base: &Option<Box<Url>>,
) -> HashSet<A> {
let mut map: HashSet<A> = HashSet::with_capacity(link_set_capacity());
let mut links_pages: Option<HashSet<A>> = if self.page_links.is_some() {
Some(HashSet::new())
} else {
None
};
let mut metadata: Option<Box<Metadata>> = None;
let mut meta_title: Option<_> = None;
let mut meta_description: Option<_> = None;
let mut meta_og_image: Option<_> = None;
if self.is_xml {
if let Some(html_bytes_taken) = self.html.take() {
self.links_stream_xml_links_stream_base(
selectors,
html_bytes_taken.as_ref(),
&mut map,
base,
)
.await;
self.html = Some(html_bytes_taken);
} else {
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
if let Some(ref guard) = self.html_spool_path {
if let Some(path) = guard.path() {
self.links_stream_xml_from_disk(
selectors,
path.to_path_buf(),
&mut map,
base,
)
.await;
}
}
}
} else {
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
if self.html.is_none() && self.html_spool_path.is_some() {
if let Some(ref guard) = self.html_spool_path {
if let Some(path) = guard.path() {
let disk_links = self
.links_stream_base_from_disk(selectors, path.to_path_buf(), base)
.await;
map.extend(disk_links);
}
}
}
if let Some(html_bytes_taken) = self.html.take() {
{
let base_input_url = tokio::sync::OnceCell::new();
let base = base.as_deref();
let xml_file = self.get_url().ends_with(".xml");
self.set_url_parsed_direct_empty();
let original_page = self.get_url_parsed_ref().as_ref();
let external_domains_caseless = &self.external_domains_caseless;
let element_content_handlers = build_link_extract_handlers(
LinkExtractCtx {
selectors,
external_domains_caseless,
map: &mut map,
links_pages: &mut links_pages,
base_input_url: &base_input_url,
base,
original_page,
ssg_raw_src_cell: None,
ssg_resolved_path_cell: None,
xml_file,
full_resources: true,
skip_links: false,
},
&mut meta_title,
&mut meta_description,
&mut meta_og_image,
);
let settings = lol_html::send::Settings {
element_content_handlers,
adjust_charset_on_meta_tag: true,
..lol_html::send::Settings::new_for_handler_types()
};
let mut rewriter = lol_html::send::HtmlRewriter::new(settings, |_c: &[u8]| {});
let mut wrote_error = false;
let should_yield = html_bytes_taken.len() > REWRITER_YIELD_THRESHOLD;
for (i, chunk) in html_bytes_taken.chunks(*STREAMING_CHUNK_SIZE).enumerate() {
if rewriter.write(chunk).is_err() {
wrote_error = true;
break;
}
if should_yield
&& i % REWRITER_YIELD_INTERVAL == REWRITER_YIELD_INTERVAL - 1
{
tokio::task::yield_now().await;
}
}
if !wrote_error {
let _ = rewriter.end();
}
}
self.html = Some(html_bytes_taken);
}
}
let valid_meta =
meta_title.is_some() || meta_description.is_some() || meta_og_image.is_some();
if valid_meta {
let mut metadata_inner = Metadata::default();
metadata_inner.title = meta_title;
metadata_inner.description = meta_description;
metadata_inner.image = meta_og_image;
if metadata_inner.exist() {
metadata.replace(Box::new(metadata_inner));
}
if metadata.is_some() {
self.metadata = metadata;
}
}
update_link_capacity_hint(map.len());
map
}
#[inline(always)]
#[cfg(all(not(feature = "decentralized"), feature = "full_resources"))]
pub async fn links_stream<
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ ToString
+ std::hash::Hash
+ From<String>
+ Into<CaseInsensitiveString>
+ for<'a> From<&'a str>,
>(
&mut self,
selectors: &RelativeSelectors,
base: &Option<Box<Url>>,
) -> HashSet<A> {
if self.is_binary_spool_aware() {
Default::default()
} else {
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
if self.html.is_none() && self.html_spool_path.is_some() {
if let Some(ref guard) = self.html_spool_path {
if let Some(path) = guard.path() {
return self
.links_stream_base_from_disk(selectors, path.to_path_buf(), base)
.await;
}
}
return Default::default();
}
self.links_stream_full_resource(selectors, base).await
}
}
#[inline(always)]
#[cfg(feature = "decentralized")]
pub async fn links_stream<
A: PartialEq
+ Eq
+ Sync
+ Send
+ Clone
+ Default
+ ToString
+ std::hash::Hash
+ From<String>
+ Into<CaseInsensitiveString>
+ for<'a> From<&'a str>,
>(
&mut self,
_: &RelativeSelectors,
) -> HashSet<A> {
Default::default()
}
#[cfg(not(feature = "decentralized"))]
#[inline(always)]
pub async fn links(
&mut self,
selectors: &RelativeSelectors,
base: &Option<Box<Url>>,
) -> HashSet<CaseInsensitiveString> {
let has_html = self.html.is_some();
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
let has_html = has_html || self.html_spool_path.is_some();
match has_html {
false => Default::default(),
true => {
self.links_stream::<CaseInsensitiveString>(selectors, base)
.await
}
}
}
#[inline(always)]
#[cfg(not(feature = "decentralized"))]
pub async fn links_full(
&mut self,
selectors: &RelativeSelectors,
base: &Option<Box<Url>>,
) -> HashSet<CaseInsensitiveString> {
let has_html = self.html.is_some();
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
let has_html = has_html || self.html_spool_path.is_some();
match has_html {
false => Default::default(),
true => {
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
if self.html.is_none() && self.html_spool_path.is_some() {
if let Some(ref guard) = self.html_spool_path {
if let Some(path) = guard.path() {
return self
.links_stream_base_from_disk(selectors, path.to_path_buf(), base)
.await;
}
}
return Default::default();
}
if self.is_binary_spool_aware() {
return Default::default();
}
self.links_stream_full_resource::<CaseInsensitiveString>(selectors, base)
.await
}
}
}
#[cfg(all(not(feature = "decentralized"), feature = "smart"))]
#[inline(always)]
pub(crate) async fn smart_links(
&mut self,
selectors: &RelativeSelectors,
configuration: &crate::configuration::Configuration,
base: &Option<Box<Url>>,
page: &crate::features::chrome::OnceBrowser,
jar: Option<&std::sync::Arc<crate::client::cookie::Jar>>,
) -> (HashSet<CaseInsensitiveString>, Option<f64>) {
let has_html = self.html.is_some();
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
let has_html = has_html || self.html_spool_path.is_some();
match has_html {
false => Default::default(),
true => {
#[cfg(all(feature = "balance", not(feature = "decentralized")))]
if self.html.is_none() && self.html_spool_path.is_some() {
if let Some(ref guard) = self.html_spool_path {
if let Some(path) = guard.path() {
let links = self
.links_stream_base_from_disk(selectors, path.to_path_buf(), base)
.await;
return (links, None);
}
}
return Default::default();
}
if self.is_binary_spool_aware() {
return Default::default();
}
self.links_stream_smart::<CaseInsensitiveString>(
selectors,
configuration,
base,
page,
jar,
)
.await
}
}
}
#[cfg(feature = "decentralized")]
#[inline(always)]
pub async fn links(
&self,
_: &RelativeSelectors,
_: &Option<Box<Url>>,
) -> HashSet<CaseInsensitiveString> {
self.links.to_owned()
}
#[cfg(feature = "decentralized")]
#[inline(always)]
pub async fn links_full(
&self,
_: &RelativeSelectors,
_: &Option<Box<Url>>,
) -> HashSet<CaseInsensitiveString> {
self.links.to_owned()
}
}
pub fn encode_bytes(html: &[u8], label: &str) -> String {
auto_encoder::encode_bytes(html, label)
}
#[cfg(feature = "encoding")]
pub fn get_html_encoded(html: &Option<bytes::Bytes>, label: &str) -> String {
match html.as_ref() {
Some(html) => encode_bytes(html, label),
_ => Default::default(),
}
}
#[cfg(not(feature = "encoding"))]
pub fn get_html_encoded(html: &Option<bytes::Bytes>, _label: &str) -> String {
match html {
Some(b) => String::from_utf8_lossy(b).into_owned(),
_ => Default::default(),
}
}
#[cfg(all(test, not(feature = "decentralized"), feature = "smart"))]
mod smart_tests {
use super::is_tracker_script;
#[test]
fn tracker_absolute_urls() {
assert!(is_tracker_script(
"https://www.googletagmanager.com/gtm.js?id=GTM-ABC"
));
assert!(is_tracker_script(
"https://www.google-analytics.com/analytics.js"
));
assert!(is_tracker_script(
"https://static.hotjar.com/c/hotjar-123.js"
));
assert!(is_tracker_script(
"https://connect.facebook.net/en_US/fbevents.js"
));
}
#[test]
fn non_tracker_absolute_urls() {
assert!(!is_tracker_script("https://cdn.example.com/app.js"));
assert!(!is_tracker_script(
"https://unpkg.com/react@18/umd/react.production.min.js"
));
}
#[test]
fn tracker_relative_paths() {
assert!(is_tracker_script("/js/analytics.js"));
assert!(is_tracker_script("/scripts/gtm.js?id=GTM-XYZ"));
}
#[test]
fn non_tracker_relative_paths() {
assert!(!is_tracker_script("/assets/app.bundle.js"));
assert!(!is_tracker_script("/_next/static/chunks/pages/index.js"));
assert!(!is_tracker_script("/main.js"));
}
}
#[cfg(test)]
pub const TEST_AGENT_NAME: &str = concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"));
#[cfg(all(
feature = "headers",
not(feature = "decentralized"),
not(feature = "cache_request"),
))]
#[tokio::test]
async fn test_headers() {
use crate::utils::PageResponse;
use reqwest::header::HeaderName;
use reqwest::header::HeaderValue;
let mut headers = reqwest::header::HeaderMap::new();
headers.insert(
HeaderName::from_static("server"),
HeaderValue::from_static("GitHub.com"),
);
headers.insert(
HeaderName::from_static("content-type"),
HeaderValue::from_static("text/html; charset=utf-8"),
);
let page = build(
"https://choosealicense.com/",
PageResponse {
content: Some(b"<html></html>".to_vec()),
headers: Some(headers),
status_code: StatusCode::OK,
..Default::default()
},
);
let headers = page.headers.clone().expect("There should be some headers");
assert_eq!(
headers
.get(HeaderName::from_static("server"))
.expect("There should be a server header value"),
HeaderValue::from_static("GitHub.com")
);
assert_eq!(
headers
.get(HeaderName::from_static("content-type"))
.expect("There should be a content-type value"),
HeaderValue::from_static("text/html; charset=utf-8")
);
}
#[tokio::test]
#[cfg(all(
not(feature = "decentralized"),
not(feature = "chrome"),
not(feature = "cache_request")
))]
async fn parse_links() {
use crate::utils::PageResponse;
let link_result = "https://choosealicense.com/";
let html = br#"<html><body><a href="/about/">About</a></body></html>"#;
let mut page = build_with_parse(
link_result,
PageResponse {
content: Some(html.to_vec()),
status_code: StatusCode::OK,
..Default::default()
},
);
let selector = get_page_selectors(link_result, false, false);
let links = page.links(&selector, &None).await;
let about_page = "https://choosealicense.com/about/".into();
assert!(
links.contains::<CaseInsensitiveString>(&about_page),
"Could not find {}. Theses URLs was found {:?}",
about_page,
&links
);
}
#[tokio::test]
#[cfg(all(
not(feature = "decentralized"),
not(feature = "chrome"),
not(feature = "cache_request")
))]
async fn test_status_code() {
use crate::utils::PageResponse;
let page = build(
"https://choosealicense.com/does-not-exist",
PageResponse {
status_code: StatusCode::NOT_FOUND,
..Default::default()
},
);
assert_eq!(page.status_code.as_u16(), 404);
}
#[tokio::test]
#[cfg(all(feature = "time", not(feature = "decentralized")))]
async fn test_duration() {
let client = Client::default();
let link_result = "https://choosealicense.com/";
let page: Page = Page::new_page(link_result, &client).await;
let duration_elasped = page.get_duration_elapsed().as_millis();
assert!(
duration_elasped < 6000,
"Duration took longer than expected {}.",
duration_elasped,
);
}
#[test]
fn test_metadata_exist_empty() {
let metadata = Metadata::default();
assert!(!metadata.exist(), "Empty metadata should not exist");
}
#[test]
fn test_metadata_exist_with_title() {
let metadata = Metadata {
title: Some(CompactString::from("Test Title")),
..Default::default()
};
assert!(metadata.exist(), "Metadata with title should exist");
}
#[test]
fn test_metadata_exist_with_description() {
let metadata = Metadata {
description: Some(CompactString::from("Test Description")),
..Default::default()
};
assert!(metadata.exist(), "Metadata with description should exist");
}
#[test]
fn test_metadata_exist_with_image() {
let metadata = Metadata {
image: Some(CompactString::from("https://example.com/image.png")),
..Default::default()
};
assert!(metadata.exist(), "Metadata with image should exist");
}
#[test]
fn test_metadata_exist_all_fields() {
let metadata = Metadata {
title: Some(CompactString::from("Test Title")),
description: Some(CompactString::from("Test Description")),
image: Some(CompactString::from("https://example.com/image.png")),
#[cfg(feature = "chrome")]
automation: None,
};
assert!(metadata.exist(), "Metadata with all fields should exist");
}
#[test]
#[cfg(not(feature = "decentralized"))]
fn test_metadata_via_build() {
use crate::utils::PageResponse;
let metadata = Metadata {
title: Some(CompactString::from("Build Test Title")),
description: Some(CompactString::from("Build Test Description")),
image: Some(CompactString::from("https://example.com/build-image.png")),
#[cfg(feature = "chrome")]
automation: None,
};
let page_response = PageResponse {
content: Some(b"<html></html>".to_vec()),
status_code: StatusCode::OK,
metadata: Some(Box::new(metadata)),
..Default::default()
};
let page = build("https://example.com", page_response);
let page_metadata = page.get_metadata();
assert!(page_metadata.is_some(), "Page should have metadata");
let meta = page_metadata.as_ref().unwrap();
assert_eq!(
meta.title.as_deref(),
Some("Build Test Title"),
"Title should match"
);
assert_eq!(
meta.description.as_deref(),
Some("Build Test Description"),
"Description should match"
);
assert_eq!(
meta.image.as_deref(),
Some("https://example.com/build-image.png"),
"Image should match"
);
}
#[test]
#[cfg(not(feature = "decentralized"))]
fn test_metadata_via_build_with_parse() {
use crate::utils::PageResponse;
let metadata = Metadata {
title: Some(CompactString::from("Parse Test Title")),
description: Some(CompactString::from("Parse Test Description")),
image: Some(CompactString::from("https://example.com/parse-image.png")),
#[cfg(feature = "chrome")]
automation: None,
};
let page_response = PageResponse {
content: Some(b"<html></html>".to_vec()),
status_code: StatusCode::OK,
metadata: Some(Box::new(metadata)),
..Default::default()
};
let page = build_with_parse("https://example.com/page", page_response);
let page_metadata = page.get_metadata();
assert!(
page_metadata.is_some(),
"Page should have metadata after build_with_parse"
);
let meta = page_metadata.as_ref().unwrap();
assert_eq!(
meta.title.as_deref(),
Some("Parse Test Title"),
"Title should match after build_with_parse"
);
}
#[test]
#[cfg(not(feature = "decentralized"))]
fn test_page_without_metadata() {
use crate::utils::PageResponse;
let page_response = PageResponse {
content: Some(b"<html></html>".to_vec()),
status_code: StatusCode::OK,
metadata: None,
..Default::default()
};
let page = build("https://example.com", page_response);
let page_metadata = page.get_metadata();
assert!(
page_metadata.is_none(),
"Page without metadata should return None"
);
}
#[tokio::test]
#[cfg(all(feature = "cmd", not(feature = "decentralized")))]
async fn test_metadata_from_streaming_bytes() {
let html = br#"<!DOCTYPE html>
<html>
<head>
<title>Streaming Test Title</title>
<meta name="description" content="Streaming Test Description">
<meta property="og:image" content="https://example.com/streaming-image.png">
</head>
<body>
<a href="/page1">Link 1</a>
<a href="/page2">Link 2</a>
</body>
</html>"#;
let url = "https://example.com/test";
let mut selectors = get_page_selectors(url, false, false);
let external_domains: Arc<HashSet<CaseInsensitiveString>> = Default::default();
let r_settings = PageLinkBuildSettings::default();
let mut map: HashSet<CaseInsensitiveString> = HashSet::with_capacity(32);
let prior_domain: Option<Box<Url>> = None;
let mut domain_parsed: Option<Box<Url>> = None;
let mut links_pages: Option<HashSet<CaseInsensitiveString>> = None;
let page = Page::new_page_streaming_from_bytes::<CaseInsensitiveString>(
url,
html,
&mut selectors,
&external_domains,
&r_settings,
&mut map,
None,
&prior_domain,
&mut domain_parsed,
&mut links_pages,
)
.await;
let page_metadata = page.get_metadata();
assert!(
page_metadata.is_some(),
"Page from streaming bytes should have metadata"
);
let meta = page_metadata.as_ref().unwrap();
assert_eq!(
meta.title.as_deref(),
Some("Streaming Test Title"),
"Title should be extracted from streaming bytes"
);
assert_eq!(
meta.description.as_deref(),
Some("Streaming Test Description"),
"Description should be extracted from streaming bytes"
);
assert_eq!(
meta.image.as_deref(),
Some("https://example.com/streaming-image.png"),
"OG image should be extracted from streaming bytes"
);
}
#[tokio::test]
#[cfg(all(feature = "cmd", not(feature = "decentralized")))]
async fn test_metadata_partial_title_only() {
let html = br#"<!DOCTYPE html>
<html>
<head>
<title>Only Title Here</title>
</head>
<body></body>
</html>"#;
let url = "https://example.com/test";
let mut selectors = get_page_selectors(url, false, false);
let external_domains: Arc<HashSet<CaseInsensitiveString>> = Default::default();
let r_settings = PageLinkBuildSettings::default();
let mut map: HashSet<CaseInsensitiveString> = HashSet::with_capacity(32);
let prior_domain: Option<Box<Url>> = None;
let mut domain_parsed: Option<Box<Url>> = None;
let mut links_pages: Option<HashSet<CaseInsensitiveString>> = None;
let page = Page::new_page_streaming_from_bytes::<CaseInsensitiveString>(
url,
html,
&mut selectors,
&external_domains,
&r_settings,
&mut map,
None,
&prior_domain,
&mut domain_parsed,
&mut links_pages,
)
.await;
let page_metadata = page.get_metadata();
assert!(
page_metadata.is_some(),
"Page with only title should have metadata"
);
let meta = page_metadata.as_ref().unwrap();
assert_eq!(
meta.title.as_deref(),
Some("Only Title Here"),
"Title should be extracted"
);
assert!(meta.description.is_none(), "Description should be None");
assert!(meta.image.is_none(), "Image should be None");
}
#[tokio::test]
#[cfg(all(feature = "cmd", not(feature = "decentralized")))]
async fn test_metadata_partial_description_only() {
let html = br#"<!DOCTYPE html>
<html>
<head>
<meta name="description" content="Only Description Here">
</head>
<body></body>
</html>"#;
let url = "https://example.com/test";
let mut selectors = get_page_selectors(url, false, false);
let external_domains: Arc<HashSet<CaseInsensitiveString>> = Default::default();
let r_settings = PageLinkBuildSettings::default();
let mut map: HashSet<CaseInsensitiveString> = HashSet::with_capacity(32);
let prior_domain: Option<Box<Url>> = None;
let mut domain_parsed: Option<Box<Url>> = None;
let mut links_pages: Option<HashSet<CaseInsensitiveString>> = None;
let page = Page::new_page_streaming_from_bytes::<CaseInsensitiveString>(
url,
html,
&mut selectors,
&external_domains,
&r_settings,
&mut map,
None,
&prior_domain,
&mut domain_parsed,
&mut links_pages,
)
.await;
let page_metadata = page.get_metadata();
assert!(
page_metadata.is_some(),
"Page with only description should have metadata"
);
let meta = page_metadata.as_ref().unwrap();
assert!(meta.title.is_none(), "Title should be None");
assert_eq!(
meta.description.as_deref(),
Some("Only Description Here"),
"Description should be extracted"
);
assert!(meta.image.is_none(), "Image should be None");
}
#[tokio::test]
#[cfg(all(feature = "cmd", not(feature = "decentralized")))]
async fn test_metadata_partial_image_only() {
let html = br#"<!DOCTYPE html>
<html>
<head>
<meta property="og:image" content="https://example.com/only-image.png">
</head>
<body></body>
</html>"#;
let url = "https://example.com/test";
let mut selectors = get_page_selectors(url, false, false);
let external_domains: Arc<HashSet<CaseInsensitiveString>> = Default::default();
let r_settings = PageLinkBuildSettings::default();
let mut map: HashSet<CaseInsensitiveString> = HashSet::with_capacity(32);
let prior_domain: Option<Box<Url>> = None;
let mut domain_parsed: Option<Box<Url>> = None;
let mut links_pages: Option<HashSet<CaseInsensitiveString>> = None;
let page = Page::new_page_streaming_from_bytes::<CaseInsensitiveString>(
url,
html,
&mut selectors,
&external_domains,
&r_settings,
&mut map,
None,
&prior_domain,
&mut domain_parsed,
&mut links_pages,
)
.await;
let page_metadata = page.get_metadata();
assert!(
page_metadata.is_some(),
"Page with only og:image should have metadata"
);
let meta = page_metadata.as_ref().unwrap();
assert!(meta.title.is_none(), "Title should be None");
assert!(meta.description.is_none(), "Description should be None");
assert_eq!(
meta.image.as_deref(),
Some("https://example.com/only-image.png"),
"OG image should be extracted"
);
}
#[tokio::test]
#[cfg(all(feature = "cmd", not(feature = "decentralized")))]
async fn test_metadata_empty_html() {
let html = br#"<!DOCTYPE html>
<html>
<head></head>
<body><p>No metadata here</p></body>
</html>"#;
let url = "https://example.com/test";
let mut selectors = get_page_selectors(url, false, false);
let external_domains: Arc<HashSet<CaseInsensitiveString>> = Default::default();
let r_settings = PageLinkBuildSettings::default();
let mut map: HashSet<CaseInsensitiveString> = HashSet::with_capacity(32);
let prior_domain: Option<Box<Url>> = None;
let mut domain_parsed: Option<Box<Url>> = None;
let mut links_pages: Option<HashSet<CaseInsensitiveString>> = None;
let page = Page::new_page_streaming_from_bytes::<CaseInsensitiveString>(
url,
html,
&mut selectors,
&external_domains,
&r_settings,
&mut map,
None,
&prior_domain,
&mut domain_parsed,
&mut links_pages,
)
.await;
let page_metadata = page.get_metadata();
assert!(
page_metadata.is_none(),
"Page without any metadata tags should return None"
);
}
#[tokio::test]
#[cfg(all(feature = "cmd", not(feature = "decentralized")))]
async fn test_metadata_special_characters() {
let html = br#"<!DOCTYPE html>
<html>
<head>
<title>Title with & special <characters></title>
<meta name="description" content="Description with "quotes" and 'apostrophes'">
<meta property="og:image" content="https://example.com/image?param=value&other=1">
</head>
<body></body>
</html>"#;
let url = "https://example.com/test";
let mut selectors = get_page_selectors(url, false, false);
let external_domains: Arc<HashSet<CaseInsensitiveString>> = Default::default();
let r_settings = PageLinkBuildSettings::default();
let mut map: HashSet<CaseInsensitiveString> = HashSet::with_capacity(32);
let prior_domain: Option<Box<Url>> = None;
let mut domain_parsed: Option<Box<Url>> = None;
let mut links_pages: Option<HashSet<CaseInsensitiveString>> = None;
let page = Page::new_page_streaming_from_bytes::<CaseInsensitiveString>(
url,
html,
&mut selectors,
&external_domains,
&r_settings,
&mut map,
None,
&prior_domain,
&mut domain_parsed,
&mut links_pages,
)
.await;
let page_metadata = page.get_metadata();
assert!(
page_metadata.is_some(),
"Page with special characters should have metadata"
);
let meta = page_metadata.as_ref().unwrap();
assert!(
meta.title.is_some(),
"Title with special chars should be extracted"
);
assert!(
meta.description.is_some(),
"Description with special chars should be extracted"
);
assert!(
meta.image.is_some(),
"Image URL with special chars should be extracted"
);
}
#[tokio::test]
#[cfg(all(feature = "cmd", not(feature = "decentralized")))]
async fn test_metadata_unicode() {
let html = r#"<!DOCTYPE html>
<html>
<head>
<title>日本語タイトル - Japanese Title</title>
<meta name="description" content="中文描述 - Chinese Description - Описание на русском">
<meta property="og:image" content="https://example.com/画像.png">
</head>
<body></body>
</html>"#
.as_bytes();
let url = "https://example.com/test";
let mut selectors = get_page_selectors(url, false, false);
let external_domains: Arc<HashSet<CaseInsensitiveString>> = Default::default();
let r_settings = PageLinkBuildSettings::default();
let mut map: HashSet<CaseInsensitiveString> = HashSet::with_capacity(32);
let prior_domain: Option<Box<Url>> = None;
let mut domain_parsed: Option<Box<Url>> = None;
let mut links_pages: Option<HashSet<CaseInsensitiveString>> = None;
let page = Page::new_page_streaming_from_bytes::<CaseInsensitiveString>(
url,
html,
&mut selectors,
&external_domains,
&r_settings,
&mut map,
None,
&prior_domain,
&mut domain_parsed,
&mut links_pages,
)
.await;
let page_metadata = page.get_metadata();
assert!(
page_metadata.is_some(),
"Page with unicode content should have metadata"
);
let meta = page_metadata.as_ref().unwrap();
assert!(
meta.title
.as_ref()
.map(|t| t.contains("日本語"))
.unwrap_or(false),
"Title should contain Japanese characters"
);
assert!(
meta.description
.as_ref()
.map(|d| d.contains("中文"))
.unwrap_or(false),
"Description should contain Chinese characters"
);
}
#[test]
#[cfg(feature = "chrome")]
fn test_automation_results_structure() {
let automation_result = AutomationResults {
input: "Test prompt".to_string(),
content_output: serde_json::json!({"result": "test"}),
screenshot_output: Some("base64_screenshot_data".to_string()),
error: None,
usage: None,
relevant: None,
steps_executed: None,
reasoning: None,
};
assert_eq!(automation_result.input, "Test prompt");
assert!(automation_result.screenshot_output.is_some());
assert!(automation_result.error.is_none());
}
#[test]
#[cfg(feature = "chrome")]
fn test_metadata_with_automation() {
let automation_results = vec![AutomationResults {
input: "Click the button".to_string(),
content_output: serde_json::json!({"clicked": true}),
screenshot_output: None,
error: None,
usage: None,
relevant: None,
steps_executed: None,
reasoning: None,
}];
let metadata = Metadata {
title: Some(CompactString::from("Automation Test")),
description: None,
image: None,
automation: Some(automation_results),
};
assert!(metadata.exist(), "Metadata with title should exist");
assert!(
metadata.automation.is_some(),
"Automation results should be present"
);
assert_eq!(
metadata.automation.as_ref().unwrap().len(),
1,
"Should have one automation result"
);
}
#[test]
#[cfg(all(feature = "chrome", not(feature = "decentralized")))]
fn test_set_metadata_preserves_automation() {
let automation_results = vec![AutomationResults {
input: "Original automation".to_string(),
content_output: serde_json::json!({"original": true}),
screenshot_output: None,
error: None,
usage: None,
relevant: None,
steps_executed: None,
reasoning: None,
}];
let existing_metadata = Metadata {
title: Some(CompactString::from("Original Title")),
description: None,
image: None,
automation: Some(automation_results),
};
let mut existing = Some(Box::new(existing_metadata));
let mut new_metadata = Metadata {
title: Some(CompactString::from("New Title")),
description: Some(CompactString::from("New Description")),
image: None,
automation: None,
};
set_metadata(&mut existing, &mut new_metadata);
assert!(
new_metadata.automation.is_some(),
"Automation should be preserved from existing metadata"
);
}
#[tokio::test]
#[cfg(all(
feature = "chrome",
not(feature = "decentralized"),
not(feature = "cache_request")
))]
async fn test_metadata_chrome_real_page() {
use crate::utils::PageResponse;
let automation_results = vec![AutomationResults {
input: "Extract CTA".to_string(),
content_output: serde_json::json!({"cta": "Sign up"}),
screenshot_output: Some("base64_screenshot_data".to_string()),
error: None,
usage: None,
relevant: Some(true),
steps_executed: Some(1),
reasoning: Some("CTA extracted from main hero section".to_string()),
}];
let metadata = Metadata {
title: Some(CompactString::from("Chrome Metadata Test")),
description: Some(CompactString::from("Description available")),
image: Some(CompactString::from("https://example.com/image.png")),
automation: Some(automation_results),
};
let page = build(
"https://example.com",
PageResponse {
content: Some(b"<html></html>".to_vec()),
status_code: StatusCode::OK,
metadata: Some(Box::new(metadata)),
..Default::default()
},
);
let meta = page
.get_metadata()
.as_ref()
.expect("metadata should be present for chrome feature test");
assert!(meta.title.as_deref() == Some("Chrome Metadata Test"));
assert!(
meta.automation.is_some(),
"automation metadata should be present"
);
assert_eq!(meta.automation.as_ref().expect("automation data").len(), 1);
}
#[test]
#[cfg(feature = "encoding")]
fn test_encoding_get_html_encoded() {
let html_bytes = "こんにちは世界".as_bytes().to_vec();
let encoded = encode_bytes(&html_bytes, "UTF-8");
assert!(
encoded.contains("こんにちは"),
"UTF-8 encoding should preserve Japanese characters"
);
}
#[test]
#[cfg(all(feature = "encoding", not(feature = "decentralized")))]
fn test_encoding_page_get_html_encoded() {
use crate::utils::PageResponse;
let html_content = "Hello World - テスト";
let page_response = PageResponse {
content: Some(html_content.as_bytes().to_vec()),
status_code: StatusCode::OK,
..Default::default()
};
let page = build("https://example.com", page_response);
let encoded = page.get_html_encoded("UTF-8");
assert!(
encoded.contains("Hello World"),
"Encoded content should contain ASCII text"
);
assert!(
encoded.contains("テスト"),
"Encoded content should contain Japanese text"
);
}
#[test]
#[cfg(all(feature = "remote_addr", not(feature = "decentralized")))]
fn test_remote_addr_field() {
use crate::utils::PageResponse;
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
let page_response = PageResponse {
content: Some(b"<html></html>".to_vec()),
status_code: StatusCode::OK,
remote_addr: Some(SocketAddr::new(
IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)),
8080,
)),
..Default::default()
};
let page = build("https://example.com", page_response);
assert!(
page.remote_addr.is_some(),
"Page should have remote_addr when feature is enabled"
);
let addr = page.remote_addr.unwrap();
assert_eq!(addr.ip(), IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)));
assert_eq!(addr.port(), 8080);
}
#[test]
#[cfg(all(feature = "page_error_status_details", not(feature = "decentralized")))]
fn test_page_error_status_details() {
use crate::utils::PageResponse;
let page_response = PageResponse {
content: None,
status_code: StatusCode::INTERNAL_SERVER_ERROR,
..Default::default()
};
let page = build("https://example.com", page_response);
let _error: &Option<std::sync::Arc<reqwest::Error>> = &page.error_status;
}
#[test]
#[cfg(all(
not(feature = "page_error_status_details"),
not(feature = "decentralized")
))]
fn test_page_error_status_string() {
use crate::utils::PageResponse;
let page_response = PageResponse {
content: None,
status_code: StatusCode::INTERNAL_SERVER_ERROR,
..Default::default()
};
let page = build("https://example.com", page_response);
let _error: &Option<String> = &page.error_status;
}
#[test]
#[cfg(all(feature = "cookies", not(feature = "decentralized")))]
fn test_cookies_field() {
use crate::utils::PageResponse;
let page_response = PageResponse {
content: Some(b"<html></html>".to_vec()),
status_code: StatusCode::OK,
..Default::default()
};
let page = build("https://example.com", page_response);
let _cookies: &Option<reqwest::header::HeaderMap> = &page.cookies;
}
#[test]
#[cfg(all(feature = "chrome", not(feature = "decentralized")))]
fn test_chrome_screenshot_bytes_field() {
use crate::utils::PageResponse;
let screenshot_data = vec![0x89, 0x50, 0x4E, 0x47];
let page_response = PageResponse {
content: Some(b"<html></html>".to_vec()),
status_code: StatusCode::OK,
screenshot_bytes: Some(screenshot_data.clone()),
..Default::default()
};
let page = build("https://example.com", page_response);
assert!(
page.screenshot_bytes.is_some(),
"Page should have screenshot_bytes when chrome feature is enabled"
);
assert_eq!(
page.screenshot_bytes.as_ref().unwrap(),
&screenshot_data,
"Screenshot bytes should match"
);
}
#[test]
#[cfg(all(feature = "time", not(feature = "decentralized")))]
fn test_time_duration_field() {
use crate::utils::PageResponse;
let page_response = PageResponse {
content: Some(b"<html></html>".to_vec()),
status_code: StatusCode::OK,
duration: Some(tokio::time::Instant::now()),
..Default::default()
};
let page = build("https://example.com", page_response);
let duration = page.get_duration_elapsed();
assert!(
duration.as_millis() < 1000,
"Duration should be less than 1 second"
);
}
#[test]
#[cfg(all(feature = "openai", not(feature = "decentralized")))]
fn test_openai_fields() {
use crate::features::openai_common::OpenAIUsage;
use crate::utils::PageResponse;
let page_response = PageResponse {
content: Some(b"<html></html>".to_vec()),
status_code: StatusCode::OK,
..Default::default()
};
let mut page = build("https://example.com", page_response);
assert!(
page.openai_credits_used.is_none(),
"openai_credits_used should be None initially"
);
page.openai_credits_used = Some(vec![OpenAIUsage::default()]);
assert!(
page.openai_credits_used.is_some(),
"openai_credits_used should be set"
);
assert!(
page.extra_ai_data.is_none(),
"extra_ai_data should be None initially"
);
page.extra_ai_data = Some(vec![AIResults::default()]);
assert!(page.extra_ai_data.is_some(), "extra_ai_data should be set");
}
#[test]
#[cfg(all(feature = "gemini", not(feature = "decentralized")))]
fn test_gemini_fields() {
use crate::features::gemini_common::GeminiUsage;
use crate::utils::PageResponse;
let page_response = PageResponse {
content: Some(b"<html></html>".to_vec()),
status_code: StatusCode::OK,
..Default::default()
};
let mut page = build("https://example.com", page_response);
assert!(
page.gemini_credits_used.is_none(),
"gemini_credits_used should be None initially"
);
page.gemini_credits_used = Some(vec![GeminiUsage::default()]);
assert!(
page.gemini_credits_used.is_some(),
"gemini_credits_used should be set"
);
assert!(
page.extra_gemini_data.is_none(),
"extra_gemini_data should be None initially"
);
page.extra_gemini_data = Some(vec![AIResults::default()]);
assert!(
page.extra_gemini_data.is_some(),
"extra_gemini_data should be set"
);
}
#[test]
#[cfg(feature = "serde")]
fn test_metadata_serde() {
let metadata = Metadata {
title: Some("Test Title".into()),
description: Some("Test Description".into()),
image: Some("https://example.com/image.png".into()),
#[cfg(feature = "chrome")]
automation: None,
};
let json = serde_json::to_string(&metadata).expect("Failed to serialize metadata");
assert!(json.contains("Test Title"), "JSON should contain title");
let deserialized: Metadata =
serde_json::from_str(&json).expect("Failed to deserialize metadata");
assert_eq!(
metadata.title, deserialized.title,
"Title should match after deserialization"
);
assert_eq!(
metadata.description, deserialized.description,
"Description should match after deserialization"
);
assert_eq!(
metadata.image, deserialized.image,
"Image should match after deserialization"
);
}
#[test]
#[cfg(feature = "serde")]
fn test_airesults_serde() {
let ai_results = AIResults {
input: "Test prompt".to_string(),
js_output: "console.log('test');".to_string(),
content_output: vec!["Result 1".to_string(), "Result 2".to_string()],
screenshot_output: None,
error: None,
};
let json = serde_json::to_string(&ai_results).expect("Failed to serialize AIResults");
assert!(json.contains("Test prompt"), "JSON should contain input");
let deserialized: AIResults =
serde_json::from_str(&json).expect("Failed to deserialize AIResults");
assert_eq!(
ai_results.input, deserialized.input,
"Input should match after deserialization"
);
assert_eq!(
ai_results.js_output, deserialized.js_output,
"JS output should match after deserialization"
);
assert_eq!(
ai_results.content_output.len(),
deserialized.content_output.len(),
"Content output length should match"
);
}
#[test]
#[cfg(feature = "decentralized")]
fn test_decentralized_page() {
let page = Page::default();
assert!(
page.links.is_empty(),
"Default Page should have empty links"
);
assert!(
page.external_domains_caseless.is_empty(),
"Default Page should have empty external_domains_caseless"
);
}
#[test]
#[cfg(all(feature = "smart", not(feature = "decentralized")))]
fn test_smart_feature() {
use crate::utils::PageResponse;
let page_response = PageResponse {
content: Some(b"<html></html>".to_vec()),
status_code: StatusCode::OK,
..Default::default()
};
let page = build("https://example.com", page_response);
assert!(
page.screenshot_bytes.is_none(),
"screenshot_bytes should be None initially"
);
}
#[test]
#[cfg(not(feature = "decentralized"))]
fn test_build_preserves_spawn_pages() {
use crate::utils::PageResponse;
let page = build(
"https://example.com",
PageResponse {
status_code: StatusCode::OK,
spawn_pages: Some(vec![
"https://example.com/a".to_string(),
"https://example.com/b".to_string(),
]),
..Default::default()
},
);
let spawn_pages = page
.spawn_pages
.as_ref()
.expect("spawn_pages should be preserved");
assert_eq!(spawn_pages.len(), 2);
assert_eq!(spawn_pages[0], "https://example.com/a");
assert_eq!(spawn_pages[1], "https://example.com/b");
}
#[test]
#[cfg(all(feature = "smart", not(feature = "decentralized")))]
fn test_page_assign_merges_spawn_pages() {
use crate::utils::PageResponse;
let mut page = build(
"https://example.com",
PageResponse {
status_code: StatusCode::OK,
spawn_pages: Some(vec!["https://example.com/root".to_string()]),
..Default::default()
},
);
let new_page = build(
"https://example.com",
PageResponse {
status_code: StatusCode::OK,
spawn_pages: Some(vec![
"https://example.com/x".to_string(),
"https://example.com/y".to_string(),
]),
..Default::default()
},
);
page_assign(&mut page, new_page);
let spawn_pages = page
.spawn_pages
.as_ref()
.expect("spawn_pages should be merged");
assert_eq!(spawn_pages.len(), 3);
assert!(spawn_pages.contains(&"https://example.com/root".to_string()));
assert!(spawn_pages.contains(&"https://example.com/x".to_string()));
assert!(spawn_pages.contains(&"https://example.com/y".to_string()));
}
#[test]
#[cfg(not(feature = "decentralized"))]
fn test_page_links_field() {
use crate::utils::PageResponse;
let page_response = PageResponse {
content: Some(b"<html></html>".to_vec()),
status_code: StatusCode::OK,
..Default::default()
};
let mut page = build("https://example.com", page_response);
assert!(
page.page_links.is_none(),
"page_links should be None initially"
);
let mut links = HashSet::new();
links.insert(CaseInsensitiveString::new("https://example.com/page1"));
page.page_links = Some(Box::new(links));
assert!(page.page_links.is_some(), "page_links should be set");
assert_eq!(
page.page_links.as_ref().unwrap().len(),
1,
"page_links should have 1 link"
);
}
#[test]
#[cfg(not(feature = "decentralized"))]
fn test_bytes_transferred_field() {
use crate::utils::PageResponse;
let page_response = PageResponse {
content: Some(b"<html></html>".to_vec()),
status_code: StatusCode::OK,
..Default::default()
};
let mut page = build("https://example.com", page_response);
assert!(
page.bytes_transferred.is_none(),
"bytes_transferred should be None initially"
);
page.bytes_transferred = Some(1024.0);
assert_eq!(
page.bytes_transferred,
Some(1024.0),
"bytes_transferred should be 1024.0"
);
}
#[test]
#[cfg(not(feature = "decentralized"))]
fn test_waf_and_retry_fields() {
use crate::utils::PageResponse;
let page_response = PageResponse {
content: Some(b"<html></html>".to_vec()),
status_code: StatusCode::OK,
..Default::default()
};
let mut page = build("https://example.com", page_response);
assert!(!page.waf_check, "waf_check should be false initially");
assert!(!page.should_retry, "should_retry should be false initially");
page.waf_check = true;
assert!(page.waf_check, "waf_check should be true");
page.should_retry = true;
assert!(page.should_retry, "should_retry should be true");
}
#[test]
fn test_is_retryable_status_excludes_dns() {
assert!(
!is_retryable_status(*DNS_RESOLVE_ERROR),
"DNS_RESOLVE_ERROR (525) must not be retryable"
);
assert!(
is_retryable_status(StatusCode::INTERNAL_SERVER_ERROR),
"500 should be retryable"
);
assert!(
is_retryable_status(StatusCode::BAD_GATEWAY),
"502 should be retryable"
);
assert!(
is_retryable_status(StatusCode::SERVICE_UNAVAILABLE),
"503 should be retryable"
);
assert!(
is_retryable_status(StatusCode::GATEWAY_TIMEOUT),
"504 should be retryable"
);
assert!(
is_retryable_status(StatusCode::TOO_MANY_REQUESTS),
"429 should be retryable"
);
assert!(
!is_retryable_status(StatusCode::OK),
"200 should not be retryable"
);
assert!(
!is_retryable_status(*TOO_MANY_REDIRECTS_ERROR),
"TOO_MANY_REDIRECTS_ERROR (310) must not be retryable — redirect loops are deterministic"
);
}
#[test]
#[cfg(not(feature = "decentralized"))]
fn test_blocked_crawl_field() {
use crate::utils::PageResponse;
let page_response = PageResponse {
content: Some(b"<html></html>".to_vec()),
status_code: StatusCode::OK,
..Default::default()
};
let mut page = build("https://example.com", page_response);
assert!(
!page.blocked_crawl,
"blocked_crawl should be false initially"
);
page.blocked_crawl = true;
assert!(page.blocked_crawl, "blocked_crawl should be true");
}
#[test]
fn test_extract_root_domain() {
assert_eq!(extract_root_domain("example.com"), "example");
assert_eq!(extract_root_domain("example.org"), "example");
assert_eq!(extract_root_domain("sub.example.com"), "example.com");
assert_eq!(extract_root_domain("deep.sub.example.co.uk"), "co.uk");
assert_eq!(extract_root_domain("localhost"), "localhost");
}
#[test]
fn test_is_subdomain_tld_matching() {
assert!(is_subdomain("example.com", "example.org"));
assert!(is_subdomain("example.net", "example.com"));
assert!(is_subdomain("a.example.com", "b.example.com"));
assert!(!is_subdomain("sub.example.com", "example.com"));
assert!(!is_subdomain("example.com", "other.com"));
assert!(!is_subdomain("myexample.com", "example.com"));
}
#[test]
fn test_get_page_selectors_base_tld() {
let selectors = get_page_selectors_base("https://example.com/page", false, true);
assert_eq!(selectors.0.as_str(), "example");
let selectors_no_tld = get_page_selectors_base("https://example.com/page", false, false);
assert!(selectors_no_tld.0.is_empty());
}
#[test]
fn test_parent_host_match_tld() {
let parent_host = CompactString::from("example.com");
let base_host = CompactString::from("example.com");
let sub_matcher = CompactString::from("example");
assert!(parent_host_match(
Some("example.com"),
"example",
&parent_host,
&base_host,
&sub_matcher,
));
assert!(parent_host_match(
Some("example.org"),
"example",
&parent_host,
&base_host,
&sub_matcher,
));
assert!(!parent_host_match(
Some("other.com"),
"example",
&parent_host,
&base_host,
&sub_matcher,
));
}
#[test]
fn test_validate_link_subdomain_relative_resolution() {
let selectors = get_page_selectors("https://www.example.com/", true, false);
let external_domains: Arc<HashSet<CaseInsensitiveString>> = Arc::new(HashSet::new());
let subdomain_base = url::Url::parse("https://sub.example.com/page").unwrap();
let mut no_page_links: Option<HashSet<CaseInsensitiveString>> = None;
let result = validate_link(
&Some(&subdomain_base),
"/about",
&selectors.0,
&selectors.1[0],
&selectors.2,
&selectors.0,
&external_domains,
&mut no_page_links,
);
assert!(
result.is_some(),
"Relative link on subdomain page should be accepted"
);
assert_eq!(
result.unwrap().as_str(),
"https://sub.example.com/about",
"Relative link should resolve against subdomain host, not crawl origin"
);
let crawl_origin_base = url::Url::parse("https://www.example.com/").unwrap();
let mut no_page_links2: Option<HashSet<CaseInsensitiveString>> = None;
let result_old = validate_link(
&Some(&crawl_origin_base),
"/about",
&selectors.0,
&selectors.1[0],
&selectors.2,
&selectors.0,
&external_domains,
&mut no_page_links2,
);
assert!(result_old.is_some());
assert_eq!(
result_old.unwrap().as_str(),
"https://www.example.com/about",
"With crawl origin as base, link resolves against wrong host"
);
}
#[test]
fn test_validate_link_same_domain_resolution() {
let selectors = get_page_selectors("https://www.example.com/", false, false);
let external_domains: Arc<HashSet<CaseInsensitiveString>> = Arc::new(HashSet::new());
let page_base = url::Url::parse("https://www.example.com/some-page").unwrap();
let mut no_page_links: Option<HashSet<CaseInsensitiveString>> = None;
let result = validate_link(
&Some(&page_base),
"/about",
&selectors.0,
&selectors.1[0],
&selectors.2,
&selectors.0,
&external_domains,
&mut no_page_links,
);
assert!(result.is_some());
assert_eq!(
result.unwrap().as_str(),
"https://www.example.com/about",
"Same-domain relative link should resolve correctly"
);
}
#[tokio::test]
#[cfg(all(
not(feature = "decentralized"),
not(feature = "chrome"),
not(feature = "cache_request")
))]
async fn test_subdomain_page_links_resolution() {
use crate::utils::PageResponse;
let html = br#"<html><body>
<a href="/about">About</a>
<a href="/contact">Contact</a>
<a href="https://sub.example.com/absolute">Absolute</a>
</body></html>"#;
let mut page = build_with_parse(
"https://sub.example.com/page",
PageResponse {
content: Some(html.to_vec()),
status_code: reqwest::StatusCode::OK,
..Default::default()
},
);
let selectors = get_page_selectors("https://www.example.com/", true, false);
let page_base = url::Url::parse("https://sub.example.com/page")
.ok()
.map(Box::new);
let links = page.links(&selectors, &page_base).await;
let expected_about: CaseInsensitiveString = "https://sub.example.com/about".into();
let expected_contact: CaseInsensitiveString = "https://sub.example.com/contact".into();
let expected_absolute: CaseInsensitiveString = "https://sub.example.com/absolute".into();
let wrong_about: CaseInsensitiveString = "https://www.example.com/about".into();
assert!(
links.contains(&expected_about),
"Relative /about should resolve to sub.example.com/about, got: {:?}",
&links
);
assert!(
links.contains(&expected_contact),
"Relative /contact should resolve to sub.example.com/contact, got: {:?}",
&links
);
assert!(
links.contains(&expected_absolute),
"Absolute link should be preserved, got: {:?}",
&links
);
assert!(
!links.contains(&wrong_about),
"Links should NOT resolve against crawl origin www.example.com"
);
}
#[tokio::test]
#[cfg(all(
not(feature = "decentralized"),
not(feature = "chrome"),
not(feature = "cache_request")
))]
async fn test_same_domain_page_links_resolution() {
use crate::utils::PageResponse;
let html = br#"<html><body><a href="/about">About</a></body></html>"#;
let mut page = build_with_parse(
"https://www.example.com/page",
PageResponse {
content: Some(html.to_vec()),
status_code: reqwest::StatusCode::OK,
..Default::default()
},
);
let selectors = get_page_selectors("https://www.example.com/", false, false);
let page_base = url::Url::parse("https://www.example.com/page")
.ok()
.map(Box::new);
let links = page.links(&selectors, &page_base).await;
let expected: CaseInsensitiveString = "https://www.example.com/about".into();
assert!(
links.contains(&expected),
"Same-domain relative link should resolve correctly, got: {:?}",
&links
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_dns_error_no_retry() {
let res = PageResponse {
status_code: StatusCode::from_u16(525).unwrap(),
content: None,
..Default::default()
};
let page = build("https://nonexistent.invalid", res);
assert!(
!page.should_retry,
"DNS resolve errors (525) must not be retried"
);
assert!(
!page.needs_retry(),
"DNS resolve errors (525) — needs_retry() must be false"
);
}
#[test]
fn test_host_dns_cache_bounded_capacity_evicts_fifo() {
let cache = HostDnsCache::new(3, std::time::Duration::from_secs(60));
assert_eq!(cache.len(), 0);
cache.insert("a".to_string(), LocalDnsState::Resolved);
cache.insert("b".to_string(), LocalDnsState::NxDomain);
cache.insert("c".to_string(), LocalDnsState::Resolved);
assert_eq!(cache.len(), 3, "filled to capacity");
assert_eq!(cache.get("a"), Some(LocalDnsState::Resolved));
assert_eq!(cache.get("b"), Some(LocalDnsState::NxDomain));
assert_eq!(cache.get("c"), Some(LocalDnsState::Resolved));
cache.insert("d".to_string(), LocalDnsState::TimedOut);
assert_eq!(cache.len(), 3, "capacity must not grow past 3");
assert_eq!(cache.get("a"), None, "oldest entry must be evicted (FIFO)");
assert_eq!(cache.get("b"), Some(LocalDnsState::NxDomain));
assert_eq!(cache.get("c"), Some(LocalDnsState::Resolved));
assert_eq!(cache.get("d"), Some(LocalDnsState::TimedOut));
cache.insert("e".to_string(), LocalDnsState::Resolved);
assert_eq!(cache.len(), 3);
assert_eq!(cache.get("b"), None);
for i in 0..1000 {
cache.insert(format!("host{i}"), LocalDnsState::Resolved);
assert!(
cache.len() <= 3,
"capacity must never exceed 3 (saw {})",
cache.len()
);
}
assert_eq!(
cache.len(),
3,
"after stress, exactly capacity entries remain"
);
}
#[test]
fn test_host_dns_cache_ttl_expiration() {
let cache = HostDnsCache::new(8, std::time::Duration::from_millis(50));
cache.insert("expiring".to_string(), LocalDnsState::NxDomain);
assert_eq!(cache.get("expiring"), Some(LocalDnsState::NxDomain));
std::thread::sleep(std::time::Duration::from_millis(70));
assert_eq!(
cache.get("expiring"),
None,
"past-TTL entry must return None (caller re-checks via fresh lookup)"
);
}
#[test]
fn test_host_dns_cache_update_existing_key_no_growth() {
let cache = HostDnsCache::new(2, std::time::Duration::from_secs(60));
cache.insert("a".to_string(), LocalDnsState::Resolved);
cache.insert("b".to_string(), LocalDnsState::Resolved);
assert_eq!(cache.len(), 2);
cache.insert("a".to_string(), LocalDnsState::NxDomain);
assert_eq!(cache.len(), 2);
assert_eq!(cache.get("a"), Some(LocalDnsState::NxDomain));
assert_eq!(cache.get("b"), Some(LocalDnsState::Resolved));
}
#[test]
fn test_host_dns_cache_capacity_clamped_to_one() {
let cache = HostDnsCache::new(0, std::time::Duration::from_secs(60));
cache.insert("a".to_string(), LocalDnsState::Resolved);
cache.insert("b".to_string(), LocalDnsState::Resolved);
assert_eq!(cache.len(), 1, "capacity=0 must be clamped to 1");
assert_eq!(cache.get("a"), None, "old entry evicted at capacity 1");
assert_eq!(cache.get("b"), Some(LocalDnsState::Resolved));
}
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
async fn test_host_dns_cache_concurrent_no_deadlock() {
use std::sync::Arc;
let cache = Arc::new(HostDnsCache::new(64, std::time::Duration::from_secs(60)));
let start = std::time::Instant::now();
let mut handles = Vec::new();
for t in 0..16 {
let c = Arc::clone(&cache);
handles.push(tokio::spawn(async move {
for i in 0..200 {
let host = format!("t{t}-h{}", i % 32);
if i % 2 == 0 {
let _ = c.get(&host);
} else {
c.insert(host, LocalDnsState::Resolved);
}
}
}));
}
for h in handles {
h.await.expect("task panicked");
}
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_secs(5),
"16 tasks × 200 ops must complete fast (saw {:?}) — \
long elapsed indicates lock contention or a deadlock-recovery path",
elapsed
);
assert!(
cache.len() <= 64,
"capacity must hold under concurrent access (saw {})",
cache.len()
);
}
#[cfg(not(feature = "decentralized"))]
#[tokio::test(flavor = "current_thread")]
async fn test_host_resolves_locally_cached_does_not_cache_timed_out() {
let host = "this-host-must-not-exist-cache-pollution-test.invalid";
let state1 = host_resolves_locally_cached(host, std::time::Duration::from_micros(1)).await;
assert_eq!(
state1,
LocalDnsState::TimedOut,
"1µs timeout must return TimedOut (sanity check for the repro setup)"
);
let state2 = host_resolves_locally_cached(host, std::time::Duration::from_secs(5)).await;
assert_eq!(
state2,
LocalDnsState::NxDomain,
"second call with longer timeout must re-probe and resolve to \
NxDomain — TimedOut from call 1 must NOT be cached. This is the \
midwestrodding regression fix; cache pollution would have left \
confirm_tunnel unable to upgrade to 525 for 5 minutes."
);
}
#[cfg(not(feature = "decentralized"))]
#[tokio::test(flavor = "current_thread")]
async fn test_host_resolves_locally_cached_second_call_hits_cache() {
let host = "cached-test-unique.invalid";
let start1 = std::time::Instant::now();
let s1 = host_resolves_locally_cached(host, std::time::Duration::from_secs(2)).await;
let cold = start1.elapsed();
let start2 = std::time::Instant::now();
let s2 = host_resolves_locally_cached(host, std::time::Duration::from_secs(2)).await;
let hot = start2.elapsed();
assert_eq!(s1, s2, "cached value must equal cold value");
assert_eq!(s1, LocalDnsState::NxDomain);
assert!(
hot < std::time::Duration::from_micros(500),
"cached lookup must be <500µs (got {:?}); cold was {:?}",
hot,
cold
);
}
#[cfg(not(feature = "decentralized"))]
#[tokio::test(flavor = "current_thread")]
async fn test_build_respects_pre_classified_525_does_not_force_retry() {
use std::io::{Read, Write};
use std::net::TcpListener;
let listener = TcpListener::bind("127.0.0.1:0").expect("bind");
let proxy_port = listener.local_addr().expect("addr").port();
std::thread::spawn(move || {
for stream in listener.incoming() {
let Ok(mut s) = stream else { continue };
let mut buf = [0u8; 1024];
let _ = s.read(&mut buf);
let _ = s.write_all(b"HTTP/1.1 502 Bad Gateway\r\nContent-Length: 0\r\n\r\n");
let _ = s.flush();
}
});
let proxy_url = format!("http://127.0.0.1:{proxy_port}");
let client = reqwest::Client::builder()
.proxy(reqwest::Proxy::all(&proxy_url).expect("proxy"))
.connect_timeout(std::time::Duration::from_secs(3))
.build()
.expect("client");
let target = "https://this-host-must-not-exist.invalid/";
let err = client.get(target).send().await.expect_err("must err");
#[cfg(feature = "cache_request")]
let wrapped_err = reqwest_middleware::Error::Reqwest(err);
#[cfg(not(feature = "cache_request"))]
let wrapped_err = err;
let res = PageResponse {
status_code: *DNS_RESOLVE_ERROR,
error_for_status: Some(Err(wrapped_err)),
..Default::default()
};
let page = build(target, res);
assert_eq!(
page.status_code, *DNS_RESOLVE_ERROR,
"status must remain 525 after build()"
);
assert!(
!page.should_retry,
"should_retry MUST be false — pre-classified 525 must short-circuit \
get_error_status_base's connect-error retry override. Pre-fix, this \
was true because get_error_status_base re-classified the raw error \
and saw a retryable connect failure (503 catch-all), forcing \
should_retry back to true and triggering the retry loop. The leak \
turned every NXDOMAIN-through-proxy host into minutes of wasted retries."
);
assert!(
!page.needs_retry(),
"needs_retry() MUST be false → retry loop short-circuits"
);
}
#[cfg(not(feature = "decentralized"))]
#[tokio::test(flavor = "current_thread")]
async fn test_build_transient_503_still_retries() {
use std::io::{Read, Write};
use std::net::TcpListener;
let listener = TcpListener::bind("127.0.0.1:0").expect("bind");
let proxy_port = listener.local_addr().expect("addr").port();
std::thread::spawn(move || {
for stream in listener.incoming() {
let Ok(mut s) = stream else { continue };
let mut buf = [0u8; 1024];
let _ = s.read(&mut buf);
let _ = s.write_all(b"HTTP/1.1 502 Bad Gateway\r\nContent-Length: 0\r\n\r\n");
let _ = s.flush();
}
});
let proxy_url = format!("http://127.0.0.1:{proxy_port}");
let client = reqwest::Client::builder()
.proxy(reqwest::Proxy::all(&proxy_url).expect("proxy"))
.connect_timeout(std::time::Duration::from_secs(3))
.build()
.expect("client");
let target = "https://localhost/";
let err = client.get(target).send().await.expect_err("must err");
#[cfg(feature = "cache_request")]
let wrapped_err = reqwest_middleware::Error::Reqwest(err);
#[cfg(not(feature = "cache_request"))]
let wrapped_err = err;
let res = PageResponse {
status_code: *UNREACHABLE_REQUEST_ERROR, error_for_status: Some(Err(wrapped_err)),
..Default::default()
};
let page = build(target, res);
assert_eq!(page.status_code, *UNREACHABLE_REQUEST_ERROR);
assert!(
page.should_retry,
"transient 503 must still set should_retry=true so legitimate \
transient proxy hiccups continue to retry — preserves the \
v2.51.165→v2.51.167 false-positive fix"
);
assert!(
page.needs_retry(),
"needs_retry() must be true for transient classifications"
);
}
#[cfg(not(feature = "decentralized"))]
#[tokio::test(flavor = "current_thread")]
async fn test_host_resolves_locally_rfc2606_invalid() {
let state = host_resolves_locally(
"this-host-must-not-exist.invalid",
std::time::Duration::from_secs(2),
)
.await;
assert_eq!(
state,
LocalDnsState::NxDomain,
"RFC 2606 .invalid TLD must always classify as NxDomain — \
platform resolver Display strings are caught by typed NotFound \
OR DNS_ERROR_AC scan"
);
}
#[cfg(not(feature = "decentralized"))]
#[tokio::test(flavor = "current_thread")]
async fn test_host_resolves_locally_loopback() {
let state = host_resolves_locally("localhost", std::time::Duration::from_secs(2)).await;
assert_eq!(
state,
LocalDnsState::Resolved,
"localhost must always resolve via the OS resolver"
);
}
#[cfg(all(not(feature = "decentralized"), not(feature = "cache_request")))]
#[tokio::test(flavor = "current_thread")]
async fn test_confirm_tunnel_failure_with_local_dns_nxdomain_upgrades_to_525() {
use std::io::{Read, Write};
use std::net::TcpListener;
let listener = TcpListener::bind("127.0.0.1:0").expect("bind loopback");
let proxy_port = listener.local_addr().expect("local addr").port();
std::thread::spawn(move || {
for stream in listener.incoming() {
let Ok(mut s) = stream else { continue };
let mut buf = [0u8; 1024];
let _ = s.read(&mut buf);
let _ = s.write_all(b"HTTP/1.1 502 Bad Gateway\r\nContent-Length: 0\r\n\r\n");
let _ = s.flush();
}
});
let proxy_url = format!("http://127.0.0.1:{proxy_port}");
let client = reqwest::Client::builder()
.proxy(reqwest::Proxy::all(&proxy_url).expect("valid proxy"))
.connect_timeout(std::time::Duration::from_secs(3))
.build()
.expect("client");
let target = "https://this-host-must-not-exist.invalid/";
let err = client.get(target).send().await.expect_err("must error");
let initial = get_error_http_status_code(&err);
assert_eq!(
initial, *UNREACHABLE_REQUEST_ERROR,
"initial sync classification must be 503 (post-v2.51.167 revert)"
);
let final_status = confirm_tunnel_failure_with_local_dns(
initial,
&err,
target,
std::time::Duration::from_secs(2),
)
.await;
assert_eq!(
final_status, *DNS_RESOLVE_ERROR,
"tunnel failure + local NxDomain must upgrade to 525 — both signals agree"
);
}
#[cfg(all(not(feature = "decentralized"), not(feature = "cache_request")))]
#[tokio::test(flavor = "current_thread")]
async fn test_confirm_tunnel_failure_with_local_dns_resolved_keeps_503() {
use std::io::{Read, Write};
use std::net::TcpListener;
let listener = TcpListener::bind("127.0.0.1:0").expect("bind loopback");
let proxy_port = listener.local_addr().expect("local addr").port();
std::thread::spawn(move || {
for stream in listener.incoming() {
let Ok(mut s) = stream else { continue };
let mut buf = [0u8; 1024];
let _ = s.read(&mut buf);
let _ = s.write_all(b"HTTP/1.1 502 Bad Gateway\r\nContent-Length: 0\r\n\r\n");
let _ = s.flush();
}
});
let proxy_url = format!("http://127.0.0.1:{proxy_port}");
let client = reqwest::Client::builder()
.proxy(reqwest::Proxy::all(&proxy_url).expect("valid proxy"))
.connect_timeout(std::time::Duration::from_secs(3))
.build()
.expect("client");
let target = "https://localhost/";
let err = client.get(target).send().await.expect_err("must error");
let initial = get_error_http_status_code(&err);
let final_status = confirm_tunnel_failure_with_local_dns(
initial,
&err,
target,
std::time::Duration::from_secs(2),
)
.await;
assert_eq!(
final_status, *UNREACHABLE_REQUEST_ERROR,
"tunnel failure + local Resolved must STAY at 503 — proxy issue is \
transient, retry path runs as before. Regression case from v2.51.165: \
example.com / cloudflare.com / github docs must keep retrying."
);
assert!(
is_retryable_status(final_status),
"503 must remain retryable so transient proxy hiccups recover"
);
}
#[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
#[tokio::test(flavor = "current_thread")]
async fn test_confirm_chrome_tunnel_failure_nxdomain_upgrades_to_525() {
let padding = "x".repeat(1000);
let html = format!(
"<html lang=\"en\" dir=\"ltr\">\n\
<style>{padding}</style>\n\
<div class=\"error-code\">ERR_TUNNEL_CONNECTION_FAILED</div>\n\
<script>var loadTimeDataRaw = {{\"errorCode\":\"ERR_TUNNEL_CONNECTION_FAILED\",\
\"title\":\"this-host-must-not-exist.invalid\"}};</script></html>"
);
let mut page = build(
"https://this-host-must-not-exist.invalid/",
PageResponse {
status_code: StatusCode::OK,
content: Some(html.into_bytes()),
..Default::default()
},
);
assert_eq!(
page.status_code,
StatusCode::from_u16(599).unwrap(),
"build() must classify chrome ERR_TUNNEL_CONNECTION_FAILED to 599 catch-all"
);
assert!(
page.should_retry,
"build()'s should_retry_status sets true for 599 server_error"
);
confirm_chrome_tunnel_failure_with_local_dns(
&mut page,
"https://this-host-must-not-exist.invalid/",
std::time::Duration::from_secs(2),
)
.await;
assert_eq!(
page.status_code, *DNS_RESOLVE_ERROR,
"chrome tunnel + local NxDomain must upgrade to 525 — both signals agree"
);
assert!(
!page.should_retry,
"should_retry must be cleared so chrome_page_fetch! retry loop short-circuits"
);
assert!(
!page.needs_retry(),
"needs_retry() must be false (525 is non-retryable, should_retry cleared)"
);
}
#[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
#[tokio::test(flavor = "current_thread")]
async fn test_confirm_chrome_tunnel_failure_resolved_keeps_599() {
let padding = "x".repeat(1000);
let html = format!(
"<html lang=\"en\" dir=\"ltr\">\n\
<style>{padding}</style>\n\
<script>var loadTimeDataRaw = {{\"errorCode\":\"ERR_TUNNEL_CONNECTION_FAILED\",\
\"title\":\"localhost\"}};</script></html>"
);
let mut page = build(
"https://localhost/",
PageResponse {
status_code: StatusCode::OK,
content: Some(html.into_bytes()),
..Default::default()
},
);
let initial_status = page.status_code;
let initial_should_retry = page.should_retry;
assert_eq!(
initial_status,
StatusCode::from_u16(599).unwrap(),
"build() classification must be 599 (catch-all)"
);
confirm_chrome_tunnel_failure_with_local_dns(
&mut page,
"https://localhost/",
std::time::Duration::from_secs(2),
)
.await;
assert_eq!(
page.status_code, initial_status,
"chrome tunnel + local Resolved must STAY at 599 — proxy issue is \
transient, retry path runs as before. v2.51.166 regression case stays \
retryable."
);
assert_eq!(
page.should_retry, initial_should_retry,
"should_retry must be unchanged when local DNS doesn't confirm NxDomain"
);
assert!(
page.needs_retry(),
"needs_retry() must be true (599 is_retryable, transient proxy hiccup)"
);
}
#[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
#[tokio::test(flavor = "current_thread")]
async fn test_confirm_chrome_tunnel_failure_gates_short_circuit() {
let mut page = build(
"https://example.com/",
PageResponse {
status_code: StatusCode::OK,
content: Some(b"<html><body>fine</body></html>".to_vec()),
..Default::default()
},
);
let initial = page.status_code;
confirm_chrome_tunnel_failure_with_local_dns(
&mut page,
"https://example.com/",
std::time::Duration::from_millis(10),
)
.await;
assert_eq!(
page.status_code, initial,
"non-599 status must be untouched"
);
let mut page2 = Page::default();
page2.status_code = StatusCode::from_u16(599).unwrap();
confirm_chrome_tunnel_failure_with_local_dns(
&mut page2,
"https://example.com/",
std::time::Duration::from_millis(10),
)
.await;
assert_eq!(
page2.status_code,
StatusCode::from_u16(599).unwrap(),
"599 without chrome-error content must stay 599 (gate 2 short-circuits)"
);
let padding = "x".repeat(1000);
let html_reset = format!(
"<html><style>{padding}</style>\
<script>var loadTimeDataRaw = {{\"errorCode\":\"ERR_CONNECTION_RESET\",\
\"title\":\"x\"}};</script></html>"
);
let mut page3 = build(
"https://this-host-must-not-exist.invalid/",
PageResponse {
status_code: StatusCode::OK,
content: Some(html_reset.into_bytes()),
..Default::default()
},
);
let pre_status = page3.status_code;
confirm_chrome_tunnel_failure_with_local_dns(
&mut page3,
"https://this-host-must-not-exist.invalid/",
std::time::Duration::from_millis(10),
)
.await;
assert_eq!(
page3.status_code, pre_status,
"non-tunnel errorCode (ERR_CONNECTION_RESET) must stay at 599 — \
gate 3 short-circuits, no DNS work"
);
}
#[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
#[tokio::test(flavor = "current_thread")]
async fn test_confirm_chrome_tunnel_failure_504_nxdomain_upgrades_to_525() {
let mut page = Page::default();
page.status_code = StatusCode::from_u16(504).unwrap();
page.should_retry = true;
confirm_chrome_tunnel_failure_with_local_dns(
&mut page,
"https://this-host-must-not-exist.invalid/",
std::time::Duration::from_secs(2),
)
.await;
assert_eq!(
page.status_code, *DNS_RESOLVE_ERROR,
"504 + empty body + local NxDomain must upgrade to 525 — closes the \
midwestrodding chrome regression where the helper only fired on 599"
);
assert!(
!page.should_retry,
"should_retry must be cleared after 504 → 525 upgrade so chrome \
retry loop short-circuits"
);
}
#[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
#[tokio::test(flavor = "current_thread")]
async fn test_confirm_chrome_tunnel_failure_504_resolved_keeps_504() {
let mut page = Page::default();
page.status_code = StatusCode::from_u16(504).unwrap();
page.should_retry = true;
confirm_chrome_tunnel_failure_with_local_dns(
&mut page,
"https://localhost/",
std::time::Duration::from_secs(2),
)
.await;
assert_eq!(
page.status_code,
StatusCode::from_u16(504).unwrap(),
"504 with locally-resolvable host must stay 504 (transient hiccup, \
retryable). Required to avoid false-positives on legit hosts."
);
assert!(
page.should_retry,
"should_retry must remain true on resolved hosts — retry can recover"
);
}
#[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
#[tokio::test(flavor = "current_thread")]
async fn test_confirm_chrome_tunnel_failure_504_non_empty_body_stays() {
let big_body = vec![b'x'; 1024];
let mut page = build(
"https://this-host-must-not-exist.invalid/",
PageResponse {
status_code: StatusCode::from_u16(504).unwrap(),
content: Some(big_body),
..Default::default()
},
);
page.should_retry = true;
confirm_chrome_tunnel_failure_with_local_dns(
&mut page,
"https://this-host-must-not-exist.invalid/",
std::time::Duration::from_millis(10),
)
.await;
assert_eq!(
page.status_code,
StatusCode::from_u16(504).unwrap(),
"504 with real (>256B) body must stay 504 even on NxDomain — body \
presence proves chrome got data back; this is the slow-API/captcha \
case, not the dead-host case"
);
}
#[cfg(all(not(feature = "decentralized"), not(feature = "cache_request")))]
#[tokio::test(flavor = "current_thread")]
async fn test_confirm_tunnel_failure_no_op_for_non_tunnel_errors() {
let client = reqwest::Client::builder().build().expect("client");
let err = client
.get("https://orthwestsci.invalid/")
.send()
.await
.expect_err("must error");
let initial = get_error_http_status_code(&err);
assert_eq!(
initial, *DNS_RESOLVE_ERROR,
"pure local DNS error must classify to 525 via is_dns_error"
);
let final_status = confirm_tunnel_failure_with_local_dns(
initial,
&err,
"https://orthwestsci.invalid/",
std::time::Duration::from_millis(100),
)
.await;
assert_eq!(
final_status, initial,
"confirm helper must be no-op for non-tunnel classifications — \
only triggers when initial_status == *UNREACHABLE_REQUEST_ERROR"
);
}
#[cfg(all(not(feature = "decentralized"), feature = "cache_request"))]
#[tokio::test(flavor = "current_thread")]
async fn test_confirm_tunnel_failure_cache_wrapped_nxdomain_upgrades_to_525() {
let target = "https://this-host-must-not-exist.invalid/";
let io = std::io::Error::new(
std::io::ErrorKind::Other,
format!("Cache error: error sending request for url ({target})"),
);
let err: crate::client::Error = reqwest_middleware::Error::middleware(io);
assert!(
err.is_middleware(),
"constructed wrap must report is_middleware()=true: {err:?}"
);
assert!(
CACHE_WRAPPED_TRANSPORT_AC.is_match(&err.to_string()),
"Display must match the cache wrap pattern: {err}"
);
let initial = get_error_http_status_code(&err);
assert_eq!(
initial, *ADDRESS_UNREACHABLE_ERROR,
"cache shortcut must classify cache-wrapped transport errors to 526"
);
let final_status = confirm_tunnel_failure_with_local_dns(
initial,
&err,
target,
std::time::Duration::from_secs(2),
)
.await;
assert_eq!(
final_status, *DNS_RESOLVE_ERROR,
"cache-wrapped 526 + local NxDomain must upgrade to 525 — confirmed dead \
host fast-fails through the same path as the non-cached tunnel-failure case"
);
}
#[cfg(all(not(feature = "decentralized"), feature = "cache_request"))]
#[tokio::test(flavor = "current_thread")]
async fn test_confirm_tunnel_failure_cache_wrapped_resolved_keeps_526() {
let target = "https://localhost/";
let io = std::io::Error::new(
std::io::ErrorKind::Other,
format!("Cache error: error sending request for url ({target})"),
);
let err: crate::client::Error = reqwest_middleware::Error::middleware(io);
let initial = get_error_http_status_code(&err);
assert_eq!(
initial, *ADDRESS_UNREACHABLE_ERROR,
"cache-wrapped transport error must classify to 526"
);
let final_status = confirm_tunnel_failure_with_local_dns(
initial,
&err,
target,
std::time::Duration::from_secs(2),
)
.await;
assert_eq!(
final_status, *ADDRESS_UNREACHABLE_ERROR,
"cache-wrapped 526 + local Resolved must KEEP 526 — preserves the \
cache shortcut's hang-prevention invariant"
);
assert!(
!is_retryable_status(final_status),
"526 must stay non-retryable so the cache-error retry loop halts"
);
}
#[cfg(all(not(feature = "decentralized"), feature = "cache_request"))]
#[tokio::test(flavor = "current_thread")]
async fn test_confirm_tunnel_failure_cache_wrapped_unrelated_keeps_initial() {
let io = std::io::Error::new(
std::io::ErrorKind::Other,
"cache storage io error: disk full",
);
let err: crate::client::Error = reqwest_middleware::Error::middleware(io);
assert!(err.is_middleware());
assert!(
!CACHE_WRAPPED_TRANSPORT_AC.is_match(&err.to_string()),
"non-transport cache errors must NOT match the AC pattern"
);
let initial = StatusCode::INTERNAL_SERVER_ERROR;
let final_status = confirm_tunnel_failure_with_local_dns(
initial,
&err,
"https://example.com/",
std::time::Duration::from_millis(50),
)
.await;
assert_eq!(
final_status, initial,
"non-cache-wrap middleware errors must short-circuit before any DNS lookup"
);
}
#[cfg(all(not(feature = "decentralized"), not(feature = "cache_request")))]
#[tokio::test(flavor = "current_thread")]
async fn test_proxy_tunnel_failure_no_retry() {
use std::io::{Read, Write};
use std::net::TcpListener;
let listener = TcpListener::bind("127.0.0.1:0").expect("bind loopback");
let proxy_addr = listener.local_addr().expect("local addr");
let proxy_port = proxy_addr.port();
std::thread::spawn(move || {
for stream in listener.incoming() {
let Ok(mut s) = stream else { continue };
let mut buf = [0u8; 1024];
let _ = s.read(&mut buf);
let _ = s.write_all(b"HTTP/1.1 502 Bad Gateway\r\nContent-Length: 0\r\n\r\n");
let _ = s.flush();
}
});
let proxy_url = format!("http://127.0.0.1:{proxy_port}");
let client = reqwest::Client::builder()
.proxy(reqwest::Proxy::all(&proxy_url).expect("valid proxy"))
.connect_timeout(std::time::Duration::from_secs(3))
.build()
.expect("client");
let err = client
.get("https://example.invalid/")
.send()
.await
.expect_err("CONNECT 502 must error");
assert!(
err.is_connect(),
"proxy CONNECT 502 must surface as is_connect()=true; got: {err}"
);
assert!(
is_proxy_tunnel_failure(&err),
"is_proxy_tunnel_failure must catch hyper-util TunnelUnsuccessful via chain walk; \
got: {err:?}"
);
let mapped = get_error_http_status_code(&err);
assert_eq!(
mapped, *UNREACHABLE_REQUEST_ERROR,
"post-v2.51.167: tunnel failures stay at 503 — caller pairs with \
independent signal before declaring permanent. 526 over-classified \
legitimate proxied requests on transient proxy hiccups."
);
assert!(
is_retryable_status(mapped),
"503 must remain retryable so transient proxy issues on legitimate \
hosts (example.com, cloudflare.com, etc.) recover via the retry path"
);
let page = build(
"https://example.invalid",
PageResponse {
status_code: mapped,
content: None,
..Default::default()
},
);
assert!(
page.should_retry,
"Post-revert: 503 is server_error → build()'s should_retry_status sets true"
);
assert!(
page.needs_retry(),
"Post-revert: needs_retry() returns true (503 is_retryable) so the retry \
loop runs as before — preserves transient-proxy recovery"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_retry_strategy_not_consulted_for_dns_error() {
use crate::retry_strategy::{AttemptOutcome, RetryDirective, RetryStrategy};
use std::sync::atomic::{AtomicU32, Ordering};
struct AggressiveStrategy {
on_retry_calls: AtomicU32,
}
impl RetryStrategy for AggressiveStrategy {
fn max_retries(&self) -> u32 {
10
}
fn on_retry(&self, _outcome: &AttemptOutcome) -> RetryDirective {
self.on_retry_calls.fetch_add(1, Ordering::SeqCst);
RetryDirective {
should_retry: true,
..Default::default()
}
}
}
let strategy = AggressiveStrategy {
on_retry_calls: AtomicU32::new(0),
};
let dns_page = build(
"https://orthwestsci.invalid",
PageResponse {
status_code: *DNS_RESOLVE_ERROR,
content: None,
..Default::default()
},
);
let mut retry_count: u32 = strategy.max_retries();
let mut iterations: u32 = 0;
while dns_page.needs_retry() && retry_count > 0 {
retry_count -= 1;
iterations += 1;
let outcome = AttemptOutcome {
attempt: iterations,
status_code: dns_page.status_code,
should_retry: dns_page.should_retry,
content_truncated: dns_page.content_truncated,
waf_check: dns_page.waf_check,
anti_bot_tech: &dns_page.anti_bot_tech,
proxy_configured: dns_page.proxy_configured,
url: "https://orthwestsci.invalid",
profile_key: None,
html_length: dns_page.size(),
bytes_transferred: dns_page.bytes_transferred,
error_status: None,
final_redirect_destination: None,
};
let _ = strategy.on_retry(&outcome);
}
assert_eq!(
iterations, 0,
"525 DNS errors must not enter the retry loop body even once \
(custom strategy with max_retries=10 + should_retry=true)"
);
assert_eq!(
strategy.on_retry_calls.load(Ordering::SeqCst),
0,
"RetryStrategy::on_retry must NEVER be consulted on DNS errors — \
needs_retry() guards every retry loop in website.rs before strategy lookup"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_server_error_still_retries() {
let res = PageResponse {
status_code: StatusCode::INTERNAL_SERVER_ERROR,
content: Some(Default::default()),
..Default::default()
};
let page = build("https://example.com", res);
assert!(page.should_retry, "500 errors should still be retried");
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_chrome_error_page_detected_as_empty() {
let padding = "x".repeat(1000); let chrome_error_html_str = format!(
"<html lang=\"en\" dir=\"ltr\">\n\
<style>{padding}</style>\n\
<div id=\"main-frame-error\" class=\"interstitial-wrapper\">\n\
<h1><span>This site can\u{2019}t be reached</span></h1>\n\
<div class=\"error-code\">ERR_TUNNEL_CONNECTION_FAILED</div>\n\
</div>\n\
<script>var loadTimeDataRaw = {{\"errorCode\":\"ERR_TUNNEL_CONNECTION_FAILED\",\
\"heading\":{{\"msg\":\"This site can't be reached\"}},\
\"title\":\"www.example.com\"}};</script></html>"
);
let chrome_error_html = chrome_error_html_str.as_bytes();
assert!(
is_chrome_error_page(chrome_error_html),
"should detect Chrome error page by structural tail match"
);
assert!(
!validate_empty(&Some(chrome_error_html.to_vec()), true),
"Chrome error page should be treated as empty/invalid content"
);
let res = PageResponse {
status_code: StatusCode::OK,
content: Some(chrome_error_html.to_vec()),
..Default::default()
};
let page = build("https://www.example.com", res);
assert!(
page.should_retry,
"Chrome error page with 200 status should trigger retry"
);
assert_eq!(
page.status_code,
StatusCode::from_u16(599).unwrap(),
"Chrome error page should be reclassified to 599"
);
assert!(
!page.get_html().is_empty(),
"Chrome error page content should be preserved for debugging"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_normal_page_not_detected_as_chrome_error() {
let normal_html =
b"<html><head><title>My Blog</title></head><body><p>Hello world</p></body></html>";
assert!(!is_chrome_error_page(normal_html));
assert!(validate_empty(&Some(normal_html.to_vec()), true));
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_is_chrome_name_resolution_error_matches_both_forms() {
assert!(is_chrome_name_resolution_error(
"net::ERR_NAME_NOT_RESOLVED"
));
assert!(is_chrome_name_resolution_error("ERR_NAME_NOT_RESOLVED"));
assert!(is_chrome_name_resolution_error(
"net::ERR_NAME_RESOLUTION_FAILED"
));
assert!(is_chrome_name_resolution_error(
"ERR_NAME_RESOLUTION_FAILED"
));
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_is_chrome_name_resolution_error_rejects_transient_and_unrelated() {
assert!(!is_chrome_name_resolution_error("net::ERR_DNS_TIMED_OUT"));
assert!(!is_chrome_name_resolution_error(
"net::ERR_DNS_SERVER_FAILED"
));
assert!(!is_chrome_name_resolution_error(
"net::ERR_DNS_MALFORMED_RESPONSE"
));
assert!(!is_chrome_name_resolution_error(
"net::ERR_TUNNEL_CONNECTION_FAILED"
));
assert!(!is_chrome_name_resolution_error("net::ERR_FAILED"));
assert!(!is_chrome_name_resolution_error(
"net::ERR_CONNECTION_REFUSED"
));
assert!(!is_chrome_name_resolution_error(""));
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_extract_chrome_error_code_basic() {
let padding = "x".repeat(1000);
let html = format!(
"<html><style>{padding}</style>\
<script>var loadTimeDataRaw = {{\"errorCode\":\"ERR_NAME_NOT_RESOLVED\",\
\"title\":\"a.com\"}};</script></html>"
);
assert_eq!(
extract_chrome_error_code(html.as_bytes()),
Some("ERR_NAME_NOT_RESOLVED")
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_extract_chrome_error_code_absent() {
let html = b"<html><body><p>no error marker here</p></body></html>";
assert_eq!(extract_chrome_error_code(html), None);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_chrome_error_page_dns_reclassified_to_525() {
let padding = "x".repeat(1000);
let html_str = format!(
"<html lang=\"en\" dir=\"ltr\">\n\
<style>{padding}</style>\n\
<div id=\"main-frame-error\" class=\"interstitial-wrapper\">\n\
<h1><span>This site can\u{2019}t be reached</span></h1>\n\
<div class=\"error-code\">ERR_NAME_NOT_RESOLVED</div>\n\
</div>\n\
<script>var loadTimeDataRaw = {{\"errorCode\":\"ERR_NAME_NOT_RESOLVED\",\
\"title\":\"a.com\"}};</script></html>"
);
let res = PageResponse {
status_code: StatusCode::OK,
content: Some(html_str.into_bytes()),
..Default::default()
};
let page = build("https://a.com", res);
assert_eq!(
page.status_code, *DNS_RESOLVE_ERROR,
"Chrome ERR_NAME_NOT_RESOLVED must reclassify to 525, not 599"
);
assert!(
!page.should_retry,
"DNS resolution failures must not trigger a retry"
);
assert!(
!page.needs_retry(),
"needs_retry() must be false for permanent DNS failures"
);
}
#[test]
fn test_dns_error_ac_matches_hickory_strings() {
assert!(
DNS_ERROR_AC.is_match(
"no record found for Query { name: Name(\"a.com.\"), query_type: A, query_class: IN }"
),
"hickory NoRecordsFound Display must be caught by the safety-net AC scan"
);
assert!(
DNS_ERROR_AC.is_match("dns resolution returned no addresses"),
"DnsCacheResolver empty-addresses error must be caught"
);
}
#[test]
fn test_dns_error_ac_matches_existing_resolver_strings() {
assert!(DNS_ERROR_AC
.is_match("dns error: failed to lookup address information: nodename nor servname"));
assert!(
DNS_ERROR_AC.is_match("error trying to connect: Name or service not known (os error -2)")
);
assert!(DNS_ERROR_AC.is_match("No address associated with hostname"));
assert!(DNS_ERROR_AC.is_match("getaddrinfo ENOTFOUND example.invalid"));
}
#[test]
fn test_proxy_tunnel_ac_rejects_unrelated_errors() {
assert!(!PROXY_TUNNEL_FAILURE_AC.is_match("dns error"));
assert!(!PROXY_TUNNEL_FAILURE_AC
.is_match("dns error: failed to lookup address information: nodename nor servname"));
assert!(!PROXY_TUNNEL_FAILURE_AC.is_match("Name or service not known"));
assert!(!PROXY_TUNNEL_FAILURE_AC.is_match("ENOTFOUND"));
assert!(!PROXY_TUNNEL_FAILURE_AC.is_match("connection refused"));
assert!(!PROXY_TUNNEL_FAILURE_AC.is_match("connection reset by peer"));
assert!(!PROXY_TUNNEL_FAILURE_AC.is_match("connection aborted"));
assert!(!PROXY_TUNNEL_FAILURE_AC.is_match("tls handshake failure"));
assert!(!PROXY_TUNNEL_FAILURE_AC.is_match("alert: HandshakeFailure"));
assert!(!PROXY_TUNNEL_FAILURE_AC.is_match("ERR_SSL_VERSION_OR_CIPHER_MISMATCH"));
assert!(!PROXY_TUNNEL_FAILURE_AC.is_match("operation timed out"));
assert!(!PROXY_TUNNEL_FAILURE_AC.is_match("request timed out"));
assert!(!PROXY_TUNNEL_FAILURE_AC.is_match("error sending request for url (https://x.test/)"));
assert!(!PROXY_TUNNEL_FAILURE_AC.is_match("client error (Connect)"));
assert!(!PROXY_TUNNEL_FAILURE_AC.is_match("broken pipe"));
assert!(!PROXY_TUNNEL_FAILURE_AC.is_match(""));
assert!(PROXY_TUNNEL_FAILURE_AC.is_match("tunnel error: unsuccessful"));
assert!(PROXY_TUNNEL_FAILURE_AC.is_match("TunnelUnsuccessful"));
assert!(PROXY_TUNNEL_FAILURE_AC.is_match("error connecting to socks proxy"));
assert!(PROXY_TUNNEL_FAILURE_AC.is_match("SOCKS error: general server failure"));
assert!(PROXY_TUNNEL_FAILURE_AC.is_match("SOCKS error: host unreachable"));
assert!(PROXY_TUNNEL_FAILURE_AC.is_match("SOCKS error: network unreachable"));
}
#[test]
fn test_dns_ac_rejects_tunnel_failure_phrases() {
assert!(!DNS_ERROR_AC.is_match("tunnel error: unsuccessful"));
assert!(!DNS_ERROR_AC.is_match("TunnelUnsuccessful"));
assert!(!DNS_ERROR_AC.is_match("client error (Connect)"));
}
#[test]
fn test_dns_error_ac_rejects_unrelated_errors() {
assert!(!DNS_ERROR_AC.is_match("connection refused"));
assert!(!DNS_ERROR_AC.is_match("connection reset by peer"));
assert!(!DNS_ERROR_AC.is_match("tls handshake failure"));
assert!(!DNS_ERROR_AC.is_match("request timed out"));
assert!(!DNS_ERROR_AC.is_match("broken pipe"));
assert!(!DNS_ERROR_AC.is_match(""));
}
#[cfg(all(not(feature = "decentralized"), not(feature = "cache_request")))]
#[tokio::test(flavor = "current_thread")]
async fn test_dns_error_no_retry_end_to_end_via_real_reqwest() {
let client = reqwest::Client::builder()
.build()
.expect("client should build");
let err = client
.get("https://orthwestsci.invalid/")
.send()
.await
.expect_err("NXDOMAIN must error");
let top_display = err.to_string();
assert!(
!DNS_ERROR_AC.is_match(&top_display),
"guard: top-level Display should not contain DNS phrases — \
this test is meaningful only because per-layer scanning is required. \
Got: {top_display}"
);
assert!(
is_dns_error(&err),
"is_dns_error must classify reqwest NXDOMAIN error via source-chain walk"
);
let mapped = get_error_http_status_code(&err);
assert_eq!(
mapped, *DNS_RESOLVE_ERROR,
"NXDOMAIN must map to 525 (permanent), not 503 catch-all (retryable)"
);
assert_ne!(
mapped, *UNREACHABLE_REQUEST_ERROR,
"NXDOMAIN must NOT fall through to *UNREACHABLE_REQUEST_ERROR (503) — \
that path is retryable and would burn the retry budget"
);
assert!(
!is_retryable_status(mapped),
"is_retryable_status({mapped}) must be false for DNS-classified failures"
);
let dns_page = build(
"https://orthwestsci.invalid",
PageResponse {
status_code: mapped,
content: None,
..Default::default()
},
);
assert!(
!dns_page.should_retry,
"Page.should_retry MUST be false for DNS-classified failures \
(mapped status: {mapped})"
);
assert!(
!dns_page.needs_retry(),
"Page::needs_retry() MUST be false — every retry loop in website.rs \
is gated on `while page.needs_retry() && retry_count > 0`, so this \
is the structural guarantee that DNS errors never enter the loop body"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_is_chrome_permanent_failure_accepts_all_target_side_codes() {
for code in [
"net::ERR_NAME_NOT_RESOLVED",
"ERR_NAME_NOT_RESOLVED",
"net::ERR_NAME_RESOLUTION_FAILED",
"ERR_NAME_RESOLUTION_FAILED",
"net::ERR_ADDRESS_UNREACHABLE",
"ERR_ADDRESS_UNREACHABLE",
"net::ERR_CONNECTION_REFUSED",
"ERR_CONNECTION_REFUSED",
] {
assert!(
is_chrome_permanent_failure(code),
"{code} must classify as a permanent target-side failure"
);
}
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_is_chrome_permanent_failure_rejects_transient_and_unrelated() {
for code in [
"net::ERR_DNS_TIMED_OUT",
"net::ERR_DNS_SERVER_FAILED",
"net::ERR_DNS_MALFORMED_RESPONSE",
"net::ERR_TUNNEL_CONNECTION_FAILED",
"net::ERR_PROXY_CONNECTION_FAILED",
"net::ERR_CONNECTION_RESET",
"net::ERR_CONNECTION_TIMED_OUT",
"net::ERR_TIMED_OUT",
"net::ERR_CERT_INVALID",
"net::ERR_FAILED",
"",
] {
assert!(
!is_chrome_permanent_failure(code),
"{code} must remain retryable"
);
}
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_chrome_permanent_failure_status_maps_dns_to_525() {
assert_eq!(
chrome_permanent_failure_status("net::ERR_NAME_NOT_RESOLVED"),
*DNS_RESOLVE_ERROR
);
assert_eq!(
chrome_permanent_failure_status("ERR_NAME_RESOLUTION_FAILED"),
*DNS_RESOLVE_ERROR
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_chrome_permanent_failure_status_maps_unreachable_to_526() {
assert_eq!(
chrome_permanent_failure_status("net::ERR_ADDRESS_UNREACHABLE"),
*ADDRESS_UNREACHABLE_ERROR
);
assert_eq!(
chrome_permanent_failure_status("ERR_CONNECTION_REFUSED"),
*ADDRESS_UNREACHABLE_ERROR
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_is_retryable_status_excludes_address_unreachable() {
assert!(
!is_retryable_status(*ADDRESS_UNREACHABLE_ERROR),
"526 (ADDRESS_UNREACHABLE_ERROR) must not be retryable"
);
assert!(
!is_retryable_status(*DNS_RESOLVE_ERROR),
"525 (DNS_RESOLVE_ERROR) must not be retryable"
);
assert!(is_retryable_status(StatusCode::from_u16(502).unwrap()));
assert!(is_retryable_status(StatusCode::from_u16(503).unwrap()));
assert!(is_retryable_status(StatusCode::from_u16(521).unwrap()));
}
#[test]
fn test_ssl_handshake_error_ac_matches_known_surfaces() {
for s in [
"received fatal alert: HandshakeFailure",
"alert: HandshakeFailure",
"alert: ProtocolVersion",
"error:1408F10B:SSL routines:ssl3_get_record:wrong version number",
"error:1408A0C1:SSL routines:ssl3_get_client_hello:no shared cipher",
"tls error: unsupported protocol",
"tls handshake error",
"TLS handshake failed",
"ssl handshake failed",
"TLS handshake failure",
"net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH",
"net::ERR_SSL_PROTOCOL_ERROR",
] {
assert!(
SSL_HANDSHAKE_ERROR_AC.is_match(s),
"{s:?} must classify as SSL handshake failure"
);
}
}
#[test]
fn test_ssl_handshake_error_ac_rejects_unrelated() {
for s in [
"connection reset by peer",
"broken pipe",
"operation timed out",
"dns error: ENOTFOUND",
"failed to lookup address",
"invalid HTTP response",
"decode error",
"",
"error sending request for url (https://example.com/blog/tls-handshake-overview)",
"error sending request for url (https://example.com/blog/ssl-handshake-explained)",
"error sending request for url (https://example.com/?topic=ProtocolVersion)",
"error sending request for url (https://example.com/HandshakeFailure-explained)",
] {
assert!(
!SSL_HANDSHAKE_ERROR_AC.is_match(s),
"{s:?} must NOT classify as SSL handshake failure"
);
}
}
#[test]
fn test_is_retryable_status_excludes_ssl_handshake_bucket() {
assert!(
!is_retryable_status(*ADDRESS_UNREACHABLE_ERROR),
"526 must remain non-retryable for SSL handshake failures"
);
}
#[test]
fn test_is_retryable_status_excludes_permanent_5xx() {
for code in [
StatusCode::NOT_IMPLEMENTED, StatusCode::HTTP_VERSION_NOT_SUPPORTED, StatusCode::NETWORK_AUTHENTICATION_REQUIRED, ] {
assert!(
!is_retryable_status(code),
"{code} must be permanent (server-declared won't-do)"
);
}
for code in [
StatusCode::INTERNAL_SERVER_ERROR, StatusCode::BAD_GATEWAY, StatusCode::SERVICE_UNAVAILABLE, StatusCode::GATEWAY_TIMEOUT, ] {
assert!(is_retryable_status(code), "{code} must remain retryable");
}
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_permanent_5xx_pages_are_not_retryable_end_to_end() {
for code in [
StatusCode::NOT_IMPLEMENTED,
StatusCode::HTTP_VERSION_NOT_SUPPORTED,
StatusCode::NETWORK_AUTHENTICATION_REQUIRED,
] {
let res = PageResponse {
status_code: code,
content: None,
..Default::default()
};
let page = build("https://example.invalid", res);
assert_eq!(page.status_code, code);
assert!(
!page.should_retry,
"Page::should_retry must be false for {code}"
);
assert!(
!page.needs_retry(),
"Page::needs_retry() must be false for {code}"
);
}
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_is_chrome_permanent_failure_extended_codes() {
for code in [
"ERR_HTTP2_INADEQUATE_TRANSPORT_SECURITY",
"net::ERR_HTTP2_INADEQUATE_TRANSPORT_SECURITY",
"ERR_INVALID_URL",
"ERR_UNSAFE_PORT",
"ERR_DISALLOWED_URL_SCHEME",
"ERR_UNKNOWN_URL_SCHEME",
"net::ERR_INVALID_URL",
] {
assert!(
is_chrome_permanent_failure(code),
"{code} must classify as a permanent Chrome failure"
);
}
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_chrome_permanent_failure_status_routes_extended_codes() {
assert_eq!(
chrome_permanent_failure_status("net::ERR_HTTP2_INADEQUATE_TRANSPORT_SECURITY"),
*ADDRESS_UNREACHABLE_ERROR
);
for code in [
"ERR_INVALID_URL",
"ERR_UNSAFE_PORT",
"ERR_DISALLOWED_URL_SCHEME",
"ERR_UNKNOWN_URL_SCHEME",
] {
assert_eq!(
chrome_permanent_failure_status(code),
StatusCode::BAD_REQUEST,
"{code} must route to 400"
);
assert!(
!is_retryable_status(chrome_permanent_failure_status(code)),
"{code} status must not be retryable"
);
}
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_chrome_error_page_extended_codes_reclassified() {
let cases: &[(&str, StatusCode)] = &[
(
"ERR_HTTP2_INADEQUATE_TRANSPORT_SECURITY",
*ADDRESS_UNREACHABLE_ERROR,
),
("ERR_INVALID_URL", StatusCode::BAD_REQUEST),
("ERR_UNSAFE_PORT", StatusCode::BAD_REQUEST),
];
let padding = "x".repeat(1000);
for (code, expected) in cases {
let html_str = format!(
"<html><style>{padding}</style>\
<script>var loadTimeDataRaw = {{\"errorCode\":\"{code}\",\
\"title\":\"example.invalid\"}};</script></html>"
);
let res = PageResponse {
status_code: StatusCode::OK,
content: Some(html_str.into_bytes()),
..Default::default()
};
let page = build("https://example.invalid", res);
assert_eq!(
page.status_code, *expected,
"Chrome {code} must reclassify to {expected}"
);
assert!(
!page.should_retry,
"{code} must not flag the page for retry"
);
assert!(
!page.needs_retry(),
"{code} must not trigger website-level retry"
);
}
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_ssl_handshake_page_is_not_retryable_end_to_end() {
let res = PageResponse {
status_code: *ADDRESS_UNREACHABLE_ERROR,
content: None,
..Default::default()
};
let page = build("https://www.example.invalid/legal/impressum", res);
assert_eq!(
page.status_code, *ADDRESS_UNREACHABLE_ERROR,
"SSL handshake page must keep its 526 classification through build()"
);
assert!(
!page.should_retry,
"Page::should_retry must be false for SSL handshake failures"
);
assert!(
!page.needs_retry(),
"Page::needs_retry() must be false so the website retry loop skips"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_is_chrome_name_resolution_error_backward_compat() {
assert!(is_chrome_name_resolution_error("ERR_NAME_NOT_RESOLVED"));
assert!(is_chrome_name_resolution_error(
"ERR_NAME_RESOLUTION_FAILED"
));
assert!(!is_chrome_name_resolution_error("ERR_ADDRESS_UNREACHABLE"));
assert!(!is_chrome_name_resolution_error("ERR_CONNECTION_REFUSED"));
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_chrome_error_page_address_unreachable_reclassified_to_526() {
let padding = "x".repeat(1000);
let html_str = format!(
"<html lang=\"en\" dir=\"ltr\">\n\
<style>{padding}</style>\n\
<div class=\"error-code\">ERR_ADDRESS_UNREACHABLE</div>\n\
<script>var loadTimeDataRaw = {{\"errorCode\":\"ERR_ADDRESS_UNREACHABLE\",\
\"title\":\"example.invalid\"}};</script></html>"
);
let res = PageResponse {
status_code: StatusCode::OK,
content: Some(html_str.into_bytes()),
..Default::default()
};
let page = build("https://example.invalid", res);
assert_eq!(
page.status_code, *ADDRESS_UNREACHABLE_ERROR,
"Chrome ERR_ADDRESS_UNREACHABLE must reclassify to 526, not 599"
);
assert!(
!page.should_retry,
"address-unreachable failures must not trigger a retry"
);
assert!(
!page.needs_retry(),
"needs_retry() must be false for permanent address-unreachable failures"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_chrome_error_page_connection_refused_reclassified_to_526() {
let padding = "x".repeat(1000);
let html_str = format!(
"<html><style>{padding}</style>\
<script>var loadTimeDataRaw = {{\"errorCode\":\"ERR_CONNECTION_REFUSED\",\
\"title\":\"example.invalid\"}};</script></html>"
);
let res = PageResponse {
status_code: StatusCode::OK,
content: Some(html_str.into_bytes()),
..Default::default()
};
let page = build("https://example.invalid", res);
assert_eq!(
page.status_code, *ADDRESS_UNREACHABLE_ERROR,
"Chrome ERR_CONNECTION_REFUSED must reclassify to 526"
);
assert!(
!page.should_retry,
"connection-refused failures through proxy chain must not retry"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_chrome_error_page_non_dns_stays_599() {
let padding = "x".repeat(1000);
let html_str = format!(
"<html><style>{padding}</style>\
<script>var loadTimeDataRaw = {{\"errorCode\":\"ERR_TUNNEL_CONNECTION_FAILED\",\
\"title\":\"example.com\"}};</script></html>"
);
let res = PageResponse {
status_code: StatusCode::OK,
content: Some(html_str.into_bytes()),
..Default::default()
};
let page = build("https://example.com", res);
assert_eq!(
page.status_code,
StatusCode::from_u16(599).unwrap(),
"non-DNS Chrome error pages must still map to 599"
);
assert!(
page.should_retry,
"599 errors remain retryable for proxy/tunnel rotation"
);
}
#[test]
fn test_retryable_status_server_errors() {
for code in [500, 502, 503, 504, 521, 522, 523, 524, 598, 599] {
let status = StatusCode::from_u16(code).unwrap();
assert!(is_retryable_status(status), "{code} should be retryable");
}
for code in [501, 505, 511, 525, 526] {
let status = StatusCode::from_u16(code).unwrap();
assert!(
!is_retryable_status(status),
"{code} must NOT be retryable (permanent declaration)"
);
}
}
#[test]
fn test_retryable_status_rate_limit_and_timeout() {
assert!(
is_retryable_status(StatusCode::TOO_MANY_REQUESTS),
"429 retryable"
);
assert!(
is_retryable_status(StatusCode::REQUEST_TIMEOUT),
"408 retryable"
);
}
#[test]
fn test_non_retryable_status_dns_error() {
let dns = StatusCode::from_u16(525).unwrap();
assert!(!is_retryable_status(dns), "525 DNS must never be retried");
}
#[test]
fn test_non_retryable_client_errors() {
for code in [400, 401, 403, 404, 405, 409, 410, 422, 451] {
let status = StatusCode::from_u16(code).unwrap();
assert!(
!is_retryable_status(status),
"{code} should NOT be retryable"
);
}
}
#[test]
fn test_non_retryable_success_codes() {
for code in [200, 201, 204, 301, 302, 304] {
let status = StatusCode::from_u16(code).unwrap();
assert!(
!is_retryable_status(status),
"{code} should NOT be retryable"
);
}
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_needs_retry_should_retry_flag_alone() {
let res = PageResponse {
status_code: StatusCode::OK,
content: Some(b"<html><body>ok</body></html>".to_vec()),
..Default::default()
};
let mut page = build("https://example.com", res);
assert!(!page.needs_retry(), "clean 200 page should not need retry");
page.should_retry = true;
assert!(page.needs_retry(), "should_retry flag forces retry");
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_needs_retry_content_truncated_alone() {
let res = PageResponse {
status_code: StatusCode::OK,
content: Some(b"<html><body>ok</body></html>".to_vec()),
..Default::default()
};
let mut page = build("https://example.com", res);
page.content_truncated = true;
assert!(page.needs_retry(), "truncated content forces retry");
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_needs_retry_retryable_status_alone() {
let res = PageResponse {
status_code: StatusCode::BAD_GATEWAY,
content: Some(Default::default()),
..Default::default()
};
let page = build("https://example.com", res);
assert!(page.needs_retry(), "502 status triggers needs_retry");
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_needs_retry_dns_error_not_retried() {
let res = PageResponse {
status_code: StatusCode::from_u16(525).unwrap(),
content: None,
..Default::default()
};
let page = build("https://nonexistent.invalid", res);
assert!(!page.needs_retry(), "DNS 525 must never trigger retry");
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_needs_retry_client_error_no_flags() {
let res = PageResponse {
status_code: StatusCode::NOT_FOUND,
content: Some(b"<html>not found</html>".to_vec()),
..Default::default()
};
let page = build("https://example.com/missing", res);
assert!(!page.needs_retry(), "404 should not need retry");
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_needs_retry_multiple_flags_combined() {
let res = PageResponse {
status_code: StatusCode::SERVICE_UNAVAILABLE,
content: Some(Default::default()),
..Default::default()
};
let mut page = build("https://example.com", res);
page.content_truncated = true;
assert!(
page.needs_retry(),
"multiple retry signals still returns true"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_chrome_error_page_under_500_bytes() {
let short = b"<script>var loadTimeDataRaw = {\"errorCode\":\"ERR_FAIL\"};</script></html>";
assert!(
!is_chrome_error_page(short),
"content < 500 bytes should be rejected"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_chrome_error_page_missing_tail() {
let padding = "x".repeat(1000);
let html = format!(
"<html><style>{padding}</style>\
<script>var loadTimeDataRaw = {{\"errorCode\":\"ERR_FAIL\"}};</script></body></html>"
);
assert!(
!is_chrome_error_page(html.as_bytes()),
"wrong tail (has </body>) should not match"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_chrome_error_page_missing_error_code_needle() {
let padding = "x".repeat(1000);
let html = format!(
"<html><style>{padding}</style>\
<script>var loadTimeDataRaw = {{\"someKey\":\"value\"}};</script></html>"
);
assert!(
!is_chrome_error_page(html.as_bytes()),
"missing errorCode needle should not match"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_chrome_error_page_trailing_whitespace() {
let padding = "x".repeat(1000);
let html = format!(
"<html><style>{padding}</style>\
<script>var loadTimeDataRaw = {{\"errorCode\":\"ERR_TUNNEL_CONNECTION_FAILED\"}};</script></html>\n\r\n "
);
assert!(
is_chrome_error_page(html.as_bytes()),
"trailing whitespace should be trimmed"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_chrome_error_page_needle_outside_4kb_window() {
let error_part = r#"<script>var loadTimeDataRaw = {"errorCode":"ERR_FAIL"};</script>"#;
let padding = "x".repeat(5000); let html = format!(
"<html>{error_part}<style>{padding}</style>\
<script>var more = {{}};</script></html>"
);
assert!(
!is_chrome_error_page(html.as_bytes()),
"needle outside last 4KB window should not match"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_get_timeout_rate_limit() {
let res = PageResponse {
status_code: StatusCode::TOO_MANY_REQUESTS,
content: None,
..Default::default()
};
let page = build("https://example.com", res);
let timeout = page.get_timeout();
assert_eq!(
timeout,
Some(std::time::Duration::from_millis(2_500)),
"429 → 2500ms"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_get_timeout_gateway_timeout() {
let res = PageResponse {
status_code: StatusCode::GATEWAY_TIMEOUT,
content: None,
..Default::default()
};
let page = build("https://example.com", res);
let timeout = page.get_timeout();
assert_eq!(
timeout,
Some(std::time::Duration::from_millis(1_500)),
"504 → 1500ms"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_get_timeout_proxy_errors() {
for code in [598u16, 599] {
let res = PageResponse {
status_code: StatusCode::from_u16(code).unwrap(),
content: None,
..Default::default()
};
let page = build("https://example.com", res);
let timeout = page.get_timeout();
assert_eq!(
timeout,
Some(std::time::Duration::from_millis(1_500)),
"{code} → 500ms"
);
}
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_get_timeout_normal_status_none() {
for code in [200u16, 301, 404, 500, 502, 503] {
let res = PageResponse {
status_code: StatusCode::from_u16(code).unwrap(),
content: Some(b"<html></html>".to_vec()),
..Default::default()
};
let page = build("https://example.com", res);
let timeout = page.get_timeout();
assert_eq!(timeout, None, "{code} should have no special timeout");
}
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_validate_empty_none_content() {
assert!(!validate_empty(&None, true), "None content is empty");
assert!(
!validate_empty(&None, false),
"None content is empty regardless of success"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_validate_empty_zero_length() {
assert!(!validate_empty(&Some(vec![]), true), "empty vec is empty");
assert!(
!validate_empty(&Some(vec![]), false),
"empty vec is empty regardless of success"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_validate_empty_html_shell() {
let shell = b"<html><head></head><body></body></html>".to_vec();
assert!(
!validate_empty(&Some(shell), true),
"empty HTML shell should be rejected"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_validate_empty_valid_content() {
let valid = b"<html><head><title>Test</title></head><body><p>Hello</p></body></html>".to_vec();
assert!(validate_empty(&Some(valid), true), "valid HTML should pass");
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_401_not_retried_without_proxy() {
let res = PageResponse {
status_code: StatusCode::UNAUTHORIZED,
content: Some(b"unauthorized".to_vec()),
..Default::default()
};
let page = build("https://example.com", res);
assert!(
!page.needs_retry(),
"401 without proxy_configured should NOT trigger retry"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_401_retried_with_proxy() {
let res = PageResponse {
status_code: StatusCode::UNAUTHORIZED,
content: Some(b"unauthorized".to_vec()),
..Default::default()
};
let mut page = build("https://example.com", res);
page.proxy_configured = true;
assert!(
page.needs_retry(),
"401 with proxy_configured should trigger retry"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_401_proxy_flag_does_not_affect_other_client_errors() {
let res = PageResponse {
status_code: StatusCode::NOT_FOUND,
content: Some(b"<html><body>Not Found</body></html>".to_vec()),
..Default::default()
};
let mut page = build("https://example.com/missing", res);
page.proxy_configured = true;
assert!(
!page.needs_retry(),
"404 should NOT be retried even with proxy"
);
}
#[cfg(not(feature = "decentralized"))]
#[test]
fn test_needs_retry_server_error_regardless_of_proxy() {
let res = PageResponse {
status_code: StatusCode::INTERNAL_SERVER_ERROR,
content: Some(Default::default()),
..Default::default()
};
let page = build("https://example.com", res);
assert!(page.needs_retry(), "500 retried without proxy");
}
impl crate::traits::PageData for Page {
#[inline]
fn url(&self) -> &str {
self.get_url()
}
#[inline]
fn url_final(&self) -> &str {
match self.final_redirect_destination.as_deref() {
Some(u) => u,
_ => &self.url,
}
}
#[inline]
fn bytes(&self) -> Option<&[u8]> {
self.get_bytes()
}
#[inline]
fn html(&self) -> String {
self.get_html()
}
#[inline]
fn html_bytes_u8(&self) -> &[u8] {
self.get_html_bytes_u8()
}
#[inline]
fn status_code(&self) -> StatusCode {
self.status_code
}
#[inline]
fn headers(&self) -> Option<&reqwest::header::HeaderMap> {
self.headers.as_ref()
}
#[inline]
fn is_empty(&self) -> bool {
self.is_empty()
}
}
#[cfg(feature = "time")]
impl crate::traits::PageTimingExt for Page {
#[inline]
fn duration_elapsed(&self) -> tokio::time::Duration {
self.get_duration_elapsed()
}
}
#[cfg(feature = "chrome")]
impl crate::traits::PageChromeExt for Page {
#[inline]
fn chrome_page(&self) -> Option<&chromiumoxide::Page> {
self.get_chrome_page()
}
#[inline]
fn screenshot_bytes(&self) -> Option<&[u8]> {
self.screenshot_bytes.as_deref()
}
}
#[cfg(all(test, not(feature = "decentralized")))]
mod empty_success_tests {
use super::*;
use crate::utils::PageResponse;
fn pr(status: u16, content: Option<Vec<u8>>) -> PageResponse {
PageResponse {
status_code: StatusCode::from_u16(status).unwrap(),
content,
..Default::default()
}
}
#[test]
fn empty_success_none_content_reclassified_to_504() {
let page = build("https://example.com", pr(200, None));
assert_eq!(page.status_code.as_u16(), 504);
assert!(page.should_retry, "must flag for retry strategy");
}
#[test]
fn empty_success_empty_vec_reclassified_to_504() {
let page = build("https://example.com", pr(200, Some(Vec::new())));
assert_eq!(page.status_code.as_u16(), 504);
assert!(page.should_retry);
}
#[test]
fn empty_success_shell_html_reclassified_to_504() {
let page = build(
"https://example.com",
pr(
200,
Some(b"<html><head></head><body></body></html>".to_vec()),
),
);
assert_eq!(page.status_code.as_u16(), 504);
assert!(page.should_retry);
}
#[test]
fn real_success_preserves_status_200() {
let html = b"<html><head><title>x</title></head><body><h1>hi</h1></body></html>";
let page = build("https://example.com", pr(200, Some(html.to_vec())));
assert_eq!(page.status_code.as_u16(), 200);
}
#[test]
fn not_found_empty_body_keeps_404() {
let page = build("https://example.com", pr(404, None));
assert_eq!(page.status_code.as_u16(), 404);
}
#[test]
fn truncated_success_preserves_status() {
let mut res = pr(200, Some(b"<html><body><p>partial".to_vec()));
res.content_truncated = true;
let page = build("https://example.com", res);
assert_eq!(page.status_code.as_u16(), 200);
}
#[test]
fn chrome_error_page_reclassified_to_599_not_504() {
let body = b"<html><head></head><body><div id=\"main-frame-error\"></div></body><script>loadTimeDataRaw = {\"errorCode\":\"net::ERR_GENERIC\"};</script></html>";
let page = build("https://example.com", pr(200, Some(body.to_vec())));
assert_ne!(page.status_code.as_u16(), 504);
}
#[test]
fn server_error_preserves_status() {
let page = build("https://example.com", pr(503, None));
assert_eq!(page.status_code.as_u16(), 503);
assert!(page.should_retry);
}
#[cfg(feature = "balance")]
#[test]
fn balance_spooled_content_preserves_status_200() {
use crate::utils::html_spool::{SpoolVitals, SpooledContent};
use std::path::PathBuf;
let mut response = pr(200, None);
response.content_spool = Some(SpooledContent {
path: PathBuf::from("/tmp/spider-test-spooled-page.html"),
vitals: SpoolVitals {
byte_len: 70_000,
is_valid_utf8: true,
is_xml: false,
binary_file: false,
},
..Default::default()
});
let page = build("https://example.com", response);
assert_eq!(
page.status_code.as_u16(),
200,
"spooled content must keep its 200 status — bytes are on disk, not empty"
);
}
#[test]
fn content_size_empty_response_returns_zero() {
assert_eq!(pr(200, None).content_size(), 0);
}
#[test]
fn content_size_in_memory_returns_buffer_len() {
let body = b"<html><body>hello</body></html>".to_vec();
let expected = body.len();
assert_eq!(pr(200, Some(body)).content_size(), expected);
}
#[cfg(feature = "balance")]
#[test]
fn content_size_spool_returns_vitals_byte_len() {
use crate::utils::html_spool::{SpoolVitals, SpooledContent};
use std::path::PathBuf;
let mut response = pr(200, None);
response.content_spool = Some(SpooledContent {
path: PathBuf::from("/tmp/spider-test-spool-size.html"),
vitals: SpoolVitals {
byte_len: 4_096,
is_valid_utf8: true,
is_xml: false,
binary_file: false,
},
..Default::default()
});
assert_eq!(response.content_size(), 4_096);
}
#[test]
fn has_content_bytes_empty_response_returns_false() {
assert!(!pr(200, None).has_content_bytes());
assert!(!pr(200, Some(Vec::new())).has_content_bytes());
}
#[test]
fn has_content_bytes_in_memory_returns_true() {
assert!(pr(200, Some(b"x".to_vec())).has_content_bytes());
}
#[cfg(feature = "balance")]
#[test]
fn has_content_bytes_spool_returns_true_even_with_empty_content() {
use crate::utils::html_spool::{SpoolVitals, SpooledContent};
use std::path::PathBuf;
let mut response = pr(200, None);
response.content_spool = Some(SpooledContent {
path: PathBuf::from("/tmp/spider-test-spool-presence.html"),
vitals: SpoolVitals {
byte_len: 100,
is_valid_utf8: true,
is_xml: false,
binary_file: false,
},
..Default::default()
});
assert!(response.has_content_bytes());
}
#[cfg(feature = "balance")]
#[test]
fn balance_spooled_content_does_not_set_should_retry() {
use crate::utils::html_spool::{SpoolVitals, SpooledContent};
use std::path::PathBuf;
let mut response = pr(200, None);
response.content_spool = Some(SpooledContent {
path: PathBuf::from("/tmp/spider-test-spool-no-retry.html"),
vitals: SpoolVitals {
byte_len: 50_000,
is_valid_utf8: true,
is_xml: false,
binary_file: false,
},
..Default::default()
});
let page = build("https://example.com", response);
assert!(
!page.should_retry,
"spooled real content on 2xx must not be flagged for retry"
);
}
#[cfg(feature = "balance")]
#[test]
fn balance_spool_populates_content_byte_len_from_vitals() {
use crate::utils::html_spool::{SpoolVitals, SpooledContent};
use std::path::PathBuf;
let mut response = pr(200, None);
response.content_spool = Some(SpooledContent {
path: PathBuf::from("/tmp/spider-test-spooled-page-vitals.html"),
vitals: SpoolVitals {
byte_len: 12_345,
is_valid_utf8: true,
is_xml: false,
binary_file: false,
},
..Default::default()
});
let page = build("https://example.com", response);
assert_eq!(
page.content_byte_len, 12_345,
"content_byte_len must come from spool vitals — never re-read disk"
);
}
}