use std::time::{Duration, Instant};
use crate::{
challenge::{ChallengeVerdict, EngineClass, engine_classify},
dom::Dom,
host::EngineHandle,
net::HttpClient,
stealth::StealthProfile,
};
const DEFAULT_NAV_BUDGET: Duration = Duration::from_secs(15);
const DEFAULT_MAX_ITERATIONS: u8 = 3;
#[derive(Debug)]
pub struct Page {
engine: EngineHandle,
dom: Dom,
url: String,
title: String,
html: String,
challenge_class: EngineClass,
profile: Option<StealthProfile>,
}
impl Page {
pub fn new(engine: EngineHandle) -> Self {
Self {
engine,
dom: Dom::new(),
url: "about:blank".to_string(),
title: String::new(),
html: String::new(),
challenge_class: EngineClass {
tag: "L3-RENDERED",
verdict: ChallengeVerdict::Pass,
len: 0,
},
profile: None,
}
}
pub async fn from_html(
html: &str,
_profile: Option<StealthProfile>,
) -> Result<Self, PageError> {
let dom = crate::html_parser::parse_html(html);
let title = extract_title(html);
let challenge_class = engine_classify(html);
Ok(Self {
engine: EngineHandle::new(),
dom,
url: "about:blank".to_string(),
title,
html: html.to_string(),
challenge_class,
profile: None,
})
}
pub async fn with_profile(
html: &str,
url: &str,
_profile: StealthProfile,
) -> Result<Self, PageError> {
let dom = crate::html_parser::parse_html(html);
let title = extract_title(html);
let challenge_class = engine_classify(html);
Ok(Self {
engine: EngineHandle::new(),
dom,
url: url.to_string(),
title,
html: html.to_string(),
challenge_class,
profile: None,
})
}
pub fn reload_html(&mut self, html: &str, url: &str) {
self.dom = crate::html_parser::parse_html(html);
self.url = url.to_string();
self.html = html.to_string();
self.title = extract_title(html);
self.challenge_class = engine_classify(html);
}
pub async fn navigate(&mut self, url: &str) -> Result<(), PageError> {
let client = HttpClient::shared(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
self.navigate_inner(url, &client, DEFAULT_MAX_ITERATIONS, DEFAULT_NAV_BUDGET)
.await
}
pub async fn navigate_with_solvers(
&mut self,
url: &str,
solvers: &[&dyn crate::challenge::ChallengeSolver],
) -> Result<(), PageError> {
let client = HttpClient::shared(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
self.navigate_with_solvers_inner(
url,
&client,
solvers,
DEFAULT_MAX_ITERATIONS,
DEFAULT_NAV_BUDGET,
)
.await
}
pub async fn navigate_warm(&mut self, url: &str) -> Result<(), PageError> {
let client = HttpClient::shared(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
let resp = client.get_follow(url, 10).await.map_err(PageError::Net)?;
let html = resp.text();
let resp_url = resp.url.clone();
self.reload_html(&html, &resp_url);
Ok(())
}
async fn navigate_inner(
&mut self,
url: &str,
client: &HttpClient,
max_iterations: u8,
budget: Duration,
) -> Result<(), PageError> {
self.navigate_with_solvers_inner(url, client, &[], max_iterations, budget)
.await
}
async fn navigate_with_solvers_inner(
&mut self,
url: &str,
client: &HttpClient,
solvers: &[&dyn crate::challenge::ChallengeSolver],
max_iterations: u8,
budget: Duration,
) -> Result<(), PageError> {
let t0 = Instant::now();
let iterations = max_iterations.max(1);
let resp = client.get_follow(url, 10).await.map_err(PageError::Net)?;
let mut current_html = resp.text();
let mut current_url = resp.url.clone();
let mut cookies_before = cookie_snapshot(client, ¤t_url).await;
for iter in 0..iterations {
if t0.elapsed() >= budget {
tracing::warn!(
iter,
elapsed_ms = t0.elapsed().as_millis(),
"navigate budget exhausted"
);
break;
}
self.reload_html(¤t_html, ¤t_url);
let challenge = engine_classify(¤t_html);
if !challenge.verdict.is_challenge() {
return Ok(());
}
let kind = tag_to_kind(challenge.tag);
let mut any_solved = false;
for solver in solvers {
if !solver.can_handle(&kind) {
continue;
}
if matches!(
solver.solve(&kind, self).await,
crate::challenge::SolveOutcome::Solved
) {
any_solved = true;
}
}
if any_solved {
let resp = client
.get_follow(¤t_url, 10)
.await
.map_err(PageError::Net)?;
current_html = resp.text();
current_url = resp.url.clone();
cookies_before = cookie_snapshot(client, ¤t_url).await;
continue;
}
if iter + 1 < iterations {
let cookies_after = cookie_snapshot(client, ¤t_url).await;
if cookies_after != cookies_before && !cookies_after.is_empty() {
tracing::info!(iter, "cookie delta detected — retrying navigation");
let resp = client
.get_follow(¤t_url, 10)
.await
.map_err(PageError::Net)?;
current_html = resp.text();
current_url = resp.url.clone();
cookies_before = cookie_snapshot(client, ¤t_url).await;
continue;
}
}
break;
}
Ok(())
}
pub async fn evaluate_async(&mut self, _script: &str) -> Result<serde_json::Value, PageError> {
Err(PageError::Evaluation(
"evaluate_async requires v8 feature".into(),
))
}
pub fn evaluate(&mut self, _script: &str) -> Result<String, PageError> {
Ok("undefined".to_string())
}
pub async fn title_async(&self) -> Result<String, PageError> {
Ok(self.title.clone())
}
pub fn title(&self) -> String {
self.title.clone()
}
pub fn url(&self) -> &str {
&self.url
}
pub fn content(&self) -> String {
self.html.clone()
}
pub async fn text_content(&self) -> Result<String, PageError> {
Ok(self.dom.text_content(crate::dom::NodeId::DOCUMENT))
}
pub async fn text_of(&self, _selector: &str) -> Result<String, PageError> {
Ok(String::new())
}
pub fn has_element(&self, _selector: &str) -> bool {
false
}
pub fn challenge_verdict(&self) -> ChallengeVerdict {
self.challenge_class.verdict
}
pub fn engine_class(&self) -> &EngineClass {
&self.challenge_class
}
pub fn dom(&self) -> &Dom {
&self.dom
}
}
fn tag_to_kind(tag: &'static str) -> crate::challenge::ChallengeKind {
let (vendor, sub_kind): (&'static str, &'static str) = if tag.starts_with("cf-") {
("cloudflare", tag)
} else if tag.starts_with("AWS-WAF") {
("aws-waf", tag)
} else if tag.eq_ignore_ascii_case("datadome") {
("datadome", tag)
} else if tag.starts_with("akamai") {
("akamai", tag)
} else if tag.starts_with("px-") || tag.starts_with("PXC") {
("perimeterx", tag)
} else if tag.starts_with("kasada") {
("kasada", tag)
} else if tag.starts_with("sec-cpt") {
("sec-cpt", tag)
} else if tag.starts_with("hcaptcha") {
("hcaptcha", tag)
} else {
("unknown", tag)
};
crate::challenge::ChallengeKind::new(vendor, sub_kind)
}
async fn cookie_snapshot(client: &HttpClient, url: &str) -> String {
if let Ok(parsed) = url::Url::parse(url) {
client.cookies_for_url(&parsed).await.unwrap_or_default()
} else {
String::new()
}
}
fn extract_title(html: &str) -> String {
let lower = html.to_lowercase();
if let Some(start) = lower.find("<title") {
let after_tag = &html[start..];
if let Some(gt) = after_tag.find('>') {
let content = &after_tag[gt + 1..];
if let Some(end) = content.to_lowercase().find("</title>") {
return content[..end].trim().to_string();
}
}
}
String::new()
}
#[derive(Debug, thiserror::Error)]
pub enum PageError {
#[error("navigation failed: {0}")]
Navigation(String),
#[error("evaluation failed: {0}")]
Evaluation(String),
#[error("element not found")]
ElementNotFound,
#[error("page not loaded")]
NotLoaded,
#[error("network error: {0}")]
Net(#[from] crate::net::NetError),
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn bdd_navigate_to_clean_page() {
let mut body = String::from("Hello World. ");
for _ in 0..500 {
body.push_str("This is real rendered content for the test page. ");
}
let html = format!(
r#"<!DOCTYPE html>
<html>
<head><title>Test Page</title></head>
<body>{body}</body>
</html>"#
);
let page = Page::from_html(&html, None).await.unwrap();
assert_eq!(page.title(), "Test Page");
assert!(page.content().contains("Hello World"));
assert_eq!(page.challenge_verdict(), ChallengeVerdict::Pass);
}
#[tokio::test]
async fn bdd_navigate_with_challenge_detection() {
let html = r#"<!DOCTYPE html>
<html>
<head><title>Just a moment...</title></head>
<body>
<script>window._cf_chl_opt={cvId:'3',cType:'managed'};</script>
Checking your browser before accessing the site...
</body>
</html>"#;
let page = Page::from_html(html, None).await.unwrap();
assert_eq!(page.challenge_verdict(), ChallengeVerdict::EdgeBlock);
assert!(page.challenge_verdict().is_challenge());
}
#[tokio::test]
async fn bdd_challenge_incomplete_verdict() {
let mut html = String::from(
r#"<html><head><title>Just a moment...</title></head><body>
<script>window._cf_chl_opt={cvId:'3',cType:'managed'};</script>"#,
);
for _ in 0..2000 {
html.push_str("<div>cf challenge orchestrator shell padding</div>");
}
html.push_str("</body></html>");
assert!(html.len() >= 50_000);
let page = Page::from_html(&html, None).await.unwrap();
assert_eq!(
page.challenge_verdict(),
ChallengeVerdict::ChallengeIncomplete
);
assert!(page.challenge_verdict().is_challenge());
}
#[tokio::test]
async fn bdd_clean_page_passes() {
let mut html = String::from("<html><body>");
for _ in 0..400 {
html.push_str("<p>Normal rendered content paragraph with enough text.</p>");
}
html.push_str("</body></html>");
assert!(html.len() >= 15_000);
let page = Page::from_html(&html, None).await.unwrap();
assert_eq!(page.challenge_verdict(), ChallengeVerdict::Pass);
assert!(!page.challenge_verdict().is_challenge());
}
#[tokio::test]
async fn bdd_warm_reuse_reloads_html() {
let html1 =
r#"<!DOCTYPE html><html><head><title>First</title></head><body>Page One</body></html>"#;
let html2 = r#"<!DOCTYPE html><html><head><title>Second</title></head><body>Page Two</body></html>"#;
let mut page = Page::from_html(html1, None).await.unwrap();
assert_eq!(page.title(), "First");
assert!(page.content().contains("Page One"));
page.reload_html(html2, "https://example.com/second");
assert_eq!(page.title(), "Second");
assert!(page.content().contains("Page Two"));
assert_eq!(page.url(), "https://example.com/second");
}
#[tokio::test]
async fn bdd_thin_body_render_incomplete() {
let html = "<html><body>tiny</body></html>";
let page = Page::from_html(html, None).await.unwrap();
assert_eq!(page.challenge_verdict(), ChallengeVerdict::RenderIncomplete);
assert!(!page.challenge_verdict().is_challenge());
}
#[tokio::test]
async fn bdd_datadome_interstitial() {
let html = r#"<script src="https://geo.captcha-delivery.com/captcha.js"></script>
<div id="ddcaptchaencoded">encoded_payload</div>"#;
let page = Page::from_html(html, None).await.unwrap();
assert!(page.challenge_verdict().is_challenge());
}
#[tokio::test]
async fn bdd_awswaf_challenge() {
let html = r#"<html><body>
<script>window.gokuProps={key:'a',context:'b',iv:'c'};</script>
<script>window.awsWafCookieDomainList=["example.com"];</script>
<script src="https://x.token.awswaf.com/challenge.js"></script>
<script>AwsWafIntegration.checkForceRefresh();</script>
</body></html>"#;
let page = Page::from_html(html, None).await.unwrap();
assert!(page.challenge_verdict().is_challenge());
}
#[test]
fn extract_title_basic() {
assert_eq!(
extract_title("<html><head><title>Hello</title></head></html>"),
"Hello"
);
}
#[test]
fn extract_title_empty() {
assert_eq!(extract_title("<html><body></body></html>"), "");
}
#[test]
fn extract_title_case_insensitive() {
assert_eq!(
extract_title("<HTML><HEAD><TITLE>Test</TITLE></HEAD></HTML>"),
"Test"
);
}
}