1use std::time::{Duration, Instant};
4
5use crate::{
6 challenge::{ChallengeVerdict, EngineClass, engine_classify},
7 dom::Dom,
8 host::EngineHandle,
9 net::HttpClient,
10 stealth::StealthProfile,
11};
12
13const DEFAULT_NAV_BUDGET: Duration = Duration::from_secs(15);
15const DEFAULT_MAX_ITERATIONS: u8 = 3;
17
18#[derive(Debug)]
20pub struct Page {
21 engine: EngineHandle,
22 dom: Dom,
23 url: String,
24 title: String,
25 html: String,
26 challenge_class: EngineClass,
27 profile: Option<StealthProfile>,
28}
29
30impl Page {
31 pub fn new(engine: EngineHandle) -> Self {
32 Self {
33 engine,
34 dom: Dom::new(),
35 url: "about:blank".to_string(),
36 title: String::new(),
37 html: String::new(),
38 challenge_class: EngineClass {
39 tag: "L3-RENDERED",
40 verdict: ChallengeVerdict::Pass,
41 len: 0,
42 },
43 profile: None,
44 }
45 }
46
47 pub async fn from_html(
49 html: &str,
50 _profile: Option<StealthProfile>,
51 ) -> Result<Self, PageError> {
52 let dom = crate::html_parser::parse_html(html);
53 let title = extract_title(html);
54 let challenge_class = engine_classify(html);
55 Ok(Self {
56 engine: EngineHandle::new(),
57 dom,
58 url: "about:blank".to_string(),
59 title,
60 html: html.to_string(),
61 challenge_class,
62 profile: None,
63 })
64 }
65
66 pub async fn with_profile(
68 html: &str,
69 url: &str,
70 _profile: StealthProfile,
71 ) -> Result<Self, PageError> {
72 let dom = crate::html_parser::parse_html(html);
73 let title = extract_title(html);
74 let challenge_class = engine_classify(html);
75 Ok(Self {
76 engine: EngineHandle::new(),
77 dom,
78 url: url.to_string(),
79 title,
80 html: html.to_string(),
81 challenge_class,
82 profile: None,
83 })
84 }
85
86 pub fn reload_html(&mut self, html: &str, url: &str) {
88 self.dom = crate::html_parser::parse_html(html);
89 self.url = url.to_string();
90 self.html = html.to_string();
91 self.title = extract_title(html);
92 self.challenge_class = engine_classify(html);
93 }
94
95 pub async fn navigate(&mut self, url: &str) -> Result<(), PageError> {
100 let client = HttpClient::shared(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
102 self.navigate_inner(url, &client, DEFAULT_MAX_ITERATIONS, DEFAULT_NAV_BUDGET)
103 .await
104 }
105
106 pub async fn navigate_with_solvers(
110 &mut self,
111 url: &str,
112 solvers: &[&dyn crate::challenge::ChallengeSolver],
113 ) -> Result<(), PageError> {
114 let client = HttpClient::shared(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
115 self.navigate_with_solvers_inner(
116 url,
117 &client,
118 solvers,
119 DEFAULT_MAX_ITERATIONS,
120 DEFAULT_NAV_BUDGET,
121 )
122 .await
123 }
124
125 pub async fn navigate_warm(&mut self, url: &str) -> Result<(), PageError> {
129 let client = HttpClient::shared(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
130 let resp = client.get_follow(url, 10).await.map_err(PageError::Net)?;
131 let html = resp.text();
132 let resp_url = resp.url.clone();
133
134 self.reload_html(&html, &resp_url);
135 Ok(())
136 }
137
138 async fn navigate_inner(
140 &mut self,
141 url: &str,
142 client: &HttpClient,
143 max_iterations: u8,
144 budget: Duration,
145 ) -> Result<(), PageError> {
146 self.navigate_with_solvers_inner(url, client, &[], max_iterations, budget)
147 .await
148 }
149
150 async fn navigate_with_solvers_inner(
152 &mut self,
153 url: &str,
154 client: &HttpClient,
155 solvers: &[&dyn crate::challenge::ChallengeSolver],
156 max_iterations: u8,
157 budget: Duration,
158 ) -> Result<(), PageError> {
159 let t0 = Instant::now();
160 let iterations = max_iterations.max(1);
161
162 let resp = client.get_follow(url, 10).await.map_err(PageError::Net)?;
163 let mut current_html = resp.text();
164 let mut current_url = resp.url.clone();
165 let mut cookies_before = cookie_snapshot(client, ¤t_url).await;
166
167 for iter in 0..iterations {
168 if t0.elapsed() >= budget {
169 tracing::warn!(
170 iter,
171 elapsed_ms = t0.elapsed().as_millis(),
172 "navigate budget exhausted"
173 );
174 break;
175 }
176
177 self.reload_html(¤t_html, ¤t_url);
178
179 let challenge = engine_classify(¤t_html);
180
181 if !challenge.verdict.is_challenge() {
183 return Ok(());
184 }
185
186 let kind = tag_to_kind(challenge.tag);
188 let mut any_solved = false;
189 for solver in solvers {
190 if !solver.can_handle(&kind) {
191 continue;
192 }
193 if matches!(
194 solver.solve(&kind, self).await,
195 crate::challenge::SolveOutcome::Solved
196 ) {
197 any_solved = true;
198 }
199 }
200
201 if any_solved {
202 let resp = client
204 .get_follow(¤t_url, 10)
205 .await
206 .map_err(PageError::Net)?;
207 current_html = resp.text();
208 current_url = resp.url.clone();
209 cookies_before = cookie_snapshot(client, ¤t_url).await;
210 continue;
211 }
212
213 if iter + 1 < iterations {
216 let cookies_after = cookie_snapshot(client, ¤t_url).await;
217 if cookies_after != cookies_before && !cookies_after.is_empty() {
218 tracing::info!(iter, "cookie delta detected — retrying navigation");
219 let resp = client
220 .get_follow(¤t_url, 10)
221 .await
222 .map_err(PageError::Net)?;
223 current_html = resp.text();
224 current_url = resp.url.clone();
225 cookies_before = cookie_snapshot(client, ¤t_url).await;
226 continue;
227 }
228 }
229
230 break;
232 }
233
234 Ok(())
235 }
236
237 pub async fn evaluate_async(&mut self, _script: &str) -> Result<serde_json::Value, PageError> {
238 Err(PageError::Evaluation(
239 "evaluate_async requires v8 feature".into(),
240 ))
241 }
242
243 pub fn evaluate(&mut self, _script: &str) -> Result<String, PageError> {
245 Ok("undefined".to_string())
246 }
247
248 pub async fn title_async(&self) -> Result<String, PageError> {
249 Ok(self.title.clone())
250 }
251
252 pub fn title(&self) -> String {
254 self.title.clone()
255 }
256
257 pub fn url(&self) -> &str {
259 &self.url
260 }
261
262 pub fn content(&self) -> String {
264 self.html.clone()
265 }
266
267 pub async fn text_content(&self) -> Result<String, PageError> {
268 Ok(self.dom.text_content(crate::dom::NodeId::DOCUMENT))
269 }
270
271 pub async fn text_of(&self, _selector: &str) -> Result<String, PageError> {
272 Ok(String::new())
273 }
274
275 pub fn has_element(&self, _selector: &str) -> bool {
277 false
278 }
279
280 pub fn challenge_verdict(&self) -> ChallengeVerdict {
282 self.challenge_class.verdict
283 }
284
285 pub fn engine_class(&self) -> &EngineClass {
287 &self.challenge_class
288 }
289
290 pub fn dom(&self) -> &Dom {
291 &self.dom
292 }
293}
294
295fn tag_to_kind(tag: &'static str) -> crate::challenge::ChallengeKind {
297 let (vendor, sub_kind): (&'static str, &'static str) = if tag.starts_with("cf-") {
298 ("cloudflare", tag)
299 } else if tag.starts_with("AWS-WAF") {
300 ("aws-waf", tag)
301 } else if tag.eq_ignore_ascii_case("datadome") {
302 ("datadome", tag)
303 } else if tag.starts_with("akamai") {
304 ("akamai", tag)
305 } else if tag.starts_with("px-") || tag.starts_with("PXC") {
306 ("perimeterx", tag)
307 } else if tag.starts_with("kasada") {
308 ("kasada", tag)
309 } else if tag.starts_with("sec-cpt") {
310 ("sec-cpt", tag)
311 } else if tag.starts_with("hcaptcha") {
312 ("hcaptcha", tag)
313 } else {
314 ("unknown", tag)
315 };
316 crate::challenge::ChallengeKind::new(vendor, sub_kind)
317}
318
319async fn cookie_snapshot(client: &HttpClient, url: &str) -> String {
321 if let Ok(parsed) = url::Url::parse(url) {
322 client.cookies_for_url(&parsed).await.unwrap_or_default()
323 } else {
324 String::new()
325 }
326}
327
328fn extract_title(html: &str) -> String {
330 let lower = html.to_lowercase();
331 if let Some(start) = lower.find("<title") {
332 let after_tag = &html[start..];
333 if let Some(gt) = after_tag.find('>') {
334 let content = &after_tag[gt + 1..];
335 if let Some(end) = content.to_lowercase().find("</title>") {
336 return content[..end].trim().to_string();
337 }
338 }
339 }
340 String::new()
341}
342
343#[derive(Debug, thiserror::Error)]
344pub enum PageError {
345 #[error("navigation failed: {0}")]
346 Navigation(String),
347 #[error("evaluation failed: {0}")]
348 Evaluation(String),
349 #[error("element not found")]
350 ElementNotFound,
351 #[error("page not loaded")]
352 NotLoaded,
353 #[error("network error: {0}")]
354 Net(#[from] crate::net::NetError),
355}
356
357#[cfg(test)]
358mod tests {
359 use super::*;
360
361 #[tokio::test]
364 async fn bdd_navigate_to_clean_page() {
365 let mut body = String::from("Hello World. ");
366 for _ in 0..500 {
368 body.push_str("This is real rendered content for the test page. ");
369 }
370 let html = format!(
371 r#"<!DOCTYPE html>
372<html>
373<head><title>Test Page</title></head>
374<body>{body}</body>
375</html>"#
376 );
377 let page = Page::from_html(&html, None).await.unwrap();
378
379 assert_eq!(page.title(), "Test Page");
380 assert!(page.content().contains("Hello World"));
381 assert_eq!(page.challenge_verdict(), ChallengeVerdict::Pass);
382 }
383
384 #[tokio::test]
387 async fn bdd_navigate_with_challenge_detection() {
388 let html = r#"<!DOCTYPE html>
390<html>
391<head><title>Just a moment...</title></head>
392<body>
393<script>window._cf_chl_opt={cvId:'3',cType:'managed'};</script>
394Checking your browser before accessing the site...
395</body>
396</html>"#;
397 let page = Page::from_html(html, None).await.unwrap();
398
399 assert_eq!(page.challenge_verdict(), ChallengeVerdict::EdgeBlock);
400 assert!(page.challenge_verdict().is_challenge());
401 }
402
403 #[tokio::test]
406 async fn bdd_challenge_incomplete_verdict() {
407 let mut html = String::from(
408 r#"<html><head><title>Just a moment...</title></head><body>
409 <script>window._cf_chl_opt={cvId:'3',cType:'managed'};</script>"#,
410 );
411 for _ in 0..2000 {
412 html.push_str("<div>cf challenge orchestrator shell padding</div>");
413 }
414 html.push_str("</body></html>");
415 assert!(html.len() >= 50_000);
416
417 let page = Page::from_html(&html, None).await.unwrap();
418 assert_eq!(
419 page.challenge_verdict(),
420 ChallengeVerdict::ChallengeIncomplete
421 );
422 assert!(page.challenge_verdict().is_challenge());
423 }
424
425 #[tokio::test]
428 async fn bdd_clean_page_passes() {
429 let mut html = String::from("<html><body>");
430 for _ in 0..400 {
431 html.push_str("<p>Normal rendered content paragraph with enough text.</p>");
432 }
433 html.push_str("</body></html>");
434 assert!(html.len() >= 15_000);
435
436 let page = Page::from_html(&html, None).await.unwrap();
437 assert_eq!(page.challenge_verdict(), ChallengeVerdict::Pass);
438 assert!(!page.challenge_verdict().is_challenge());
439 }
440
441 #[tokio::test]
444 async fn bdd_warm_reuse_reloads_html() {
445 let html1 =
446 r#"<!DOCTYPE html><html><head><title>First</title></head><body>Page One</body></html>"#;
447 let html2 = r#"<!DOCTYPE html><html><head><title>Second</title></head><body>Page Two</body></html>"#;
448
449 let mut page = Page::from_html(html1, None).await.unwrap();
450 assert_eq!(page.title(), "First");
451 assert!(page.content().contains("Page One"));
452
453 page.reload_html(html2, "https://example.com/second");
455 assert_eq!(page.title(), "Second");
456 assert!(page.content().contains("Page Two"));
457 assert_eq!(page.url(), "https://example.com/second");
458 }
459
460 #[tokio::test]
463 async fn bdd_thin_body_render_incomplete() {
464 let html = "<html><body>tiny</body></html>";
465 let page = Page::from_html(html, None).await.unwrap();
466 assert_eq!(page.challenge_verdict(), ChallengeVerdict::RenderIncomplete);
467 assert!(!page.challenge_verdict().is_challenge());
468 }
469
470 #[tokio::test]
473 async fn bdd_datadome_interstitial() {
474 let html = r#"<script src="https://geo.captcha-delivery.com/captcha.js"></script>
475<div id="ddcaptchaencoded">encoded_payload</div>"#;
476 let page = Page::from_html(html, None).await.unwrap();
477 assert!(page.challenge_verdict().is_challenge());
478 }
479
480 #[tokio::test]
483 async fn bdd_awswaf_challenge() {
484 let html = r#"<html><body>
485<script>window.gokuProps={key:'a',context:'b',iv:'c'};</script>
486<script>window.awsWafCookieDomainList=["example.com"];</script>
487<script src="https://x.token.awswaf.com/challenge.js"></script>
488<script>AwsWafIntegration.checkForceRefresh();</script>
489</body></html>"#;
490 let page = Page::from_html(html, None).await.unwrap();
491 assert!(page.challenge_verdict().is_challenge());
492 }
493
494 #[test]
497 fn extract_title_basic() {
498 assert_eq!(
499 extract_title("<html><head><title>Hello</title></head></html>"),
500 "Hello"
501 );
502 }
503
504 #[test]
505 fn extract_title_empty() {
506 assert_eq!(extract_title("<html><body></body></html>"), "");
507 }
508
509 #[test]
510 fn extract_title_case_insensitive() {
511 assert_eq!(
512 extract_title("<HTML><HEAD><TITLE>Test</TITLE></HEAD></HTML>"),
513 "Test"
514 );
515 }
516}