1use std::time::{Duration, Instant};
4
5#[cfg(feature = "v8")]
6use crate::js_runtime::runtime::BrowserJsRuntime;
7use crate::{
8 challenge::{ChallengeVerdict, EngineClass, engine_classify},
9 dom::Dom,
10 host::EngineHandle,
11 net::{HttpClient, RedirectPolicy},
12 stealth::StealthProfile,
13};
14
15const DEFAULT_NAV_BUDGET: Duration = Duration::from_secs(15);
17const DEFAULT_MAX_ITERATIONS: u8 = 3;
19
20pub struct Page {
22 engine: EngineHandle,
23 dom: Dom,
24 url: String,
25 title: String,
26 html: String,
27 challenge_class: EngineClass,
28 profile: Option<StealthProfile>,
29 stealth: bool,
30}
31
32impl std::fmt::Debug for Page {
33 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
34 f.debug_struct("Page")
35 .field("url", &self.url)
36 .field("title", &self.title)
37 .field("stealth", &self.stealth)
38 .field("challenge_class", &self.challenge_class)
39 .field("profile", &self.profile.is_some())
40 .finish()
41 }
42}
43
44impl Page {
45 pub fn new(engine: EngineHandle) -> Self {
46 Self {
47 engine,
48 dom: Dom::new(),
49 url: "about:blank".to_string(),
50 title: String::new(),
51 html: String::new(),
52 challenge_class: EngineClass {
53 tag: "L3-RENDERED",
54 verdict: ChallengeVerdict::Pass,
55 len: 0,
56 },
57 profile: None,
58 stealth: false,
59 }
60 }
61
62 pub async fn from_html(html: &str, stealth: bool) -> Result<Self, PageError> {
64 let dom = crate::html_parser::parse_html(html);
65 let title = extract_title(html);
66 let challenge_class = engine_classify(html);
67 Ok(Self {
68 engine: EngineHandle::new(),
69 dom,
70 url: "about:blank".to_string(),
71 title,
72 html: html.to_string(),
73 challenge_class,
74 profile: None,
75 stealth,
76 })
77 }
78
79 pub async fn with_profile(
81 html: &str,
82 url: &str,
83 _profile: StealthProfile,
84 ) -> Result<Self, PageError> {
85 let dom = crate::html_parser::parse_html(html);
86 let title = extract_title(html);
87 let challenge_class = engine_classify(html);
88 Ok(Self {
89 engine: EngineHandle::new(),
90 dom,
91 url: url.to_string(),
92 title,
93 html: html.to_string(),
94 challenge_class,
95 profile: None,
96 stealth: true,
97 })
98 }
99
100 pub fn reload_html(&mut self, html: &str, url: &str) {
102 self.dom = crate::html_parser::parse_html(html);
103 self.url = url.to_string();
104 self.html = html.to_string();
105 self.title = extract_title(html);
106 self.challenge_class = engine_classify(html);
107 }
108
109 pub async fn navigate(&mut self, url: &str) -> Result<(), PageError> {
114 let client = HttpClient::new(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
116 self.navigate_inner(url, &client, DEFAULT_MAX_ITERATIONS, DEFAULT_NAV_BUDGET)
117 .await
118 }
119
120 pub async fn navigate_with_solvers(
124 &mut self,
125 url: &str,
126 solvers: &[&dyn crate::challenge::ChallengeSolver],
127 ) -> Result<(), PageError> {
128 let client = HttpClient::new(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
129 self.navigate_with_solvers_inner(
130 url,
131 &client,
132 solvers,
133 DEFAULT_MAX_ITERATIONS,
134 DEFAULT_NAV_BUDGET,
135 )
136 .await
137 }
138
139 pub async fn navigate_warm(&mut self, url: &str) -> Result<(), PageError> {
143 let client = HttpClient::new(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
144 let resp = client
145 .request("GET", url, None, &[], RedirectPolicy::Follow(10))
146 .await
147 .map_err(PageError::Net)?;
148 let html = resp.text();
149 let resp_url = resp.url.clone();
150
151 self.reload_html(&html, &resp_url);
152 Ok(())
153 }
154
155 async fn navigate_inner(
157 &mut self,
158 url: &str,
159 client: &HttpClient,
160 max_iterations: u8,
161 budget: Duration,
162 ) -> Result<(), PageError> {
163 self.navigate_with_solvers_inner(url, client, &[], max_iterations, budget)
164 .await
165 }
166
167 async fn navigate_with_solvers_inner(
169 &mut self,
170 url: &str,
171 client: &HttpClient,
172 solvers: &[&dyn crate::challenge::ChallengeSolver],
173 max_iterations: u8,
174 budget: Duration,
175 ) -> Result<(), PageError> {
176 let t0 = Instant::now();
177 let iterations = max_iterations.max(1);
178
179 let resp = client
180 .request("GET", url, None, &[], RedirectPolicy::Follow(10))
181 .await
182 .map_err(PageError::Net)?;
183 let mut current_html = resp.text();
184 let mut current_url = resp.url.clone();
185 let mut cookies_before = cookie_snapshot(client, ¤t_url).await;
186
187 for iter in 0..iterations {
188 if t0.elapsed() >= budget {
189 tracing::warn!(
190 iter,
191 elapsed_ms = t0.elapsed().as_millis(),
192 "navigate budget exhausted"
193 );
194 break;
195 }
196
197 self.reload_html(¤t_html, ¤t_url);
198
199 let challenge = engine_classify(¤t_html);
200
201 if !challenge.verdict.is_challenge() {
203 return Ok(());
204 }
205
206 let kind = tag_to_kind(challenge.tag);
208 let mut any_solved = false;
209 for solver in solvers {
210 if !solver.can_handle(&kind) {
211 continue;
212 }
213 if matches!(
214 solver.solve(&kind, self).await,
215 crate::challenge::SolveOutcome::Solved
216 ) {
217 any_solved = true;
218 }
219 }
220
221 if any_solved {
222 let resp = client
224 .request("GET", ¤t_url, None, &[], RedirectPolicy::Follow(10))
225 .await
226 .map_err(PageError::Net)?;
227 current_html = resp.text();
228 current_url = resp.url.clone();
229 cookies_before = cookie_snapshot(client, ¤t_url).await;
230 continue;
231 }
232
233 if iter + 1 < iterations {
236 let cookies_after = cookie_snapshot(client, ¤t_url).await;
237 if cookies_after != cookies_before && !cookies_after.is_empty() {
238 tracing::info!(iter, "cookie delta detected — retrying navigation");
239 let resp = client
240 .request("GET", ¤t_url, None, &[], RedirectPolicy::Follow(10))
241 .await
242 .map_err(PageError::Net)?;
243 current_html = resp.text();
244 current_url = resp.url.clone();
245 cookies_before = cookie_snapshot(client, ¤t_url).await;
246 continue;
247 }
248 }
249
250 break;
252 }
253
254 Ok(())
255 }
256
257 pub async fn evaluate_async(&mut self, _script: &str) -> Result<serde_json::Value, PageError> {
258 Err(PageError::Evaluation(
259 "evaluate_async requires v8 feature".into(),
260 ))
261 }
262
263 pub fn evaluate(&mut self, _script: &str) -> Result<String, PageError> {
265 Ok("undefined".to_string())
266 }
267
268 pub async fn title_async(&self) -> Result<String, PageError> {
269 Ok(self.title.clone())
270 }
271
272 pub fn title(&self) -> String {
274 self.title.clone()
275 }
276
277 pub fn url(&self) -> &str {
279 &self.url
280 }
281
282 pub fn stealth(&self) -> bool {
284 self.stealth
285 }
286
287 #[cfg(feature = "v8")]
294 pub fn set_profile(&mut self, profile: StealthProfile) {
295 let mut rt = BrowserJsRuntime::new(crate::dom::Dom::new());
296 rt.set_user_agent(&profile.user_agent);
297 rt.set_platform(&profile.platform, &profile.os_name, &profile.os_version);
298 rt.set_stealth(true);
299 rt.run_page_init();
300 self.profile = Some(profile);
301 }
302
303 pub fn content(&self) -> String {
305 self.html.clone()
306 }
307
308 pub async fn text_content(&self) -> Result<String, PageError> {
309 Ok(self.dom.text_content(crate::dom::NodeId::DOCUMENT))
310 }
311
312 pub async fn text_of(&self, _selector: &str) -> Result<String, PageError> {
313 Ok(String::new())
314 }
315
316 pub fn has_element(&self, _selector: &str) -> bool {
318 false
319 }
320
321 pub fn challenge_verdict(&self) -> ChallengeVerdict {
323 self.challenge_class.verdict
324 }
325
326 pub fn engine_class(&self) -> &EngineClass {
328 &self.challenge_class
329 }
330
331 pub fn dom(&self) -> &Dom {
332 &self.dom
333 }
334}
335
336fn tag_to_kind(tag: &'static str) -> crate::challenge::ChallengeKind {
338 let (vendor, sub_kind): (&'static str, &'static str) = if tag.starts_with("cf-") {
339 ("cloudflare", tag)
340 } else if tag.starts_with("AWS-WAF") {
341 ("aws-waf", tag)
342 } else if tag.eq_ignore_ascii_case("datadome") {
343 ("datadome", tag)
344 } else if tag.starts_with("akamai") {
345 ("akamai", tag)
346 } else if tag.starts_with("px-") || tag.starts_with("PXC") {
347 ("perimeterx", tag)
348 } else if tag.starts_with("kasada") {
349 ("kasada", tag)
350 } else if tag.starts_with("sec-cpt") {
351 ("sec-cpt", tag)
352 } else if tag.starts_with("hcaptcha") {
353 ("hcaptcha", tag)
354 } else {
355 ("unknown", tag)
356 };
357 crate::challenge::ChallengeKind::new(vendor, sub_kind)
358}
359
360async fn cookie_snapshot(client: &HttpClient, url: &str) -> String {
362 if let Ok(parsed) = url::Url::parse(url) {
363 client.cookies_for_url(&parsed).await.unwrap_or_default()
364 } else {
365 String::new()
366 }
367}
368
369fn extract_title(html: &str) -> String {
371 let lower = html.to_lowercase();
372 if let Some(start) = lower.find("<title") {
373 let after_tag = &html[start..];
374 if let Some(gt) = after_tag.find('>') {
375 let content = &after_tag[gt + 1..];
376 if let Some(end) = content.to_lowercase().find("</title>") {
377 return content[..end].trim().to_string();
378 }
379 }
380 }
381 String::new()
382}
383
384#[derive(Debug, thiserror::Error)]
385pub enum PageError {
386 #[error("navigation failed: {0}")]
387 Navigation(String),
388 #[error("evaluation failed: {0}")]
389 Evaluation(String),
390 #[error("element not found")]
391 ElementNotFound,
392 #[error("page not loaded")]
393 NotLoaded,
394 #[error("network error: {0}")]
395 Net(#[from] crate::net::NetError),
396}
397
398#[cfg(test)]
399mod tests {
400 use super::*;
401
402 #[tokio::test]
405 async fn bdd_navigate_to_clean_page() {
406 let mut body = String::from("Hello World. ");
407 for _ in 0..500 {
409 body.push_str("This is real rendered content for the test page. ");
410 }
411 let html = format!(
412 r#"<!DOCTYPE html>
413<html>
414<head><title>Test Page</title></head>
415<body>{body}</body>
416</html>"#
417 );
418 let page = Page::from_html(&html, false).await.unwrap();
419
420 assert_eq!(page.title(), "Test Page");
421 assert!(page.content().contains("Hello World"));
422 assert_eq!(page.challenge_verdict(), ChallengeVerdict::Pass);
423 }
424
425 #[tokio::test]
428 async fn bdd_navigate_with_challenge_detection() {
429 let html = r#"<!DOCTYPE html>
431<html>
432<head><title>Just a moment...</title></head>
433<body>
434<script>window._cf_chl_opt={cvId:'3',cType:'managed'};</script>
435Checking your browser before accessing the site...
436</body>
437</html>"#;
438 let page = Page::from_html(html, false).await.unwrap();
439
440 assert_eq!(page.challenge_verdict(), ChallengeVerdict::EdgeBlock);
441 assert!(page.challenge_verdict().is_challenge());
442 }
443
444 #[tokio::test]
447 async fn bdd_challenge_incomplete_verdict() {
448 let mut html = String::from(
449 r#"<html><head><title>Just a moment...</title></head><body>
450 <script>window._cf_chl_opt={cvId:'3',cType:'managed'};</script>"#,
451 );
452 for _ in 0..2000 {
453 html.push_str("<div>cf challenge orchestrator shell padding</div>");
454 }
455 html.push_str("</body></html>");
456 assert!(html.len() >= 50_000);
457
458 let page = Page::from_html(&html, false).await.unwrap();
459 assert_eq!(
460 page.challenge_verdict(),
461 ChallengeVerdict::ChallengeIncomplete
462 );
463 assert!(page.challenge_verdict().is_challenge());
464 }
465
466 #[tokio::test]
469 async fn bdd_clean_page_passes() {
470 let mut html = String::from("<html><body>");
471 for _ in 0..400 {
472 html.push_str("<p>Normal rendered content paragraph with enough text.</p>");
473 }
474 html.push_str("</body></html>");
475 assert!(html.len() >= 15_000);
476
477 let page = Page::from_html(&html, false).await.unwrap();
478 assert_eq!(page.challenge_verdict(), ChallengeVerdict::Pass);
479 assert!(!page.challenge_verdict().is_challenge());
480 }
481
482 #[tokio::test]
485 async fn bdd_warm_reuse_reloads_html() {
486 let html1 =
487 r#"<!DOCTYPE html><html><head><title>First</title></head><body>Page One</body></html>"#;
488 let html2 = r#"<!DOCTYPE html><html><head><title>Second</title></head><body>Page Two</body></html>"#;
489
490 let mut page = Page::from_html(html1, false).await.unwrap();
491 assert_eq!(page.title(), "First");
492 assert!(page.content().contains("Page One"));
493
494 page.reload_html(html2, "https://example.com/second");
496 assert_eq!(page.title(), "Second");
497 assert!(page.content().contains("Page Two"));
498 assert_eq!(page.url(), "https://example.com/second");
499 }
500
501 #[tokio::test]
504 async fn bdd_thin_body_render_incomplete() {
505 let html = "<html><body>tiny</body></html>";
506 let page = Page::from_html(html, false).await.unwrap();
507 assert_eq!(page.challenge_verdict(), ChallengeVerdict::RenderIncomplete);
508 assert!(!page.challenge_verdict().is_challenge());
509 }
510
511 #[tokio::test]
514 async fn bdd_datadome_interstitial() {
515 let html = r#"<script src="https://geo.captcha-delivery.com/captcha.js"></script>
516<div id="ddcaptchaencoded">encoded_payload</div>"#;
517 let page = Page::from_html(html, false).await.unwrap();
518 assert!(page.challenge_verdict().is_challenge());
519 }
520
521 #[tokio::test]
524 async fn bdd_awswaf_challenge() {
525 let html = r#"<html><body>
526<script>window.gokuProps={key:'a',context:'b',iv:'c'};</script>
527<script>window.awsWafCookieDomainList=["example.com"];</script>
528<script src="https://x.token.awswaf.com/challenge.js"></script>
529<script>AwsWafIntegration.checkForceRefresh();</script>
530</body></html>"#;
531 let page = Page::from_html(html, false).await.unwrap();
532 assert!(page.challenge_verdict().is_challenge());
533 }
534
535 #[test]
538 fn extract_title_basic() {
539 assert_eq!(
540 extract_title("<html><head><title>Hello</title></head></html>"),
541 "Hello"
542 );
543 }
544
545 #[test]
546 fn extract_title_empty() {
547 assert_eq!(extract_title("<html><body></body></html>"), "");
548 }
549
550 #[test]
551 fn extract_title_case_insensitive() {
552 assert_eq!(
553 extract_title("<HTML><HEAD><TITLE>Test</TITLE></HEAD></HTML>"),
554 "Test"
555 );
556 }
557}