1use std::time::{Duration, Instant};
4
5#[cfg(feature = "v8")]
6use crate::js_runtime::runtime::BrowserJsRuntime;
7use crate::{
8 challenge::{ChallengeVerdict, EngineClass, engine_classify},
9 dom::Dom,
10 host::EngineHandle,
11 net::HttpClient,
12 stealth::StealthProfile,
13};
14
15const DEFAULT_NAV_BUDGET: Duration = Duration::from_secs(15);
17const DEFAULT_MAX_ITERATIONS: u8 = 3;
19
20pub struct Page {
22 engine: EngineHandle,
23 dom: Dom,
24 url: String,
25 title: String,
26 html: String,
27 challenge_class: EngineClass,
28 profile: Option<StealthProfile>,
29 stealth: bool,
30}
31
32impl std::fmt::Debug for Page {
33 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
34 f.debug_struct("Page")
35 .field("url", &self.url)
36 .field("title", &self.title)
37 .field("stealth", &self.stealth)
38 .field("challenge_class", &self.challenge_class)
39 .field("profile", &self.profile.is_some())
40 .finish()
41 }
42}
43
44impl Page {
45 pub fn new(engine: EngineHandle) -> Self {
46 Self {
47 engine,
48 dom: Dom::new(),
49 url: "about:blank".to_string(),
50 title: String::new(),
51 html: String::new(),
52 challenge_class: EngineClass {
53 tag: "L3-RENDERED",
54 verdict: ChallengeVerdict::Pass,
55 len: 0,
56 },
57 profile: None,
58 stealth: false,
59 }
60 }
61
62 pub async fn from_html(html: &str, stealth: bool) -> Result<Self, PageError> {
64 let dom = crate::html_parser::parse_html(html);
65 let title = extract_title(html);
66 let challenge_class = engine_classify(html);
67 Ok(Self {
68 engine: EngineHandle::new(),
69 dom,
70 url: "about:blank".to_string(),
71 title,
72 html: html.to_string(),
73 challenge_class,
74 profile: None,
75 stealth,
76 })
77 }
78
79 pub async fn with_profile(
81 html: &str,
82 url: &str,
83 _profile: StealthProfile,
84 ) -> Result<Self, PageError> {
85 let dom = crate::html_parser::parse_html(html);
86 let title = extract_title(html);
87 let challenge_class = engine_classify(html);
88 Ok(Self {
89 engine: EngineHandle::new(),
90 dom,
91 url: url.to_string(),
92 title,
93 html: html.to_string(),
94 challenge_class,
95 profile: None,
96 stealth: true,
97 })
98 }
99
100 pub fn reload_html(&mut self, html: &str, url: &str) {
102 self.dom = crate::html_parser::parse_html(html);
103 self.url = url.to_string();
104 self.html = html.to_string();
105 self.title = extract_title(html);
106 self.challenge_class = engine_classify(html);
107 }
108
109 pub async fn navigate(&mut self, url: &str) -> Result<(), PageError> {
114 let client = HttpClient::shared(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
116 self.navigate_inner(url, &client, DEFAULT_MAX_ITERATIONS, DEFAULT_NAV_BUDGET)
117 .await
118 }
119
120 pub async fn navigate_with_solvers(
124 &mut self,
125 url: &str,
126 solvers: &[&dyn crate::challenge::ChallengeSolver],
127 ) -> Result<(), PageError> {
128 let client = HttpClient::shared(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
129 self.navigate_with_solvers_inner(
130 url,
131 &client,
132 solvers,
133 DEFAULT_MAX_ITERATIONS,
134 DEFAULT_NAV_BUDGET,
135 )
136 .await
137 }
138
139 pub async fn navigate_warm(&mut self, url: &str) -> Result<(), PageError> {
143 let client = HttpClient::shared(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
144 let resp = client.get_follow(url, 10).await.map_err(PageError::Net)?;
145 let html = resp.text();
146 let resp_url = resp.url.clone();
147
148 self.reload_html(&html, &resp_url);
149 Ok(())
150 }
151
152 async fn navigate_inner(
154 &mut self,
155 url: &str,
156 client: &HttpClient,
157 max_iterations: u8,
158 budget: Duration,
159 ) -> Result<(), PageError> {
160 self.navigate_with_solvers_inner(url, client, &[], max_iterations, budget)
161 .await
162 }
163
164 async fn navigate_with_solvers_inner(
166 &mut self,
167 url: &str,
168 client: &HttpClient,
169 solvers: &[&dyn crate::challenge::ChallengeSolver],
170 max_iterations: u8,
171 budget: Duration,
172 ) -> Result<(), PageError> {
173 let t0 = Instant::now();
174 let iterations = max_iterations.max(1);
175
176 let resp = client.get_follow(url, 10).await.map_err(PageError::Net)?;
177 let mut current_html = resp.text();
178 let mut current_url = resp.url.clone();
179 let mut cookies_before = cookie_snapshot(client, ¤t_url).await;
180
181 for iter in 0..iterations {
182 if t0.elapsed() >= budget {
183 tracing::warn!(
184 iter,
185 elapsed_ms = t0.elapsed().as_millis(),
186 "navigate budget exhausted"
187 );
188 break;
189 }
190
191 self.reload_html(¤t_html, ¤t_url);
192
193 let challenge = engine_classify(¤t_html);
194
195 if !challenge.verdict.is_challenge() {
197 return Ok(());
198 }
199
200 let kind = tag_to_kind(challenge.tag);
202 let mut any_solved = false;
203 for solver in solvers {
204 if !solver.can_handle(&kind) {
205 continue;
206 }
207 if matches!(
208 solver.solve(&kind, self).await,
209 crate::challenge::SolveOutcome::Solved
210 ) {
211 any_solved = true;
212 }
213 }
214
215 if any_solved {
216 let resp = client
218 .get_follow(¤t_url, 10)
219 .await
220 .map_err(PageError::Net)?;
221 current_html = resp.text();
222 current_url = resp.url.clone();
223 cookies_before = cookie_snapshot(client, ¤t_url).await;
224 continue;
225 }
226
227 if iter + 1 < iterations {
230 let cookies_after = cookie_snapshot(client, ¤t_url).await;
231 if cookies_after != cookies_before && !cookies_after.is_empty() {
232 tracing::info!(iter, "cookie delta detected — retrying navigation");
233 let resp = client
234 .get_follow(¤t_url, 10)
235 .await
236 .map_err(PageError::Net)?;
237 current_html = resp.text();
238 current_url = resp.url.clone();
239 cookies_before = cookie_snapshot(client, ¤t_url).await;
240 continue;
241 }
242 }
243
244 break;
246 }
247
248 Ok(())
249 }
250
251 pub async fn evaluate_async(&mut self, _script: &str) -> Result<serde_json::Value, PageError> {
252 Err(PageError::Evaluation(
253 "evaluate_async requires v8 feature".into(),
254 ))
255 }
256
257 pub fn evaluate(&mut self, _script: &str) -> Result<String, PageError> {
259 Ok("undefined".to_string())
260 }
261
262 pub async fn title_async(&self) -> Result<String, PageError> {
263 Ok(self.title.clone())
264 }
265
266 pub fn title(&self) -> String {
268 self.title.clone()
269 }
270
271 pub fn url(&self) -> &str {
273 &self.url
274 }
275
276 pub fn stealth(&self) -> bool {
278 self.stealth
279 }
280
281 #[cfg(feature = "v8")]
288 pub fn set_profile(&mut self, profile: StealthProfile) {
289 let mut rt = BrowserJsRuntime::new(crate::dom::Dom::new());
290 rt.set_user_agent(&profile.user_agent);
291 rt.set_platform(&profile.platform, &profile.os_name, &profile.os_version);
292 rt.set_stealth(true);
293 rt.run_page_init();
294 self.profile = Some(profile);
295 }
296
297 pub fn content(&self) -> String {
299 self.html.clone()
300 }
301
302 pub async fn text_content(&self) -> Result<String, PageError> {
303 Ok(self.dom.text_content(crate::dom::NodeId::DOCUMENT))
304 }
305
306 pub async fn text_of(&self, _selector: &str) -> Result<String, PageError> {
307 Ok(String::new())
308 }
309
310 pub fn has_element(&self, _selector: &str) -> bool {
312 false
313 }
314
315 pub fn challenge_verdict(&self) -> ChallengeVerdict {
317 self.challenge_class.verdict
318 }
319
320 pub fn engine_class(&self) -> &EngineClass {
322 &self.challenge_class
323 }
324
325 pub fn dom(&self) -> &Dom {
326 &self.dom
327 }
328}
329
330fn tag_to_kind(tag: &'static str) -> crate::challenge::ChallengeKind {
332 let (vendor, sub_kind): (&'static str, &'static str) = if tag.starts_with("cf-") {
333 ("cloudflare", tag)
334 } else if tag.starts_with("AWS-WAF") {
335 ("aws-waf", tag)
336 } else if tag.eq_ignore_ascii_case("datadome") {
337 ("datadome", tag)
338 } else if tag.starts_with("akamai") {
339 ("akamai", tag)
340 } else if tag.starts_with("px-") || tag.starts_with("PXC") {
341 ("perimeterx", tag)
342 } else if tag.starts_with("kasada") {
343 ("kasada", tag)
344 } else if tag.starts_with("sec-cpt") {
345 ("sec-cpt", tag)
346 } else if tag.starts_with("hcaptcha") {
347 ("hcaptcha", tag)
348 } else {
349 ("unknown", tag)
350 };
351 crate::challenge::ChallengeKind::new(vendor, sub_kind)
352}
353
354async fn cookie_snapshot(client: &HttpClient, url: &str) -> String {
356 if let Ok(parsed) = url::Url::parse(url) {
357 client.cookies_for_url(&parsed).await.unwrap_or_default()
358 } else {
359 String::new()
360 }
361}
362
363fn extract_title(html: &str) -> String {
365 let lower = html.to_lowercase();
366 if let Some(start) = lower.find("<title") {
367 let after_tag = &html[start..];
368 if let Some(gt) = after_tag.find('>') {
369 let content = &after_tag[gt + 1..];
370 if let Some(end) = content.to_lowercase().find("</title>") {
371 return content[..end].trim().to_string();
372 }
373 }
374 }
375 String::new()
376}
377
378#[derive(Debug, thiserror::Error)]
379pub enum PageError {
380 #[error("navigation failed: {0}")]
381 Navigation(String),
382 #[error("evaluation failed: {0}")]
383 Evaluation(String),
384 #[error("element not found")]
385 ElementNotFound,
386 #[error("page not loaded")]
387 NotLoaded,
388 #[error("network error: {0}")]
389 Net(#[from] crate::net::NetError),
390}
391
392#[cfg(test)]
393mod tests {
394 use super::*;
395
396 #[tokio::test]
399 async fn bdd_navigate_to_clean_page() {
400 let mut body = String::from("Hello World. ");
401 for _ in 0..500 {
403 body.push_str("This is real rendered content for the test page. ");
404 }
405 let html = format!(
406 r#"<!DOCTYPE html>
407<html>
408<head><title>Test Page</title></head>
409<body>{body}</body>
410</html>"#
411 );
412 let page = Page::from_html(&html, false).await.unwrap();
413
414 assert_eq!(page.title(), "Test Page");
415 assert!(page.content().contains("Hello World"));
416 assert_eq!(page.challenge_verdict(), ChallengeVerdict::Pass);
417 }
418
419 #[tokio::test]
422 async fn bdd_navigate_with_challenge_detection() {
423 let html = r#"<!DOCTYPE html>
425<html>
426<head><title>Just a moment...</title></head>
427<body>
428<script>window._cf_chl_opt={cvId:'3',cType:'managed'};</script>
429Checking your browser before accessing the site...
430</body>
431</html>"#;
432 let page = Page::from_html(html, false).await.unwrap();
433
434 assert_eq!(page.challenge_verdict(), ChallengeVerdict::EdgeBlock);
435 assert!(page.challenge_verdict().is_challenge());
436 }
437
438 #[tokio::test]
441 async fn bdd_challenge_incomplete_verdict() {
442 let mut html = String::from(
443 r#"<html><head><title>Just a moment...</title></head><body>
444 <script>window._cf_chl_opt={cvId:'3',cType:'managed'};</script>"#,
445 );
446 for _ in 0..2000 {
447 html.push_str("<div>cf challenge orchestrator shell padding</div>");
448 }
449 html.push_str("</body></html>");
450 assert!(html.len() >= 50_000);
451
452 let page = Page::from_html(&html, false).await.unwrap();
453 assert_eq!(
454 page.challenge_verdict(),
455 ChallengeVerdict::ChallengeIncomplete
456 );
457 assert!(page.challenge_verdict().is_challenge());
458 }
459
460 #[tokio::test]
463 async fn bdd_clean_page_passes() {
464 let mut html = String::from("<html><body>");
465 for _ in 0..400 {
466 html.push_str("<p>Normal rendered content paragraph with enough text.</p>");
467 }
468 html.push_str("</body></html>");
469 assert!(html.len() >= 15_000);
470
471 let page = Page::from_html(&html, false).await.unwrap();
472 assert_eq!(page.challenge_verdict(), ChallengeVerdict::Pass);
473 assert!(!page.challenge_verdict().is_challenge());
474 }
475
476 #[tokio::test]
479 async fn bdd_warm_reuse_reloads_html() {
480 let html1 =
481 r#"<!DOCTYPE html><html><head><title>First</title></head><body>Page One</body></html>"#;
482 let html2 = r#"<!DOCTYPE html><html><head><title>Second</title></head><body>Page Two</body></html>"#;
483
484 let mut page = Page::from_html(html1, false).await.unwrap();
485 assert_eq!(page.title(), "First");
486 assert!(page.content().contains("Page One"));
487
488 page.reload_html(html2, "https://example.com/second");
490 assert_eq!(page.title(), "Second");
491 assert!(page.content().contains("Page Two"));
492 assert_eq!(page.url(), "https://example.com/second");
493 }
494
495 #[tokio::test]
498 async fn bdd_thin_body_render_incomplete() {
499 let html = "<html><body>tiny</body></html>";
500 let page = Page::from_html(html, false).await.unwrap();
501 assert_eq!(page.challenge_verdict(), ChallengeVerdict::RenderIncomplete);
502 assert!(!page.challenge_verdict().is_challenge());
503 }
504
505 #[tokio::test]
508 async fn bdd_datadome_interstitial() {
509 let html = r#"<script src="https://geo.captcha-delivery.com/captcha.js"></script>
510<div id="ddcaptchaencoded">encoded_payload</div>"#;
511 let page = Page::from_html(html, false).await.unwrap();
512 assert!(page.challenge_verdict().is_challenge());
513 }
514
515 #[tokio::test]
518 async fn bdd_awswaf_challenge() {
519 let html = r#"<html><body>
520<script>window.gokuProps={key:'a',context:'b',iv:'c'};</script>
521<script>window.awsWafCookieDomainList=["example.com"];</script>
522<script src="https://x.token.awswaf.com/challenge.js"></script>
523<script>AwsWafIntegration.checkForceRefresh();</script>
524</body></html>"#;
525 let page = Page::from_html(html, false).await.unwrap();
526 assert!(page.challenge_verdict().is_challenge());
527 }
528
529 #[test]
532 fn extract_title_basic() {
533 assert_eq!(
534 extract_title("<html><head><title>Hello</title></head></html>"),
535 "Hello"
536 );
537 }
538
539 #[test]
540 fn extract_title_empty() {
541 assert_eq!(extract_title("<html><body></body></html>"), "");
542 }
543
544 #[test]
545 fn extract_title_case_insensitive() {
546 assert_eq!(
547 extract_title("<HTML><HEAD><TITLE>Test</TITLE></HEAD></HTML>"),
548 "Test"
549 );
550 }
551}