Skip to main content

hpx_browser/
page.rs

1//! Browser page abstraction with challenge-aware navigation.
2
3use std::time::{Duration, Instant};
4
5use crate::{
6    challenge::{ChallengeVerdict, EngineClass, engine_classify},
7    dom::Dom,
8    host::EngineHandle,
9    net::HttpClient,
10    stealth::StealthProfile,
11};
12
13/// Default navigation budget.
14const DEFAULT_NAV_BUDGET: Duration = Duration::from_secs(15);
15/// Default max iterations for challenge retry loops.
16const DEFAULT_MAX_ITERATIONS: u8 = 3;
17
18/// A browser page/tab.
19#[derive(Debug)]
20pub struct Page {
21    engine: EngineHandle,
22    dom: Dom,
23    url: String,
24    title: String,
25    html: String,
26    challenge_class: EngineClass,
27    profile: Option<StealthProfile>,
28}
29
30impl Page {
31    pub fn new(engine: EngineHandle) -> Self {
32        Self {
33            engine,
34            dom: Dom::new(),
35            url: "about:blank".to_string(),
36            title: String::new(),
37            html: String::new(),
38            challenge_class: EngineClass {
39                tag: "L3-RENDERED",
40                verdict: ChallengeVerdict::Pass,
41                len: 0,
42            },
43            profile: None,
44        }
45    }
46
47    /// Create a page from raw HTML (no network).
48    pub async fn from_html(
49        html: &str,
50        _profile: Option<StealthProfile>,
51    ) -> Result<Self, PageError> {
52        let dom = crate::html_parser::parse_html(html);
53        let title = extract_title(html);
54        let challenge_class = engine_classify(html);
55        Ok(Self {
56            engine: EngineHandle::new(),
57            dom,
58            url: "about:blank".to_string(),
59            title,
60            html: html.to_string(),
61            challenge_class,
62            profile: None,
63        })
64    }
65
66    /// Create a page with profile and URL (no network).
67    pub async fn with_profile(
68        html: &str,
69        url: &str,
70        _profile: StealthProfile,
71    ) -> Result<Self, PageError> {
72        let dom = crate::html_parser::parse_html(html);
73        let title = extract_title(html);
74        let challenge_class = engine_classify(html);
75        Ok(Self {
76            engine: EngineHandle::new(),
77            dom,
78            url: url.to_string(),
79            title,
80            html: html.to_string(),
81            challenge_class,
82            profile: None,
83        })
84    }
85
86    /// Reload the page with new HTML (reuses V8 isolate in v8 mode).
87    pub fn reload_html(&mut self, html: &str, url: &str) {
88        self.dom = crate::html_parser::parse_html(html);
89        self.url = url.to_string();
90        self.html = html.to_string();
91        self.title = extract_title(html);
92        self.challenge_class = engine_classify(html);
93    }
94
95    /// Navigate to a URL with challenge-aware retry loop.
96    ///
97    /// Fetch → classify → if challenge detected, retry up to `max_iterations`.
98    /// Uses 15s budget by default.
99    pub async fn navigate(&mut self, url: &str) -> Result<(), PageError> {
100        // ponytail: always Chrome profile; per-profile routing via tls_impersonate
101        let client = HttpClient::shared(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
102        self.navigate_inner(url, &client, DEFAULT_MAX_ITERATIONS, DEFAULT_NAV_BUDGET)
103            .await
104    }
105
106    /// Navigate with a custom solver list.
107    ///
108    /// Same as `navigate()` but accepts external challenge solvers.
109    pub async fn navigate_with_solvers(
110        &mut self,
111        url: &str,
112        solvers: &[&dyn crate::challenge::ChallengeSolver],
113    ) -> Result<(), PageError> {
114        let client = HttpClient::shared(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
115        self.navigate_with_solvers_inner(
116            url,
117            &client,
118            solvers,
119            DEFAULT_MAX_ITERATIONS,
120            DEFAULT_NAV_BUDGET,
121        )
122        .await
123    }
124
125    /// Warm navigation — reuse existing page state, fetch new URL.
126    ///
127    /// Faster than cold `navigate()` because it skips profile setup.
128    pub async fn navigate_warm(&mut self, url: &str) -> Result<(), PageError> {
129        let client = HttpClient::shared(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
130        let resp = client.get_follow(url, 10).await.map_err(PageError::Net)?;
131        let html = resp.text();
132        let resp_url = resp.url.clone();
133
134        self.reload_html(&html, &resp_url);
135        Ok(())
136    }
137
138    /// Core navigate loop with budget and cookie-diff retry.
139    async fn navigate_inner(
140        &mut self,
141        url: &str,
142        client: &HttpClient,
143        max_iterations: u8,
144        budget: Duration,
145    ) -> Result<(), PageError> {
146        self.navigate_with_solvers_inner(url, client, &[], max_iterations, budget)
147            .await
148    }
149
150    /// Core navigate loop with solver support.
151    async fn navigate_with_solvers_inner(
152        &mut self,
153        url: &str,
154        client: &HttpClient,
155        solvers: &[&dyn crate::challenge::ChallengeSolver],
156        max_iterations: u8,
157        budget: Duration,
158    ) -> Result<(), PageError> {
159        let t0 = Instant::now();
160        let iterations = max_iterations.max(1);
161
162        let resp = client.get_follow(url, 10).await.map_err(PageError::Net)?;
163        let mut current_html = resp.text();
164        let mut current_url = resp.url.clone();
165        let mut cookies_before = cookie_snapshot(client, &current_url).await;
166
167        for iter in 0..iterations {
168            if t0.elapsed() >= budget {
169                tracing::warn!(
170                    iter,
171                    elapsed_ms = t0.elapsed().as_millis(),
172                    "navigate budget exhausted"
173                );
174                break;
175            }
176
177            self.reload_html(&current_html, &current_url);
178
179            let challenge = engine_classify(&current_html);
180
181            // Clean page — no challenge markers, return immediately.
182            if !challenge.verdict.is_challenge() {
183                return Ok(());
184            }
185
186            // Try registered solvers.
187            let kind = tag_to_kind(challenge.tag);
188            let mut any_solved = false;
189            for solver in solvers {
190                if !solver.can_handle(&kind) {
191                    continue;
192                }
193                if matches!(
194                    solver.solve(&kind, self).await,
195                    crate::challenge::SolveOutcome::Solved
196                ) {
197                    any_solved = true;
198                }
199            }
200
201            if any_solved {
202                // Re-fetch after solver ran.
203                let resp = client
204                    .get_follow(&current_url, 10)
205                    .await
206                    .map_err(PageError::Net)?;
207                current_html = resp.text();
208                current_url = resp.url.clone();
209                cookies_before = cookie_snapshot(client, &current_url).await;
210                continue;
211            }
212
213            // Cookie-diff retry: if cookies changed during this iteration,
214            // the challenge script may have self-solved.
215            if iter + 1 < iterations {
216                let cookies_after = cookie_snapshot(client, &current_url).await;
217                if cookies_after != cookies_before && !cookies_after.is_empty() {
218                    tracing::info!(iter, "cookie delta detected — retrying navigation");
219                    let resp = client
220                        .get_follow(&current_url, 10)
221                        .await
222                        .map_err(PageError::Net)?;
223                    current_html = resp.text();
224                    current_url = resp.url.clone();
225                    cookies_before = cookie_snapshot(client, &current_url).await;
226                    continue;
227                }
228            }
229
230            // Challenge still present, no solver helped, no cookie change.
231            break;
232        }
233
234        Ok(())
235    }
236
237    pub async fn evaluate_async(&mut self, _script: &str) -> Result<serde_json::Value, PageError> {
238        Err(PageError::Evaluation(
239            "evaluate_async requires v8 feature".into(),
240        ))
241    }
242
243    /// Synchronous evaluate — DOM-level only without v8.
244    pub fn evaluate(&mut self, _script: &str) -> Result<String, PageError> {
245        Ok("undefined".to_string())
246    }
247
248    pub async fn title_async(&self) -> Result<String, PageError> {
249        Ok(self.title.clone())
250    }
251
252    /// Synchronous title.
253    pub fn title(&self) -> String {
254        self.title.clone()
255    }
256
257    /// Current URL.
258    pub fn url(&self) -> &str {
259        &self.url
260    }
261
262    /// Page HTML content.
263    pub fn content(&self) -> String {
264        self.html.clone()
265    }
266
267    pub async fn text_content(&self) -> Result<String, PageError> {
268        Ok(self.dom.text_content(crate::dom::NodeId::DOCUMENT))
269    }
270
271    pub async fn text_of(&self, _selector: &str) -> Result<String, PageError> {
272        Ok(String::new())
273    }
274
275    /// Synchronous element check.
276    pub fn has_element(&self, _selector: &str) -> bool {
277        false
278    }
279
280    /// Challenge classification result.
281    pub fn challenge_verdict(&self) -> ChallengeVerdict {
282        self.challenge_class.verdict
283    }
284
285    /// Full challenge classification.
286    pub fn engine_class(&self) -> &EngineClass {
287        &self.challenge_class
288    }
289
290    pub fn dom(&self) -> &Dom {
291        &self.dom
292    }
293}
294
295/// Map an `engine_classify` tag to a `ChallengeKind` for solver dispatch.
296fn tag_to_kind(tag: &'static str) -> crate::challenge::ChallengeKind {
297    let (vendor, sub_kind): (&'static str, &'static str) = if tag.starts_with("cf-") {
298        ("cloudflare", tag)
299    } else if tag.starts_with("AWS-WAF") {
300        ("aws-waf", tag)
301    } else if tag.eq_ignore_ascii_case("datadome") {
302        ("datadome", tag)
303    } else if tag.starts_with("akamai") {
304        ("akamai", tag)
305    } else if tag.starts_with("px-") || tag.starts_with("PXC") {
306        ("perimeterx", tag)
307    } else if tag.starts_with("kasada") {
308        ("kasada", tag)
309    } else if tag.starts_with("sec-cpt") {
310        ("sec-cpt", tag)
311    } else if tag.starts_with("hcaptcha") {
312        ("hcaptcha", tag)
313    } else {
314        ("unknown", tag)
315    };
316    crate::challenge::ChallengeKind::new(vendor, sub_kind)
317}
318
319/// Snapshot cookie jar for a URL (empty string if none).
320async fn cookie_snapshot(client: &HttpClient, url: &str) -> String {
321    if let Ok(parsed) = url::Url::parse(url) {
322        client.cookies_for_url(&parsed).await.unwrap_or_default()
323    } else {
324        String::new()
325    }
326}
327
328/// Extract <title> from HTML (cheap string scan, no full parse).
329fn extract_title(html: &str) -> String {
330    let lower = html.to_lowercase();
331    if let Some(start) = lower.find("<title") {
332        let after_tag = &html[start..];
333        if let Some(gt) = after_tag.find('>') {
334            let content = &after_tag[gt + 1..];
335            if let Some(end) = content.to_lowercase().find("</title>") {
336                return content[..end].trim().to_string();
337            }
338        }
339    }
340    String::new()
341}
342
343#[derive(Debug, thiserror::Error)]
344pub enum PageError {
345    #[error("navigation failed: {0}")]
346    Navigation(String),
347    #[error("evaluation failed: {0}")]
348    Evaluation(String),
349    #[error("element not found")]
350    ElementNotFound,
351    #[error("page not loaded")]
352    NotLoaded,
353    #[error("network error: {0}")]
354    Net(#[from] crate::net::NetError),
355}
356
357#[cfg(test)]
358mod tests {
359    use super::*;
360
361    // ── BDD Scenario 1: Navigate to clean page ──────────────────────────
362
363    #[tokio::test]
364    async fn bdd_navigate_to_clean_page() {
365        let mut body = String::from("Hello World. ");
366        // Push past THIN_BODY_MAX_BYTES (1000) and THIN_SHELL_MAX_BYTES (15KB)
367        for _ in 0..500 {
368            body.push_str("This is real rendered content for the test page. ");
369        }
370        let html = format!(
371            r#"<!DOCTYPE html>
372<html>
373<head><title>Test Page</title></head>
374<body>{body}</body>
375</html>"#
376        );
377        let page = Page::from_html(&html, None).await.unwrap();
378
379        assert_eq!(page.title(), "Test Page");
380        assert!(page.content().contains("Hello World"));
381        assert_eq!(page.challenge_verdict(), ChallengeVerdict::Pass);
382    }
383
384    // ── BDD Scenario 2: Navigate with challenge detection ────────────────
385
386    #[tokio::test]
387    async fn bdd_navigate_with_challenge_detection() {
388        // Simulate a Cloudflare challenge response
389        let html = r#"<!DOCTYPE html>
390<html>
391<head><title>Just a moment...</title></head>
392<body>
393<script>window._cf_chl_opt={cvId:'3',cType:'managed'};</script>
394Checking your browser before accessing the site...
395</body>
396</html>"#;
397        let page = Page::from_html(html, None).await.unwrap();
398
399        assert_eq!(page.challenge_verdict(), ChallengeVerdict::EdgeBlock);
400        assert!(page.challenge_verdict().is_challenge());
401    }
402
403    // ── BDD Scenario 3: ChallengeIncomplete for large managed shell ──────
404
405    #[tokio::test]
406    async fn bdd_challenge_incomplete_verdict() {
407        let mut html = String::from(
408            r#"<html><head><title>Just a moment...</title></head><body>
409            <script>window._cf_chl_opt={cvId:'3',cType:'managed'};</script>"#,
410        );
411        for _ in 0..2000 {
412            html.push_str("<div>cf challenge orchestrator shell padding</div>");
413        }
414        html.push_str("</body></html>");
415        assert!(html.len() >= 50_000);
416
417        let page = Page::from_html(&html, None).await.unwrap();
418        assert_eq!(
419            page.challenge_verdict(),
420            ChallengeVerdict::ChallengeIncomplete
421        );
422        assert!(page.challenge_verdict().is_challenge());
423    }
424
425    // ── BDD: Clean page with substantial content passes ──────────────────
426
427    #[tokio::test]
428    async fn bdd_clean_page_passes() {
429        let mut html = String::from("<html><body>");
430        for _ in 0..400 {
431            html.push_str("<p>Normal rendered content paragraph with enough text.</p>");
432        }
433        html.push_str("</body></html>");
434        assert!(html.len() >= 15_000);
435
436        let page = Page::from_html(&html, None).await.unwrap();
437        assert_eq!(page.challenge_verdict(), ChallengeVerdict::Pass);
438        assert!(!page.challenge_verdict().is_challenge());
439    }
440
441    // ── BDD: Warm reuse reloads HTML ─────────────────────────────────────
442
443    #[tokio::test]
444    async fn bdd_warm_reuse_reloads_html() {
445        let html1 =
446            r#"<!DOCTYPE html><html><head><title>First</title></head><body>Page One</body></html>"#;
447        let html2 = r#"<!DOCTYPE html><html><head><title>Second</title></head><body>Page Two</body></html>"#;
448
449        let mut page = Page::from_html(html1, None).await.unwrap();
450        assert_eq!(page.title(), "First");
451        assert!(page.content().contains("Page One"));
452
453        // Warm reuse: reload with new HTML
454        page.reload_html(html2, "https://example.com/second");
455        assert_eq!(page.title(), "Second");
456        assert!(page.content().contains("Page Two"));
457        assert_eq!(page.url(), "https://example.com/second");
458    }
459
460    // ── BDD: Thin body is RenderIncomplete ───────────────────────────────
461
462    #[tokio::test]
463    async fn bdd_thin_body_render_incomplete() {
464        let html = "<html><body>tiny</body></html>";
465        let page = Page::from_html(html, None).await.unwrap();
466        assert_eq!(page.challenge_verdict(), ChallengeVerdict::RenderIncomplete);
467        assert!(!page.challenge_verdict().is_challenge());
468    }
469
470    // ── BDD: DataDome interstitial detected ──────────────────────────────
471
472    #[tokio::test]
473    async fn bdd_datadome_interstitial() {
474        let html = r#"<script src="https://geo.captcha-delivery.com/captcha.js"></script>
475<div id="ddcaptchaencoded">encoded_payload</div>"#;
476        let page = Page::from_html(html, None).await.unwrap();
477        assert!(page.challenge_verdict().is_challenge());
478    }
479
480    // ── BDD: AWS-WAF challenge detected ──────────────────────────────────
481
482    #[tokio::test]
483    async fn bdd_awswaf_challenge() {
484        let html = r#"<html><body>
485<script>window.gokuProps={key:'a',context:'b',iv:'c'};</script>
486<script>window.awsWafCookieDomainList=["example.com"];</script>
487<script src="https://x.token.awswaf.com/challenge.js"></script>
488<script>AwsWafIntegration.checkForceRefresh();</script>
489</body></html>"#;
490        let page = Page::from_html(html, None).await.unwrap();
491        assert!(page.challenge_verdict().is_challenge());
492    }
493
494    // ── extract_title tests ──────────────────────────────────────────────
495
496    #[test]
497    fn extract_title_basic() {
498        assert_eq!(
499            extract_title("<html><head><title>Hello</title></head></html>"),
500            "Hello"
501        );
502    }
503
504    #[test]
505    fn extract_title_empty() {
506        assert_eq!(extract_title("<html><body></body></html>"), "");
507    }
508
509    #[test]
510    fn extract_title_case_insensitive() {
511        assert_eq!(
512            extract_title("<HTML><HEAD><TITLE>Test</TITLE></HEAD></HTML>"),
513            "Test"
514        );
515    }
516}