Skip to main content

hpx_browser/
page.rs

1//! Browser page abstraction with challenge-aware navigation.
2
3use std::time::{Duration, Instant};
4
5#[cfg(feature = "v8")]
6use crate::js_runtime::runtime::BrowserJsRuntime;
7use crate::{
8    challenge::{ChallengeVerdict, EngineClass, engine_classify},
9    dom::Dom,
10    host::EngineHandle,
11    net::{HttpClient, RedirectPolicy},
12    stealth::StealthProfile,
13};
14
15/// Default navigation budget.
16const DEFAULT_NAV_BUDGET: Duration = Duration::from_secs(15);
17/// Default max iterations for challenge retry loops.
18const DEFAULT_MAX_ITERATIONS: u8 = 3;
19
20/// A browser page/tab.
21pub struct Page {
22    engine: EngineHandle,
23    dom: Dom,
24    url: String,
25    title: String,
26    html: String,
27    challenge_class: EngineClass,
28    profile: Option<StealthProfile>,
29    stealth: bool,
30}
31
32impl std::fmt::Debug for Page {
33    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
34        f.debug_struct("Page")
35            .field("url", &self.url)
36            .field("title", &self.title)
37            .field("stealth", &self.stealth)
38            .field("challenge_class", &self.challenge_class)
39            .field("profile", &self.profile.is_some())
40            .finish()
41    }
42}
43
44impl Page {
45    pub fn new(engine: EngineHandle) -> Self {
46        Self {
47            engine,
48            dom: Dom::new(),
49            url: "about:blank".to_string(),
50            title: String::new(),
51            html: String::new(),
52            challenge_class: EngineClass {
53                tag: "L3-RENDERED",
54                verdict: ChallengeVerdict::Pass,
55                len: 0,
56            },
57            profile: None,
58            stealth: false,
59        }
60    }
61
62    /// Create a page from raw HTML (no network).
63    pub async fn from_html(html: &str, stealth: bool) -> Result<Self, PageError> {
64        let dom = crate::html_parser::parse_html(html);
65        let title = extract_title(html);
66        let challenge_class = engine_classify(html);
67        Ok(Self {
68            engine: EngineHandle::new(),
69            dom,
70            url: "about:blank".to_string(),
71            title,
72            html: html.to_string(),
73            challenge_class,
74            profile: None,
75            stealth,
76        })
77    }
78
79    /// Create a page with profile and URL (no network).
80    pub async fn with_profile(
81        html: &str,
82        url: &str,
83        _profile: StealthProfile,
84    ) -> Result<Self, PageError> {
85        let dom = crate::html_parser::parse_html(html);
86        let title = extract_title(html);
87        let challenge_class = engine_classify(html);
88        Ok(Self {
89            engine: EngineHandle::new(),
90            dom,
91            url: url.to_string(),
92            title,
93            html: html.to_string(),
94            challenge_class,
95            profile: None,
96            stealth: true,
97        })
98    }
99
100    /// Reload the page with new HTML (reuses V8 isolate in v8 mode).
101    pub fn reload_html(&mut self, html: &str, url: &str) {
102        self.dom = crate::html_parser::parse_html(html);
103        self.url = url.to_string();
104        self.html = html.to_string();
105        self.title = extract_title(html);
106        self.challenge_class = engine_classify(html);
107    }
108
109    /// Navigate to a URL with challenge-aware retry loop.
110    ///
111    /// Fetch → classify → if challenge detected, retry up to `max_iterations`.
112    /// Uses 15s budget by default.
113    pub async fn navigate(&mut self, url: &str) -> Result<(), PageError> {
114        // ponytail: always Chrome profile; per-profile routing via tls_impersonate
115        let client = HttpClient::new(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
116        self.navigate_inner(url, &client, DEFAULT_MAX_ITERATIONS, DEFAULT_NAV_BUDGET)
117            .await
118    }
119
120    /// Navigate with a custom solver list.
121    ///
122    /// Same as `navigate()` but accepts external challenge solvers.
123    pub async fn navigate_with_solvers(
124        &mut self,
125        url: &str,
126        solvers: &[&dyn crate::challenge::ChallengeSolver],
127    ) -> Result<(), PageError> {
128        let client = HttpClient::new(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
129        self.navigate_with_solvers_inner(
130            url,
131            &client,
132            solvers,
133            DEFAULT_MAX_ITERATIONS,
134            DEFAULT_NAV_BUDGET,
135        )
136        .await
137    }
138
139    /// Warm navigation — reuse existing page state, fetch new URL.
140    ///
141    /// Faster than cold `navigate()` because it skips profile setup.
142    pub async fn navigate_warm(&mut self, url: &str) -> Result<(), PageError> {
143        let client = HttpClient::new(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
144        let resp = client
145            .request("GET", url, None, &[], RedirectPolicy::Follow(10))
146            .await
147            .map_err(PageError::Net)?;
148        let html = resp.text();
149        let resp_url = resp.url.clone();
150
151        self.reload_html(&html, &resp_url);
152        Ok(())
153    }
154
155    /// Core navigate loop with budget and cookie-diff retry.
156    async fn navigate_inner(
157        &mut self,
158        url: &str,
159        client: &HttpClient,
160        max_iterations: u8,
161        budget: Duration,
162    ) -> Result<(), PageError> {
163        self.navigate_with_solvers_inner(url, client, &[], max_iterations, budget)
164            .await
165    }
166
167    /// Core navigate loop with solver support.
168    async fn navigate_with_solvers_inner(
169        &mut self,
170        url: &str,
171        client: &HttpClient,
172        solvers: &[&dyn crate::challenge::ChallengeSolver],
173        max_iterations: u8,
174        budget: Duration,
175    ) -> Result<(), PageError> {
176        let t0 = Instant::now();
177        let iterations = max_iterations.max(1);
178
179        let resp = client
180            .request("GET", url, None, &[], RedirectPolicy::Follow(10))
181            .await
182            .map_err(PageError::Net)?;
183        let mut current_html = resp.text();
184        let mut current_url = resp.url.clone();
185        let mut cookies_before = cookie_snapshot(client, &current_url).await;
186
187        for iter in 0..iterations {
188            if t0.elapsed() >= budget {
189                tracing::warn!(
190                    iter,
191                    elapsed_ms = t0.elapsed().as_millis(),
192                    "navigate budget exhausted"
193                );
194                break;
195            }
196
197            self.reload_html(&current_html, &current_url);
198
199            let challenge = engine_classify(&current_html);
200
201            // Clean page — no challenge markers, return immediately.
202            if !challenge.verdict.is_challenge() {
203                return Ok(());
204            }
205
206            // Try registered solvers.
207            let kind = tag_to_kind(challenge.tag);
208            let mut any_solved = false;
209            for solver in solvers {
210                if !solver.can_handle(&kind) {
211                    continue;
212                }
213                if matches!(
214                    solver.solve(&kind, self).await,
215                    crate::challenge::SolveOutcome::Solved
216                ) {
217                    any_solved = true;
218                }
219            }
220
221            if any_solved {
222                // Re-fetch after solver ran.
223                let resp = client
224                    .request("GET", &current_url, None, &[], RedirectPolicy::Follow(10))
225                    .await
226                    .map_err(PageError::Net)?;
227                current_html = resp.text();
228                current_url = resp.url.clone();
229                cookies_before = cookie_snapshot(client, &current_url).await;
230                continue;
231            }
232
233            // Cookie-diff retry: if cookies changed during this iteration,
234            // the challenge script may have self-solved.
235            if iter + 1 < iterations {
236                let cookies_after = cookie_snapshot(client, &current_url).await;
237                if cookies_after != cookies_before && !cookies_after.is_empty() {
238                    tracing::info!(iter, "cookie delta detected — retrying navigation");
239                    let resp = client
240                        .request("GET", &current_url, None, &[], RedirectPolicy::Follow(10))
241                        .await
242                        .map_err(PageError::Net)?;
243                    current_html = resp.text();
244                    current_url = resp.url.clone();
245                    cookies_before = cookie_snapshot(client, &current_url).await;
246                    continue;
247                }
248            }
249
250            // Challenge still present, no solver helped, no cookie change.
251            break;
252        }
253
254        Ok(())
255    }
256
257    pub async fn evaluate_async(&mut self, _script: &str) -> Result<serde_json::Value, PageError> {
258        Err(PageError::Evaluation(
259            "evaluate_async requires v8 feature".into(),
260        ))
261    }
262
263    /// Synchronous evaluate — DOM-level only without v8.
264    pub fn evaluate(&mut self, _script: &str) -> Result<String, PageError> {
265        Ok("undefined".to_string())
266    }
267
268    pub async fn title_async(&self) -> Result<String, PageError> {
269        Ok(self.title.clone())
270    }
271
272    /// Synchronous title.
273    pub fn title(&self) -> String {
274        self.title.clone()
275    }
276
277    /// Current URL.
278    pub fn url(&self) -> &str {
279        &self.url
280    }
281
282    /// Whether stealth globals are enabled for this page.
283    pub fn stealth(&self) -> bool {
284        self.stealth
285    }
286
287    /// Apply a stealth profile's fields as JS globals and run page init.
288    ///
289    /// This sets `navigator.userAgent`, `navigator.platform`, screen
290    /// dimensions, GPU info, and other fingerprint globals from the
291    /// profile, then calls `__hpx_init()` to wire them into the
292    /// JavaScript environment.
293    #[cfg(feature = "v8")]
294    pub fn set_profile(&mut self, profile: StealthProfile) {
295        let mut rt = BrowserJsRuntime::new(crate::dom::Dom::new());
296        rt.set_user_agent(&profile.user_agent);
297        rt.set_platform(&profile.platform, &profile.os_name, &profile.os_version);
298        rt.set_stealth(true);
299        rt.run_page_init();
300        self.profile = Some(profile);
301    }
302
303    /// Page HTML content.
304    pub fn content(&self) -> String {
305        self.html.clone()
306    }
307
308    pub async fn text_content(&self) -> Result<String, PageError> {
309        Ok(self.dom.text_content(crate::dom::NodeId::DOCUMENT))
310    }
311
312    pub async fn text_of(&self, _selector: &str) -> Result<String, PageError> {
313        Ok(String::new())
314    }
315
316    /// Synchronous element check.
317    pub fn has_element(&self, _selector: &str) -> bool {
318        false
319    }
320
321    /// Challenge classification result.
322    pub fn challenge_verdict(&self) -> ChallengeVerdict {
323        self.challenge_class.verdict
324    }
325
326    /// Full challenge classification.
327    pub fn engine_class(&self) -> &EngineClass {
328        &self.challenge_class
329    }
330
331    pub fn dom(&self) -> &Dom {
332        &self.dom
333    }
334}
335
336/// Map an `engine_classify` tag to a `ChallengeKind` for solver dispatch.
337fn tag_to_kind(tag: &'static str) -> crate::challenge::ChallengeKind {
338    let (vendor, sub_kind): (&'static str, &'static str) = if tag.starts_with("cf-") {
339        ("cloudflare", tag)
340    } else if tag.starts_with("AWS-WAF") {
341        ("aws-waf", tag)
342    } else if tag.eq_ignore_ascii_case("datadome") {
343        ("datadome", tag)
344    } else if tag.starts_with("akamai") {
345        ("akamai", tag)
346    } else if tag.starts_with("px-") || tag.starts_with("PXC") {
347        ("perimeterx", tag)
348    } else if tag.starts_with("kasada") {
349        ("kasada", tag)
350    } else if tag.starts_with("sec-cpt") {
351        ("sec-cpt", tag)
352    } else if tag.starts_with("hcaptcha") {
353        ("hcaptcha", tag)
354    } else {
355        ("unknown", tag)
356    };
357    crate::challenge::ChallengeKind::new(vendor, sub_kind)
358}
359
360/// Snapshot cookie jar for a URL (empty string if none).
361async fn cookie_snapshot(client: &HttpClient, url: &str) -> String {
362    if let Ok(parsed) = url::Url::parse(url) {
363        client.cookies_for_url(&parsed).await.unwrap_or_default()
364    } else {
365        String::new()
366    }
367}
368
369/// Extract <title> from HTML (cheap string scan, no full parse).
370fn extract_title(html: &str) -> String {
371    let lower = html.to_lowercase();
372    if let Some(start) = lower.find("<title") {
373        let after_tag = &html[start..];
374        if let Some(gt) = after_tag.find('>') {
375            let content = &after_tag[gt + 1..];
376            if let Some(end) = content.to_lowercase().find("</title>") {
377                return content[..end].trim().to_string();
378            }
379        }
380    }
381    String::new()
382}
383
384#[derive(Debug, thiserror::Error)]
385pub enum PageError {
386    #[error("navigation failed: {0}")]
387    Navigation(String),
388    #[error("evaluation failed: {0}")]
389    Evaluation(String),
390    #[error("element not found")]
391    ElementNotFound,
392    #[error("page not loaded")]
393    NotLoaded,
394    #[error("network error: {0}")]
395    Net(#[from] crate::net::NetError),
396}
397
398#[cfg(test)]
399mod tests {
400    use super::*;
401
402    // ── BDD Scenario 1: Navigate to clean page ──────────────────────────
403
404    #[tokio::test]
405    async fn bdd_navigate_to_clean_page() {
406        let mut body = String::from("Hello World. ");
407        // Push past THIN_BODY_MAX_BYTES (1000) and THIN_SHELL_MAX_BYTES (15KB)
408        for _ in 0..500 {
409            body.push_str("This is real rendered content for the test page. ");
410        }
411        let html = format!(
412            r#"<!DOCTYPE html>
413<html>
414<head><title>Test Page</title></head>
415<body>{body}</body>
416</html>"#
417        );
418        let page = Page::from_html(&html, false).await.unwrap();
419
420        assert_eq!(page.title(), "Test Page");
421        assert!(page.content().contains("Hello World"));
422        assert_eq!(page.challenge_verdict(), ChallengeVerdict::Pass);
423    }
424
425    // ── BDD Scenario 2: Navigate with challenge detection ────────────────
426
427    #[tokio::test]
428    async fn bdd_navigate_with_challenge_detection() {
429        // Simulate a Cloudflare challenge response
430        let html = r#"<!DOCTYPE html>
431<html>
432<head><title>Just a moment...</title></head>
433<body>
434<script>window._cf_chl_opt={cvId:'3',cType:'managed'};</script>
435Checking your browser before accessing the site...
436</body>
437</html>"#;
438        let page = Page::from_html(html, false).await.unwrap();
439
440        assert_eq!(page.challenge_verdict(), ChallengeVerdict::EdgeBlock);
441        assert!(page.challenge_verdict().is_challenge());
442    }
443
444    // ── BDD Scenario 3: ChallengeIncomplete for large managed shell ──────
445
446    #[tokio::test]
447    async fn bdd_challenge_incomplete_verdict() {
448        let mut html = String::from(
449            r#"<html><head><title>Just a moment...</title></head><body>
450            <script>window._cf_chl_opt={cvId:'3',cType:'managed'};</script>"#,
451        );
452        for _ in 0..2000 {
453            html.push_str("<div>cf challenge orchestrator shell padding</div>");
454        }
455        html.push_str("</body></html>");
456        assert!(html.len() >= 50_000);
457
458        let page = Page::from_html(&html, false).await.unwrap();
459        assert_eq!(
460            page.challenge_verdict(),
461            ChallengeVerdict::ChallengeIncomplete
462        );
463        assert!(page.challenge_verdict().is_challenge());
464    }
465
466    // ── BDD: Clean page with substantial content passes ──────────────────
467
468    #[tokio::test]
469    async fn bdd_clean_page_passes() {
470        let mut html = String::from("<html><body>");
471        for _ in 0..400 {
472            html.push_str("<p>Normal rendered content paragraph with enough text.</p>");
473        }
474        html.push_str("</body></html>");
475        assert!(html.len() >= 15_000);
476
477        let page = Page::from_html(&html, false).await.unwrap();
478        assert_eq!(page.challenge_verdict(), ChallengeVerdict::Pass);
479        assert!(!page.challenge_verdict().is_challenge());
480    }
481
482    // ── BDD: Warm reuse reloads HTML ─────────────────────────────────────
483
484    #[tokio::test]
485    async fn bdd_warm_reuse_reloads_html() {
486        let html1 =
487            r#"<!DOCTYPE html><html><head><title>First</title></head><body>Page One</body></html>"#;
488        let html2 = r#"<!DOCTYPE html><html><head><title>Second</title></head><body>Page Two</body></html>"#;
489
490        let mut page = Page::from_html(html1, false).await.unwrap();
491        assert_eq!(page.title(), "First");
492        assert!(page.content().contains("Page One"));
493
494        // Warm reuse: reload with new HTML
495        page.reload_html(html2, "https://example.com/second");
496        assert_eq!(page.title(), "Second");
497        assert!(page.content().contains("Page Two"));
498        assert_eq!(page.url(), "https://example.com/second");
499    }
500
501    // ── BDD: Thin body is RenderIncomplete ───────────────────────────────
502
503    #[tokio::test]
504    async fn bdd_thin_body_render_incomplete() {
505        let html = "<html><body>tiny</body></html>";
506        let page = Page::from_html(html, false).await.unwrap();
507        assert_eq!(page.challenge_verdict(), ChallengeVerdict::RenderIncomplete);
508        assert!(!page.challenge_verdict().is_challenge());
509    }
510
511    // ── BDD: DataDome interstitial detected ──────────────────────────────
512
513    #[tokio::test]
514    async fn bdd_datadome_interstitial() {
515        let html = r#"<script src="https://geo.captcha-delivery.com/captcha.js"></script>
516<div id="ddcaptchaencoded">encoded_payload</div>"#;
517        let page = Page::from_html(html, false).await.unwrap();
518        assert!(page.challenge_verdict().is_challenge());
519    }
520
521    // ── BDD: AWS-WAF challenge detected ──────────────────────────────────
522
523    #[tokio::test]
524    async fn bdd_awswaf_challenge() {
525        let html = r#"<html><body>
526<script>window.gokuProps={key:'a',context:'b',iv:'c'};</script>
527<script>window.awsWafCookieDomainList=["example.com"];</script>
528<script src="https://x.token.awswaf.com/challenge.js"></script>
529<script>AwsWafIntegration.checkForceRefresh();</script>
530</body></html>"#;
531        let page = Page::from_html(html, false).await.unwrap();
532        assert!(page.challenge_verdict().is_challenge());
533    }
534
535    // ── extract_title tests ──────────────────────────────────────────────
536
537    #[test]
538    fn extract_title_basic() {
539        assert_eq!(
540            extract_title("<html><head><title>Hello</title></head></html>"),
541            "Hello"
542        );
543    }
544
545    #[test]
546    fn extract_title_empty() {
547        assert_eq!(extract_title("<html><body></body></html>"), "");
548    }
549
550    #[test]
551    fn extract_title_case_insensitive() {
552        assert_eq!(
553            extract_title("<HTML><HEAD><TITLE>Test</TITLE></HEAD></HTML>"),
554            "Test"
555        );
556    }
557}