Skip to main content

hpx_browser/
page.rs

1//! Browser page abstraction with challenge-aware navigation.
2
3use std::time::{Duration, Instant};
4
5#[cfg(feature = "v8")]
6use crate::js_runtime::runtime::BrowserJsRuntime;
7use crate::{
8    challenge::{ChallengeVerdict, EngineClass, engine_classify},
9    dom::Dom,
10    host::EngineHandle,
11    net::HttpClient,
12    stealth::StealthProfile,
13};
14
15/// Default navigation budget.
16const DEFAULT_NAV_BUDGET: Duration = Duration::from_secs(15);
17/// Default max iterations for challenge retry loops.
18const DEFAULT_MAX_ITERATIONS: u8 = 3;
19
20/// A browser page/tab.
21pub struct Page {
22    engine: EngineHandle,
23    dom: Dom,
24    url: String,
25    title: String,
26    html: String,
27    challenge_class: EngineClass,
28    profile: Option<StealthProfile>,
29    stealth: bool,
30}
31
32impl std::fmt::Debug for Page {
33    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
34        f.debug_struct("Page")
35            .field("url", &self.url)
36            .field("title", &self.title)
37            .field("stealth", &self.stealth)
38            .field("challenge_class", &self.challenge_class)
39            .field("profile", &self.profile.is_some())
40            .finish()
41    }
42}
43
44impl Page {
45    pub fn new(engine: EngineHandle) -> Self {
46        Self {
47            engine,
48            dom: Dom::new(),
49            url: "about:blank".to_string(),
50            title: String::new(),
51            html: String::new(),
52            challenge_class: EngineClass {
53                tag: "L3-RENDERED",
54                verdict: ChallengeVerdict::Pass,
55                len: 0,
56            },
57            profile: None,
58            stealth: false,
59        }
60    }
61
62    /// Create a page from raw HTML (no network).
63    pub async fn from_html(html: &str, stealth: bool) -> Result<Self, PageError> {
64        let dom = crate::html_parser::parse_html(html);
65        let title = extract_title(html);
66        let challenge_class = engine_classify(html);
67        Ok(Self {
68            engine: EngineHandle::new(),
69            dom,
70            url: "about:blank".to_string(),
71            title,
72            html: html.to_string(),
73            challenge_class,
74            profile: None,
75            stealth,
76        })
77    }
78
79    /// Create a page with profile and URL (no network).
80    pub async fn with_profile(
81        html: &str,
82        url: &str,
83        _profile: StealthProfile,
84    ) -> Result<Self, PageError> {
85        let dom = crate::html_parser::parse_html(html);
86        let title = extract_title(html);
87        let challenge_class = engine_classify(html);
88        Ok(Self {
89            engine: EngineHandle::new(),
90            dom,
91            url: url.to_string(),
92            title,
93            html: html.to_string(),
94            challenge_class,
95            profile: None,
96            stealth: true,
97        })
98    }
99
100    /// Reload the page with new HTML (reuses V8 isolate in v8 mode).
101    pub fn reload_html(&mut self, html: &str, url: &str) {
102        self.dom = crate::html_parser::parse_html(html);
103        self.url = url.to_string();
104        self.html = html.to_string();
105        self.title = extract_title(html);
106        self.challenge_class = engine_classify(html);
107    }
108
109    /// Navigate to a URL with challenge-aware retry loop.
110    ///
111    /// Fetch → classify → if challenge detected, retry up to `max_iterations`.
112    /// Uses 15s budget by default.
113    pub async fn navigate(&mut self, url: &str) -> Result<(), PageError> {
114        // ponytail: always Chrome profile; per-profile routing via tls_impersonate
115        let client = HttpClient::shared(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
116        self.navigate_inner(url, &client, DEFAULT_MAX_ITERATIONS, DEFAULT_NAV_BUDGET)
117            .await
118    }
119
120    /// Navigate with a custom solver list.
121    ///
122    /// Same as `navigate()` but accepts external challenge solvers.
123    pub async fn navigate_with_solvers(
124        &mut self,
125        url: &str,
126        solvers: &[&dyn crate::challenge::ChallengeSolver],
127    ) -> Result<(), PageError> {
128        let client = HttpClient::shared(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
129        self.navigate_with_solvers_inner(
130            url,
131            &client,
132            solvers,
133            DEFAULT_MAX_ITERATIONS,
134            DEFAULT_NAV_BUDGET,
135        )
136        .await
137    }
138
139    /// Warm navigation — reuse existing page state, fetch new URL.
140    ///
141    /// Faster than cold `navigate()` because it skips profile setup.
142    pub async fn navigate_warm(&mut self, url: &str) -> Result<(), PageError> {
143        let client = HttpClient::shared(hpx::BrowserProfile::Chrome).map_err(PageError::Net)?;
144        let resp = client.get_follow(url, 10).await.map_err(PageError::Net)?;
145        let html = resp.text();
146        let resp_url = resp.url.clone();
147
148        self.reload_html(&html, &resp_url);
149        Ok(())
150    }
151
152    /// Core navigate loop with budget and cookie-diff retry.
153    async fn navigate_inner(
154        &mut self,
155        url: &str,
156        client: &HttpClient,
157        max_iterations: u8,
158        budget: Duration,
159    ) -> Result<(), PageError> {
160        self.navigate_with_solvers_inner(url, client, &[], max_iterations, budget)
161            .await
162    }
163
164    /// Core navigate loop with solver support.
165    async fn navigate_with_solvers_inner(
166        &mut self,
167        url: &str,
168        client: &HttpClient,
169        solvers: &[&dyn crate::challenge::ChallengeSolver],
170        max_iterations: u8,
171        budget: Duration,
172    ) -> Result<(), PageError> {
173        let t0 = Instant::now();
174        let iterations = max_iterations.max(1);
175
176        let resp = client.get_follow(url, 10).await.map_err(PageError::Net)?;
177        let mut current_html = resp.text();
178        let mut current_url = resp.url.clone();
179        let mut cookies_before = cookie_snapshot(client, &current_url).await;
180
181        for iter in 0..iterations {
182            if t0.elapsed() >= budget {
183                tracing::warn!(
184                    iter,
185                    elapsed_ms = t0.elapsed().as_millis(),
186                    "navigate budget exhausted"
187                );
188                break;
189            }
190
191            self.reload_html(&current_html, &current_url);
192
193            let challenge = engine_classify(&current_html);
194
195            // Clean page — no challenge markers, return immediately.
196            if !challenge.verdict.is_challenge() {
197                return Ok(());
198            }
199
200            // Try registered solvers.
201            let kind = tag_to_kind(challenge.tag);
202            let mut any_solved = false;
203            for solver in solvers {
204                if !solver.can_handle(&kind) {
205                    continue;
206                }
207                if matches!(
208                    solver.solve(&kind, self).await,
209                    crate::challenge::SolveOutcome::Solved
210                ) {
211                    any_solved = true;
212                }
213            }
214
215            if any_solved {
216                // Re-fetch after solver ran.
217                let resp = client
218                    .get_follow(&current_url, 10)
219                    .await
220                    .map_err(PageError::Net)?;
221                current_html = resp.text();
222                current_url = resp.url.clone();
223                cookies_before = cookie_snapshot(client, &current_url).await;
224                continue;
225            }
226
227            // Cookie-diff retry: if cookies changed during this iteration,
228            // the challenge script may have self-solved.
229            if iter + 1 < iterations {
230                let cookies_after = cookie_snapshot(client, &current_url).await;
231                if cookies_after != cookies_before && !cookies_after.is_empty() {
232                    tracing::info!(iter, "cookie delta detected — retrying navigation");
233                    let resp = client
234                        .get_follow(&current_url, 10)
235                        .await
236                        .map_err(PageError::Net)?;
237                    current_html = resp.text();
238                    current_url = resp.url.clone();
239                    cookies_before = cookie_snapshot(client, &current_url).await;
240                    continue;
241                }
242            }
243
244            // Challenge still present, no solver helped, no cookie change.
245            break;
246        }
247
248        Ok(())
249    }
250
251    pub async fn evaluate_async(&mut self, _script: &str) -> Result<serde_json::Value, PageError> {
252        Err(PageError::Evaluation(
253            "evaluate_async requires v8 feature".into(),
254        ))
255    }
256
257    /// Synchronous evaluate — DOM-level only without v8.
258    pub fn evaluate(&mut self, _script: &str) -> Result<String, PageError> {
259        Ok("undefined".to_string())
260    }
261
262    pub async fn title_async(&self) -> Result<String, PageError> {
263        Ok(self.title.clone())
264    }
265
266    /// Synchronous title.
267    pub fn title(&self) -> String {
268        self.title.clone()
269    }
270
271    /// Current URL.
272    pub fn url(&self) -> &str {
273        &self.url
274    }
275
276    /// Whether stealth globals are enabled for this page.
277    pub fn stealth(&self) -> bool {
278        self.stealth
279    }
280
281    /// Apply a stealth profile's fields as JS globals and run page init.
282    ///
283    /// This sets `navigator.userAgent`, `navigator.platform`, screen
284    /// dimensions, GPU info, and other fingerprint globals from the
285    /// profile, then calls `__hpx_init()` to wire them into the
286    /// JavaScript environment.
287    #[cfg(feature = "v8")]
288    pub fn set_profile(&mut self, profile: StealthProfile) {
289        let mut rt = BrowserJsRuntime::new(crate::dom::Dom::new());
290        rt.set_user_agent(&profile.user_agent);
291        rt.set_platform(&profile.platform, &profile.os_name, &profile.os_version);
292        rt.set_stealth(true);
293        rt.run_page_init();
294        self.profile = Some(profile);
295    }
296
297    /// Page HTML content.
298    pub fn content(&self) -> String {
299        self.html.clone()
300    }
301
302    pub async fn text_content(&self) -> Result<String, PageError> {
303        Ok(self.dom.text_content(crate::dom::NodeId::DOCUMENT))
304    }
305
306    pub async fn text_of(&self, _selector: &str) -> Result<String, PageError> {
307        Ok(String::new())
308    }
309
310    /// Synchronous element check.
311    pub fn has_element(&self, _selector: &str) -> bool {
312        false
313    }
314
315    /// Challenge classification result.
316    pub fn challenge_verdict(&self) -> ChallengeVerdict {
317        self.challenge_class.verdict
318    }
319
320    /// Full challenge classification.
321    pub fn engine_class(&self) -> &EngineClass {
322        &self.challenge_class
323    }
324
325    pub fn dom(&self) -> &Dom {
326        &self.dom
327    }
328}
329
330/// Map an `engine_classify` tag to a `ChallengeKind` for solver dispatch.
331fn tag_to_kind(tag: &'static str) -> crate::challenge::ChallengeKind {
332    let (vendor, sub_kind): (&'static str, &'static str) = if tag.starts_with("cf-") {
333        ("cloudflare", tag)
334    } else if tag.starts_with("AWS-WAF") {
335        ("aws-waf", tag)
336    } else if tag.eq_ignore_ascii_case("datadome") {
337        ("datadome", tag)
338    } else if tag.starts_with("akamai") {
339        ("akamai", tag)
340    } else if tag.starts_with("px-") || tag.starts_with("PXC") {
341        ("perimeterx", tag)
342    } else if tag.starts_with("kasada") {
343        ("kasada", tag)
344    } else if tag.starts_with("sec-cpt") {
345        ("sec-cpt", tag)
346    } else if tag.starts_with("hcaptcha") {
347        ("hcaptcha", tag)
348    } else {
349        ("unknown", tag)
350    };
351    crate::challenge::ChallengeKind::new(vendor, sub_kind)
352}
353
354/// Snapshot cookie jar for a URL (empty string if none).
355async fn cookie_snapshot(client: &HttpClient, url: &str) -> String {
356    if let Ok(parsed) = url::Url::parse(url) {
357        client.cookies_for_url(&parsed).await.unwrap_or_default()
358    } else {
359        String::new()
360    }
361}
362
363/// Extract <title> from HTML (cheap string scan, no full parse).
364fn extract_title(html: &str) -> String {
365    let lower = html.to_lowercase();
366    if let Some(start) = lower.find("<title") {
367        let after_tag = &html[start..];
368        if let Some(gt) = after_tag.find('>') {
369            let content = &after_tag[gt + 1..];
370            if let Some(end) = content.to_lowercase().find("</title>") {
371                return content[..end].trim().to_string();
372            }
373        }
374    }
375    String::new()
376}
377
378#[derive(Debug, thiserror::Error)]
379pub enum PageError {
380    #[error("navigation failed: {0}")]
381    Navigation(String),
382    #[error("evaluation failed: {0}")]
383    Evaluation(String),
384    #[error("element not found")]
385    ElementNotFound,
386    #[error("page not loaded")]
387    NotLoaded,
388    #[error("network error: {0}")]
389    Net(#[from] crate::net::NetError),
390}
391
392#[cfg(test)]
393mod tests {
394    use super::*;
395
396    // ── BDD Scenario 1: Navigate to clean page ──────────────────────────
397
398    #[tokio::test]
399    async fn bdd_navigate_to_clean_page() {
400        let mut body = String::from("Hello World. ");
401        // Push past THIN_BODY_MAX_BYTES (1000) and THIN_SHELL_MAX_BYTES (15KB)
402        for _ in 0..500 {
403            body.push_str("This is real rendered content for the test page. ");
404        }
405        let html = format!(
406            r#"<!DOCTYPE html>
407<html>
408<head><title>Test Page</title></head>
409<body>{body}</body>
410</html>"#
411        );
412        let page = Page::from_html(&html, false).await.unwrap();
413
414        assert_eq!(page.title(), "Test Page");
415        assert!(page.content().contains("Hello World"));
416        assert_eq!(page.challenge_verdict(), ChallengeVerdict::Pass);
417    }
418
419    // ── BDD Scenario 2: Navigate with challenge detection ────────────────
420
421    #[tokio::test]
422    async fn bdd_navigate_with_challenge_detection() {
423        // Simulate a Cloudflare challenge response
424        let html = r#"<!DOCTYPE html>
425<html>
426<head><title>Just a moment...</title></head>
427<body>
428<script>window._cf_chl_opt={cvId:'3',cType:'managed'};</script>
429Checking your browser before accessing the site...
430</body>
431</html>"#;
432        let page = Page::from_html(html, false).await.unwrap();
433
434        assert_eq!(page.challenge_verdict(), ChallengeVerdict::EdgeBlock);
435        assert!(page.challenge_verdict().is_challenge());
436    }
437
438    // ── BDD Scenario 3: ChallengeIncomplete for large managed shell ──────
439
440    #[tokio::test]
441    async fn bdd_challenge_incomplete_verdict() {
442        let mut html = String::from(
443            r#"<html><head><title>Just a moment...</title></head><body>
444            <script>window._cf_chl_opt={cvId:'3',cType:'managed'};</script>"#,
445        );
446        for _ in 0..2000 {
447            html.push_str("<div>cf challenge orchestrator shell padding</div>");
448        }
449        html.push_str("</body></html>");
450        assert!(html.len() >= 50_000);
451
452        let page = Page::from_html(&html, false).await.unwrap();
453        assert_eq!(
454            page.challenge_verdict(),
455            ChallengeVerdict::ChallengeIncomplete
456        );
457        assert!(page.challenge_verdict().is_challenge());
458    }
459
460    // ── BDD: Clean page with substantial content passes ──────────────────
461
462    #[tokio::test]
463    async fn bdd_clean_page_passes() {
464        let mut html = String::from("<html><body>");
465        for _ in 0..400 {
466            html.push_str("<p>Normal rendered content paragraph with enough text.</p>");
467        }
468        html.push_str("</body></html>");
469        assert!(html.len() >= 15_000);
470
471        let page = Page::from_html(&html, false).await.unwrap();
472        assert_eq!(page.challenge_verdict(), ChallengeVerdict::Pass);
473        assert!(!page.challenge_verdict().is_challenge());
474    }
475
476    // ── BDD: Warm reuse reloads HTML ─────────────────────────────────────
477
478    #[tokio::test]
479    async fn bdd_warm_reuse_reloads_html() {
480        let html1 =
481            r#"<!DOCTYPE html><html><head><title>First</title></head><body>Page One</body></html>"#;
482        let html2 = r#"<!DOCTYPE html><html><head><title>Second</title></head><body>Page Two</body></html>"#;
483
484        let mut page = Page::from_html(html1, false).await.unwrap();
485        assert_eq!(page.title(), "First");
486        assert!(page.content().contains("Page One"));
487
488        // Warm reuse: reload with new HTML
489        page.reload_html(html2, "https://example.com/second");
490        assert_eq!(page.title(), "Second");
491        assert!(page.content().contains("Page Two"));
492        assert_eq!(page.url(), "https://example.com/second");
493    }
494
495    // ── BDD: Thin body is RenderIncomplete ───────────────────────────────
496
497    #[tokio::test]
498    async fn bdd_thin_body_render_incomplete() {
499        let html = "<html><body>tiny</body></html>";
500        let page = Page::from_html(html, false).await.unwrap();
501        assert_eq!(page.challenge_verdict(), ChallengeVerdict::RenderIncomplete);
502        assert!(!page.challenge_verdict().is_challenge());
503    }
504
505    // ── BDD: DataDome interstitial detected ──────────────────────────────
506
507    #[tokio::test]
508    async fn bdd_datadome_interstitial() {
509        let html = r#"<script src="https://geo.captcha-delivery.com/captcha.js"></script>
510<div id="ddcaptchaencoded">encoded_payload</div>"#;
511        let page = Page::from_html(html, false).await.unwrap();
512        assert!(page.challenge_verdict().is_challenge());
513    }
514
515    // ── BDD: AWS-WAF challenge detected ──────────────────────────────────
516
517    #[tokio::test]
518    async fn bdd_awswaf_challenge() {
519        let html = r#"<html><body>
520<script>window.gokuProps={key:'a',context:'b',iv:'c'};</script>
521<script>window.awsWafCookieDomainList=["example.com"];</script>
522<script src="https://x.token.awswaf.com/challenge.js"></script>
523<script>AwsWafIntegration.checkForceRefresh();</script>
524</body></html>"#;
525        let page = Page::from_html(html, false).await.unwrap();
526        assert!(page.challenge_verdict().is_challenge());
527    }
528
529    // ── extract_title tests ──────────────────────────────────────────────
530
531    #[test]
532    fn extract_title_basic() {
533        assert_eq!(
534            extract_title("<html><head><title>Hello</title></head></html>"),
535            "Hello"
536        );
537    }
538
539    #[test]
540    fn extract_title_empty() {
541        assert_eq!(extract_title("<html><body></body></html>"), "");
542    }
543
544    #[test]
545    fn extract_title_case_insensitive() {
546        assert_eq!(
547            extract_title("<HTML><HEAD><TITLE>Test</TITLE></HEAD></HTML>"),
548            "Test"
549        );
550    }
551}