Skip to main content

spider_browser/
page.rs

1//! SpiderPage -- deterministic browser tab abstraction.
2//!
3//! All standard browser automation methods (no LLM required).
4//! Works over both CDP (Chrome/Servo/LightPanda) and BiDi (Firefox)
5//! through the [`ProtocolAdapter`].
6
7use crate::errors::{Result, SpiderError};
8use crate::protocol::protocol_adapter::ProtocolAdapter;
9use arc_swap::ArcSwap;
10use serde_json::Value;
11use std::sync::Arc;
12use std::time::{Duration, Instant};
13use tokio::time::sleep;
14
15/// Browser tab abstraction with full automation API.
16///
17/// Wraps a [`ProtocolAdapter`] and exposes high-level navigation, content
18/// extraction, click/input/scroll primitives, wait helpers, and viewport
19/// control. The adapter can be swapped atomically via [`set_adapter`] during
20/// browser rotation without dropping inflight references.
21pub struct SpiderPage {
22    adapter: ArcSwap<ProtocolAdapter>,
23}
24
25/// Selector specification for [`SpiderPage::extract_fields`].
26///
27/// Use [`From<&str>`] for the common text-content case:
28/// ```ignore
29/// page.extract_fields(&[
30///     ("title", "#productTitle".into()),
31///     ("image", FieldSelector::Attr { selector: "#img", attribute: "src" }),
32/// ]).await?;
33/// ```
34#[derive(Debug, Clone)]
35pub enum FieldSelector<'a> {
36    /// Extract `textContent` (trimmed) from the element matching this CSS
37    /// selector.
38    Text(&'a str),
39    /// Extract an attribute value from the element matching the CSS selector.
40    Attr {
41        selector: &'a str,
42        attribute: &'a str,
43    },
44}
45
46impl<'a> From<&'a str> for FieldSelector<'a> {
47    fn from(s: &'a str) -> Self {
48        Self::Text(s)
49    }
50}
51
52impl SpiderPage {
53    // -----------------------------------------------------------------
54    // Construction
55    // -----------------------------------------------------------------
56
57    /// Create a new `SpiderPage` wrapping the given protocol adapter.
58    pub fn new(adapter: ProtocolAdapter) -> Self {
59        Self {
60            adapter: ArcSwap::from_pointee(adapter),
61        }
62    }
63
64    /// Create a new `SpiderPage` from an already-`Arc`-wrapped adapter.
65    pub fn from_arc(adapter: Arc<ProtocolAdapter>) -> Self {
66        Self {
67            adapter: ArcSwap::from(adapter),
68        }
69    }
70
71    /// Load the current adapter snapshot. All methods go through this.
72    #[inline]
73    pub(crate) fn adapter(&self) -> arc_swap::Guard<Arc<ProtocolAdapter>> {
74        self.adapter.load()
75    }
76
77    // =================================================================
78    // Navigation
79    // =================================================================
80
81    /// Navigate to a URL and wait for load.
82    pub async fn goto(&self, url: &str) -> Result<()> {
83        self.adapter().navigate(url).await
84    }
85
86    /// Navigate without waiting for full page load (5 s max wait).
87    /// Use with [`content_with_early_return`] for SPAs that never fire
88    /// `loadEventFired`.
89    pub async fn goto_fast(&self, url: &str) -> Result<()> {
90        self.adapter().navigate_fast(url).await
91    }
92
93    /// Navigate and return as soon as `DOMContentLoaded` fires (3 s max).
94    /// Fastest option -- the DOM shell is ready but subresources may still
95    /// load. Pair with [`content_with_early_return`] or
96    /// [`content_with_network_idle`] for best results.
97    pub async fn goto_dom(&self, url: &str) -> Result<()> {
98        self.adapter().navigate_dom(url).await
99    }
100
101    /// Go back in browser history.
102    pub async fn go_back(&self) -> Result<()> {
103        self.adapter().evaluate("window.history.back()").await?;
104        Ok(())
105    }
106
107    /// Go forward in browser history.
108    pub async fn go_forward(&self) -> Result<()> {
109        self.adapter().evaluate("window.history.forward()").await?;
110        Ok(())
111    }
112
113    /// Reload the page.
114    pub async fn reload(&self) -> Result<()> {
115        self.adapter().evaluate("window.location.reload()").await?;
116        Ok(())
117    }
118
119    // =================================================================
120    // Content
121    // =================================================================
122
123    /// Get the full page HTML, ensuring the page is ready first.
124    ///
125    /// Waits for network idle + DOM stability, then checks content quality.
126    /// If the content seems incomplete (too short or looks like a loading
127    /// state), does incremental waits with exponential backoff before
128    /// returning.
129    ///
130    /// * `wait_ms`   -- Max time to wait for readiness (default 8000).
131    ///                  Pass 0 to skip readiness checks and return
132    ///                  immediately.
133    /// * `min_length` -- Minimum content length to consider "good"
134    ///                   (default 1000).
135    pub async fn content(&self, wait_ms: u64, min_length: usize) -> Result<String> {
136        // ---- SSR fast path ----
137        // Check if content is already sufficient before waiting for network
138        // idle.  SSR pages have full HTML available immediately after
139        // navigation, so this skips the expensive networkIdle wait.
140        if wait_ms > 0 {
141            let early_html = self.adapter().get_html().await.unwrap_or_default();
142            if early_html.len() >= min_length
143                && !Self::is_interstitial_content(&early_html)
144                && !Self::is_rate_limit_content(&early_html)
145            {
146                return Ok(early_html);
147            }
148            self.wait_for_network_idle(wait_ms).await?;
149        }
150
151        let mut html = self.adapter().get_html().await.unwrap_or_default();
152
153        // ---- Interstitial detection ----
154        // Cloudflare "Just a moment...", PerimeterX "Verifying the
155        // device...", and similar interstitials auto-resolve after a few
156        // seconds.  PerimeterX can take 30-45 s.
157        // Graduated waits: 2+2+3+4+5+7+7 = 30 s max.
158        // No no-growth early exit: PerimeterX pages stay identical during
159        // JS verification then suddenly redirect when challenge passes.
160        // Must wait the full budget.
161        if wait_ms > 0 && Self::is_interstitial_content(&html) {
162            let interstitial_waits: &[u64] = &[2000, 2000, 3000, 4000, 5000, 7000, 7000];
163            for &wait in interstitial_waits {
164                sleep(Duration::from_millis(wait)).await;
165                html = self.adapter().get_html().await.unwrap_or_default();
166                if !Self::is_interstitial_content(&html) {
167                    break;
168                }
169                // Content-growth early exit: real page rendered
170                if html.len() > 15_000 {
171                    break;
172                }
173            }
174            // If still an interstitial after all waits, throw Blocked so
175            // retry engine rotates browser.
176            if Self::is_interstitial_content(&html) {
177                return Err(SpiderError::Blocked(
178                    "Page stuck on interstitial challenge".into(),
179                ));
180            }
181        }
182
183        // ---- Site-level rate limiting ----
184        // Throw Blocked so retry engine rotates browser (new profile).
185        if wait_ms > 0 && Self::is_rate_limit_content(&html) {
186            return Err(SpiderError::Blocked(
187                "Rate limit exceeded (site-level)".into(),
188            ));
189        }
190
191        // ---- Incremental quality check ----
192        // If content seems incomplete, wait progressively.  After
193        // incremental waits, fall back to polling (catches SPAs that never
194        // fire load but have content available via client-side rendering).
195        if wait_ms > 0 && html.len() < min_length {
196            let increments: &[u64] = &[300, 500, 800, 1200];
197            for &extra in increments {
198                sleep(Duration::from_millis(extra)).await;
199                let updated = self.adapter().get_html().await.unwrap_or_default();
200                if updated.len() > html.len() {
201                    html = updated;
202                }
203                if html.len() >= min_length {
204                    break;
205                }
206            }
207            // If still short after incremental waits, do a brief polling
208            // phase. This catches SPAs that render content asynchronously
209            // after page load.
210            if html.len() < min_length {
211                let poll_deadline = Instant::now() + Duration::from_millis(3000);
212                while Instant::now() < poll_deadline {
213                    sleep(Duration::from_millis(1000)).await;
214                    let polled = self.adapter().get_html().await.unwrap_or_default();
215                    if polled.len() > html.len() {
216                        html = polled;
217                    }
218                    if html.len() >= min_length {
219                        break;
220                    }
221                }
222            }
223        }
224
225        Ok(html)
226    }
227
228    /// Get the raw page HTML without any readiness waiting.
229    /// Use this when you need immediate access or have already waited.
230    pub async fn raw_content(&self) -> Result<String> {
231        self.adapter().get_html().await
232    }
233
234    /// Poll for content with early return -- for SPAs that never fire
235    /// `loadEventFired`.
236    ///
237    /// Instead of waiting for a full page load event, this polls for HTML
238    /// content at regular intervals and returns as soon as sufficient
239    /// content is available.  Useful for timeout retries where the page
240    /// loads data asynchronously.
241    ///
242    /// * `max_wait_ms`        -- Max time to poll (default 15 s).
243    /// * `min_content_length` -- Minimum HTML length to accept (default 500).
244    /// * `poll_interval_ms`   -- Interval between polls (default 2 s).
245    pub async fn content_with_early_return(
246        &self,
247        max_wait_ms: u64,
248        min_content_length: usize,
249        poll_interval_ms: u64,
250    ) -> Result<String> {
251        let deadline = Instant::now() + Duration::from_millis(max_wait_ms);
252        while Instant::now() < deadline {
253            let html = self.adapter().get_html().await.unwrap_or_default();
254            if html.len() >= min_content_length
255                && !Self::is_interstitial_content(&html)
256                && !Self::is_rate_limit_content(&html)
257            {
258                return Ok(html);
259            }
260            let remaining = deadline.saturating_duration_since(Instant::now());
261            if remaining.is_zero() {
262                break;
263            }
264            let wait = Duration::from_millis(poll_interval_ms).min(remaining);
265            sleep(wait).await;
266        }
267        // Final attempt -- return whatever we have
268        Ok(self.adapter().get_html().await.unwrap_or_default())
269    }
270
271    /// Get content using network idle detection + polling hybrid approach.
272    ///
273    /// Best for heavy SPAs: uses `PerformanceObserver` + `MutationObserver`
274    /// to detect when the page stops loading, combined with content-length
275    /// thresholds.
276    ///
277    /// Strategy:
278    /// 1. Wait for `readyState=interactive` (DOM parsed)
279    /// 2. Start network+DOM idle monitoring (400 ms silence threshold)
280    /// 3. Poll HTML length -- return early if sufficient + idle
281    /// 4. Interstitial detection with configurable wait budget
282    ///
283    /// * `max_wait_ms`           -- Max total time to wait (default 20 s).
284    /// * `min_content_length`    -- Minimum HTML length to accept (default 1000).
285    /// * `interstitial_budget_ms` -- Max time to wait for interstitials to
286    ///                               resolve (default 16 s, use 30 s for
287    ///                               retries).
288    pub async fn content_with_network_idle(
289        &self,
290        max_wait_ms: u64,
291        min_content_length: usize,
292        interstitial_budget_ms: u64,
293    ) -> Result<String> {
294        let deadline = Instant::now() + Duration::from_millis(max_wait_ms);
295
296        // Phase 1: Quick check -- SSR pages have content immediately
297        let mut html = self.adapter().get_html().await.unwrap_or_default();
298        if html.len() >= min_content_length
299            && !Self::is_interstitial_content(&html)
300            && !Self::is_rate_limit_content(&html)
301        {
302            return Ok(html);
303        }
304
305        // Phase 2: Wait for readyState=interactive or complete (DOM parsed)
306        let dom_deadline = deadline.min(Instant::now() + Duration::from_millis(5000));
307        while Instant::now() < dom_deadline {
308            let state = self.adapter().evaluate("document.readyState").await;
309            if let Ok(val) = state {
310                let s = val.as_str().unwrap_or("");
311                if s == "interactive" || s == "complete" {
312                    break;
313                }
314            }
315            sleep(Duration::from_millis(200)).await;
316        }
317
318        // Phase 3: Network + DOM idle monitoring with content polling.
319        // Inject a combined observer that tracks resource loads and DOM
320        // mutations.
321        let idle_ms: u64 = 400;
322        let idle_check_ms = {
323            let remaining = deadline.saturating_duration_since(Instant::now());
324            remaining.as_millis().min(8000) as u64
325        };
326        if idle_check_ms > 500 {
327            let js = format!(
328                r#"
329                new Promise((resolve) => {{
330                    let lastActivity = Date.now();
331                    const idleThreshold = {idle_ms};
332                    const deadline = Date.now() + {idle_check_ms};
333                    const perfObs = new PerformanceObserver(() => {{ lastActivity = Date.now(); }});
334                    try {{ perfObs.observe({{ entryTypes: ['resource'] }}); }} catch(e) {{}}
335                    const mutObs = new MutationObserver(() => {{ lastActivity = Date.now(); }});
336                    mutObs.observe(document.documentElement, {{ childList: true, subtree: true, attributes: true }});
337                    const check = () => {{
338                        const now = Date.now();
339                        if (now >= deadline || (now - lastActivity >= idleThreshold)) {{
340                            perfObs.disconnect(); mutObs.disconnect(); resolve(true); return;
341                        }}
342                        setTimeout(check, 100);
343                    }};
344                    setTimeout(check, idleThreshold);
345                }})
346                "#
347            );
348            if self.adapter().evaluate(&js).await.is_err() {
349                sleep(Duration::from_millis(500)).await;
350            }
351        }
352
353        // Check content after idle
354        html = self.adapter().get_html().await.unwrap_or_default();
355        if html.len() >= min_content_length
356            && !Self::is_interstitial_content(&html)
357            && !Self::is_rate_limit_content(&html)
358        {
359            return Ok(html);
360        }
361
362        // Phase 4: Interstitial handling with configurable budget.
363        // No no-growth early exit: PerimeterX/Akamai pages stay identical
364        // during JS verification then suddenly redirect. Must wait the
365        // full budget.
366        if Self::is_interstitial_content(&html) {
367            let i_deadline =
368                deadline.min(Instant::now() + Duration::from_millis(interstitial_budget_ms));
369            let waits: &[u64] = &[2000, 2000, 3000, 4000, 5000, 7000, 10000];
370            for &wait in waits {
371                if Instant::now() >= i_deadline {
372                    break;
373                }
374                let remaining = i_deadline.saturating_duration_since(Instant::now());
375                let actual_wait = Duration::from_millis(wait).min(remaining);
376                sleep(actual_wait).await;
377                html = self.adapter().get_html().await.unwrap_or_default();
378                if !Self::is_interstitial_content(&html) {
379                    break;
380                }
381                if html.len() > 15_000 {
382                    break;
383                }
384            }
385            if Self::is_interstitial_content(&html) {
386                return Err(SpiderError::Blocked(
387                    "Page stuck on interstitial challenge".into(),
388                ));
389            }
390        }
391
392        if Self::is_rate_limit_content(&html) {
393            return Err(SpiderError::Blocked(
394                "Rate limit exceeded (site-level)".into(),
395            ));
396        }
397
398        // Phase 5: Final polling for async content
399        if html.len() < min_content_length {
400            while Instant::now() < deadline {
401                sleep(Duration::from_millis(1000)).await;
402                let polled = self.adapter().get_html().await.unwrap_or_default();
403                if polled.len() > html.len() {
404                    html = polled;
405                }
406                if html.len() >= min_content_length {
407                    break;
408                }
409            }
410        }
411
412        Ok(html)
413    }
414
415    // =================================================================
416    // Info
417    // =================================================================
418
419    /// Get the page title.
420    pub async fn title(&self) -> Result<String> {
421        let val = self.adapter().evaluate("document.title").await?;
422        Ok(val.as_str().unwrap_or("").to_string())
423    }
424
425    /// Get the current page URL.
426    pub async fn url(&self) -> Result<String> {
427        let val = self.adapter().evaluate("window.location.href").await?;
428        Ok(val.as_str().unwrap_or("").to_string())
429    }
430
431    /// Capture a screenshot as base64 PNG.
432    pub async fn screenshot(&self) -> Result<String> {
433        self.adapter().capture_screenshot().await
434    }
435
436    /// Evaluate arbitrary JavaScript and return the result.
437    pub async fn evaluate(&self, expression: &str) -> Result<Value> {
438        self.adapter().evaluate(expression).await
439    }
440
441    // =================================================================
442    // Click Actions
443    // =================================================================
444
445    /// Click an element by CSS selector.
446    pub async fn click(&self, selector: &str) -> Result<()> {
447        let (x, y) = self.get_element_center(selector).await?;
448        self.adapter().click_point(x, y).await
449    }
450
451    /// Click at specific viewport coordinates.
452    pub async fn click_at(&self, x: f64, y: f64) -> Result<()> {
453        self.adapter().click_point(x, y).await
454    }
455
456    /// Double-click an element by CSS selector.
457    pub async fn dblclick(&self, selector: &str) -> Result<()> {
458        let (x, y) = self.get_element_center(selector).await?;
459        self.adapter().double_click_point(x, y).await
460    }
461
462    /// Right-click an element by CSS selector.
463    pub async fn right_click(&self, selector: &str) -> Result<()> {
464        let (x, y) = self.get_element_center(selector).await?;
465        self.adapter().right_click_point(x, y).await
466    }
467
468    /// Click and hold an element for a duration.
469    ///
470    /// Useful for long-press interactions, drag initiation, and
471    /// mobile-style gestures.
472    ///
473    /// * `selector` -- CSS selector of the element.
474    /// * `hold_ms`  -- Duration in milliseconds to hold (default 1000).
475    pub async fn click_and_hold(&self, selector: &str, hold_ms: u64) -> Result<()> {
476        let (x, y) = self.get_element_center(selector).await?;
477        self.adapter().click_hold_point(x, y, hold_ms).await
478    }
479
480    /// Click and hold at specific viewport coordinates for a duration.
481    ///
482    /// * `x`       -- X coordinate (CSS pixels).
483    /// * `y`       -- Y coordinate (CSS pixels).
484    /// * `hold_ms` -- Duration in milliseconds to hold (default 1000).
485    pub async fn click_and_hold_at(&self, x: f64, y: f64, hold_ms: u64) -> Result<()> {
486        self.adapter().click_hold_point(x, y, hold_ms).await
487    }
488
489    /// Click all elements matching a selector.
490    pub async fn click_all(&self, selector: &str) -> Result<()> {
491        let escaped = serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
492        let js = format!(
493            r#"
494            (function() {{
495                const els = document.querySelectorAll({escaped});
496                return Array.from(els).map(el => {{
497                    const r = el.getBoundingClientRect();
498                    return {{ x: r.x + r.width / 2, y: r.y + r.height / 2 }};
499                }});
500            }})()
501            "#
502        );
503        let result = self.adapter().evaluate(&js).await?;
504        if let Some(points) = result.as_array() {
505            for pt in points {
506                let x = pt.get("x").and_then(|v| v.as_f64()).unwrap_or(0.0);
507                let y = pt.get("y").and_then(|v| v.as_f64()).unwrap_or(0.0);
508                self.adapter().click_point(x, y).await?;
509                sleep(Duration::from_millis(100)).await;
510            }
511        }
512        Ok(())
513    }
514
515    // =================================================================
516    // Input Actions
517    // =================================================================
518
519    /// Fill a form field -- focus, clear existing value, type new value.
520    pub async fn fill(&self, selector: &str, value: &str) -> Result<()> {
521        let escaped_sel =
522            serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
523        // Clear via JS
524        let clear_js = format!(
525            r#"
526            (function() {{
527                const el = document.querySelector({escaped_sel});
528                if (el) {{ el.focus(); el.value = ''; }}
529            }})()
530            "#
531        );
532        self.adapter().evaluate(&clear_js).await?;
533
534        // Click to ensure focus with real browser event
535        if let Ok((x, y)) = self.get_element_center(selector).await {
536            let _ = self.adapter().click_point(x, y).await;
537        }
538
539        // Insert text
540        self.adapter().insert_text(value).await?;
541
542        // Dispatch input + change events
543        let dispatch_js = format!(
544            r#"
545            (function() {{
546                const el = document.querySelector({escaped_sel});
547                if (el) {{
548                    el.dispatchEvent(new Event('input', {{ bubbles: true }}));
549                    el.dispatchEvent(new Event('change', {{ bubbles: true }}));
550                }}
551            }})()
552            "#
553        );
554        self.adapter().evaluate(&dispatch_js).await?;
555        Ok(())
556    }
557
558    /// Type text into the currently focused element.
559    pub async fn type_text(&self, value: &str) -> Result<()> {
560        self.adapter().insert_text(value).await
561    }
562
563    /// Press a named key (e.g. "Enter", "Tab", "Escape").
564    pub async fn press(&self, key: &str) -> Result<()> {
565        self.adapter().press_key(key).await
566    }
567
568    /// Clear an input field.
569    pub async fn clear(&self, selector: &str) -> Result<()> {
570        let escaped =
571            serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
572        let js = format!("document.querySelector({escaped}).value = ''");
573        self.adapter().evaluate(&js).await?;
574        Ok(())
575    }
576
577    /// Select an option in a `<select>` element.
578    pub async fn select(&self, selector: &str, value: &str) -> Result<()> {
579        let escaped_sel =
580            serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
581        let escaped_val =
582            serde_json::to_string(value).unwrap_or_else(|_| format!("\"{}\"", value));
583        let js = format!(
584            r#"
585            (function() {{
586                const el = document.querySelector({escaped_sel});
587                if (el) {{
588                    el.value = {escaped_val};
589                    el.dispatchEvent(new Event('change', {{ bubbles: true }}));
590                }}
591            }})()
592            "#
593        );
594        self.adapter().evaluate(&js).await?;
595        Ok(())
596    }
597
598    // =================================================================
599    // Focus & Hover
600    // =================================================================
601
602    /// Focus an element.
603    pub async fn focus(&self, selector: &str) -> Result<()> {
604        let escaped =
605            serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
606        let js = format!("document.querySelector({escaped})?.focus()");
607        self.adapter().evaluate(&js).await?;
608        Ok(())
609    }
610
611    /// Blur (unfocus) an element.
612    pub async fn blur(&self, selector: &str) -> Result<()> {
613        let escaped =
614            serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
615        let js = format!("document.querySelector({escaped})?.blur()");
616        self.adapter().evaluate(&js).await?;
617        Ok(())
618    }
619
620    /// Hover over an element.
621    pub async fn hover(&self, selector: &str) -> Result<()> {
622        let (x, y) = self.get_element_center(selector).await?;
623        self.adapter().hover_point(x, y).await
624    }
625
626    // =================================================================
627    // Drag
628    // =================================================================
629
630    /// Drag from one element to another.
631    pub async fn drag(&self, from_selector: &str, to_selector: &str) -> Result<()> {
632        let (fx, fy) = self.get_element_center(from_selector).await?;
633        let (tx, ty) = self.get_element_center(to_selector).await?;
634        self.adapter().drag_point(fx, fy, tx, ty).await
635    }
636
637    // =================================================================
638    // Scroll
639    // =================================================================
640
641    /// Scroll vertically by pixels (positive = down).
642    pub async fn scroll_y(&self, pixels: i64) -> Result<()> {
643        let js = format!("window.scrollBy(0, {pixels})");
644        self.adapter().evaluate(&js).await?;
645        Ok(())
646    }
647
648    /// Scroll horizontally by pixels (positive = right).
649    pub async fn scroll_x(&self, pixels: i64) -> Result<()> {
650        let js = format!("window.scrollBy({pixels}, 0)");
651        self.adapter().evaluate(&js).await?;
652        Ok(())
653    }
654
655    /// Scroll an element into view.
656    pub async fn scroll_to(&self, selector: &str) -> Result<()> {
657        let escaped =
658            serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
659        let js = format!(
660            "document.querySelector({escaped})?.scrollIntoView({{ behavior: 'smooth', block: 'center' }})"
661        );
662        self.adapter().evaluate(&js).await?;
663        Ok(())
664    }
665
666    /// Scroll to absolute page coordinates.
667    pub async fn scroll_to_point(&self, x: f64, y: f64) -> Result<()> {
668        let js = format!("window.scrollTo({x}, {y})");
669        self.adapter().evaluate(&js).await?;
670        Ok(())
671    }
672
673    // =================================================================
674    // Wait
675    // =================================================================
676
677    /// Wait for a CSS selector to appear in the DOM.
678    pub async fn wait_for_selector(&self, selector: &str, timeout_ms: u64) -> Result<()> {
679        let interval: u64 = 100;
680        let max_iter = (timeout_ms + interval - 1) / interval; // ceil division
681        let escaped =
682            serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
683        let check_js = format!("!!document.querySelector({escaped})");
684        for _ in 0..max_iter {
685            let found = self.adapter().evaluate(&check_js).await?;
686            if found.as_bool().unwrap_or(false) {
687                return Ok(());
688            }
689            sleep(Duration::from_millis(interval)).await;
690        }
691        Err(SpiderError::Timeout(format!(
692            "Timeout waiting for selector: {selector}"
693        )))
694    }
695
696    /// Wait for navigation/page load (simple delay).
697    pub async fn wait_for_navigation(&self, timeout_ms: u64) -> Result<()> {
698        let wait = timeout_ms.min(1000);
699        sleep(Duration::from_millis(wait)).await;
700        Ok(())
701    }
702
703    /// Wait until the page is fully loaded and DOM is stable.
704    ///
705    /// Checks:
706    /// 1. `document.readyState === 'complete'`
707    /// 2. DOM content length stabilizes (no changes for 500 ms)
708    ///
709    /// Use after `goto()` for SPAs and dynamic pages to ensure all
710    /// content is rendered before extracting HTML.
711    pub async fn wait_for_ready(&self, timeout_ms: u64) -> Result<()> {
712        let start = Instant::now();
713        let poll_interval: u64 = 200;
714        let stable_threshold = Duration::from_millis(500);
715        let timeout = Duration::from_millis(timeout_ms);
716
717        // Phase 1: wait for document.readyState === 'complete'
718        while start.elapsed() < timeout {
719            let state = self.adapter().evaluate("document.readyState").await;
720            if let Ok(val) = state {
721                if val.as_str() == Some("complete") {
722                    break;
723                }
724            }
725            sleep(Duration::from_millis(poll_interval)).await;
726        }
727
728        // Phase 2: wait for DOM content length to stabilize
729        let mut last_length: i64 = 0;
730        let mut stable_since = Instant::now();
731
732        while start.elapsed() < timeout {
733            let length = self
734                .adapter()
735                .evaluate("document.documentElement.innerHTML.length")
736                .await
737                .ok()
738                .and_then(|v| v.as_i64())
739                .unwrap_or(0);
740
741            if length != last_length {
742                last_length = length;
743                stable_since = Instant::now();
744            } else if stable_since.elapsed() >= stable_threshold {
745                return Ok(());
746            }
747
748            sleep(Duration::from_millis(poll_interval)).await;
749        }
750
751        Ok(())
752    }
753
754    /// Wait until page content exceeds a minimum length.
755    /// Useful for SPAs where content loads asynchronously.
756    pub async fn wait_for_content(&self, min_length: usize, timeout_ms: u64) -> Result<()> {
757        let start = Instant::now();
758        let timeout = Duration::from_millis(timeout_ms);
759        while start.elapsed() < timeout {
760            let length = self
761                .adapter()
762                .evaluate("document.documentElement.innerHTML.length")
763                .await
764                .ok()
765                .and_then(|v| v.as_u64())
766                .unwrap_or(0) as usize;
767            if length >= min_length {
768                return Ok(());
769            }
770            sleep(Duration::from_millis(200)).await;
771        }
772        Ok(())
773    }
774
775    /// Wait for network idle + DOM stability (cross-platform).
776    ///
777    /// Uses the Performance/Resource Timing API and `MutationObserver`
778    /// (works in both Chrome/CDP and Firefox/BiDi) to detect when:
779    /// 1. `document.readyState === 'complete'`
780    /// 2. No new network resources loading (`PerformanceObserver`)
781    /// 3. DOM mutations have settled
782    ///
783    /// This is more comprehensive than [`wait_for_ready`] -- it also
784    /// catches lazy-loaded images, XHR/fetch requests, and
785    /// script-injected content.
786    pub async fn wait_for_network_idle(&self, timeout_ms: u64) -> Result<()> {
787        let start = Instant::now();
788        let poll_interval: u64 = 250;
789        let timeout = Duration::from_millis(timeout_ms);
790
791        // Phase 1: wait for document.readyState === 'complete'
792        while start.elapsed() < timeout {
793            let state = self.adapter().evaluate("document.readyState").await;
794            if let Ok(val) = state {
795                if val.as_str() == Some("complete") {
796                    break;
797                }
798            }
799            sleep(Duration::from_millis(poll_interval)).await;
800        }
801
802        // Phase 2: inject a combined network + DOM stability checker.
803        // Uses PerformanceObserver for resource timing + MutationObserver
804        // for DOM changes. Returns a promise that resolves when both are
805        // quiet for `idle_ms`.
806        let idle_ms: u64 = 400;
807        let remaining = {
808            let elapsed = start.elapsed();
809            if timeout > elapsed {
810                (timeout - elapsed).as_millis().max(1000) as u64
811            } else {
812                1000
813            }
814        };
815        let js = format!(
816            r#"
817            new Promise((resolve) => {{
818                let lastActivity = Date.now();
819                const idleThreshold = {idle_ms};
820                const deadline = Date.now() + {remaining};
821
822                const perfObs = new PerformanceObserver(() => {{ lastActivity = Date.now(); }});
823                try {{ perfObs.observe({{ entryTypes: ['resource'] }}); }} catch(e) {{}}
824
825                const mutObs = new MutationObserver(() => {{ lastActivity = Date.now(); }});
826                mutObs.observe(document.documentElement, {{
827                    childList: true, subtree: true, attributes: true
828                }});
829
830                const check = () => {{
831                    const now = Date.now();
832                    if (now >= deadline || (now - lastActivity >= idleThreshold)) {{
833                        perfObs.disconnect();
834                        mutObs.disconnect();
835                        resolve(true);
836                        return;
837                    }}
838                    setTimeout(check, 100);
839                }};
840                setTimeout(check, idleThreshold);
841            }})
842            "#
843        );
844        if self.adapter().evaluate(&js).await.is_err() {
845            // If the evaluate fails (e.g. page navigated away), just
846            // continue.
847            sleep(Duration::from_millis(500)).await;
848        }
849
850        Ok(())
851    }
852
853    // =================================================================
854    // Viewport
855    // =================================================================
856
857    /// Set the viewport dimensions.
858    pub async fn set_viewport(
859        &self,
860        width: u32,
861        height: u32,
862        device_scale_factor: f64,
863        mobile: bool,
864    ) -> Result<()> {
865        self.adapter()
866            .set_viewport(width, height, device_scale_factor, mobile)
867            .await
868    }
869
870    // =================================================================
871    // DOM Queries
872    // =================================================================
873
874    /// Query a single element and return its outer HTML.
875    pub async fn query_selector(&self, selector: &str) -> Result<Option<String>> {
876        let escaped =
877            serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
878        let js = format!("document.querySelector({escaped})?.outerHTML ?? null");
879        let val = self.adapter().evaluate(&js).await?;
880        if val.is_null() {
881            Ok(None)
882        } else {
883            Ok(val.as_str().map(|s| s.to_string()))
884        }
885    }
886
887    /// Query all matching elements and return their outer HTML.
888    pub async fn query_selector_all(&self, selector: &str) -> Result<Vec<String>> {
889        let escaped =
890            serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
891        let js = format!(
892            "Array.from(document.querySelectorAll({escaped})).map(el => el.outerHTML)"
893        );
894        let val = self.adapter().evaluate(&js).await?;
895        let items = val
896            .as_array()
897            .map(|arr| {
898                arr.iter()
899                    .filter_map(|v| v.as_str().map(|s| s.to_string()))
900                    .collect()
901            })
902            .unwrap_or_default();
903        Ok(items)
904    }
905
906    /// Get text content of an element.
907    pub async fn text_content(&self, selector: &str) -> Result<Option<String>> {
908        let escaped =
909            serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
910        let js = format!("document.querySelector({escaped})?.textContent ?? null");
911        let val = self.adapter().evaluate(&js).await?;
912        if val.is_null() {
913            Ok(None)
914        } else {
915            Ok(val.as_str().map(|s| s.to_string()))
916        }
917    }
918
919    /// Extract multiple fields from the page in a single evaluate call.
920    ///
921    /// Each entry maps a key name to a [`FieldSelector`]. Returns a map of
922    /// key → value (or `None` if the element was not found).
923    ///
924    /// # Example
925    ///
926    /// ```ignore
927    /// use std::collections::HashMap;
928    /// use spider_browser::page::FieldSelector;
929    ///
930    /// let data = page.extract_fields(&[
931    ///     ("title", "#productTitle".into()),
932    ///     ("price", ".a-price .a-offscreen".into()),
933    ///     ("image", FieldSelector::Attr {
934    ///         selector: "#main-image",
935    ///         attribute: "src",
936    ///     }),
937    /// ]).await?;
938    /// println!("{:?}", data.get("title"));
939    /// ```
940    pub async fn extract_fields(
941        &self,
942        fields: &[(&str, FieldSelector<'_>)],
943    ) -> Result<std::collections::HashMap<String, Option<String>>> {
944        // Build the field map as a JSON array for the browser JS.
945        let field_map: Vec<Value> = fields
946            .iter()
947            .map(|(key, sel)| {
948                let (css, attr) = match sel {
949                    FieldSelector::Text(s) => (*s, None),
950                    FieldSelector::Attr {
951                        selector,
952                        attribute,
953                    } => (*selector, Some(*attribute)),
954                };
955                serde_json::json!({
956                    "key": key,
957                    "selector": css,
958                    "attribute": attr,
959                })
960            })
961            .collect();
962
963        let field_json = serde_json::to_string(&field_map)
964            .unwrap_or_else(|_| "[]".to_string());
965
966        let js = format!(
967            r#"
968            (() => {{
969                const fields = {field_json};
970                const result = {{}};
971                for (const f of fields) {{
972                    const el = document.querySelector(f.selector);
973                    result[f.key] = el
974                        ? (f.attribute ? el.getAttribute(f.attribute) : el.textContent?.trim()) ?? null
975                        : null;
976                }}
977                return JSON.stringify(result);
978            }})()
979            "#
980        );
981
982        let val = self.adapter().evaluate(&js).await?;
983        let raw = val.as_str().unwrap_or("{}");
984        let parsed: std::collections::HashMap<String, Option<String>> =
985            serde_json::from_str(raw).unwrap_or_default();
986        Ok(parsed)
987    }
988
989    // =================================================================
990    // Internals
991    // =================================================================
992
993    /// Get the center coordinates of a DOM element (scrolls into view
994    /// first).
995    async fn get_element_center(&self, selector: &str) -> Result<(f64, f64)> {
996        let escaped =
997            serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
998        let js = format!(
999            r#"
1000            (function() {{
1001                const el = document.querySelector({escaped});
1002                if (!el) return null;
1003                el.scrollIntoView({{ block: 'center', behavior: 'instant' }});
1004                const r = el.getBoundingClientRect();
1005                return {{ x: r.x + r.width / 2, y: r.y + r.height / 2 }};
1006            }})()
1007            "#
1008        );
1009        let result = self.adapter().evaluate(&js).await?;
1010
1011        if result.is_null() {
1012            return Err(SpiderError::Other(format!(
1013                "Element not found: {selector}"
1014            )));
1015        }
1016
1017        let x = result
1018            .get("x")
1019            .and_then(|v| v.as_f64())
1020            .ok_or_else(|| SpiderError::Other(format!("Element not found: {selector}")))?;
1021        let y = result
1022            .get("y")
1023            .and_then(|v| v.as_f64())
1024            .ok_or_else(|| SpiderError::Other(format!("Element not found: {selector}")))?;
1025
1026        Ok((x, y))
1027    }
1028
1029    /// Route an incoming WebSocket message to the underlying protocol session.
1030    pub fn route_message(&self, data: &str) {
1031        self.adapter.load().route_message(data);
1032    }
1033
1034    /// Clean up protocol resources.
1035    pub fn destroy(&self) {
1036        self.adapter.load().destroy();
1037    }
1038
1039    /// Replace the adapter (used during browser switching).
1040    ///
1041    /// Atomically swaps the underlying [`ProtocolAdapter`] so that
1042    /// inflight operations on the old adapter can finish while new
1043    /// operations use the replacement.
1044    pub fn set_adapter(&self, adapter: ProtocolAdapter) {
1045        self.adapter.store(Arc::new(adapter));
1046    }
1047
1048    /// Replace the adapter with an already-`Arc`-wrapped instance.
1049    pub fn set_adapter_arc(&self, adapter: Arc<ProtocolAdapter>) {
1050        self.adapter.store(adapter);
1051    }
1052
1053    /// Detect challenge interstitials that may auto-resolve (e.g.
1054    /// Cloudflare "Just a moment...").
1055    ///
1056    /// These pages show briefly before redirecting to the real content.
1057    pub fn is_interstitial_content(html: &str) -> bool {
1058        if html.len() > 15_000 {
1059            return false; // Real pages are larger
1060        }
1061        let lower = html.to_lowercase();
1062
1063        // Challenge / WAF interstitials
1064        if lower.contains("just a moment")
1065            || lower.contains("checking your browser")
1066            || lower.contains("please wait while we verify")
1067            || lower.contains("verifying the device")
1068            || lower.contains("available after verification")
1069            || lower.contains("ddos-guard")
1070            || lower.contains("challenge-platform")
1071            || lower.contains("px-captcha")
1072            || lower.contains("_cf_chl_opt")
1073            || lower.contains("managed_challenge")
1074            || lower.contains("datadome")
1075            || lower.contains("ak_bmsc")
1076            || lower.contains("please enable cookies")
1077        {
1078            return true;
1079        }
1080
1081        // SPA loading states -- page shell rendered but content still
1082        // loading.  These auto-resolve once JS fetches actual data.
1083        // Only match on very small pages to avoid false positives on
1084        // real pages that mention "loading".
1085        if html.len() < 5_000 {
1086            if lower.contains("loading...") || lower.contains("loading results") {
1087                return true;
1088            }
1089            if lower.contains("please wait") && !lower.contains("article") {
1090                return true;
1091            }
1092        }
1093
1094        false
1095    }
1096
1097    /// Detect site-level rate limiting in page content.
1098    ///
1099    /// Browser rotation gives a new profile which bypasses per-session
1100    /// rate limits.
1101    pub fn is_rate_limit_content(html: &str) -> bool {
1102        if html.len() > 20_000 {
1103            return false; // Real pages won't be just a rate limit message
1104        }
1105        let lower = html.to_lowercase();
1106        lower.contains("rate limit exceeded")
1107            || lower.contains("too many requests")
1108            || (lower.contains("rate limit") && lower.contains("please try again"))
1109    }
1110}
1111
1112#[cfg(test)]
1113mod tests {
1114    use super::*;
1115
1116    // -----------------------------------------------------------------
1117    // is_interstitial_content tests
1118    // -----------------------------------------------------------------
1119
1120    #[test]
1121    fn interstitial_cloudflare() {
1122        let html = "<html><body>Just a moment...</body></html>";
1123        assert!(SpiderPage::is_interstitial_content(html));
1124    }
1125
1126    #[test]
1127    fn interstitial_checking_browser() {
1128        let html = "<html><body>Checking your browser before accessing</body></html>";
1129        assert!(SpiderPage::is_interstitial_content(html));
1130    }
1131
1132    #[test]
1133    fn interstitial_perimeterx() {
1134        let html = "<html><body>Verifying the device...</body></html>";
1135        assert!(SpiderPage::is_interstitial_content(html));
1136    }
1137
1138    #[test]
1139    fn interstitial_ddos_guard() {
1140        let html = "<html><head></head><body>ddos-guard check</body></html>";
1141        assert!(SpiderPage::is_interstitial_content(html));
1142    }
1143
1144    #[test]
1145    fn interstitial_challenge_platform() {
1146        let html = "<html><body class='challenge-platform'>wait</body></html>";
1147        assert!(SpiderPage::is_interstitial_content(html));
1148    }
1149
1150    #[test]
1151    fn interstitial_px_captcha() {
1152        let html = "<html><body><div id='px-captcha'></div></body></html>";
1153        assert!(SpiderPage::is_interstitial_content(html));
1154    }
1155
1156    #[test]
1157    fn interstitial_cf_chl_opt() {
1158        let html = "<html><body><script>var _cf_chl_opt={}</script></body></html>";
1159        assert!(SpiderPage::is_interstitial_content(html));
1160    }
1161
1162    #[test]
1163    fn interstitial_managed_challenge() {
1164        let html = "<html><body>managed_challenge page</body></html>";
1165        assert!(SpiderPage::is_interstitial_content(html));
1166    }
1167
1168    #[test]
1169    fn interstitial_datadome() {
1170        let html = "<html><body>DataDome verification</body></html>";
1171        assert!(SpiderPage::is_interstitial_content(html));
1172    }
1173
1174    #[test]
1175    fn interstitial_akamai() {
1176        let html = "<html><body><script>ak_bmsc=cookie</script></body></html>";
1177        assert!(SpiderPage::is_interstitial_content(html));
1178    }
1179
1180    #[test]
1181    fn interstitial_enable_cookies() {
1182        let html = "<html><body>Please enable cookies to continue</body></html>";
1183        assert!(SpiderPage::is_interstitial_content(html));
1184    }
1185
1186    #[test]
1187    fn interstitial_loading_small() {
1188        let html = "<html><body>Loading...</body></html>";
1189        assert!(SpiderPage::is_interstitial_content(html));
1190    }
1191
1192    #[test]
1193    fn interstitial_loading_results() {
1194        let html = "<html><body>Loading results</body></html>";
1195        assert!(SpiderPage::is_interstitial_content(html));
1196    }
1197
1198    #[test]
1199    fn interstitial_please_wait_small() {
1200        let html = "<html><body>Please wait</body></html>";
1201        assert!(SpiderPage::is_interstitial_content(html));
1202    }
1203
1204    #[test]
1205    fn interstitial_please_wait_with_article_not_detected() {
1206        let html = "<html><body>Please wait for this article</body></html>";
1207        assert!(!SpiderPage::is_interstitial_content(html));
1208    }
1209
1210    #[test]
1211    fn interstitial_large_page_not_detected() {
1212        let html = "x".repeat(16_000);
1213        assert!(!SpiderPage::is_interstitial_content(&html));
1214    }
1215
1216    #[test]
1217    fn interstitial_normal_page_not_detected() {
1218        let html = "<html><body><h1>Welcome</h1><p>Normal content here.</p></body></html>";
1219        assert!(!SpiderPage::is_interstitial_content(html));
1220    }
1221
1222    // -----------------------------------------------------------------
1223    // is_rate_limit_content tests
1224    // -----------------------------------------------------------------
1225
1226    #[test]
1227    fn rate_limit_exceeded() {
1228        let html = "<html><body>Rate limit exceeded</body></html>";
1229        assert!(SpiderPage::is_rate_limit_content(html));
1230    }
1231
1232    #[test]
1233    fn rate_limit_too_many_requests() {
1234        let html = "<html><body>Too many requests</body></html>";
1235        assert!(SpiderPage::is_rate_limit_content(html));
1236    }
1237
1238    #[test]
1239    fn rate_limit_try_again() {
1240        let html = "<html><body>Rate limit hit. Please try again later.</body></html>";
1241        assert!(SpiderPage::is_rate_limit_content(html));
1242    }
1243
1244    #[test]
1245    fn rate_limit_large_page_not_detected() {
1246        let html = format!(
1247            "<html><body>{}</body></html>",
1248            "x".repeat(21_000)
1249        );
1250        assert!(!SpiderPage::is_rate_limit_content(&html));
1251    }
1252
1253    #[test]
1254    fn rate_limit_normal_page_not_detected() {
1255        let html = "<html><body><h1>Normal page</h1></body></html>";
1256        assert!(!SpiderPage::is_rate_limit_content(html));
1257    }
1258}