Skip to main content

scrapling_browser/
fetcher.rs

1//! High-level browser session types that drive page fetching.
2//!
3//! This is the main module most users interact with. It provides two session types:
4//!
5//! - [`DynamicSession`] -- a standard browser automation session. It launches (or
6//!   connects to) a Chromium instance, navigates to URLs, waits for the page to
7//!   stabilise, and returns a [`scrapling_fetch::Response`] containing the rendered
8//!   HTML, status code, headers, and cookies. Use this for sites that render content
9//!   with JavaScript but do not actively block bots.
10//!
11//! - [`StealthySession`] -- an anti-detection session that wraps `DynamicSession`
12//!   behaviour with stealth Chromium flags, optional WebRTC/canvas/WebGL
13//!   countermeasures, and an automatic Cloudflare Turnstile solver. Use this for
14//!   sites protected by Cloudflare, DataDome, or similar services.
15//!
16//! # Lifecycle
17//!
18//! Both sessions follow the same three-step lifecycle:
19//!
20//! 1. **Construct** -- `::new(config)` validates the configuration.
21//! 2. **Start** -- `.start().await` launches the browser and creates a context.
22//! 3. **Fetch** -- `.fetch(url, params).await` navigates and returns a response.
23//!    You can call `fetch` multiple times on the same session.
24//! 4. **Close** -- `.close().await` tears down the browser and driver.
25//!
26//! Each `fetch` call automatically retries up to `config.retries` times with a
27//! configurable delay between attempts.
28
29use std::collections::{HashMap, HashSet};
30use std::time::Duration;
31
32use tracing::{debug, error, info, warn};
33
34use crate::config::{BrowserConfig, FetchParams, ResolvedFetchParams, StealthConfig, WaitState};
35use crate::engine::{build_launch_options, launch_playwright};
36use crate::error::{BrowserError, Result};
37use crate::intercept::should_block_request;
38use crate::page_pool::PagePool;
39use crate::response_factory;
40
41use scrapling_fetch::Response;
42
43// ---------------------------------------------------------------------------
44// Shared helpers
45// ---------------------------------------------------------------------------
46
47async fn setup_page(
48    page: &playwright_rs::Page,
49    timeout_ms: f64,
50    extra_headers: &HashMap<String, String>,
51    disable_resources: bool,
52    blocked_domains: &HashSet<String>,
53) -> Result<()> {
54    page.set_default_timeout(timeout_ms).await;
55    page.set_default_navigation_timeout(timeout_ms).await;
56
57    if !extra_headers.is_empty() {
58        page.set_extra_http_headers(extra_headers.clone()).await?;
59    }
60
61    if disable_resources || !blocked_domains.is_empty() {
62        let disable = disable_resources;
63        let domains = blocked_domains.clone();
64
65        page.route("**/*", move |route| {
66            let domains = domains.clone();
67            async move {
68                let request = route.request();
69                let resource_type = request.resource_type();
70                let req_url = request.url();
71
72                if should_block_request(resource_type, req_url, disable, &domains) {
73                    route.abort(Some("blockedbyclient")).await
74                } else {
75                    route.continue_(None).await
76                }
77            }
78        })
79        .await?;
80    }
81
82    Ok(())
83}
84
85async fn wait_for_stability(
86    page: &playwright_rs::Page,
87    load_dom: bool,
88    network_idle: bool,
89) -> Result<()> {
90    let _ = page
91        .wait_for_load_state(Some(playwright_rs::WaitUntil::Load))
92        .await;
93
94    if load_dom {
95        let _ = page
96            .wait_for_load_state(Some(playwright_rs::WaitUntil::DomContentLoaded))
97            .await;
98    }
99
100    if network_idle {
101        let _ = page
102            .wait_for_load_state(Some(playwright_rs::WaitUntil::NetworkIdle))
103            .await;
104    }
105
106    Ok(())
107}
108
109async fn wait_for_selector(
110    page: &playwright_rs::Page,
111    selector: &str,
112    _state: WaitState,
113    timeout_ms: f64,
114) -> Result<()> {
115    let locator = page.locator(selector).await;
116    let wait_opts = playwright_rs::protocol::WaitForOptions::builder()
117        .timeout(timeout_ms)
118        .build();
119    locator
120        .wait_for(Some(wait_opts))
121        .await
122        .map_err(|e| BrowserError::Timeout(format!("wait_for_selector({selector}): {e}")))?;
123    Ok(())
124}
125
126async fn initialize_context(
127    context: &playwright_rs::BrowserContext,
128    config: &BrowserConfig,
129) -> Result<()> {
130    if let Some(ref script_path) = config.init_script {
131        let script = std::fs::read_to_string(script_path)
132            .map_err(|e| BrowserError::Config(format!("failed to read init_script: {e}")))?;
133        context.add_init_script(&script).await?;
134    }
135
136    if !config.cookies.is_empty() {
137        let pw_cookies: Vec<playwright_rs::protocol::Cookie> = config
138            .cookies
139            .iter()
140            .map(|c| playwright_rs::protocol::Cookie {
141                name: c.name.clone(),
142                value: c.value.clone(),
143                domain: c.domain.clone().unwrap_or_default(),
144                path: c.path.clone().unwrap_or_else(|| "/".into()),
145                expires: -1.0,
146                http_only: false,
147                secure: false,
148                same_site: None,
149            })
150            .collect();
151        context.add_cookies(&pw_cookies).await?;
152    }
153
154    Ok(())
155}
156
157fn make_goto_options(timeout_ms: f64, network_idle: bool) -> playwright_rs::GotoOptions {
158    let mut opts =
159        playwright_rs::GotoOptions::new().timeout(Duration::from_millis(timeout_ms as u64));
160    if network_idle {
161        opts = opts.wait_until(playwright_rs::WaitUntil::NetworkIdle);
162    }
163    opts
164}
165
166// ---------------------------------------------------------------------------
167// DynamicSession — standard Playwright browser automation
168// ---------------------------------------------------------------------------
169
170/// Standard Playwright browser automation session without stealth measures.
171///
172/// `DynamicSession` manages the full lifecycle of a Chromium browser: launching the
173/// Playwright driver, creating a browser context with cookies and init scripts,
174/// navigating to pages, waiting for load events, and extracting the rendered HTML
175/// into a [`Response`]. It supports static proxies, rotating proxies, CDP connections,
176/// resource blocking, domain blocking, and custom page callbacks.
177///
178/// For sites with bot detection, use [`StealthySession`] instead.
179pub struct DynamicSession {
180    config: BrowserConfig,
181    playwright: Option<playwright_rs::Playwright>,
182    browser: Option<playwright_rs::Browser>,
183    context: Option<playwright_rs::BrowserContext>,
184    page_pool: PagePool,
185    is_alive: bool,
186}
187
188impl DynamicSession {
189    /// Create a new `DynamicSession` from the given configuration, validating it upfront.
190    /// The browser is *not* launched yet -- call [`start`](Self::start) to do that.
191    /// Returns an error if the configuration fails validation.
192    pub fn new(mut config: BrowserConfig) -> Result<Self> {
193        config.validate()?;
194        let max_pages = config.max_pages;
195        Ok(Self {
196            config,
197            playwright: None,
198            browser: None,
199            context: None,
200            page_pool: PagePool::new(max_pages),
201            is_alive: false,
202        })
203    }
204
205    /// Launch the browser and create the initial browser context.
206    ///
207    /// Depending on the configuration this will either launch a new Chromium process,
208    /// connect to an existing one via CDP, or launch with a rotating proxy provider.
209    /// Init scripts and cookies from the config are applied to the context.
210    /// You must call this before calling [`fetch`](Self::fetch).
211    pub async fn start(&mut self) -> Result<()> {
212        let pw = launch_playwright().await?;
213        let chromium = pw.chromium();
214        let launch_opts = build_launch_options(&self.config, false, &[]);
215
216        if let Some(ref cdp_url) = self.config.cdp_url {
217            let browser = chromium.connect_over_cdp(cdp_url, None).await?;
218            if !self.config.has_proxy_rotator() {
219                let ctx = browser.new_context().await?;
220                initialize_context(&ctx, &self.config).await?;
221                self.context = Some(ctx);
222            }
223            self.browser = Some(browser);
224        } else if self.config.has_proxy_rotator() {
225            let browser = chromium.launch_with_options(launch_opts).await?;
226            self.browser = Some(browser);
227        } else {
228            let browser = chromium.launch_with_options(launch_opts).await?;
229            let ctx = browser.new_context().await?;
230            initialize_context(&ctx, &self.config).await?;
231            self.context = Some(ctx);
232            self.browser = Some(browser);
233        }
234
235        self.playwright = Some(pw);
236        self.is_alive = true;
237        info!("DynamicSession started");
238        Ok(())
239    }
240
241    /// Navigate to `url`, wait for stability, and return the page response with retries.
242    ///
243    /// Pass an optional [`FetchParams`] to override session-level settings for this
244    /// single request. The method retries up to `config.retries` times on failure,
245    /// sleeping `config.retry_delay_secs` between attempts.
246    pub async fn fetch(&self, url: &str, params: Option<FetchParams>) -> Result<Response> {
247        if !self.is_alive {
248            return Err(BrowserError::Config("session not started".into()));
249        }
250
251        let resolved = params.unwrap_or_default().merge_with_config(&self.config);
252        let mut last_error = None;
253
254        for attempt in 0..self.config.retries {
255            match self.do_fetch(url, &resolved).await {
256                Ok(response) => return Ok(response),
257                Err(e) => {
258                    if attempt < self.config.retries - 1 {
259                        warn!(attempt = attempt + 1, error = %e, "fetch failed, retrying");
260                        tokio::time::sleep(Duration::from_secs_f64(self.config.retry_delay_secs))
261                            .await;
262                    } else {
263                        error!(attempts = self.config.retries, "all retries exhausted");
264                    }
265                    last_error = Some(e);
266                }
267            }
268        }
269
270        Err(last_error.unwrap_or(BrowserError::Other("unknown error".into())))
271    }
272
273    async fn do_fetch(&self, url: &str, params: &ResolvedFetchParams) -> Result<Response> {
274        let page = self.get_page().await?;
275
276        setup_page(
277            &page,
278            params.timeout_ms,
279            &params.extra_headers,
280            params.disable_resources,
281            &params.blocked_domains,
282        )
283        .await?;
284
285        if let Some(ref cb) = self.config.page_setup {
286            cb(page.clone()).await?;
287        }
288
289        let goto_opts = make_goto_options(params.timeout_ms, params.network_idle);
290
291        debug!(url = %url, "navigating");
292        let nav_response = page.goto(url, Some(goto_opts)).await?;
293
294        wait_for_stability(&page, params.load_dom, params.network_idle).await?;
295
296        if let Some(ref cb) = self.config.page_action {
297            cb(page.clone()).await?;
298        }
299
300        if let Some(ref selector) = params.wait_selector {
301            wait_for_selector(
302                &page,
303                selector,
304                params.wait_selector_state,
305                params.timeout_ms,
306            )
307            .await?;
308        }
309
310        if params.wait_ms > 0 {
311            tokio::time::sleep(Duration::from_millis(params.wait_ms)).await;
312        }
313
314        let response = response_factory::from_browser_page(
315            &page,
316            nav_response.as_ref(),
317            nav_response.as_ref(),
318            HashMap::new(),
319            Vec::new(),
320        )
321        .await?;
322
323        page.close().await?;
324        info!(status = response.status, url = url, "fetch complete");
325        Ok(response)
326    }
327
328    async fn get_page(&self) -> Result<playwright_rs::Page> {
329        if let Some(ref ctx) = self.context {
330            ctx.new_page().await.map_err(Into::into)
331        } else if let Some(ref browser) = self.browser {
332            let ctx = browser.new_context().await?;
333            ctx.new_page().await.map_err(Into::into)
334        } else {
335            Err(BrowserError::Config("no browser available".into()))
336        }
337    }
338
339    /// Close the browser context, browser, and Playwright driver.
340    ///
341    /// This shuts down resources in reverse order: context, then browser, then
342    /// driver. After calling this, [`is_alive`](Self::is_alive) returns `false`
343    /// and any subsequent [`fetch`](Self::fetch) calls will fail.
344    pub async fn close(&mut self) -> Result<()> {
345        if let Some(ctx) = self.context.take() {
346            let _ = ctx.close().await;
347        }
348        if let Some(browser) = self.browser.take() {
349            let _ = browser.close().await;
350        }
351        self.playwright = None;
352        self.is_alive = false;
353        info!("DynamicSession closed");
354        Ok(())
355    }
356
357    /// Returns `true` if the session has been started and not yet closed.
358    /// Use this to guard against calling `fetch` on a session that has not been
359    /// started or has already been shut down.
360    pub fn is_alive(&self) -> bool {
361        self.is_alive
362    }
363
364    /// Return a snapshot of the page pool's current statistics.
365    /// See [`PoolStats`](crate::page_pool::PoolStats) for what is included.
366    pub fn pool_stats(&self) -> crate::page_pool::PoolStats {
367        self.page_pool.stats()
368    }
369}
370
371// ---------------------------------------------------------------------------
372// StealthySession — anti-detection browser automation with Cloudflare solver
373// ---------------------------------------------------------------------------
374
375/// Anti-detection browser automation session with optional Cloudflare challenge solving.
376///
377/// `StealthySession` is the stealth counterpart to [`DynamicSession`]. It uses the
378/// same Playwright infrastructure but launches Chromium with additional flags that
379/// remove automation indicators, block WebRTC IP leaks, inject canvas noise, and
380/// disable WebGL. It also includes a built-in Cloudflare Turnstile solver that
381/// detects non-interactive, managed, interactive, and embedded challenges and
382/// attempts to click through them automatically.
383pub struct StealthySession {
384    config: StealthConfig,
385    playwright: Option<playwright_rs::Playwright>,
386    browser: Option<playwright_rs::Browser>,
387    context: Option<playwright_rs::BrowserContext>,
388    page_pool: PagePool,
389    is_alive: bool,
390}
391
392impl StealthySession {
393    /// Create a new `StealthySession` from the given configuration, validating it upfront.
394    /// The browser is *not* launched yet -- call [`start`](Self::start) to do that.
395    /// Returns an error if the configuration fails validation.
396    pub fn new(mut config: StealthConfig) -> Result<Self> {
397        config.validate()?;
398        let max_pages = config.base.max_pages;
399        Ok(Self {
400            config,
401            playwright: None,
402            browser: None,
403            context: None,
404            page_pool: PagePool::new(max_pages),
405            is_alive: false,
406        })
407    }
408
409    /// Launch the browser with stealth flags and create the initial browser context.
410    ///
411    /// This works like [`DynamicSession::start`] but additionally applies the
412    /// stealth CLI flags from [`StealthConfig::extra_stealth_args`] and the
413    /// full [`constants::STEALTH_ARGS`] list. You must call this before calling
414    /// [`fetch`](Self::fetch).
415    pub async fn start(&mut self) -> Result<()> {
416        let pw = launch_playwright().await?;
417        let chromium = pw.chromium();
418        let extra = self.config.extra_stealth_args();
419        let launch_opts = build_launch_options(&self.config.base, true, &extra);
420
421        if let Some(ref cdp_url) = self.config.base.cdp_url {
422            let browser = chromium.connect_over_cdp(cdp_url, None).await?;
423            if !self.config.base.has_proxy_rotator() {
424                let ctx = browser.new_context().await?;
425                initialize_context(&ctx, &self.config.base).await?;
426                self.context = Some(ctx);
427            }
428            self.browser = Some(browser);
429        } else if self.config.base.has_proxy_rotator() {
430            let browser = chromium.launch_with_options(launch_opts).await?;
431            self.browser = Some(browser);
432        } else {
433            let browser = chromium.launch_with_options(launch_opts).await?;
434            let ctx = browser.new_context().await?;
435            initialize_context(&ctx, &self.config.base).await?;
436            self.context = Some(ctx);
437            self.browser = Some(browser);
438        }
439
440        self.playwright = Some(pw);
441        self.is_alive = true;
442        info!("StealthySession started");
443        Ok(())
444    }
445
446    /// Navigate to `url` with stealth measures, solve challenges if enabled, and return the response.
447    ///
448    /// If `solve_cloudflare` is enabled (either in the config or the per-request params),
449    /// the Cloudflare Turnstile solver runs after navigation. The method retries up to
450    /// `config.base.retries` times on failure.
451    pub async fn fetch(&self, url: &str, params: Option<FetchParams>) -> Result<Response> {
452        if !self.is_alive {
453            return Err(BrowserError::Config("session not started".into()));
454        }
455
456        let mut resolved = params
457            .unwrap_or_default()
458            .merge_with_config(&self.config.base);
459        if self.config.solve_cloudflare {
460            resolved.solve_cloudflare = true;
461        }
462
463        let mut last_error = None;
464
465        for attempt in 0..self.config.base.retries {
466            match self.do_fetch(url, &resolved).await {
467                Ok(response) => return Ok(response),
468                Err(e) => {
469                    if attempt < self.config.base.retries - 1 {
470                        warn!(attempt = attempt + 1, error = %e, "stealth fetch failed, retrying");
471                        tokio::time::sleep(Duration::from_secs_f64(
472                            self.config.base.retry_delay_secs,
473                        ))
474                        .await;
475                    }
476                    last_error = Some(e);
477                }
478            }
479        }
480
481        Err(last_error.unwrap_or(BrowserError::Other("unknown error".into())))
482    }
483
484    async fn do_fetch(&self, url: &str, params: &ResolvedFetchParams) -> Result<Response> {
485        let page = self.get_page().await?;
486
487        setup_page(
488            &page,
489            params.timeout_ms,
490            &params.extra_headers,
491            params.disable_resources,
492            &params.blocked_domains,
493        )
494        .await?;
495
496        if let Some(ref cb) = self.config.base.page_setup {
497            cb(page.clone()).await?;
498        }
499
500        let goto_opts = make_goto_options(params.timeout_ms, params.network_idle);
501
502        debug!(url = %url, "stealth navigating");
503        let nav_response = page.goto(url, Some(goto_opts)).await?;
504
505        wait_for_stability(&page, params.load_dom, params.network_idle).await?;
506
507        if params.solve_cloudflare {
508            self.cloudflare_solver(&page).await?;
509        }
510
511        if let Some(ref cb) = self.config.base.page_action {
512            cb(page.clone()).await?;
513        }
514
515        if let Some(ref selector) = params.wait_selector {
516            wait_for_selector(
517                &page,
518                selector,
519                params.wait_selector_state,
520                params.timeout_ms,
521            )
522            .await?;
523        }
524
525        if params.wait_ms > 0 {
526            tokio::time::sleep(Duration::from_millis(params.wait_ms)).await;
527        }
528
529        let response = response_factory::from_browser_page(
530            &page,
531            nav_response.as_ref(),
532            nav_response.as_ref(),
533            HashMap::new(),
534            Vec::new(),
535        )
536        .await?;
537
538        page.close().await?;
539        info!(
540            status = response.status,
541            url = url,
542            "stealth fetch complete"
543        );
544        Ok(response)
545    }
546
547    async fn get_page(&self) -> Result<playwright_rs::Page> {
548        if let Some(ref ctx) = self.context {
549            ctx.new_page().await.map_err(Into::into)
550        } else if let Some(ref browser) = self.browser {
551            let ctx = browser.new_context().await?;
552            ctx.new_page().await.map_err(Into::into)
553        } else {
554            Err(BrowserError::Config("no browser available".into()))
555        }
556    }
557
558    /// Detect and solve Cloudflare Turnstile challenges.
559    ///
560    /// Inspects the current page content for Cloudflare challenge markers. For
561    /// non-interactive challenges it polls the page title for up to 60 seconds. For
562    /// managed/interactive/embedded challenges it searches for the Turnstile iframe
563    /// and clicks it, retrying up to 10 times with 2-second delays.
564    async fn cloudflare_solver(&self, page: &playwright_rs::Page) -> Result<()> {
565        let _ = page
566            .wait_for_load_state(Some(playwright_rs::WaitUntil::NetworkIdle))
567            .await;
568        tokio::time::sleep(Duration::from_secs(2)).await;
569
570        let content = page
571            .content()
572            .await
573            .map_err(|e| BrowserError::Navigation(format!("cloudflare solver: {e}")))?;
574
575        let Some(challenge) = detect_cloudflare_challenge(&content) else {
576            debug!("no Cloudflare challenge detected");
577            return Ok(());
578        };
579
580        info!(challenge = %challenge, "Cloudflare challenge detected");
581
582        match challenge.as_str() {
583            "non-interactive" => {
584                for _ in 0..30 {
585                    tokio::time::sleep(Duration::from_secs(2)).await;
586                    let title = page.title().await.unwrap_or_default();
587                    if !title.contains("Just a moment") {
588                        debug!("Cloudflare non-interactive challenge resolved");
589                        return Ok(());
590                    }
591                }
592                warn!("Cloudflare non-interactive challenge did not resolve");
593            }
594            "managed" | "interactive" | "embedded" => {
595                let selectors = [
596                    "iframe[src*='challenges.cloudflare.com']",
597                    "#turnstile-wrapper iframe",
598                    ".cf-turnstile iframe",
599                ];
600
601                for _ in 0..10 {
602                    for selector in &selectors {
603                        let locator = page.locator(selector).await;
604                        if let Ok(count) = locator.count().await {
605                            if count > 0 {
606                                debug!(selector, "found Cloudflare iframe, clicking");
607                                let _ = locator.first().click(None).await;
608                                tokio::time::sleep(Duration::from_secs(3)).await;
609
610                                let new_content = page.content().await.unwrap_or_default();
611                                if detect_cloudflare_challenge(&new_content).is_none() {
612                                    info!("Cloudflare challenge solved");
613                                    return Ok(());
614                                }
615                            }
616                        }
617                    }
618                    tokio::time::sleep(Duration::from_secs(2)).await;
619                }
620                warn!("Cloudflare challenge could not be solved after retries");
621            }
622            _ => {}
623        }
624
625        Ok(())
626    }
627
628    /// Close the browser context, browser, and Playwright driver.
629    ///
630    /// This shuts down resources in reverse order: context, then browser, then
631    /// driver. After calling this, [`is_alive`](Self::is_alive) returns `false`
632    /// and any subsequent [`fetch`](Self::fetch) calls will fail.
633    pub async fn close(&mut self) -> Result<()> {
634        if let Some(ctx) = self.context.take() {
635            let _ = ctx.close().await;
636        }
637        if let Some(browser) = self.browser.take() {
638            let _ = browser.close().await;
639        }
640        self.playwright = None;
641        self.is_alive = false;
642        info!("StealthySession closed");
643        Ok(())
644    }
645
646    /// Returns `true` if the session has been started and not yet closed.
647    /// Use this to guard against calling `fetch` on a session that has not been
648    /// started or has already been shut down.
649    pub fn is_alive(&self) -> bool {
650        self.is_alive
651    }
652
653    /// Return a snapshot of the page pool's current statistics.
654    /// See [`PoolStats`](crate::page_pool::PoolStats) for what is included.
655    pub fn pool_stats(&self) -> crate::page_pool::PoolStats {
656        self.page_pool.stats()
657    }
658}
659
660fn detect_cloudflare_challenge(content: &str) -> Option<String> {
661    if content.contains("cType: 'non-interactive'") {
662        return Some("non-interactive".into());
663    }
664    if content.contains("cType: 'managed'") {
665        return Some("managed".into());
666    }
667    if content.contains("cType: 'interactive'") {
668        return Some("interactive".into());
669    }
670    if content.contains("challenges.cloudflare.com/turnstile/v") {
671        return Some("embedded".into());
672    }
673    None
674}