Skip to main content

adler_core/browser/
mod.rs

1//! Browser backend for pages that are unusable from raw HTTP.
2//!
3//! A handful of sites (`bot-protected` tag — Instagram, X/Twitter, `TikTok`,
4//! Facebook, Threads, Snapchat, Weibo, …) refuse to render anything useful
5//! to a plain `reqwest` call: they ship a JavaScript login wall, a
6//! Cloudflare challenge, or a TLS-fingerprint check. From Adler's signal
7//! perspective the response looks identical for an existing account and a
8//! missing one, so the verdict is always `Uncertain`.
9//!
10//! This module adds a thin abstraction over a *real* browser that can
11//! execute JS, accept cookies, present a residential / mobile IP, and
12//! return the final post-JS DOM. The existing detection signals
13//! (`status_found`, `body_*`, `redirect_absent`) then work on the rendered
14//! page exactly as they do on a raw HTTP response.
15//!
16//! ## Backends
17//!
18//! - [`local::LocalBackend`] launches a headless Chrome/Chromium process
19//!   via [`chromiumoxide`]. Free, runs on the user's machine, requires
20//!   Chrome to be installed.
21//! - [`browserbase::BrowserbaseBackend`] creates a remote session on
22//!   <https://browserbase.com> and connects to it via the CDP WebSocket
23//!   the service exposes. Pays per session-minute, no local setup, comes
24//!   with a residential / mobile proxy pool out of the box.
25//!
26//! Both backends drive Chrome through the same chromiumoxide [`Browser`]
27//! handle — only the transport (process vs. WebSocket) differs.
28//!
29//! [`Browser`]: chromiumoxide::Browser
30
31pub mod browserbase;
32pub mod budget;
33pub mod cdp;
34pub mod flaresolverr;
35pub mod local;
36
37#[cfg(test)]
38pub(crate) mod mock_cdp;
39
40use std::collections::BTreeMap;
41use std::time::Duration;
42
43use async_trait::async_trait;
44use url::Url;
45
46use crate::Result;
47
48pub use browserbase::{BrowserbaseBackend, BrowserbaseConfig};
49pub use budget::BrowserBudget;
50pub use flaresolverr::FlareSolverrBackend;
51pub use local::{LocalBackend, LocalConfig};
52
53/// Page state captured after the backend finished loading and JS
54/// settled. Fed into the same `Signal` pipeline as a raw HTTP response.
55#[derive(Debug, Clone)]
56#[non_exhaustive]
57pub struct RenderedPage {
58    /// Final HTTP response status (after redirects).
59    pub status: u16,
60    /// Final URL the browser ended up on (after redirects + any
61    /// client-side navigation).
62    pub final_url: Url,
63    /// Outer HTML of the document at the end of the wait.
64    pub body: String,
65    /// Wall-clock time from `fetch` entry to `Ok`/`Err`, in milliseconds.
66    pub elapsed_ms: u64,
67}
68
69/// Abstraction over a real browser. Implemented by [`LocalBackend`] and
70/// [`BrowserbaseBackend`].
71///
72/// Backends are reused across many fetches for the lifetime of a scan —
73/// they own a long-lived [`chromiumoxide::Browser`] internally. Drop the
74/// backend to release the underlying resources (kill the local process or
75/// close the remote session).
76#[async_trait]
77pub trait BrowserBackend: Send + Sync {
78    /// Render `url` and return the final page state.
79    ///
80    /// `headers` are applied to *every* request the page issues (sent via
81    /// `Network.setExtraHTTPHeaders` before navigation). The map is keyed
82    /// by header name; empty means "no overrides, use defaults". Used by
83    /// sites whose JSON APIs require app-id or custom UA — e.g.
84    /// Instagram's `web_profile_info` endpoint needs `X-IG-App-ID`.
85    ///
86    /// Failures (timeout, navigation error, JS crash, etc.) should be
87    /// returned as `Err`; the caller will convert them into a
88    /// per-site `Uncertain` verdict so a single flaky site can't abort the
89    /// scan.
90    ///
91    /// # Errors
92    /// Returns [`Error::BrowserSetup`](crate::Error::BrowserSetup) on
93    /// connection / lifecycle problems and a generic browser error string
94    /// on per-fetch failures.
95    async fn fetch(
96        &self,
97        url: &Url,
98        headers: &BTreeMap<String, String>,
99        timeout: Duration,
100    ) -> Result<RenderedPage>;
101}