Skip to main content

adler_core/browser/
mod.rs

1//! Browser backend for pages that are unusable from raw HTTP.
2//!
3//! A handful of sites (`bot-protected` tag — Instagram, X/Twitter, `TikTok`,
4//! Facebook, Threads, Snapchat, Weibo, …) refuse to render anything useful
5//! to a plain `reqwest` call: they ship a JavaScript login wall, a
6//! Cloudflare challenge, or a TLS-fingerprint check. From Adler's signal
7//! perspective the response looks identical for an existing account and a
8//! missing one, so the verdict is always `Uncertain`.
9//!
10//! This module adds a thin abstraction over a *real* browser that can
11//! execute JS, accept cookies, present a residential / mobile IP, and
12//! return the final post-JS DOM. The existing detection signals
13//! (`status_found`, `body_*`, `redirect_absent`) then work on the rendered
14//! page exactly as they do on a raw HTTP response.
15//!
16//! ## Backends
17//!
18//! - [`local::LocalBackend`] launches a headless Chrome/Chromium process
19//!   via [`chromiumoxide`]. Free, runs on the user's machine, requires
20//!   Chrome to be installed.
21//! - [`browserbase::BrowserbaseBackend`] creates a remote session on
22//!   <https://browserbase.com> and connects to it via the CDP WebSocket
23//!   the service exposes. Pays per session-minute, no local setup, comes
24//!   with a residential / mobile proxy pool out of the box.
25//!
26//! Both backends drive Chrome through the same chromiumoxide [`Browser`]
27//! handle — only the transport (process vs. WebSocket) differs.
28//!
29//! [`Browser`]: chromiumoxide::Browser
30
31pub mod browserbase;
32pub mod budget;
33pub mod cdp;
34pub mod local;
35
36#[cfg(test)]
37pub(crate) mod mock_cdp;
38
39use std::collections::BTreeMap;
40use std::time::Duration;
41
42use async_trait::async_trait;
43use url::Url;
44
45use crate::Result;
46
47pub use browserbase::{BrowserbaseBackend, BrowserbaseConfig};
48pub use budget::BrowserBudget;
49pub use local::{LocalBackend, LocalConfig};
50
51/// Page state captured after the backend finished loading and JS
52/// settled. Fed into the same `Signal` pipeline as a raw HTTP response.
53#[derive(Debug, Clone)]
54#[non_exhaustive]
55pub struct RenderedPage {
56    /// Final HTTP response status (after redirects).
57    pub status: u16,
58    /// Final URL the browser ended up on (after redirects + any
59    /// client-side navigation).
60    pub final_url: Url,
61    /// Outer HTML of the document at the end of the wait.
62    pub body: String,
63    /// Wall-clock time from `fetch` entry to `Ok`/`Err`, in milliseconds.
64    pub elapsed_ms: u64,
65}
66
67/// Abstraction over a real browser. Implemented by [`LocalBackend`] and
68/// [`BrowserbaseBackend`].
69///
70/// Backends are reused across many fetches for the lifetime of a scan —
71/// they own a long-lived [`chromiumoxide::Browser`] internally. Drop the
72/// backend to release the underlying resources (kill the local process or
73/// close the remote session).
74#[async_trait]
75pub trait BrowserBackend: Send + Sync {
76    /// Render `url` and return the final page state.
77    ///
78    /// `headers` are applied to *every* request the page issues (sent via
79    /// `Network.setExtraHTTPHeaders` before navigation). The map is keyed
80    /// by header name; empty means "no overrides, use defaults". Used by
81    /// sites whose JSON APIs require app-id or custom UA — e.g.
82    /// Instagram's `web_profile_info` endpoint needs `X-IG-App-ID`.
83    ///
84    /// Failures (timeout, navigation error, JS crash, etc.) should be
85    /// returned as `Err`; the caller will convert them into a
86    /// per-site `Uncertain` verdict so a single flaky site can't abort the
87    /// scan.
88    ///
89    /// # Errors
90    /// Returns [`Error::BrowserSetup`](crate::Error::BrowserSetup) on
91    /// connection / lifecycle problems and a generic browser error string
92    /// on per-fetch failures.
93    async fn fetch(
94        &self,
95        url: &Url,
96        headers: &BTreeMap<String, String>,
97        timeout: Duration,
98    ) -> Result<RenderedPage>;
99}