adler_core/browser/mod.rs
1//! Browser backend for pages that are unusable from raw HTTP.
2//!
3//! A handful of sites (`bot-protected` tag — Instagram, X/Twitter, `TikTok`,
4//! Facebook, Threads, Snapchat, Weibo, …) refuse to render anything useful
5//! to a plain `reqwest` call: they ship a JavaScript login wall, a
6//! Cloudflare challenge, or a TLS-fingerprint check. From Adler's signal
7//! perspective the response looks identical for an existing account and a
8//! missing one, so the verdict is always `Uncertain`.
9//!
10//! This module adds a thin abstraction over a *real* browser that can
11//! execute JS, accept cookies, present a residential / mobile IP, and
12//! return the final post-JS DOM. The existing detection signals
13//! (`status_found`, `body_*`, `redirect_absent`) then work on the rendered
14//! page exactly as they do on a raw HTTP response.
15//!
16//! ## Backends
17//!
18//! - [`local::LocalBackend`] launches a headless Chrome/Chromium process
19//! via [`chromiumoxide`]. Free, runs on the user's machine, requires
20//! Chrome to be installed.
21//! - [`browserbase::BrowserbaseBackend`] creates a remote session on
22//! <https://browserbase.com> and connects to it via the CDP WebSocket
23//! the service exposes. Pays per session-minute, no local setup, comes
24//! with a residential / mobile proxy pool out of the box.
25//!
26//! Both backends drive Chrome through the same chromiumoxide [`Browser`]
27//! handle — only the transport (process vs. WebSocket) differs.
28//!
29//! [`Browser`]: chromiumoxide::Browser
30
31pub mod browserbase;
32pub mod budget;
33pub mod cdp;
34pub mod flaresolverr;
35pub mod local;
36
37#[cfg(test)]
38pub(crate) mod mock_cdp;
39
40use std::collections::BTreeMap;
41use std::time::Duration;
42
43use async_trait::async_trait;
44use url::Url;
45
46use crate::Result;
47
48pub use browserbase::{BrowserbaseBackend, BrowserbaseConfig};
49pub use budget::BrowserBudget;
50pub use flaresolverr::FlareSolverrBackend;
51pub use local::{LocalBackend, LocalConfig};
52
53/// Page state captured after the backend finished loading and JS
54/// settled. Fed into the same `Signal` pipeline as a raw HTTP response.
55#[derive(Debug, Clone)]
56#[non_exhaustive]
57pub struct RenderedPage {
58 /// Final HTTP response status (after redirects).
59 pub status: u16,
60 /// Final URL the browser ended up on (after redirects + any
61 /// client-side navigation).
62 pub final_url: Url,
63 /// Outer HTML of the document at the end of the wait.
64 pub body: String,
65 /// Wall-clock time from `fetch` entry to `Ok`/`Err`, in milliseconds.
66 pub elapsed_ms: u64,
67}
68
69/// Abstraction over a real browser. Implemented by [`LocalBackend`] and
70/// [`BrowserbaseBackend`].
71///
72/// Backends are reused across many fetches for the lifetime of a scan —
73/// they own a long-lived [`chromiumoxide::Browser`] internally. Drop the
74/// backend to release the underlying resources (kill the local process or
75/// close the remote session).
76#[async_trait]
77pub trait BrowserBackend: Send + Sync {
78 /// Render `url` and return the final page state.
79 ///
80 /// `headers` are applied to *every* request the page issues (sent via
81 /// `Network.setExtraHTTPHeaders` before navigation). The map is keyed
82 /// by header name; empty means "no overrides, use defaults". Used by
83 /// sites whose JSON APIs require app-id or custom UA — e.g.
84 /// Instagram's `web_profile_info` endpoint needs `X-IG-App-ID`.
85 ///
86 /// Failures (timeout, navigation error, JS crash, etc.) should be
87 /// returned as `Err`; the caller will convert them into a
88 /// per-site `Uncertain` verdict so a single flaky site can't abort the
89 /// scan.
90 ///
91 /// # Errors
92 /// Returns [`Error::BrowserSetup`](crate::Error::BrowserSetup) on
93 /// connection / lifecycle problems and a generic browser error string
94 /// on per-fetch failures.
95 async fn fetch(
96 &self,
97 url: &Url,
98 headers: &BTreeMap<String, String>,
99 timeout: Duration,
100 ) -> Result<RenderedPage>;
101}