adler_core/browser/mod.rs
1//! Browser backend for pages that are unusable from raw HTTP.
2//!
3//! A handful of sites (`bot-protected` tag — Instagram, X/Twitter, `TikTok`,
4//! Facebook, Threads, Snapchat, Weibo, …) refuse to render anything useful
5//! to a plain `reqwest` call: they ship a JavaScript login wall, a
6//! Cloudflare challenge, or a TLS-fingerprint check. From Adler's signal
7//! perspective the response looks identical for an existing account and a
8//! missing one, so the verdict is always `Uncertain`.
9//!
10//! This module adds a thin abstraction over a *real* browser that can
11//! execute JS, accept cookies, present a residential / mobile IP, and
12//! return the final post-JS DOM. The existing detection signals
13//! (`status_found`, `body_*`, `redirect_absent`) then work on the rendered
14//! page exactly as they do on a raw HTTP response.
15//!
16//! ## Backends
17//!
18//! - [`local::LocalBackend`] launches a headless Chrome/Chromium process
19//! via [`chromiumoxide`]. Free, runs on the user's machine, requires
20//! Chrome to be installed.
21//! - [`browserbase::BrowserbaseBackend`] creates a remote session on
22//! <https://browserbase.com> and connects to it via the CDP WebSocket
23//! the service exposes. Pays per session-minute, no local setup, comes
24//! with a residential / mobile proxy pool out of the box.
25//!
26//! Both backends drive Chrome through the same chromiumoxide [`Browser`]
27//! handle — only the transport (process vs. WebSocket) differs.
28//!
29//! [`Browser`]: chromiumoxide::Browser
30
31pub mod browserbase;
32pub mod budget;
33pub mod cdp;
34pub mod local;
35
36#[cfg(test)]
37pub(crate) mod mock_cdp;
38
39use std::collections::BTreeMap;
40use std::time::Duration;
41
42use async_trait::async_trait;
43use url::Url;
44
45use crate::Result;
46
47pub use browserbase::{BrowserbaseBackend, BrowserbaseConfig};
48pub use budget::BrowserBudget;
49pub use local::{LocalBackend, LocalConfig};
50
51/// Page state captured after the backend finished loading and JS
52/// settled. Fed into the same `Signal` pipeline as a raw HTTP response.
53#[derive(Debug, Clone)]
54#[non_exhaustive]
55pub struct RenderedPage {
56 /// Final HTTP response status (after redirects).
57 pub status: u16,
58 /// Final URL the browser ended up on (after redirects + any
59 /// client-side navigation).
60 pub final_url: Url,
61 /// Outer HTML of the document at the end of the wait.
62 pub body: String,
63 /// Wall-clock time from `fetch` entry to `Ok`/`Err`, in milliseconds.
64 pub elapsed_ms: u64,
65}
66
67/// Abstraction over a real browser. Implemented by [`LocalBackend`] and
68/// [`BrowserbaseBackend`].
69///
70/// Backends are reused across many fetches for the lifetime of a scan —
71/// they own a long-lived [`chromiumoxide::Browser`] internally. Drop the
72/// backend to release the underlying resources (kill the local process or
73/// close the remote session).
74#[async_trait]
75pub trait BrowserBackend: Send + Sync {
76 /// Render `url` and return the final page state.
77 ///
78 /// `headers` are applied to *every* request the page issues (sent via
79 /// `Network.setExtraHTTPHeaders` before navigation). The map is keyed
80 /// by header name; empty means "no overrides, use defaults". Used by
81 /// sites whose JSON APIs require app-id or custom UA — e.g.
82 /// Instagram's `web_profile_info` endpoint needs `X-IG-App-ID`.
83 ///
84 /// Failures (timeout, navigation error, JS crash, etc.) should be
85 /// returned as `Err`; the caller will convert them into a
86 /// per-site `Uncertain` verdict so a single flaky site can't abort the
87 /// scan.
88 ///
89 /// # Errors
90 /// Returns [`Error::BrowserSetup`](crate::Error::BrowserSetup) on
91 /// connection / lifecycle problems and a generic browser error string
92 /// on per-fetch failures.
93 async fn fetch(
94 &self,
95 url: &Url,
96 headers: &BTreeMap<String, String>,
97 timeout: Duration,
98 ) -> Result<RenderedPage>;
99}