use std::{
env, fmt,
path::PathBuf,
sync::{
Arc, Once,
atomic::{AtomicBool, Ordering},
},
time::Duration,
};
use chromiumoxide::{
browser::{Browser, BrowserConfig},
cdp::browser_protocol::target::TargetId,
handler::Handler,
};
use rustls::crypto::ring::default_provider as ring_crypto_provider;
use serde_json::Value;
use tokio::{sync::Mutex, task::JoinHandle, time};
use crate::{
error::{Result, VoidCrawlError},
page::Page,
stealth::StealthConfig,
};
pub(crate) const DEFAULT_CHROME_ARGS: &[&str] = &[
"disable-blink-features=AutomationControlled",
"disable-infobars",
"disable-features=IsolateOrigins,site-per-process,TranslateUI",
"disable-background-networking",
"disable-background-timer-throttling",
"disable-backgrounding-occluded-windows",
"disable-breakpad",
"disable-client-side-phishing-detection",
"disable-component-extensions-with-background-pages",
"disable-default-apps",
"disable-dev-shm-usage",
"disable-hang-monitor",
"disable-ipc-flooding-protection",
"disable-popup-blocking",
"disable-prompt-on-repost",
"disable-renderer-backgrounding",
"disable-sync",
"force-color-profile=srgb",
"metrics-recording-only",
"no-first-run",
"password-store=basic",
"use-mock-keychain",
"no-service-autorun",
"no-default-browser-check",
"no-pings",
"disable-component-update",
"disable-session-crashed-bubble",
"disable-search-engine-choice-screen",
"homepage=about:blank",
"enable-gpu",
"ignore-gpu-blocklist",
"use-angle=vulkan",
"disable-gpu-sandbox",
];
const DEFAULT_LAUNCH_TIMEOUT_SECS: u64 = 45;
fn launch_timeout() -> Duration {
let secs = env::var("CHROME_LAUNCH_TIMEOUT_SECS")
.ok()
.and_then(|v| v.parse::<u64>().ok())
.filter(|&n| n > 0)
.unwrap_or(DEFAULT_LAUNCH_TIMEOUT_SECS);
Duration::from_secs(secs)
}
fn normalize_flag(arg: &str) -> &str {
arg.strip_prefix("--").unwrap_or(arg)
}
fn switch_key(arg: &str) -> &str {
arg.split_once('=').map_or(arg, |(k, _)| k)
}
pub(crate) fn assemble_chrome_args(extra_args: &[String]) -> Vec<String> {
let mut out: Vec<String> = DEFAULT_CHROME_ARGS.iter().map(|s| (*s).to_string()).collect();
for arg in extra_args {
let flag = normalize_flag(arg);
let key = switch_key(flag);
if let Some(slot) = out.iter_mut().find(|d| switch_key(d) == key) {
*slot = flag.to_string(); } else {
out.push(flag.to_string());
}
}
out
}
#[derive(Debug, Clone, Default)]
pub enum BrowserMode {
#[default]
Headless,
Headful,
RemoteDebug { ws_url: String },
}
#[derive(Debug, Clone)]
#[must_use]
pub struct BrowserSessionBuilder {
mode: BrowserMode,
stealth: StealthConfig,
extra_args: Vec<String>,
chrome_executable: Option<String>,
proxy: Option<String>,
no_sandbox: bool,
window_size: Option<(u32, u32)>,
port: Option<u16>,
user_data_dir: Option<PathBuf>,
}
impl Default for BrowserSessionBuilder {
fn default() -> Self {
Self {
mode: BrowserMode::Headless,
stealth: StealthConfig::chrome_like(),
extra_args: Vec::new(),
chrome_executable: None,
proxy: None,
no_sandbox: false,
window_size: None,
port: None,
user_data_dir: None,
}
}
}
impl BrowserSessionBuilder {
pub fn new() -> Self {
Self::default()
}
pub fn mode(mut self, mode: BrowserMode) -> Self {
self.mode = mode;
self
}
pub fn headless(self) -> Self {
self.mode(BrowserMode::Headless)
}
pub fn headful(self) -> Self {
self.mode(BrowserMode::Headful)
}
pub fn remote_debug(self, ws_url: impl Into<String>) -> Self {
self.mode(BrowserMode::RemoteDebug { ws_url: ws_url.into() })
}
pub fn stealth(mut self, config: StealthConfig) -> Self {
self.stealth = config;
self
}
pub fn no_stealth(mut self) -> Self {
self.stealth = StealthConfig::none();
self
}
pub fn arg(mut self, arg: impl Into<String>) -> Self {
self.extra_args.push(arg.into());
self
}
pub fn chrome_executable(mut self, path: impl Into<String>) -> Self {
self.chrome_executable = Some(path.into());
self
}
pub fn proxy(mut self, proxy_url: impl Into<String>) -> Self {
self.proxy = Some(proxy_url.into());
self
}
pub fn no_sandbox(mut self) -> Self {
self.no_sandbox = true;
self
}
pub fn window_size(mut self, width: u32, height: u32) -> Self {
self.window_size = Some((width, height));
self
}
pub fn port(mut self, port: u16) -> Self {
self.port = Some(port);
self
}
pub fn user_data_dir(mut self, path: impl Into<PathBuf>) -> Self {
self.user_data_dir = Some(path.into());
self
}
pub fn viewport(mut self, width: u32, height: u32) -> Self {
self.stealth.viewport_width = width;
self.stealth.viewport_height = height;
self
}
pub async fn launch(self) -> Result<BrowserSession> {
BrowserSession::connect_or_launch(
self.mode,
self.stealth,
self.extra_args,
self.chrome_executable,
self.proxy,
self.no_sandbox,
self.window_size,
self.port,
self.user_data_dir,
)
.await
}
}
pub struct BrowserSession {
browser: Arc<Mutex<Browser>>,
_handler_task: JoinHandle<()>,
handler_alive: Arc<AtomicBool>,
stealth: StealthConfig,
attached: bool,
_user_data_dir: Option<tempfile::TempDir>,
}
impl fmt::Debug for BrowserSession {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("BrowserSession").field("stealth", &self.stealth).finish_non_exhaustive()
}
}
impl BrowserSession {
pub fn is_alive(&self) -> bool {
self.handler_alive.load(Ordering::Acquire)
}
fn check_alive(&self) -> Result<()> {
if self.is_alive() { Ok(()) } else { Err(VoidCrawlError::BrowserClosed) }
}
pub fn builder() -> BrowserSessionBuilder {
BrowserSessionBuilder::new()
}
pub async fn launch_headless() -> Result<Self> {
Self::builder().headless().launch().await
}
pub async fn launch_headful() -> Result<Self> {
Self::builder().headful().launch().await
}
pub async fn connect(ws_url: impl Into<String>) -> Result<Self> {
Self::builder().remote_debug(ws_url).launch().await
}
#[allow(clippy::too_many_arguments, reason = "builder forwards all options at once")]
async fn connect_or_launch(
mode: BrowserMode,
stealth: StealthConfig,
extra_args: Vec<String>,
chrome_executable: Option<String>,
proxy: Option<String>,
no_sandbox: bool,
window_size: Option<(u32, u32)>,
port: Option<u16>,
persistent_user_data_dir: Option<PathBuf>,
) -> Result<Self> {
let mut owned_user_data_dir: Option<tempfile::TempDir> = None;
let (browser, handler) = match &mode {
BrowserMode::RemoteDebug { ws_url } => {
let ws = resolve_ws_url(ws_url).await?;
Browser::connect(&ws)
.await
.map_err(|e| VoidCrawlError::ConnectionFailed(e.to_string()))?
}
BrowserMode::Headless | BrowserMode::Headful => {
let mut builder = BrowserConfig::builder().disable_default_args();
if let Some(ref path) = persistent_user_data_dir {
builder = builder.user_data_dir(path);
} else {
let tmp = tempfile::tempdir()
.map_err(|e| VoidCrawlError::LaunchFailed(format!("tmpdir: {e}")))?;
builder = builder.user_data_dir(tmp.path());
owned_user_data_dir = Some(tmp);
}
if matches!(mode, BrowserMode::Headful) {
builder = builder.with_head();
} else {
builder = builder.new_headless_mode();
}
if let Some(ref exe) = chrome_executable {
builder = builder.chrome_executable(exe);
}
if no_sandbox {
builder = builder.no_sandbox();
}
if let Some((w, h)) = window_size {
builder = builder.window_size(w, h);
}
if let Some(p) = port {
builder = builder.port(p);
}
if let Some(ref p) = proxy {
builder = builder.arg(format!("--proxy-server={p}"));
}
for a in assemble_chrome_args(&extra_args) {
builder = builder.arg(a);
}
builder = builder.launch_timeout(launch_timeout());
let config = builder.build().map_err(VoidCrawlError::LaunchFailed)?;
Browser::launch(config)
.await
.map_err(|e| VoidCrawlError::LaunchFailed(e.to_string()))?
}
};
let alive = Arc::new(AtomicBool::new(true));
let handler_task = spawn_handler(handler, Arc::clone(&alive));
Ok(Self {
browser: Arc::new(Mutex::new(browser)),
_handler_task: handler_task,
handler_alive: alive,
stealth,
attached: matches!(mode, BrowserMode::RemoteDebug { .. }),
_user_data_dir: owned_user_data_dir,
})
}
pub async fn new_page(&self, url: &str) -> Result<Page> {
self.check_alive()?;
let page = {
let browser = self.browser.lock().await;
let cdp_page = browser
.new_page("about:blank")
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
Page::new(cdp_page)
};
page.apply_stealth(&self.stealth).await?;
page.navigate(url).await?;
Ok(page)
}
pub async fn new_blank_page(&self) -> Result<Page> {
self.check_alive()?;
let page = {
let browser = self.browser.lock().await;
let cdp_page = browser
.new_page("about:blank")
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
Page::new(cdp_page)
};
page.apply_stealth(&self.stealth).await?;
Ok(page)
}
pub async fn pages(&self) -> Result<Vec<Page>> {
self.check_alive()?;
let browser = self.browser.lock().await;
let cdp_pages =
browser.pages().await.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
Ok(cdp_pages.into_iter().map(Page::new).collect())
}
pub async fn websocket_url(&self) -> String {
self.browser.lock().await.websocket_address().clone()
}
pub async fn attach_page(&self, target_id: &str) -> Result<Page> {
self.check_alive()?;
let mut browser = self.browser.lock().await;
browser.fetch_targets().await.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
time::sleep(Duration::from_millis(100)).await;
let cdp_page = browser
.get_page(TargetId::new(target_id))
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
Ok(Page::new(cdp_page))
}
pub async fn version(&self) -> Result<String> {
self.check_alive()?;
let browser = self.browser.lock().await;
let info = browser.version().await.map_err(|e| VoidCrawlError::Other(e.to_string()))?;
Ok(info.product)
}
pub async fn close(&self) -> Result<()> {
if self.attached {
return Ok(());
}
let mut browser = self.browser.lock().await;
browser.close().await.map_err(|e| VoidCrawlError::Other(e.to_string()))?;
Ok(())
}
#[must_use]
pub fn is_attached(&self) -> bool {
self.attached
}
pub fn stealth_config(&self) -> &StealthConfig {
&self.stealth
}
}
fn spawn_handler(mut handler: Handler, alive: Arc<AtomicBool>) -> JoinHandle<()> {
tokio::spawn(async move {
use futures::StreamExt;
while handler.next().await.is_some() {}
alive.store(false, Ordering::Release);
})
}
async fn resolve_ws_url(url: &str) -> Result<String> {
if url.starts_with("ws://") || url.starts_with("wss://") {
return Ok(url.to_string());
}
static CRYPTO_INIT: Once = Once::new();
CRYPTO_INIT.call_once(|| {
let _ = ring_crypto_provider().install_default();
});
let version_url = format!("{}/json/version", url.trim_end_matches('/'));
let resp: Value = reqwest::get(&version_url)
.await
.map_err(|e| VoidCrawlError::ConnectionFailed(format!("GET {version_url}: {e}")))?
.json()
.await
.map_err(|e| VoidCrawlError::ConnectionFailed(format!("parse {version_url}: {e}")))?;
resp.get("webSocketDebuggerUrl").and_then(|v| v.as_str()).map(ToString::to_string).ok_or_else(
|| {
VoidCrawlError::ConnectionFailed(
"webSocketDebuggerUrl not found in /json/version response".into(),
)
},
)
}
#[cfg(test)]
mod tests {
use super::{DEFAULT_CHROME_ARGS, assemble_chrome_args};
#[test]
fn defaults_have_no_leading_double_dash() {
for f in DEFAULT_CHROME_ARGS {
assert!(!f.starts_with("--"), "default flag must not start with --: {f}");
}
}
#[test]
fn defaults_enable_hardware_gpu_and_antiautomation() {
let args = assemble_chrome_args(&[]);
for expected in [
"use-angle=vulkan",
"enable-gpu",
"ignore-gpu-blocklist",
"disable-gpu-sandbox",
"disable-blink-features=AutomationControlled",
] {
assert!(args.iter().any(|a| a == expected), "missing default flag: {expected}");
}
}
#[test]
fn novel_extra_args_are_normalized_and_appended() {
let extra = vec!["--proxy-bypass-list=*".to_string(), "lang=fr".to_string()];
let args = assemble_chrome_args(&extra);
assert_eq!(
&args[args.len() - 2..],
&["proxy-bypass-list=*".to_string(), "lang=fr".to_string()][..]
);
assert_eq!(args.len(), DEFAULT_CHROME_ARGS.len() + extra.len());
}
#[test]
fn caller_value_replaces_default_same_switch() {
let args = assemble_chrome_args(&["--use-angle=swiftshader".to_string()]);
let angle: Vec<&String> = args.iter().filter(|a| a.starts_with("use-angle")).collect();
assert_eq!(angle.len(), 1, "exactly one use-angle flag");
assert_eq!(angle[0], "use-angle=swiftshader");
assert!(!args.iter().any(|a| a == "use-angle=vulkan"), "default value must be gone");
assert_eq!(args.len(), DEFAULT_CHROME_ARGS.len());
}
#[test]
fn override_is_in_place_and_leaves_other_defaults() {
let args = assemble_chrome_args(&["--use-angle=gl".to_string()]);
assert!(args.iter().any(|a| a == "enable-gpu"));
assert!(args.iter().any(|a| a == "disable-blink-features=AutomationControlled"));
}
#[test]
fn no_extra_args_is_just_defaults() {
let args = assemble_chrome_args(&[]);
let defaults: Vec<String> = DEFAULT_CHROME_ARGS.iter().map(ToString::to_string).collect();
assert_eq!(args, defaults);
}
}