chromiumoxide/
browser.rs

1use hashbrown::HashMap;
2use reqwest::header::{HeaderMap, HeaderValue, CONTENT_TYPE};
3use std::future::Future;
4use std::time::Duration;
5use std::{
6    io,
7    path::{Path, PathBuf},
8};
9
10use futures::channel::mpsc::{channel, unbounded, Sender};
11use futures::channel::oneshot::channel as oneshot_channel;
12use futures::select;
13use futures::SinkExt;
14
15use crate::async_process::{self, Child, ExitStatus, Stdio};
16use crate::cmd::{to_command_response, CommandMessage};
17use crate::conn::Connection;
18use crate::detection::{self, DetectionOptions};
19use crate::error::{BrowserStderr, CdpError, Result};
20use crate::handler::browser::BrowserContext;
21use crate::handler::viewport::Viewport;
22use crate::handler::{Handler, HandlerConfig, HandlerMessage, REQUEST_TIMEOUT};
23use crate::listeners::{EventListenerRequest, EventStream};
24use crate::page::Page;
25use crate::utils;
26use chromiumoxide_cdp::cdp::browser_protocol::browser::{
27    BrowserContextId, CloseReturns, GetVersionParams, GetVersionReturns,
28};
29use chromiumoxide_cdp::cdp::browser_protocol::network::{Cookie, CookieParam};
30use chromiumoxide_cdp::cdp::browser_protocol::storage::{
31    ClearCookiesParams, GetCookiesParams, SetCookiesParams,
32};
33use chromiumoxide_cdp::cdp::browser_protocol::target::{
34    CreateBrowserContextParams, CreateTargetParams, DisposeBrowserContextParams,
35    GetBrowserContextsParams, GetBrowserContextsReturns, TargetId, TargetInfo,
36};
37use chromiumoxide_cdp::cdp::{CdpEventMessage, IntoEventKind};
38use chromiumoxide_types::*;
39use spider_network_blocker::intercept_manager::NetworkInterceptManager;
40
41/// Default `Browser::launch` timeout in MS
42pub const LAUNCH_TIMEOUT: u64 = 20_000;
43
44lazy_static::lazy_static! {
45    /// The request client to get the web socket url.
46    static ref REQUEST_CLIENT: reqwest::Client = reqwest::Client::builder()
47        .timeout(Duration::from_secs(60))
48        .default_headers({
49            let mut m = HeaderMap::new();
50
51            m.insert(CONTENT_TYPE, HeaderValue::from_static("application/json"));
52
53            m
54        })
55        .tcp_keepalive(Some(Duration::from_secs(5)))
56        .pool_idle_timeout(Some(Duration::from_secs(60)))
57        .pool_max_idle_per_host(10)
58        .build()
59        .expect("client to build");
60}
61
62/// A [`Browser`] is created when chromiumoxide connects to a Chromium instance.
63#[derive(Debug)]
64pub struct Browser {
65    /// The `Sender` to send messages to the connection handler that drives the
66    /// websocket
67    pub(crate) sender: Sender<HandlerMessage>,
68    /// How the spawned chromium instance was configured, if any
69    config: Option<BrowserConfig>,
70    /// The spawned chromium instance
71    child: Option<Child>,
72    /// The debug web socket url of the chromium instance
73    debug_ws_url: String,
74    /// The context of the browser
75    pub browser_context: BrowserContext,
76}
77
78/// Browser connection information.
79#[derive(serde::Deserialize, Debug, Default)]
80pub struct BrowserConnection {
81    #[serde(rename = "Browser")]
82    /// The browser name
83    pub browser: String,
84    #[serde(rename = "Protocol-Version")]
85    /// Browser version
86    pub protocol_version: String,
87    #[serde(rename = "User-Agent")]
88    /// User Agent used by default.
89    pub user_agent: String,
90    #[serde(rename = "V8-Version")]
91    /// The v8 engine version
92    pub v8_version: String,
93    #[serde(rename = "WebKit-Version")]
94    /// Webkit version
95    pub webkit_version: String,
96    #[serde(rename = "webSocketDebuggerUrl")]
97    /// Remote debugging address
98    pub web_socket_debugger_url: String,
99}
100
101impl Browser {
102    /// Connect to an already running chromium instance via the given URL.
103    ///
104    /// If the URL is a http(s) URL, it will first attempt to retrieve the Websocket URL from the `json/version` endpoint.
105    pub async fn connect(url: impl Into<String>) -> Result<(Self, Handler)> {
106        Self::connect_with_config(url, HandlerConfig::default()).await
107    }
108
109    // Connect to an already running chromium instance with a given `HandlerConfig`.
110    ///
111    /// If the URL is a http URL, it will first attempt to retrieve the Websocket URL from the `json/version` endpoint.
112    pub async fn connect_with_config(
113        url: impl Into<String>,
114        config: HandlerConfig,
115    ) -> Result<(Self, Handler)> {
116        let mut debug_ws_url = url.into();
117
118        if debug_ws_url.starts_with("http") {
119            match REQUEST_CLIENT
120                .get(
121                    if debug_ws_url.ends_with("/json/version")
122                        || debug_ws_url.ends_with("/json/version/")
123                    {
124                        debug_ws_url.to_owned()
125                    } else {
126                        format!(
127                            "{}{}json/version",
128                            &debug_ws_url,
129                            if debug_ws_url.ends_with('/') { "" } else { "/" }
130                        )
131                    },
132                )
133                .send()
134                .await
135            {
136                Ok(req) => {
137                    if let Ok(b) = req.bytes().await {
138                        if let Ok(connection) =
139                            crate::serde_json::from_slice::<Box<BrowserConnection>>(&b)
140                        {
141                            if !connection.web_socket_debugger_url.is_empty() {
142                                debug_ws_url = connection.web_socket_debugger_url;
143                            }
144                        }
145                    }
146                }
147                Err(_) => return Err(CdpError::NoResponse),
148            }
149        }
150
151        let conn = Connection::<CdpEventMessage>::connect(&debug_ws_url).await?;
152
153        let (tx, rx) = channel(1000);
154
155        let handler_config = BrowserConfig {
156            ignore_https_errors: config.ignore_https_errors,
157            viewport: config.viewport.clone(),
158            request_timeout: config.request_timeout,
159            request_intercept: config.request_intercept,
160            cache_enabled: config.cache_enabled,
161            ignore_visuals: config.ignore_visuals,
162            ignore_stylesheets: config.ignore_stylesheets,
163            ignore_javascript: config.ignore_javascript,
164            ignore_analytics: config.ignore_analytics,
165            ignore_ads: config.ignore_ads,
166            extra_headers: config.extra_headers.clone(),
167            only_html: config.only_html,
168            service_worker_enabled: config.service_worker_enabled,
169            intercept_manager: config.intercept_manager,
170            max_bytes_allowed: config.max_bytes_allowed,
171            ..Default::default()
172        };
173
174        let fut = Handler::new(conn, rx, config);
175        let browser_context = fut.default_browser_context().clone();
176
177        let browser = Self {
178            sender: tx,
179            config: Some(handler_config),
180            child: None,
181            debug_ws_url,
182            browser_context,
183        };
184
185        Ok((browser, fut))
186    }
187
188    /// Launches a new instance of `chromium` in the background and attaches to
189    /// its debug web socket.
190    ///
191    /// This fails when no chromium executable could be detected.
192    ///
193    /// This fails if no web socket url could be detected from the child
194    /// processes stderr for more than the configured `launch_timeout`
195    /// (20 seconds by default).
196    pub async fn launch(mut config: BrowserConfig) -> Result<(Self, Handler)> {
197        // Canonalize paths to reduce issues with sandboxing
198        config.executable = utils::canonicalize_except_snap(config.executable).await?;
199
200        // Launch a new chromium instance
201        let mut child = config.launch()?;
202
203        /// Faillible initialization to run once the child process is created.
204        ///
205        /// All faillible calls must be executed inside this function. This ensures that all
206        /// errors are caught and that the child process is properly cleaned-up.
207        async fn with_child(
208            config: &BrowserConfig,
209            child: &mut Child,
210        ) -> Result<(String, Connection<CdpEventMessage>)> {
211            let dur = config.launch_timeout;
212            let timeout_fut = Box::pin(tokio::time::sleep(dur));
213
214            // extract the ws:
215            let debug_ws_url = ws_url_from_output(child, timeout_fut).await?;
216            let conn = Connection::<CdpEventMessage>::connect(&debug_ws_url).await?;
217            Ok((debug_ws_url, conn))
218        }
219
220        let (debug_ws_url, conn) = match with_child(&config, &mut child).await {
221            Ok(conn) => conn,
222            Err(e) => {
223                // An initialization error occurred, clean up the process
224                if let Ok(Some(_)) = child.try_wait() {
225                    // already exited, do nothing, may happen if the browser crashed
226                } else {
227                    // the process is still alive, kill it and wait for exit (avoid zombie processes)
228                    child.kill().await.expect("`Browser::launch` failed but could not clean-up the child process (`kill`)");
229                    child.wait().await.expect("`Browser::launch` failed but could not clean-up the child process (`wait`)");
230                }
231                return Err(e);
232            }
233        };
234
235        // Only infaillible calls are allowed after this point to avoid clean-up issues with the
236        // child process.
237
238        let (tx, rx) = channel(1000);
239
240        let handler_config = HandlerConfig {
241            ignore_https_errors: config.ignore_https_errors,
242            viewport: config.viewport.clone(),
243            context_ids: Vec::new(),
244            request_timeout: config.request_timeout,
245            request_intercept: config.request_intercept,
246            cache_enabled: config.cache_enabled,
247            ignore_visuals: config.ignore_visuals,
248            ignore_stylesheets: config.ignore_stylesheets,
249            ignore_javascript: config.ignore_javascript,
250            ignore_analytics: config.ignore_analytics,
251            ignore_ads: config.ignore_ads,
252            extra_headers: config.extra_headers.clone(),
253            only_html: config.only_html,
254            service_worker_enabled: config.service_worker_enabled,
255            created_first_target: false,
256            intercept_manager: config.intercept_manager,
257            max_bytes_allowed: config.max_bytes_allowed,
258        };
259
260        let fut = Handler::new(conn, rx, handler_config);
261        let browser_context = fut.default_browser_context().clone();
262
263        let browser = Self {
264            sender: tx,
265            config: Some(config),
266            child: Some(child),
267            debug_ws_url,
268            browser_context,
269        };
270
271        Ok((browser, fut))
272    }
273
274    /// Request to fetch all existing browser targets.
275    ///
276    /// By default, only targets launched after the browser connection are tracked
277    /// when connecting to a existing browser instance with the devtools websocket url
278    /// This function fetches existing targets on the browser and adds them as pages internally
279    ///
280    /// The pages are not guaranteed to be ready as soon as the function returns
281    /// You should wait a few millis if you need to use a page
282    /// Returns [TargetInfo]
283    pub async fn fetch_targets(&mut self) -> Result<Vec<TargetInfo>> {
284        let (tx, rx) = oneshot_channel();
285
286        self.sender
287            .clone()
288            .send(HandlerMessage::FetchTargets(tx))
289            .await?;
290
291        rx.await?
292    }
293
294    /// Request for the browser to close completely.
295    ///
296    /// If the browser was spawned by [`Browser::launch`], it is recommended to wait for the
297    /// spawned instance exit, to avoid "zombie" processes ([`Browser::wait`],
298    /// [`Browser::wait_sync`], [`Browser::try_wait`]).
299    /// [`Browser::drop`] waits automatically if needed.
300    pub async fn close(&self) -> Result<CloseReturns> {
301        let (tx, rx) = oneshot_channel();
302
303        self.sender
304            .clone()
305            .send(HandlerMessage::CloseBrowser(tx))
306            .await?;
307
308        rx.await?
309    }
310
311    /// Asynchronously wait for the spawned chromium instance to exit completely.
312    ///
313    /// The instance is spawned by [`Browser::launch`]. `wait` is usually called after
314    /// [`Browser::close`]. You can call this explicitly to collect the process and avoid
315    /// "zombie" processes.
316    ///
317    /// This call has no effect if this [`Browser`] did not spawn any chromium instance (e.g.
318    /// connected to an existing browser through [`Browser::connect`])
319    pub async fn wait(&mut self) -> io::Result<Option<ExitStatus>> {
320        if let Some(child) = self.child.as_mut() {
321            Ok(Some(child.wait().await?))
322        } else {
323            Ok(None)
324        }
325    }
326
327    /// If the spawned chromium instance has completely exited, wait for it.
328    ///
329    /// The instance is spawned by [`Browser::launch`]. `try_wait` is usually called after
330    /// [`Browser::close`]. You can call this explicitly to collect the process and avoid
331    /// "zombie" processes.
332    ///
333    /// This call has no effect if this [`Browser`] did not spawn any chromium instance (e.g.
334    /// connected to an existing browser through [`Browser::connect`])
335    pub fn try_wait(&mut self) -> io::Result<Option<ExitStatus>> {
336        if let Some(child) = self.child.as_mut() {
337            child.try_wait()
338        } else {
339            Ok(None)
340        }
341    }
342
343    /// Get the spawned chromium instance
344    ///
345    /// The instance is spawned by [`Browser::launch`]. The result is a [`async_process::Child`]
346    /// value. It acts as a compat wrapper for an `async-std` or `tokio` child process.
347    ///
348    /// You may use [`async_process::Child::as_mut_inner`] to retrieve the concrete implementation
349    /// for the selected runtime.
350    ///
351    /// This call has no effect if this [`Browser`] did not spawn any chromium instance (e.g.
352    /// connected to an existing browser through [`Browser::connect`])
353    pub fn get_mut_child(&mut self) -> Option<&mut Child> {
354        self.child.as_mut()
355    }
356
357    /// Has a browser instance launched on system.
358    pub fn has_child(&self) -> bool {
359        self.child.is_some()
360    }
361
362    /// Forcibly kill the spawned chromium instance
363    ///
364    /// The instance is spawned by [`Browser::launch`]. `kill` will automatically wait for the child
365    /// process to exit to avoid "zombie" processes.
366    ///
367    /// This method is provided to help if the browser does not close by itself. You should prefer
368    /// to use [`Browser::close`].
369    ///
370    /// This call has no effect if this [`Browser`] did not spawn any chromium instance (e.g.
371    /// connected to an existing browser through [`Browser::connect`])
372    pub async fn kill(&mut self) -> Option<io::Result<()>> {
373        match self.child.as_mut() {
374            Some(child) => Some(child.kill().await),
375            None => None,
376        }
377    }
378
379    /// If not launched as incognito this creates a new incognito browser
380    /// context. After that this browser exists within the incognito session.
381    /// New pages created while being in incognito mode will also run in the
382    /// incognito context. Incognito contexts won't share cookies/cache with
383    /// other browser contexts.
384    pub async fn start_incognito_context(&mut self) -> Result<&mut Self> {
385        if !self.is_incognito_configured() {
386            let browser_context_id = self
387                .create_browser_context(CreateBrowserContextParams::default())
388                .await?;
389            self.browser_context = BrowserContext::from(browser_context_id);
390            self.sender
391                .clone()
392                .send(HandlerMessage::InsertContext(self.browser_context.clone()))
393                .await?;
394        }
395
396        Ok(self)
397    }
398
399    /// If a incognito session was created with
400    /// `Browser::start_incognito_context` this disposes this context.
401    ///
402    /// # Note This will also dispose all pages that were running within the
403    /// incognito context.
404    pub async fn quit_incognito_context_base(
405        &self,
406        browser_context_id: BrowserContextId,
407    ) -> Result<&Self> {
408        self.dispose_browser_context(browser_context_id.clone())
409            .await?;
410        self.sender
411            .clone()
412            .send(HandlerMessage::DisposeContext(BrowserContext::from(
413                browser_context_id,
414            )))
415            .await?;
416        Ok(self)
417    }
418
419    /// If a incognito session was created with
420    /// `Browser::start_incognito_context` this disposes this context.
421    ///
422    /// # Note This will also dispose all pages that were running within the
423    /// incognito context.
424    pub async fn quit_incognito_context(&mut self) -> Result<&mut Self> {
425        if let Some(id) = self.browser_context.take() {
426            let _ = self.quit_incognito_context_base(id).await;
427        }
428        Ok(self)
429    }
430
431    /// Whether incognito mode was configured from the start
432    fn is_incognito_configured(&self) -> bool {
433        self.config
434            .as_ref()
435            .map(|c| c.incognito)
436            .unwrap_or_default()
437    }
438
439    /// Returns the address of the websocket this browser is attached to
440    pub fn websocket_address(&self) -> &String {
441        &self.debug_ws_url
442    }
443
444    /// Whether the BrowserContext is incognito.
445    pub fn is_incognito(&self) -> bool {
446        self.is_incognito_configured() || self.browser_context.is_incognito()
447    }
448
449    /// The config of the spawned chromium instance if any.
450    pub fn config(&self) -> Option<&BrowserConfig> {
451        self.config.as_ref()
452    }
453
454    /// Create a new browser page
455    pub async fn new_page(&self, params: impl Into<CreateTargetParams>) -> Result<Page> {
456        let (tx, rx) = oneshot_channel();
457        let mut params = params.into();
458
459        if let Some(id) = self.browser_context.id() {
460            if params.browser_context_id.is_none() {
461                params.browser_context_id = Some(id.clone());
462            }
463        }
464
465        let _ = self
466            .sender
467            .clone()
468            .send(HandlerMessage::CreatePage(params, tx))
469            .await;
470
471        rx.await?
472    }
473
474    /// Version information about the browser
475    pub async fn version(&self) -> Result<GetVersionReturns> {
476        Ok(self.execute(GetVersionParams::default()).await?.result)
477    }
478
479    /// Returns the user agent of the browser
480    pub async fn user_agent(&self) -> Result<String> {
481        Ok(self.version().await?.user_agent)
482    }
483
484    /// Call a browser method.
485    pub async fn execute<T: Command>(&self, cmd: T) -> Result<CommandResponse<T::Response>> {
486        let (tx, rx) = oneshot_channel();
487        let method = cmd.identifier();
488        let msg = CommandMessage::new(cmd, tx)?;
489
490        self.sender
491            .clone()
492            .send(HandlerMessage::Command(msg))
493            .await?;
494        let resp = rx.await??;
495        to_command_response::<T>(resp, method)
496    }
497
498    /// Return all of the pages of the browser
499    pub async fn pages(&self) -> Result<Vec<Page>> {
500        let (tx, rx) = oneshot_channel();
501        self.sender
502            .clone()
503            .send(HandlerMessage::GetPages(tx))
504            .await?;
505        Ok(rx.await?)
506    }
507
508    /// Return page of given target_id
509    pub async fn get_page(&self, target_id: TargetId) -> Result<Page> {
510        let (tx, rx) = oneshot_channel();
511        self.sender
512            .clone()
513            .send(HandlerMessage::GetPage(target_id, tx))
514            .await?;
515        rx.await?.ok_or(CdpError::NotFound)
516    }
517
518    /// Set listener for browser event
519    pub async fn event_listener<T: IntoEventKind>(&self) -> Result<EventStream<T>> {
520        let (tx, rx) = unbounded();
521        self.sender
522            .clone()
523            .send(HandlerMessage::AddEventListener(
524                EventListenerRequest::new::<T>(tx),
525            ))
526            .await?;
527
528        Ok(EventStream::new(rx))
529    }
530
531    /// Creates a new empty browser context.
532    pub async fn create_browser_context(
533        &mut self,
534        params: CreateBrowserContextParams,
535    ) -> Result<BrowserContextId> {
536        let response = self.execute(params).await?;
537
538        Ok(response.result.browser_context_id)
539    }
540
541    /// Returns all browser contexts created with Target.createBrowserContext method.
542    pub async fn get_browser_contexts(
543        &mut self,
544        params: GetBrowserContextsParams,
545    ) -> Result<GetBrowserContextsReturns> {
546        let response = self.execute(params).await?;
547        Ok(response.result)
548    }
549
550    /// Send a new empty browser context.
551    pub async fn send_new_context(
552        &mut self,
553        browser_context_id: BrowserContextId,
554    ) -> Result<&Self> {
555        self.browser_context = BrowserContext::from(browser_context_id);
556        self.sender
557            .clone()
558            .send(HandlerMessage::InsertContext(self.browser_context.clone()))
559            .await?;
560        Ok(self)
561    }
562
563    /// Deletes a browser context.
564    pub async fn dispose_browser_context(
565        &self,
566        browser_context_id: impl Into<BrowserContextId>,
567    ) -> Result<&Self> {
568        self.execute(DisposeBrowserContextParams::new(browser_context_id))
569            .await?;
570
571        Ok(self)
572    }
573
574    /// Clears cookies.
575    pub async fn clear_cookies(&self) -> Result<&Self> {
576        self.execute(ClearCookiesParams::default()).await?;
577        Ok(self)
578    }
579
580    /// Returns all browser cookies.
581    pub async fn get_cookies(&self) -> Result<Vec<Cookie>> {
582        let mut cmd = GetCookiesParams::default();
583
584        cmd.browser_context_id = self.browser_context.id.clone();
585
586        Ok(self.execute(cmd).await?.result.cookies)
587    }
588
589    /// Sets given cookies.
590    pub async fn set_cookies(&self, mut cookies: Vec<CookieParam>) -> Result<&Self> {
591        for cookie in &mut cookies {
592            if let Some(url) = cookie.url.as_ref() {
593                crate::page::validate_cookie_url(url)?;
594            }
595        }
596
597        let mut cookies_param = SetCookiesParams::new(cookies);
598
599        cookies_param.browser_context_id = self.browser_context.id.clone();
600
601        self.execute(cookies_param).await?;
602        Ok(self)
603    }
604}
605
606impl Drop for Browser {
607    fn drop(&mut self) {
608        if let Some(child) = self.child.as_mut() {
609            if let Ok(Some(_)) = child.try_wait() {
610                // Already exited, do nothing. Usually occurs after using the method close or kill.
611            } else {
612                // We set the `kill_on_drop` property for the child process, so no need to explicitely
613                // kill it here. It can't really be done anyway since the method is async.
614                //
615                // On Unix, the process will be reaped in the background by the runtime automatically
616                // so it won't leave any resources locked. It is, however, a better practice for the user to
617                // do it himself since the runtime doesn't provide garantees as to when the reap occurs, so we
618                // warn him here.
619                tracing::warn!("Browser was not closed manually, it will be killed automatically in the background");
620            }
621        }
622    }
623}
624
625/// Resolve devtools WebSocket URL from the provided browser process
626///
627/// If an error occurs, it returns the browser's stderr output.
628///
629/// The URL resolution fails if:
630/// - [`CdpError::LaunchTimeout`]: `timeout_fut` completes, this corresponds to a timeout
631/// - [`CdpError::LaunchExit`]: the browser process exits (or is killed)
632/// - [`CdpError::LaunchIo`]: an input/output error occurs when await the process exit or reading
633///   the browser's stderr: end of stream, invalid UTF-8, other
634async fn ws_url_from_output(
635    child_process: &mut Child,
636    timeout_fut: impl Future<Output = ()> + Unpin,
637) -> Result<String> {
638    use futures::{AsyncBufReadExt, FutureExt};
639    let mut timeout_fut = timeout_fut.fuse();
640    let stderr = child_process.stderr.take().expect("no stderror");
641    let mut stderr_bytes = Vec::<u8>::new();
642    let mut exit_status_fut = Box::pin(child_process.wait()).fuse();
643    let mut buf = futures::io::BufReader::new(stderr);
644    loop {
645        select! {
646            _ = timeout_fut => return Err(CdpError::LaunchTimeout(BrowserStderr::new(stderr_bytes))),
647            exit_status = exit_status_fut => {
648                return Err(match exit_status {
649                    Err(e) => CdpError::LaunchIo(e, BrowserStderr::new(stderr_bytes)),
650                    Ok(exit_status) => CdpError::LaunchExit(exit_status, BrowserStderr::new(stderr_bytes)),
651                })
652            },
653            read_res = buf.read_until(b'\n', &mut stderr_bytes).fuse() => {
654                match read_res {
655                    Err(e) => return Err(CdpError::LaunchIo(e, BrowserStderr::new(stderr_bytes))),
656                    Ok(byte_count) => {
657                        if byte_count == 0 {
658                            let e = io::Error::new(io::ErrorKind::UnexpectedEof, "unexpected end of stream");
659                            return Err(CdpError::LaunchIo(e, BrowserStderr::new(stderr_bytes)));
660                        }
661                        let start_offset = stderr_bytes.len() - byte_count;
662                        let new_bytes = &stderr_bytes[start_offset..];
663                        match std::str::from_utf8(new_bytes) {
664                            Err(_) => {
665                                let e = io::Error::new(io::ErrorKind::InvalidData, "stream did not contain valid UTF-8");
666                                return Err(CdpError::LaunchIo(e, BrowserStderr::new(stderr_bytes)));
667                            }
668                            Ok(line) => {
669                                if let Some((_, ws)) = line.rsplit_once("listening on ") {
670                                    if ws.starts_with("ws") && ws.contains("devtools/browser") {
671                                        return Ok(ws.trim().to_string());
672                                    }
673                                }
674                            }
675                        }
676                    }
677                }
678            }
679        }
680    }
681}
682
683#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
684pub enum HeadlessMode {
685    /// The "headful" mode.
686    False,
687    /// The old headless mode.
688    #[default]
689    True,
690    /// The new headless mode. See also: https://developer.chrome.com/docs/chromium/new-headless
691    New,
692}
693
694#[derive(Debug, Clone, Default)]
695pub struct BrowserConfig {
696    /// Determines whether to run headless version of the browser. Defaults to
697    /// true.
698    headless: HeadlessMode,
699    /// Determines whether to run the browser with a sandbox.
700    sandbox: bool,
701    /// Launch the browser with a specific window width and height.
702    window_size: Option<(u32, u32)>,
703    /// Launch the browser with a specific debugging port.
704    port: u16,
705    /// Path for Chrome or Chromium.
706    ///
707    /// If unspecified, the create will try to automatically detect a suitable
708    /// binary.
709    executable: std::path::PathBuf,
710
711    /// A list of Chrome extensions to load.
712    ///
713    /// An extension should be a path to a folder containing the extension code.
714    /// CRX files cannot be used directly and must be first extracted.
715    ///
716    /// Note that Chrome does not support loading extensions in headless-mode.
717    /// See https://bugs.chromium.org/p/chromium/issues/detail?id=706008#c5
718    extensions: Vec<String>,
719
720    /// Environment variables to set for the Chromium process.
721    /// Passes value through to std::process::Command::envs.
722    pub process_envs: Option<HashMap<String, String>>,
723
724    /// Data dir for user data
725    pub user_data_dir: Option<PathBuf>,
726
727    /// Whether to launch the `Browser` in incognito mode.
728    incognito: bool,
729
730    /// Timeout duration for `Browser::launch`.
731    launch_timeout: Duration,
732
733    /// Ignore https errors, default is true.
734    ignore_https_errors: bool,
735    pub viewport: Option<Viewport>,
736    /// The duration after a request with no response should time out.
737    request_timeout: Duration,
738
739    /// Additional command line arguments to pass to the browser instance.
740    args: Vec<String>,
741
742    /// Whether to disable DEFAULT_ARGS or not, default is false.
743    disable_default_args: bool,
744
745    /// Whether to enable request interception.
746    pub request_intercept: bool,
747
748    /// Whether to enable cache.
749    pub cache_enabled: bool,
750    /// Whether to enable or disable Service Workers.
751    /// Disabling may reduce background network activity and caching effects.
752    pub service_worker_enabled: bool,
753    /// Whether to ignore image/visual requests during interception.
754    /// Can reduce bandwidth and speed up crawling when visuals are unnecessary.
755    pub ignore_visuals: bool,
756    /// Whether to ignore stylesheet (CSS) requests during interception.
757    /// Useful for content-only crawls.
758    pub ignore_stylesheets: bool,
759    /// Whether to ignore JavaScript requests during interception.
760    /// This still allows critical framework bundles to pass when applicable.
761    pub ignore_javascript: bool,
762    /// Whether to ignore analytics/telemetry requests during interception.
763    pub ignore_analytics: bool,
764    /// Whether to ignore ad network requests during interception.
765    pub ignore_ads: bool,
766    /// Extra headers.
767    pub extra_headers: Option<std::collections::HashMap<String, String>>,
768    /// Only html
769    pub only_html: bool,
770    /// The interception intercept manager.
771    pub intercept_manager: NetworkInterceptManager,
772    /// The max bytes to receive.
773    pub max_bytes_allowed: Option<u64>,
774}
775
776#[derive(Debug, Clone)]
777pub struct BrowserConfigBuilder {
778    /// Headless mode configuration for the browser.
779    headless: HeadlessMode,
780    /// Whether to run the browser with a sandbox.
781    sandbox: bool,
782    /// Optional initial browser window size `(width, height)`.
783    window_size: Option<(u32, u32)>,
784    /// DevTools debugging port to bind to.
785    port: u16,
786    /// Optional explicit path to the Chrome/Chromium executable.
787    /// If `None`, auto-detection may be attempted based on `executation_detection`.
788    executable: Option<PathBuf>,
789    /// Controls auto-detection behavior for finding a Chrome/Chromium binary.
790    executation_detection: DetectionOptions,
791    /// List of unpacked extensions (directories) to load at startup.
792    extensions: Vec<String>,
793    /// Environment variables to set on the spawned Chromium process.
794    process_envs: Option<HashMap<String, String>>,
795    /// User data directory to persist browser state, or `None` for ephemeral.
796    user_data_dir: Option<PathBuf>,
797    /// Whether to start the browser in incognito (off-the-record) mode.
798    incognito: bool,
799    /// Maximum time to wait for the browser to launch and become ready.
800    launch_timeout: Duration,
801    /// Whether to ignore HTTPS/TLS errors during navigation and requests.
802    ignore_https_errors: bool,
803    /// Default page viewport configuration applied on startup.
804    viewport: Option<Viewport>,
805    /// Timeout for individual network requests without response progress.
806    request_timeout: Duration,
807    /// Additional command-line flags passed directly to the browser process.
808    args: Vec<String>,
809    /// Disable the default argument set and use only the provided `args`.
810    disable_default_args: bool,
811    /// Enable Network.requestInterception for request filtering/handling.
812    request_intercept: bool,
813    /// Enable the browser cache for navigations and subresources.
814    cache_enabled: bool,
815    /// Enable/disable Service Workers.
816    service_worker_enabled: bool,
817    /// Drop image/visual requests when interception is enabled.
818    ignore_visuals: bool,
819    /// Drop ad network requests when interception is enabled.
820    ignore_ads: bool,
821    /// Drop JavaScript requests when interception is enabled.
822    ignore_javascript: bool,
823    /// Drop stylesheet (CSS) requests when interception is enabled.
824    ignore_stylesheets: bool,
825    /// Drop analytics/telemetry requests when interception is enabled.
826    ignore_analytics: bool,
827    /// If `true`, limit fetching to HTML documents.
828    only_html: bool,
829    /// Extra HTTP headers to include with every request.
830    extra_headers: Option<std::collections::HashMap<String, String>>,
831    /// Network interception manager used to configure filtering behavior.
832    intercept_manager: NetworkInterceptManager,
833    /// Optional upper bound on bytes that may be received (per session/run).
834    max_bytes_allowed: Option<u64>,
835}
836
837impl BrowserConfig {
838    /// Browser builder default config.
839    pub fn builder() -> BrowserConfigBuilder {
840        BrowserConfigBuilder::default()
841    }
842
843    /// Launch with the executable path.
844    pub fn with_executable(path: impl AsRef<Path>) -> Self {
845        Self::builder()
846            .chrome_executable(path)
847            .build()
848            .expect("path to executable exist")
849    }
850}
851
852impl Default for BrowserConfigBuilder {
853    fn default() -> Self {
854        Self {
855            headless: HeadlessMode::True,
856            sandbox: true,
857            window_size: None,
858            port: 0,
859            executable: None,
860            executation_detection: DetectionOptions::default(),
861            extensions: Vec::new(),
862            process_envs: None,
863            user_data_dir: None,
864            incognito: false,
865            launch_timeout: Duration::from_millis(LAUNCH_TIMEOUT),
866            ignore_https_errors: true,
867            viewport: Some(Default::default()),
868            request_timeout: Duration::from_millis(REQUEST_TIMEOUT),
869            args: Vec::new(),
870            disable_default_args: false,
871            request_intercept: false,
872            cache_enabled: true,
873            ignore_visuals: false,
874            ignore_ads: false,
875            ignore_javascript: false,
876            ignore_analytics: false,
877            ignore_stylesheets: false,
878            only_html: false,
879            extra_headers: Default::default(),
880            service_worker_enabled: true,
881            intercept_manager: NetworkInterceptManager::Unknown,
882            max_bytes_allowed: None,
883        }
884    }
885}
886
887impl BrowserConfigBuilder {
888    /// Configure window size.
889    pub fn window_size(mut self, width: u32, height: u32) -> Self {
890        self.window_size = Some((width, height));
891        self
892    }
893    /// Configure sandboxing.
894    pub fn no_sandbox(mut self) -> Self {
895        self.sandbox = false;
896        self
897    }
898    /// Configure the launch to start non headless.
899    pub fn with_head(mut self) -> Self {
900        self.headless = HeadlessMode::False;
901        self
902    }
903    /// Configure the launch with the new headless mode.
904    pub fn new_headless_mode(mut self) -> Self {
905        self.headless = HeadlessMode::New;
906        self
907    }
908    /// Configure the launch with headless.
909    pub fn headless_mode(mut self, mode: HeadlessMode) -> Self {
910        self.headless = mode;
911        self
912    }
913    /// Configure the launch in incognito.
914    pub fn incognito(mut self) -> Self {
915        self.incognito = true;
916        self
917    }
918
919    pub fn respect_https_errors(mut self) -> Self {
920        self.ignore_https_errors = false;
921        self
922    }
923
924    pub fn port(mut self, port: u16) -> Self {
925        self.port = port;
926        self
927    }
928
929    pub fn with_max_bytes_allowed(mut self, max_bytes_allowed: Option<u64>) -> Self {
930        self.max_bytes_allowed = max_bytes_allowed;
931        self
932    }
933
934    pub fn launch_timeout(mut self, timeout: Duration) -> Self {
935        self.launch_timeout = timeout;
936        self
937    }
938
939    pub fn request_timeout(mut self, timeout: Duration) -> Self {
940        self.request_timeout = timeout;
941        self
942    }
943
944    /// Configures the viewport of the browser, which defaults to `800x600`.
945    /// `None` disables viewport emulation (i.e., it uses the browsers default
946    /// configuration, which fills the available space. This is similar to what
947    /// Playwright does when you provide `null` as the value of its `viewport`
948    /// option).
949    pub fn viewport(mut self, viewport: impl Into<Option<Viewport>>) -> Self {
950        self.viewport = viewport.into();
951        self
952    }
953
954    pub fn user_data_dir(mut self, data_dir: impl AsRef<Path>) -> Self {
955        self.user_data_dir = Some(data_dir.as_ref().to_path_buf());
956        self
957    }
958
959    pub fn chrome_executable(mut self, path: impl AsRef<Path>) -> Self {
960        self.executable = Some(path.as_ref().to_path_buf());
961        self
962    }
963
964    pub fn chrome_detection(mut self, options: DetectionOptions) -> Self {
965        self.executation_detection = options;
966        self
967    }
968
969    pub fn extension(mut self, extension: impl Into<String>) -> Self {
970        self.extensions.push(extension.into());
971        self
972    }
973
974    pub fn extensions<I, S>(mut self, extensions: I) -> Self
975    where
976        I: IntoIterator<Item = S>,
977        S: Into<String>,
978    {
979        for ext in extensions {
980            self.extensions.push(ext.into());
981        }
982        self
983    }
984
985    pub fn env(mut self, key: impl Into<String>, val: impl Into<String>) -> Self {
986        self.process_envs
987            .get_or_insert(HashMap::new())
988            .insert(key.into(), val.into());
989        self
990    }
991
992    pub fn envs<I, K, V>(mut self, envs: I) -> Self
993    where
994        I: IntoIterator<Item = (K, V)>,
995        K: Into<String>,
996        V: Into<String>,
997    {
998        self.process_envs
999            .get_or_insert(HashMap::new())
1000            .extend(envs.into_iter().map(|(k, v)| (k.into(), v.into())));
1001        self
1002    }
1003
1004    pub fn arg(mut self, arg: impl Into<String>) -> Self {
1005        self.args.push(arg.into());
1006        self
1007    }
1008
1009    pub fn args<I, S>(mut self, args: I) -> Self
1010    where
1011        I: IntoIterator<Item = S>,
1012        S: Into<String>,
1013    {
1014        for arg in args {
1015            self.args.push(arg.into());
1016        }
1017        self
1018    }
1019
1020    pub fn disable_default_args(mut self) -> Self {
1021        self.disable_default_args = true;
1022        self
1023    }
1024
1025    pub fn enable_request_intercept(mut self) -> Self {
1026        self.request_intercept = true;
1027        self
1028    }
1029
1030    pub fn disable_request_intercept(mut self) -> Self {
1031        self.request_intercept = false;
1032        self
1033    }
1034
1035    pub fn enable_cache(mut self) -> Self {
1036        self.cache_enabled = true;
1037        self
1038    }
1039
1040    pub fn disable_cache(mut self) -> Self {
1041        self.cache_enabled = false;
1042        self
1043    }
1044
1045    pub fn set_service_worker_enabled(mut self, bypass: bool) -> Self {
1046        self.service_worker_enabled = bypass;
1047        self
1048    }
1049
1050    pub fn set_extra_headers(
1051        mut self,
1052        headers: Option<std::collections::HashMap<String, String>>,
1053    ) -> Self {
1054        self.extra_headers = headers;
1055        self
1056    }
1057
1058    pub fn build(self) -> std::result::Result<BrowserConfig, String> {
1059        let executable = if let Some(e) = self.executable {
1060            e
1061        } else {
1062            detection::default_executable(self.executation_detection)?
1063        };
1064
1065        Ok(BrowserConfig {
1066            headless: self.headless,
1067            sandbox: self.sandbox,
1068            window_size: self.window_size,
1069            port: self.port,
1070            executable,
1071            extensions: self.extensions,
1072            process_envs: self.process_envs,
1073            user_data_dir: self.user_data_dir,
1074            incognito: self.incognito,
1075            launch_timeout: self.launch_timeout,
1076            ignore_https_errors: self.ignore_https_errors,
1077            viewport: self.viewport,
1078            request_timeout: self.request_timeout,
1079            args: self.args,
1080            disable_default_args: self.disable_default_args,
1081            request_intercept: self.request_intercept,
1082            cache_enabled: self.cache_enabled,
1083            ignore_visuals: self.ignore_visuals,
1084            ignore_ads: self.ignore_ads,
1085            ignore_javascript: self.ignore_javascript,
1086            ignore_analytics: self.ignore_analytics,
1087            ignore_stylesheets: self.ignore_stylesheets,
1088            extra_headers: self.extra_headers,
1089            only_html: self.only_html,
1090            intercept_manager: self.intercept_manager,
1091            service_worker_enabled: self.service_worker_enabled,
1092            max_bytes_allowed: self.max_bytes_allowed,
1093        })
1094    }
1095}
1096
1097impl BrowserConfig {
1098    pub fn launch(&self) -> io::Result<Child> {
1099        let mut cmd = async_process::Command::new(&self.executable);
1100
1101        if self.disable_default_args {
1102            cmd.args(&self.args);
1103        } else {
1104            cmd.args(DEFAULT_ARGS).args(&self.args);
1105        }
1106
1107        if !self
1108            .args
1109            .iter()
1110            .any(|arg| arg.contains("--remote-debugging-port="))
1111        {
1112            cmd.arg(format!("--remote-debugging-port={}", self.port));
1113        }
1114
1115        cmd.args(
1116            self.extensions
1117                .iter()
1118                .map(|e| format!("--load-extension={e}")),
1119        );
1120
1121        if let Some(ref user_data) = self.user_data_dir {
1122            cmd.arg(format!("--user-data-dir={}", user_data.display()));
1123        } else {
1124            // If the user did not specify a data directory, this would default to the systems default
1125            // data directory. In most cases, we would rather have a fresh instance of Chromium. Specify
1126            // a temp dir just for chromiumoxide instead.
1127            cmd.arg(format!(
1128                "--user-data-dir={}",
1129                std::env::temp_dir().join("chromiumoxide-runner").display()
1130            ));
1131        }
1132
1133        if let Some((width, height)) = self.window_size {
1134            cmd.arg(format!("--window-size={width},{height}"));
1135        }
1136
1137        if !self.sandbox {
1138            cmd.args(["--no-sandbox", "--disable-setuid-sandbox"]);
1139        }
1140
1141        match self.headless {
1142            HeadlessMode::False => (),
1143            HeadlessMode::True => {
1144                cmd.args(["--headless", "--hide-scrollbars", "--mute-audio"]);
1145            }
1146            HeadlessMode::New => {
1147                cmd.args(["--headless=new", "--hide-scrollbars", "--mute-audio"]);
1148            }
1149        }
1150
1151        if self.incognito {
1152            cmd.arg("--incognito");
1153        }
1154
1155        if let Some(ref envs) = self.process_envs {
1156            cmd.envs(envs);
1157        }
1158        cmd.stderr(Stdio::piped()).spawn()
1159    }
1160}
1161
1162/// Returns the path to Chrome's executable.
1163///
1164/// If the `CHROME` environment variable is set, `default_executable` will
1165/// use it as the default path. Otherwise, the filenames `google-chrome-stable`
1166/// `chromium`, `chromium-browser`, `chrome` and `chrome-browser` are
1167/// searched for in standard places. If that fails,
1168/// `/Applications/Google Chrome.app/...` (on MacOS) or the registry (on
1169/// Windows) is consulted. If all of the above fail, an error is returned.
1170#[deprecated(note = "Use detection::default_executable instead")]
1171pub fn default_executable() -> Result<std::path::PathBuf, String> {
1172    let options = DetectionOptions {
1173        msedge: false,
1174        unstable: false,
1175    };
1176    detection::default_executable(options)
1177}
1178
1179/// These are passed to the Chrome binary by default.
1180/// Via https://github.com/puppeteer/puppeteer/blob/4846b8723cf20d3551c0d755df394cc5e0c82a94/src/node/Launcher.ts#L157
1181static DEFAULT_ARGS: [&str; 26] = [
1182    "--disable-background-networking",
1183    "--enable-features=NetworkService,NetworkServiceInProcess",
1184    "--disable-background-timer-throttling",
1185    "--disable-backgrounding-occluded-windows",
1186    "--disable-breakpad",
1187    "--disable-client-side-phishing-detection",
1188    "--disable-component-extensions-with-background-pages",
1189    "--disable-default-apps",
1190    "--disable-dev-shm-usage",
1191    "--disable-extensions",
1192    "--disable-features=TranslateUI",
1193    "--disable-hang-monitor",
1194    "--disable-ipc-flooding-protection",
1195    "--disable-popup-blocking",
1196    "--disable-prompt-on-repost",
1197    "--disable-renderer-backgrounding",
1198    "--disable-sync",
1199    "--force-color-profile=srgb",
1200    "--metrics-recording-only",
1201    "--no-first-run",
1202    "--enable-automation",
1203    "--password-store=basic",
1204    "--use-mock-keychain",
1205    "--enable-blink-features=IdleDetection",
1206    "--lang=en_US",
1207    "--disable-blink-features=AutomationControlled",
1208];