chromiumoxide/
browser.rs

1use hashbrown::HashMap;
2use reqwest::header::{HeaderMap, HeaderValue, CONTENT_TYPE};
3use std::future::Future;
4use std::time::Duration;
5use std::{
6    io,
7    path::{Path, PathBuf},
8};
9
10use futures::channel::mpsc::{channel, unbounded, Sender};
11use futures::channel::oneshot::channel as oneshot_channel;
12use futures::select;
13use futures::SinkExt;
14
15use crate::async_process::{self, Child, ExitStatus, Stdio};
16use crate::cmd::{to_command_response, CommandMessage};
17use crate::conn::Connection;
18use crate::detection::{self, DetectionOptions};
19use crate::error::{BrowserStderr, CdpError, Result};
20use crate::handler::browser::BrowserContext;
21use crate::handler::viewport::Viewport;
22use crate::handler::{Handler, HandlerConfig, HandlerMessage, REQUEST_TIMEOUT};
23use crate::listeners::{EventListenerRequest, EventStream};
24use crate::page::Page;
25use crate::utils;
26use chromiumoxide_cdp::cdp::browser_protocol::browser::{
27    BrowserContextId, CloseReturns, GetVersionParams, GetVersionReturns,
28};
29use chromiumoxide_cdp::cdp::browser_protocol::browser::{
30    PermissionDescriptor, PermissionSetting, SetPermissionParams,
31};
32use chromiumoxide_cdp::cdp::browser_protocol::network::{Cookie, CookieParam};
33use chromiumoxide_cdp::cdp::browser_protocol::storage::{
34    ClearCookiesParams, GetCookiesParams, SetCookiesParams,
35};
36use chromiumoxide_cdp::cdp::browser_protocol::target::{
37    CreateBrowserContextParams, CreateTargetParams, DisposeBrowserContextParams,
38    GetBrowserContextsParams, GetBrowserContextsReturns, TargetId, TargetInfo,
39};
40
41use chromiumoxide_cdp::cdp::{CdpEventMessage, IntoEventKind};
42use chromiumoxide_types::*;
43use spider_network_blocker::intercept_manager::NetworkInterceptManager;
44
45/// Default `Browser::launch` timeout in MS
46pub const LAUNCH_TIMEOUT: u64 = 20_000;
47
48lazy_static::lazy_static! {
49    /// The request client to get the web socket url.
50    static ref REQUEST_CLIENT: reqwest::Client = reqwest::Client::builder()
51        .timeout(Duration::from_secs(60))
52        .default_headers({
53            let mut m = HeaderMap::new();
54
55            m.insert(CONTENT_TYPE, HeaderValue::from_static("application/json"));
56
57            m
58        })
59        .tcp_keepalive(Some(Duration::from_secs(5)))
60        .pool_idle_timeout(Some(Duration::from_secs(60)))
61        .pool_max_idle_per_host(10)
62        .build()
63        .expect("client to build");
64}
65
66/// A [`Browser`] is created when chromiumoxide connects to a Chromium instance.
67#[derive(Debug)]
68pub struct Browser {
69    /// The `Sender` to send messages to the connection handler that drives the
70    /// websocket
71    pub(crate) sender: Sender<HandlerMessage>,
72    /// How the spawned chromium instance was configured, if any
73    config: Option<BrowserConfig>,
74    /// The spawned chromium instance
75    child: Option<Child>,
76    /// The debug web socket url of the chromium instance
77    debug_ws_url: String,
78    /// The context of the browser
79    pub browser_context: BrowserContext,
80}
81
82/// Browser connection information.
83#[derive(serde::Deserialize, Debug, Default)]
84pub struct BrowserConnection {
85    #[serde(rename = "Browser")]
86    /// The browser name
87    pub browser: String,
88    #[serde(rename = "Protocol-Version")]
89    /// Browser version
90    pub protocol_version: String,
91    #[serde(rename = "User-Agent")]
92    /// User Agent used by default.
93    pub user_agent: String,
94    #[serde(rename = "V8-Version")]
95    /// The v8 engine version
96    pub v8_version: String,
97    #[serde(rename = "WebKit-Version")]
98    /// Webkit version
99    pub webkit_version: String,
100    #[serde(rename = "webSocketDebuggerUrl")]
101    /// Remote debugging address
102    pub web_socket_debugger_url: String,
103}
104
105impl Browser {
106    /// Connect to an already running chromium instance via the given URL.
107    ///
108    /// If the URL is a http(s) URL, it will first attempt to retrieve the Websocket URL from the `json/version` endpoint.
109    pub async fn connect(url: impl Into<String>) -> Result<(Self, Handler)> {
110        Self::connect_with_config(url, HandlerConfig::default()).await
111    }
112
113    // Connect to an already running chromium instance with a given `HandlerConfig`.
114    ///
115    /// If the URL is a http URL, it will first attempt to retrieve the Websocket URL from the `json/version` endpoint.
116    pub async fn connect_with_config(
117        url: impl Into<String>,
118        config: HandlerConfig,
119    ) -> Result<(Self, Handler)> {
120        let mut debug_ws_url = url.into();
121
122        if debug_ws_url.starts_with("http") {
123            match REQUEST_CLIENT
124                .get(
125                    if debug_ws_url.ends_with("/json/version")
126                        || debug_ws_url.ends_with("/json/version/")
127                    {
128                        debug_ws_url.to_owned()
129                    } else {
130                        format!(
131                            "{}{}json/version",
132                            &debug_ws_url,
133                            if debug_ws_url.ends_with('/') { "" } else { "/" }
134                        )
135                    },
136                )
137                .send()
138                .await
139            {
140                Ok(req) => {
141                    if let Ok(b) = req.bytes().await {
142                        if let Ok(connection) =
143                            crate::serde_json::from_slice::<Box<BrowserConnection>>(&b)
144                        {
145                            if !connection.web_socket_debugger_url.is_empty() {
146                                debug_ws_url = connection.web_socket_debugger_url;
147                            }
148                        }
149                    }
150                }
151                Err(_) => return Err(CdpError::NoResponse),
152            }
153        }
154
155        let conn = Connection::<CdpEventMessage>::connect(&debug_ws_url).await?;
156
157        let (tx, rx) = channel(1000);
158
159        let handler_config = BrowserConfig {
160            ignore_https_errors: config.ignore_https_errors,
161            viewport: config.viewport.clone(),
162            request_timeout: config.request_timeout,
163            request_intercept: config.request_intercept,
164            cache_enabled: config.cache_enabled,
165            ignore_visuals: config.ignore_visuals,
166            ignore_stylesheets: config.ignore_stylesheets,
167            ignore_javascript: config.ignore_javascript,
168            ignore_analytics: config.ignore_analytics,
169            ignore_ads: config.ignore_ads,
170            extra_headers: config.extra_headers.clone(),
171            only_html: config.only_html,
172            service_worker_enabled: config.service_worker_enabled,
173            intercept_manager: config.intercept_manager,
174            max_bytes_allowed: config.max_bytes_allowed,
175            whitelist_patterns: config.whitelist_patterns.clone(),
176            ..Default::default()
177        };
178
179        let fut = Handler::new(conn, rx, config);
180        let browser_context = fut.default_browser_context().clone();
181
182        let browser = Self {
183            sender: tx,
184            config: Some(handler_config),
185            child: None,
186            debug_ws_url,
187            browser_context,
188        };
189
190        Ok((browser, fut))
191    }
192
193    /// Launches a new instance of `chromium` in the background and attaches to
194    /// its debug web socket.
195    ///
196    /// This fails when no chromium executable could be detected.
197    ///
198    /// This fails if no web socket url could be detected from the child
199    /// processes stderr for more than the configured `launch_timeout`
200    /// (20 seconds by default).
201    pub async fn launch(mut config: BrowserConfig) -> Result<(Self, Handler)> {
202        // Canonalize paths to reduce issues with sandboxing
203        config.executable = utils::canonicalize_except_snap(config.executable).await?;
204
205        // Launch a new chromium instance
206        let mut child = config.launch()?;
207
208        /// Faillible initialization to run once the child process is created.
209        ///
210        /// All faillible calls must be executed inside this function. This ensures that all
211        /// errors are caught and that the child process is properly cleaned-up.
212        async fn with_child(
213            config: &BrowserConfig,
214            child: &mut Child,
215        ) -> Result<(String, Connection<CdpEventMessage>)> {
216            let dur = config.launch_timeout;
217            let timeout_fut = Box::pin(tokio::time::sleep(dur));
218
219            // extract the ws:
220            let debug_ws_url = ws_url_from_output(child, timeout_fut).await?;
221            let conn = Connection::<CdpEventMessage>::connect(&debug_ws_url).await?;
222            Ok((debug_ws_url, conn))
223        }
224
225        let (debug_ws_url, conn) = match with_child(&config, &mut child).await {
226            Ok(conn) => conn,
227            Err(e) => {
228                // An initialization error occurred, clean up the process
229                if let Ok(Some(_)) = child.try_wait() {
230                    // already exited, do nothing, may happen if the browser crashed
231                } else {
232                    // the process is still alive, kill it and wait for exit (avoid zombie processes)
233                    child.kill().await.expect("`Browser::launch` failed but could not clean-up the child process (`kill`)");
234                    child.wait().await.expect("`Browser::launch` failed but could not clean-up the child process (`wait`)");
235                }
236                return Err(e);
237            }
238        };
239
240        // Only infaillible calls are allowed after this point to avoid clean-up issues with the
241        // child process.
242
243        let (tx, rx) = channel(1000);
244
245        let handler_config = HandlerConfig {
246            ignore_https_errors: config.ignore_https_errors,
247            viewport: config.viewport.clone(),
248            context_ids: Vec::new(),
249            request_timeout: config.request_timeout,
250            request_intercept: config.request_intercept,
251            cache_enabled: config.cache_enabled,
252            ignore_visuals: config.ignore_visuals,
253            ignore_stylesheets: config.ignore_stylesheets,
254            ignore_javascript: config.ignore_javascript,
255            ignore_analytics: config.ignore_analytics,
256            ignore_ads: config.ignore_ads,
257            extra_headers: config.extra_headers.clone(),
258            only_html: config.only_html,
259            service_worker_enabled: config.service_worker_enabled,
260            created_first_target: false,
261            intercept_manager: config.intercept_manager,
262            max_bytes_allowed: config.max_bytes_allowed,
263            whitelist_patterns: config.whitelist_patterns.clone(),
264        };
265
266        let fut = Handler::new(conn, rx, handler_config);
267        let browser_context = fut.default_browser_context().clone();
268
269        let browser = Self {
270            sender: tx,
271            config: Some(config),
272            child: Some(child),
273            debug_ws_url,
274            browser_context,
275        };
276
277        Ok((browser, fut))
278    }
279
280    /// Request to fetch all existing browser targets.
281    ///
282    /// By default, only targets launched after the browser connection are tracked
283    /// when connecting to a existing browser instance with the devtools websocket url
284    /// This function fetches existing targets on the browser and adds them as pages internally
285    ///
286    /// The pages are not guaranteed to be ready as soon as the function returns
287    /// You should wait a few millis if you need to use a page
288    /// Returns [TargetInfo]
289    pub async fn fetch_targets(&mut self) -> Result<Vec<TargetInfo>> {
290        let (tx, rx) = oneshot_channel();
291
292        self.sender
293            .clone()
294            .send(HandlerMessage::FetchTargets(tx))
295            .await?;
296
297        rx.await?
298    }
299
300    /// Request for the browser to close completely.
301    ///
302    /// If the browser was spawned by [`Browser::launch`], it is recommended to wait for the
303    /// spawned instance exit, to avoid "zombie" processes ([`Browser::wait`],
304    /// [`Browser::wait_sync`], [`Browser::try_wait`]).
305    /// [`Browser::drop`] waits automatically if needed.
306    pub async fn close(&self) -> Result<CloseReturns> {
307        let (tx, rx) = oneshot_channel();
308
309        self.sender
310            .clone()
311            .send(HandlerMessage::CloseBrowser(tx))
312            .await?;
313
314        rx.await?
315    }
316
317    /// Asynchronously wait for the spawned chromium instance to exit completely.
318    ///
319    /// The instance is spawned by [`Browser::launch`]. `wait` is usually called after
320    /// [`Browser::close`]. You can call this explicitly to collect the process and avoid
321    /// "zombie" processes.
322    ///
323    /// This call has no effect if this [`Browser`] did not spawn any chromium instance (e.g.
324    /// connected to an existing browser through [`Browser::connect`])
325    pub async fn wait(&mut self) -> io::Result<Option<ExitStatus>> {
326        if let Some(child) = self.child.as_mut() {
327            Ok(Some(child.wait().await?))
328        } else {
329            Ok(None)
330        }
331    }
332
333    /// If the spawned chromium instance has completely exited, wait for it.
334    ///
335    /// The instance is spawned by [`Browser::launch`]. `try_wait` is usually called after
336    /// [`Browser::close`]. You can call this explicitly to collect the process and avoid
337    /// "zombie" processes.
338    ///
339    /// This call has no effect if this [`Browser`] did not spawn any chromium instance (e.g.
340    /// connected to an existing browser through [`Browser::connect`])
341    pub fn try_wait(&mut self) -> io::Result<Option<ExitStatus>> {
342        if let Some(child) = self.child.as_mut() {
343            child.try_wait()
344        } else {
345            Ok(None)
346        }
347    }
348
349    /// Get the spawned chromium instance
350    ///
351    /// The instance is spawned by [`Browser::launch`]. The result is a [`async_process::Child`]
352    /// value. It acts as a compat wrapper for an `async-std` or `tokio` child process.
353    ///
354    /// You may use [`async_process::Child::as_mut_inner`] to retrieve the concrete implementation
355    /// for the selected runtime.
356    ///
357    /// This call has no effect if this [`Browser`] did not spawn any chromium instance (e.g.
358    /// connected to an existing browser through [`Browser::connect`])
359    pub fn get_mut_child(&mut self) -> Option<&mut Child> {
360        self.child.as_mut()
361    }
362
363    /// Has a browser instance launched on system.
364    pub fn has_child(&self) -> bool {
365        self.child.is_some()
366    }
367
368    /// Forcibly kill the spawned chromium instance
369    ///
370    /// The instance is spawned by [`Browser::launch`]. `kill` will automatically wait for the child
371    /// process to exit to avoid "zombie" processes.
372    ///
373    /// This method is provided to help if the browser does not close by itself. You should prefer
374    /// to use [`Browser::close`].
375    ///
376    /// This call has no effect if this [`Browser`] did not spawn any chromium instance (e.g.
377    /// connected to an existing browser through [`Browser::connect`])
378    pub async fn kill(&mut self) -> Option<io::Result<()>> {
379        match self.child.as_mut() {
380            Some(child) => Some(child.kill().await),
381            None => None,
382        }
383    }
384
385    /// If not launched as incognito this creates a new incognito browser
386    /// context. After that this browser exists within the incognito session.
387    /// New pages created while being in incognito mode will also run in the
388    /// incognito context. Incognito contexts won't share cookies/cache with
389    /// other browser contexts.
390    pub async fn start_incognito_context(&mut self) -> Result<&mut Self> {
391        if !self.is_incognito_configured() {
392            let browser_context_id = self
393                .create_browser_context(CreateBrowserContextParams::default())
394                .await?;
395            self.browser_context = BrowserContext::from(browser_context_id);
396            self.sender
397                .clone()
398                .send(HandlerMessage::InsertContext(self.browser_context.clone()))
399                .await?;
400        }
401
402        Ok(self)
403    }
404
405    /// If a incognito session was created with
406    /// `Browser::start_incognito_context` this disposes this context.
407    ///
408    /// # Note This will also dispose all pages that were running within the
409    /// incognito context.
410    pub async fn quit_incognito_context_base(
411        &self,
412        browser_context_id: BrowserContextId,
413    ) -> Result<&Self> {
414        self.dispose_browser_context(browser_context_id.clone())
415            .await?;
416        self.sender
417            .clone()
418            .send(HandlerMessage::DisposeContext(BrowserContext::from(
419                browser_context_id,
420            )))
421            .await?;
422        Ok(self)
423    }
424
425    /// If a incognito session was created with
426    /// `Browser::start_incognito_context` this disposes this context.
427    ///
428    /// # Note This will also dispose all pages that were running within the
429    /// incognito context.
430    pub async fn quit_incognito_context(&mut self) -> Result<&mut Self> {
431        if let Some(id) = self.browser_context.take() {
432            let _ = self.quit_incognito_context_base(id).await;
433        }
434        Ok(self)
435    }
436
437    /// Whether incognito mode was configured from the start
438    fn is_incognito_configured(&self) -> bool {
439        self.config
440            .as_ref()
441            .map(|c| c.incognito)
442            .unwrap_or_default()
443    }
444
445    /// Returns the address of the websocket this browser is attached to
446    pub fn websocket_address(&self) -> &String {
447        &self.debug_ws_url
448    }
449
450    /// Whether the BrowserContext is incognito.
451    pub fn is_incognito(&self) -> bool {
452        self.is_incognito_configured() || self.browser_context.is_incognito()
453    }
454
455    /// The config of the spawned chromium instance if any.
456    pub fn config(&self) -> Option<&BrowserConfig> {
457        self.config.as_ref()
458    }
459
460    /// Create a new browser page
461    pub async fn new_page(&self, params: impl Into<CreateTargetParams>) -> Result<Page> {
462        let (tx, rx) = oneshot_channel();
463        let mut params = params.into();
464
465        if let Some(id) = self.browser_context.id() {
466            if params.browser_context_id.is_none() {
467                params.browser_context_id = Some(id.clone());
468            }
469        }
470
471        let _ = self
472            .sender
473            .clone()
474            .send(HandlerMessage::CreatePage(params, tx))
475            .await;
476
477        rx.await?
478    }
479
480    /// Version information about the browser
481    pub async fn version(&self) -> Result<GetVersionReturns> {
482        Ok(self.execute(GetVersionParams::default()).await?.result)
483    }
484
485    /// Returns the user agent of the browser
486    pub async fn user_agent(&self) -> Result<String> {
487        Ok(self.version().await?.user_agent)
488    }
489
490    /// Call a browser method.
491    pub async fn execute<T: Command>(&self, cmd: T) -> Result<CommandResponse<T::Response>> {
492        let (tx, rx) = oneshot_channel();
493        let method = cmd.identifier();
494        let msg = CommandMessage::new(cmd, tx)?;
495
496        self.sender
497            .clone()
498            .send(HandlerMessage::Command(msg))
499            .await?;
500        let resp = rx.await??;
501        to_command_response::<T>(resp, method)
502    }
503
504    /// Set permission settings for given embedding and embedded origins.
505    /// [PermissionDescriptor](https://chromedevtools.github.io/devtools-protocol/tot/Browser/#type-PermissionDescriptor)
506    /// [PermissionSetting](https://chromedevtools.github.io/devtools-protocol/tot/Browser/#type-PermissionSetting)
507    pub async fn set_permission(
508        &self,
509        permission: PermissionDescriptor,
510        setting: PermissionSetting,
511        origin: Option<impl Into<String>>,
512        embedded_origin: Option<impl Into<String>>,
513        browser_context_id: Option<BrowserContextId>,
514    ) -> Result<&Self> {
515        self.execute(SetPermissionParams {
516            permission,
517            setting,
518            origin: origin.map(Into::into),
519            embedded_origin: embedded_origin.map(Into::into),
520            browser_context_id: browser_context_id.or_else(|| self.browser_context.id.clone()),
521        })
522        .await?;
523        Ok(self)
524    }
525
526    /// Convenience: set a permission for a single origin using the current browser context.
527    pub async fn set_permission_for_origin(
528        &self,
529        origin: impl Into<String>,
530        embedded_origin: Option<impl Into<String>>,
531        permission: PermissionDescriptor,
532        setting: PermissionSetting,
533    ) -> Result<&Self> {
534        self.set_permission(permission, setting, Some(origin), embedded_origin, None)
535            .await
536    }
537
538    /// "Reset" a permission override by setting it back to Prompt.
539    pub async fn reset_permission_for_origin(
540        &self,
541        origin: impl Into<String>,
542        embedded_origin: Option<impl Into<String>>,
543        permission: PermissionDescriptor,
544    ) -> Result<&Self> {
545        self.set_permission_for_origin(
546            origin,
547            embedded_origin,
548            permission,
549            PermissionSetting::Prompt,
550        )
551        .await
552    }
553
554    /// "Grant" all permissions.
555    pub async fn grant_all_permission_for_origin(
556        &self,
557        origin: impl Into<String>,
558        embedded_origin: Option<impl Into<String>>,
559        permission: PermissionDescriptor,
560    ) -> Result<&Self> {
561        self.set_permission_for_origin(
562            origin,
563            embedded_origin,
564            permission,
565            PermissionSetting::Granted,
566        )
567        .await
568    }
569
570    /// "Deny" all permissions.
571    pub async fn deny_all_permission_for_origin(
572        &self,
573        origin: impl Into<String>,
574        embedded_origin: Option<impl Into<String>>,
575        permission: PermissionDescriptor,
576    ) -> Result<&Self> {
577        self.set_permission_for_origin(
578            origin,
579            embedded_origin,
580            permission,
581            PermissionSetting::Denied,
582        )
583        .await
584    }
585
586    /// Return all of the pages of the browser
587    pub async fn pages(&self) -> Result<Vec<Page>> {
588        let (tx, rx) = oneshot_channel();
589        self.sender
590            .clone()
591            .send(HandlerMessage::GetPages(tx))
592            .await?;
593        Ok(rx.await?)
594    }
595
596    /// Return page of given target_id
597    pub async fn get_page(&self, target_id: TargetId) -> Result<Page> {
598        let (tx, rx) = oneshot_channel();
599        self.sender
600            .clone()
601            .send(HandlerMessage::GetPage(target_id, tx))
602            .await?;
603        rx.await?.ok_or(CdpError::NotFound)
604    }
605
606    /// Set listener for browser event
607    pub async fn event_listener<T: IntoEventKind>(&self) -> Result<EventStream<T>> {
608        let (tx, rx) = unbounded();
609        self.sender
610            .clone()
611            .send(HandlerMessage::AddEventListener(
612                EventListenerRequest::new::<T>(tx),
613            ))
614            .await?;
615
616        Ok(EventStream::new(rx))
617    }
618
619    /// Creates a new empty browser context.
620    pub async fn create_browser_context(
621        &mut self,
622        params: CreateBrowserContextParams,
623    ) -> Result<BrowserContextId> {
624        let response = self.execute(params).await?;
625
626        Ok(response.result.browser_context_id)
627    }
628
629    /// Returns all browser contexts created with Target.createBrowserContext method.
630    pub async fn get_browser_contexts(
631        &mut self,
632        params: GetBrowserContextsParams,
633    ) -> Result<GetBrowserContextsReturns> {
634        let response = self.execute(params).await?;
635        Ok(response.result)
636    }
637
638    /// Send a new empty browser context.
639    pub async fn send_new_context(
640        &mut self,
641        browser_context_id: BrowserContextId,
642    ) -> Result<&Self> {
643        self.browser_context = BrowserContext::from(browser_context_id);
644        self.sender
645            .clone()
646            .send(HandlerMessage::InsertContext(self.browser_context.clone()))
647            .await?;
648        Ok(self)
649    }
650
651    /// Deletes a browser context.
652    pub async fn dispose_browser_context(
653        &self,
654        browser_context_id: impl Into<BrowserContextId>,
655    ) -> Result<&Self> {
656        self.execute(DisposeBrowserContextParams::new(browser_context_id))
657            .await?;
658
659        Ok(self)
660    }
661
662    /// Clears cookies.
663    pub async fn clear_cookies(&self) -> Result<&Self> {
664        self.execute(ClearCookiesParams::default()).await?;
665        Ok(self)
666    }
667
668    /// Returns all browser cookies.
669    pub async fn get_cookies(&self) -> Result<Vec<Cookie>> {
670        let mut cmd = GetCookiesParams::default();
671
672        cmd.browser_context_id = self.browser_context.id.clone();
673
674        Ok(self.execute(cmd).await?.result.cookies)
675    }
676
677    /// Sets given cookies.
678    pub async fn set_cookies(&self, mut cookies: Vec<CookieParam>) -> Result<&Self> {
679        for cookie in &mut cookies {
680            if let Some(url) = cookie.url.as_ref() {
681                crate::page::validate_cookie_url(url)?;
682            }
683        }
684
685        let mut cookies_param = SetCookiesParams::new(cookies);
686
687        cookies_param.browser_context_id = self.browser_context.id.clone();
688
689        self.execute(cookies_param).await?;
690        Ok(self)
691    }
692}
693
694impl Drop for Browser {
695    fn drop(&mut self) {
696        if let Some(child) = self.child.as_mut() {
697            if let Ok(Some(_)) = child.try_wait() {
698                // Already exited, do nothing. Usually occurs after using the method close or kill.
699            } else {
700                // We set the `kill_on_drop` property for the child process, so no need to explicitely
701                // kill it here. It can't really be done anyway since the method is async.
702                //
703                // On Unix, the process will be reaped in the background by the runtime automatically
704                // so it won't leave any resources locked. It is, however, a better practice for the user to
705                // do it himself since the runtime doesn't provide garantees as to when the reap occurs, so we
706                // warn him here.
707                tracing::warn!("Browser was not closed manually, it will be killed automatically in the background");
708            }
709        }
710    }
711}
712
713/// Resolve devtools WebSocket URL from the provided browser process
714///
715/// If an error occurs, it returns the browser's stderr output.
716///
717/// The URL resolution fails if:
718/// - [`CdpError::LaunchTimeout`]: `timeout_fut` completes, this corresponds to a timeout
719/// - [`CdpError::LaunchExit`]: the browser process exits (or is killed)
720/// - [`CdpError::LaunchIo`]: an input/output error occurs when await the process exit or reading
721///   the browser's stderr: end of stream, invalid UTF-8, other
722async fn ws_url_from_output(
723    child_process: &mut Child,
724    timeout_fut: impl Future<Output = ()> + Unpin,
725) -> Result<String> {
726    use futures::{AsyncBufReadExt, FutureExt};
727    let mut timeout_fut = timeout_fut.fuse();
728    let stderr = child_process.stderr.take().expect("no stderror");
729    let mut stderr_bytes = Vec::<u8>::new();
730    let mut exit_status_fut = Box::pin(child_process.wait()).fuse();
731    let mut buf = futures::io::BufReader::new(stderr);
732    loop {
733        select! {
734            _ = timeout_fut => return Err(CdpError::LaunchTimeout(BrowserStderr::new(stderr_bytes))),
735            exit_status = exit_status_fut => {
736                return Err(match exit_status {
737                    Err(e) => CdpError::LaunchIo(e, BrowserStderr::new(stderr_bytes)),
738                    Ok(exit_status) => CdpError::LaunchExit(exit_status, BrowserStderr::new(stderr_bytes)),
739                })
740            },
741            read_res = buf.read_until(b'\n', &mut stderr_bytes).fuse() => {
742                match read_res {
743                    Err(e) => return Err(CdpError::LaunchIo(e, BrowserStderr::new(stderr_bytes))),
744                    Ok(byte_count) => {
745                        if byte_count == 0 {
746                            let e = io::Error::new(io::ErrorKind::UnexpectedEof, "unexpected end of stream");
747                            return Err(CdpError::LaunchIo(e, BrowserStderr::new(stderr_bytes)));
748                        }
749                        let start_offset = stderr_bytes.len() - byte_count;
750                        let new_bytes = &stderr_bytes[start_offset..];
751                        match std::str::from_utf8(new_bytes) {
752                            Err(_) => {
753                                let e = io::Error::new(io::ErrorKind::InvalidData, "stream did not contain valid UTF-8");
754                                return Err(CdpError::LaunchIo(e, BrowserStderr::new(stderr_bytes)));
755                            }
756                            Ok(line) => {
757                                if let Some((_, ws)) = line.rsplit_once("listening on ") {
758                                    if ws.starts_with("ws") && ws.contains("devtools/browser") {
759                                        return Ok(ws.trim().to_string());
760                                    }
761                                }
762                            }
763                        }
764                    }
765                }
766            }
767        }
768    }
769}
770
771#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
772pub enum HeadlessMode {
773    /// The "headful" mode.
774    False,
775    /// The old headless mode.
776    #[default]
777    True,
778    /// The new headless mode. See also: https://developer.chrome.com/docs/chromium/new-headless
779    New,
780}
781
782#[derive(Debug, Clone, Default)]
783pub struct BrowserConfig {
784    /// Determines whether to run headless version of the browser. Defaults to
785    /// true.
786    headless: HeadlessMode,
787    /// Determines whether to run the browser with a sandbox.
788    sandbox: bool,
789    /// Launch the browser with a specific window width and height.
790    window_size: Option<(u32, u32)>,
791    /// Launch the browser with a specific debugging port.
792    port: u16,
793    /// Path for Chrome or Chromium.
794    ///
795    /// If unspecified, the create will try to automatically detect a suitable
796    /// binary.
797    executable: std::path::PathBuf,
798
799    /// A list of Chrome extensions to load.
800    ///
801    /// An extension should be a path to a folder containing the extension code.
802    /// CRX files cannot be used directly and must be first extracted.
803    ///
804    /// Note that Chrome does not support loading extensions in headless-mode.
805    /// See https://bugs.chromium.org/p/chromium/issues/detail?id=706008#c5
806    extensions: Vec<String>,
807
808    /// Environment variables to set for the Chromium process.
809    /// Passes value through to std::process::Command::envs.
810    pub process_envs: Option<HashMap<String, String>>,
811
812    /// Data dir for user data
813    pub user_data_dir: Option<PathBuf>,
814
815    /// Whether to launch the `Browser` in incognito mode.
816    incognito: bool,
817
818    /// Timeout duration for `Browser::launch`.
819    launch_timeout: Duration,
820
821    /// Ignore https errors, default is true.
822    ignore_https_errors: bool,
823    pub viewport: Option<Viewport>,
824    /// The duration after a request with no response should time out.
825    request_timeout: Duration,
826
827    /// Additional command line arguments to pass to the browser instance.
828    args: Vec<String>,
829
830    /// Whether to disable DEFAULT_ARGS or not, default is false.
831    disable_default_args: bool,
832
833    /// Whether to enable request interception.
834    pub request_intercept: bool,
835
836    /// Whether to enable cache.
837    pub cache_enabled: bool,
838    /// Whether to enable or disable Service Workers.
839    /// Disabling may reduce background network activity and caching effects.
840    pub service_worker_enabled: bool,
841    /// Whether to ignore image/visual requests during interception.
842    /// Can reduce bandwidth and speed up crawling when visuals are unnecessary.
843    pub ignore_visuals: bool,
844    /// Whether to ignore stylesheet (CSS) requests during interception.
845    /// Useful for content-only crawls.
846    pub ignore_stylesheets: bool,
847    /// Whether to ignore JavaScript requests during interception.
848    /// This still allows critical framework bundles to pass when applicable.
849    pub ignore_javascript: bool,
850    /// Whether to ignore analytics/telemetry requests during interception.
851    pub ignore_analytics: bool,
852    /// Whether to ignore ad network requests during interception.
853    pub ignore_ads: bool,
854    /// Extra headers.
855    pub extra_headers: Option<std::collections::HashMap<String, String>>,
856    /// Only html
857    pub only_html: bool,
858    /// The interception intercept manager.
859    pub intercept_manager: NetworkInterceptManager,
860    /// The max bytes to receive.
861    pub max_bytes_allowed: Option<u64>,
862    /// Whitelist patterns to allow through the network.
863    pub whitelist_patterns: Option<Vec<String>>,
864}
865
866#[derive(Debug, Clone)]
867pub struct BrowserConfigBuilder {
868    /// Headless mode configuration for the browser.
869    headless: HeadlessMode,
870    /// Whether to run the browser with a sandbox.
871    sandbox: bool,
872    /// Optional initial browser window size `(width, height)`.
873    window_size: Option<(u32, u32)>,
874    /// DevTools debugging port to bind to.
875    port: u16,
876    /// Optional explicit path to the Chrome/Chromium executable.
877    /// If `None`, auto-detection may be attempted based on `executation_detection`.
878    executable: Option<PathBuf>,
879    /// Controls auto-detection behavior for finding a Chrome/Chromium binary.
880    executation_detection: DetectionOptions,
881    /// List of unpacked extensions (directories) to load at startup.
882    extensions: Vec<String>,
883    /// Environment variables to set on the spawned Chromium process.
884    process_envs: Option<HashMap<String, String>>,
885    /// User data directory to persist browser state, or `None` for ephemeral.
886    user_data_dir: Option<PathBuf>,
887    /// Whether to start the browser in incognito (off-the-record) mode.
888    incognito: bool,
889    /// Maximum time to wait for the browser to launch and become ready.
890    launch_timeout: Duration,
891    /// Whether to ignore HTTPS/TLS errors during navigation and requests.
892    ignore_https_errors: bool,
893    /// Default page viewport configuration applied on startup.
894    viewport: Option<Viewport>,
895    /// Timeout for individual network requests without response progress.
896    request_timeout: Duration,
897    /// Additional command-line flags passed directly to the browser process.
898    args: Vec<String>,
899    /// Disable the default argument set and use only the provided `args`.
900    disable_default_args: bool,
901    /// Enable Network.requestInterception for request filtering/handling.
902    request_intercept: bool,
903    /// Enable the browser cache for navigations and subresources.
904    cache_enabled: bool,
905    /// Enable/disable Service Workers.
906    service_worker_enabled: bool,
907    /// Drop image/visual requests when interception is enabled.
908    ignore_visuals: bool,
909    /// Drop ad network requests when interception is enabled.
910    ignore_ads: bool,
911    /// Drop JavaScript requests when interception is enabled.
912    ignore_javascript: bool,
913    /// Drop stylesheet (CSS) requests when interception is enabled.
914    ignore_stylesheets: bool,
915    /// Drop analytics/telemetry requests when interception is enabled.
916    ignore_analytics: bool,
917    /// If `true`, limit fetching to HTML documents.
918    only_html: bool,
919    /// Extra HTTP headers to include with every request.
920    extra_headers: Option<std::collections::HashMap<String, String>>,
921    /// Network interception manager used to configure filtering behavior.
922    intercept_manager: NetworkInterceptManager,
923    /// Optional upper bound on bytes that may be received (per session/run).
924    max_bytes_allowed: Option<u64>,
925    /// Whitelist patterns to allow through the network.
926    whitelist_patterns: Option<Vec<String>>,
927}
928
929impl BrowserConfig {
930    /// Browser builder default config.
931    pub fn builder() -> BrowserConfigBuilder {
932        BrowserConfigBuilder::default()
933    }
934
935    /// Launch with the executable path.
936    pub fn with_executable(path: impl AsRef<Path>) -> Self {
937        Self::builder()
938            .chrome_executable(path)
939            .build()
940            .expect("path to executable exist")
941    }
942}
943
944impl Default for BrowserConfigBuilder {
945    fn default() -> Self {
946        Self {
947            headless: HeadlessMode::True,
948            sandbox: true,
949            window_size: None,
950            port: 0,
951            executable: None,
952            executation_detection: DetectionOptions::default(),
953            extensions: Vec::new(),
954            process_envs: None,
955            user_data_dir: None,
956            incognito: false,
957            launch_timeout: Duration::from_millis(LAUNCH_TIMEOUT),
958            ignore_https_errors: true,
959            viewport: Some(Default::default()),
960            request_timeout: Duration::from_millis(REQUEST_TIMEOUT),
961            args: Vec::new(),
962            disable_default_args: false,
963            request_intercept: false,
964            cache_enabled: true,
965            ignore_visuals: false,
966            ignore_ads: false,
967            ignore_javascript: false,
968            ignore_analytics: false,
969            ignore_stylesheets: false,
970            only_html: false,
971            extra_headers: Default::default(),
972            service_worker_enabled: true,
973            intercept_manager: NetworkInterceptManager::Unknown,
974            max_bytes_allowed: None,
975            whitelist_patterns: None,
976        }
977    }
978}
979
980impl BrowserConfigBuilder {
981    /// Configure window size.
982    pub fn window_size(mut self, width: u32, height: u32) -> Self {
983        self.window_size = Some((width, height));
984        self
985    }
986    /// Configure sandboxing.
987    pub fn no_sandbox(mut self) -> Self {
988        self.sandbox = false;
989        self
990    }
991    /// Configure the launch to start non headless.
992    pub fn with_head(mut self) -> Self {
993        self.headless = HeadlessMode::False;
994        self
995    }
996    /// Configure the launch with the new headless mode.
997    pub fn new_headless_mode(mut self) -> Self {
998        self.headless = HeadlessMode::New;
999        self
1000    }
1001    /// Configure the launch with headless.
1002    pub fn headless_mode(mut self, mode: HeadlessMode) -> Self {
1003        self.headless = mode;
1004        self
1005    }
1006    /// Configure the launch in incognito.
1007    pub fn incognito(mut self) -> Self {
1008        self.incognito = true;
1009        self
1010    }
1011
1012    pub fn respect_https_errors(mut self) -> Self {
1013        self.ignore_https_errors = false;
1014        self
1015    }
1016
1017    pub fn port(mut self, port: u16) -> Self {
1018        self.port = port;
1019        self
1020    }
1021
1022    pub fn with_max_bytes_allowed(mut self, max_bytes_allowed: Option<u64>) -> Self {
1023        self.max_bytes_allowed = max_bytes_allowed;
1024        self
1025    }
1026
1027    pub fn launch_timeout(mut self, timeout: Duration) -> Self {
1028        self.launch_timeout = timeout;
1029        self
1030    }
1031
1032    pub fn request_timeout(mut self, timeout: Duration) -> Self {
1033        self.request_timeout = timeout;
1034        self
1035    }
1036
1037    /// Configures the viewport of the browser, which defaults to `800x600`.
1038    /// `None` disables viewport emulation (i.e., it uses the browsers default
1039    /// configuration, which fills the available space. This is similar to what
1040    /// Playwright does when you provide `null` as the value of its `viewport`
1041    /// option).
1042    pub fn viewport(mut self, viewport: impl Into<Option<Viewport>>) -> Self {
1043        self.viewport = viewport.into();
1044        self
1045    }
1046
1047    pub fn user_data_dir(mut self, data_dir: impl AsRef<Path>) -> Self {
1048        self.user_data_dir = Some(data_dir.as_ref().to_path_buf());
1049        self
1050    }
1051
1052    pub fn chrome_executable(mut self, path: impl AsRef<Path>) -> Self {
1053        self.executable = Some(path.as_ref().to_path_buf());
1054        self
1055    }
1056
1057    pub fn chrome_detection(mut self, options: DetectionOptions) -> Self {
1058        self.executation_detection = options;
1059        self
1060    }
1061
1062    pub fn extension(mut self, extension: impl Into<String>) -> Self {
1063        self.extensions.push(extension.into());
1064        self
1065    }
1066
1067    pub fn extensions<I, S>(mut self, extensions: I) -> Self
1068    where
1069        I: IntoIterator<Item = S>,
1070        S: Into<String>,
1071    {
1072        for ext in extensions {
1073            self.extensions.push(ext.into());
1074        }
1075        self
1076    }
1077
1078    pub fn env(mut self, key: impl Into<String>, val: impl Into<String>) -> Self {
1079        self.process_envs
1080            .get_or_insert(HashMap::new())
1081            .insert(key.into(), val.into());
1082        self
1083    }
1084
1085    pub fn envs<I, K, V>(mut self, envs: I) -> Self
1086    where
1087        I: IntoIterator<Item = (K, V)>,
1088        K: Into<String>,
1089        V: Into<String>,
1090    {
1091        self.process_envs
1092            .get_or_insert(HashMap::new())
1093            .extend(envs.into_iter().map(|(k, v)| (k.into(), v.into())));
1094        self
1095    }
1096
1097    pub fn arg(mut self, arg: impl Into<String>) -> Self {
1098        self.args.push(arg.into());
1099        self
1100    }
1101
1102    pub fn args<I, S>(mut self, args: I) -> Self
1103    where
1104        I: IntoIterator<Item = S>,
1105        S: Into<String>,
1106    {
1107        for arg in args {
1108            self.args.push(arg.into());
1109        }
1110        self
1111    }
1112
1113    pub fn disable_default_args(mut self) -> Self {
1114        self.disable_default_args = true;
1115        self
1116    }
1117
1118    pub fn enable_request_intercept(mut self) -> Self {
1119        self.request_intercept = true;
1120        self
1121    }
1122
1123    pub fn disable_request_intercept(mut self) -> Self {
1124        self.request_intercept = false;
1125        self
1126    }
1127
1128    pub fn enable_cache(mut self) -> Self {
1129        self.cache_enabled = true;
1130        self
1131    }
1132
1133    pub fn disable_cache(mut self) -> Self {
1134        self.cache_enabled = false;
1135        self
1136    }
1137
1138    /// Set service worker enabled.
1139    pub fn set_service_worker_enabled(mut self, bypass: bool) -> Self {
1140        self.service_worker_enabled = bypass;
1141        self
1142    }
1143
1144    /// Set extra request headers.
1145    pub fn set_extra_headers(
1146        mut self,
1147        headers: Option<std::collections::HashMap<String, String>>,
1148    ) -> Self {
1149        self.extra_headers = headers;
1150        self
1151    }
1152
1153    /// Set whitelist patterns to allow through network interception ignoring.
1154    pub fn set_whitelist_patterns(mut self, whitelist_patterns: Option<Vec<String>>) -> Self {
1155        self.whitelist_patterns = whitelist_patterns;
1156        self
1157    }
1158
1159    /// Build the browser.
1160    pub fn build(self) -> std::result::Result<BrowserConfig, String> {
1161        let executable = if let Some(e) = self.executable {
1162            e
1163        } else {
1164            detection::default_executable(self.executation_detection)?
1165        };
1166
1167        Ok(BrowserConfig {
1168            headless: self.headless,
1169            sandbox: self.sandbox,
1170            window_size: self.window_size,
1171            port: self.port,
1172            executable,
1173            extensions: self.extensions,
1174            process_envs: self.process_envs,
1175            user_data_dir: self.user_data_dir,
1176            incognito: self.incognito,
1177            launch_timeout: self.launch_timeout,
1178            ignore_https_errors: self.ignore_https_errors,
1179            viewport: self.viewport,
1180            request_timeout: self.request_timeout,
1181            args: self.args,
1182            disable_default_args: self.disable_default_args,
1183            request_intercept: self.request_intercept,
1184            cache_enabled: self.cache_enabled,
1185            ignore_visuals: self.ignore_visuals,
1186            ignore_ads: self.ignore_ads,
1187            ignore_javascript: self.ignore_javascript,
1188            ignore_analytics: self.ignore_analytics,
1189            ignore_stylesheets: self.ignore_stylesheets,
1190            extra_headers: self.extra_headers,
1191            only_html: self.only_html,
1192            intercept_manager: self.intercept_manager,
1193            service_worker_enabled: self.service_worker_enabled,
1194            max_bytes_allowed: self.max_bytes_allowed,
1195            whitelist_patterns: self.whitelist_patterns,
1196        })
1197    }
1198}
1199
1200impl BrowserConfig {
1201    pub fn launch(&self) -> io::Result<Child> {
1202        let mut cmd = async_process::Command::new(&self.executable);
1203
1204        if self.disable_default_args {
1205            cmd.args(&self.args);
1206        } else {
1207            cmd.args(DEFAULT_ARGS).args(&self.args);
1208        }
1209
1210        if !self
1211            .args
1212            .iter()
1213            .any(|arg| arg.contains("--remote-debugging-port="))
1214        {
1215            cmd.arg(format!("--remote-debugging-port={}", self.port));
1216        }
1217
1218        cmd.args(
1219            self.extensions
1220                .iter()
1221                .map(|e| format!("--load-extension={e}")),
1222        );
1223
1224        if let Some(ref user_data) = self.user_data_dir {
1225            cmd.arg(format!("--user-data-dir={}", user_data.display()));
1226        } else {
1227            // If the user did not specify a data directory, this would default to the systems default
1228            // data directory. In most cases, we would rather have a fresh instance of Chromium. Specify
1229            // a temp dir just for chromiumoxide instead.
1230            cmd.arg(format!(
1231                "--user-data-dir={}",
1232                std::env::temp_dir().join("chromiumoxide-runner").display()
1233            ));
1234        }
1235
1236        if let Some((width, height)) = self.window_size {
1237            cmd.arg(format!("--window-size={width},{height}"));
1238        }
1239
1240        if !self.sandbox {
1241            cmd.args(["--no-sandbox", "--disable-setuid-sandbox"]);
1242        }
1243
1244        match self.headless {
1245            HeadlessMode::False => (),
1246            HeadlessMode::True => {
1247                cmd.args(["--headless", "--hide-scrollbars", "--mute-audio"]);
1248            }
1249            HeadlessMode::New => {
1250                cmd.args(["--headless=new", "--hide-scrollbars", "--mute-audio"]);
1251            }
1252        }
1253
1254        if self.incognito {
1255            cmd.arg("--incognito");
1256        }
1257
1258        if let Some(ref envs) = self.process_envs {
1259            cmd.envs(envs);
1260        }
1261        cmd.stderr(Stdio::piped()).spawn()
1262    }
1263}
1264
1265/// Returns the path to Chrome's executable.
1266///
1267/// If the `CHROME` environment variable is set, `default_executable` will
1268/// use it as the default path. Otherwise, the filenames `google-chrome-stable`
1269/// `chromium`, `chromium-browser`, `chrome` and `chrome-browser` are
1270/// searched for in standard places. If that fails,
1271/// `/Applications/Google Chrome.app/...` (on MacOS) or the registry (on
1272/// Windows) is consulted. If all of the above fail, an error is returned.
1273#[deprecated(note = "Use detection::default_executable instead")]
1274pub fn default_executable() -> Result<std::path::PathBuf, String> {
1275    let options = DetectionOptions {
1276        msedge: false,
1277        unstable: false,
1278    };
1279    detection::default_executable(options)
1280}
1281
1282/// These are passed to the Chrome binary by default.
1283/// Via https://github.com/puppeteer/puppeteer/blob/4846b8723cf20d3551c0d755df394cc5e0c82a94/src/node/Launcher.ts#L157
1284static DEFAULT_ARGS: [&str; 26] = [
1285    "--disable-background-networking",
1286    "--enable-features=NetworkService,NetworkServiceInProcess",
1287    "--disable-background-timer-throttling",
1288    "--disable-backgrounding-occluded-windows",
1289    "--disable-breakpad",
1290    "--disable-client-side-phishing-detection",
1291    "--disable-component-extensions-with-background-pages",
1292    "--disable-default-apps",
1293    "--disable-dev-shm-usage",
1294    "--disable-extensions",
1295    "--disable-features=TranslateUI",
1296    "--disable-hang-monitor",
1297    "--disable-ipc-flooding-protection",
1298    "--disable-popup-blocking",
1299    "--disable-prompt-on-repost",
1300    "--disable-renderer-backgrounding",
1301    "--disable-sync",
1302    "--force-color-profile=srgb",
1303    "--metrics-recording-only",
1304    "--no-first-run",
1305    "--enable-automation",
1306    "--password-store=basic",
1307    "--use-mock-keychain",
1308    "--enable-blink-features=IdleDetection",
1309    "--lang=en_US",
1310    "--disable-blink-features=AutomationControlled",
1311];