Skip to main content

chromiumoxide/
browser.rs

1use hashbrown::HashMap;
2use reqwest::header::{HeaderMap, HeaderValue, CONTENT_TYPE};
3use std::future::Future;
4use std::time::Duration;
5use std::{
6    io,
7    path::{Path, PathBuf},
8};
9
10use futures::channel::mpsc::{channel, unbounded, Sender};
11use futures::channel::oneshot::channel as oneshot_channel;
12use futures::select;
13use futures::SinkExt;
14
15use crate::async_process::{self, Child, ExitStatus, Stdio};
16use crate::cmd::{to_command_response, CommandMessage};
17use crate::conn::Connection;
18use crate::detection::{self, DetectionOptions};
19use crate::error::{BrowserStderr, CdpError, Result};
20use crate::handler::browser::BrowserContext;
21use crate::handler::viewport::Viewport;
22use crate::handler::{Handler, HandlerConfig, HandlerMessage, REQUEST_TIMEOUT};
23use crate::listeners::{EventListenerRequest, EventStream};
24use crate::page::Page;
25use crate::utils;
26use chromiumoxide_cdp::cdp::browser_protocol::browser::{
27    BrowserContextId, CloseReturns, GetVersionParams, GetVersionReturns,
28};
29use chromiumoxide_cdp::cdp::browser_protocol::browser::{
30    PermissionDescriptor, PermissionSetting, SetPermissionParams,
31};
32use chromiumoxide_cdp::cdp::browser_protocol::network::{Cookie, CookieParam};
33use chromiumoxide_cdp::cdp::browser_protocol::storage::{
34    ClearCookiesParams, GetCookiesParams, SetCookiesParams,
35};
36use chromiumoxide_cdp::cdp::browser_protocol::target::{
37    CreateBrowserContextParams, CreateTargetParams, DisposeBrowserContextParams,
38    GetBrowserContextsParams, GetBrowserContextsReturns, TargetId, TargetInfo,
39};
40
41use chromiumoxide_cdp::cdp::{CdpEventMessage, IntoEventKind};
42use chromiumoxide_types::*;
43use spider_network_blocker::intercept_manager::NetworkInterceptManager;
44
45/// Default `Browser::launch` timeout in MS
46pub const LAUNCH_TIMEOUT: u64 = 20_000;
47
48lazy_static::lazy_static! {
49    /// The request client to get the web socket url.
50    static ref REQUEST_CLIENT: reqwest::Client = reqwest::Client::builder()
51        .timeout(Duration::from_secs(60))
52        .default_headers({
53            let mut m = HeaderMap::new();
54
55            m.insert(CONTENT_TYPE, HeaderValue::from_static("application/json"));
56
57            m
58        })
59        .tcp_keepalive(Some(Duration::from_secs(5)))
60        .pool_idle_timeout(Some(Duration::from_secs(60)))
61        .pool_max_idle_per_host(10)
62        .build()
63        .expect("client to build");
64}
65
66/// A [`Browser`] is created when chromiumoxide connects to a Chromium instance.
67#[derive(Debug)]
68pub struct Browser {
69    /// The `Sender` to send messages to the connection handler that drives the
70    /// websocket
71    pub(crate) sender: Sender<HandlerMessage>,
72    /// How the spawned chromium instance was configured, if any
73    config: Option<BrowserConfig>,
74    /// The spawned chromium instance
75    child: Option<Child>,
76    /// The debug web socket url of the chromium instance
77    debug_ws_url: String,
78    /// The context of the browser
79    pub browser_context: BrowserContext,
80}
81
82/// Browser connection information.
83#[derive(serde::Deserialize, Debug, Default)]
84pub struct BrowserConnection {
85    #[serde(rename = "Browser")]
86    /// The browser name
87    pub browser: String,
88    #[serde(rename = "Protocol-Version")]
89    /// Browser version
90    pub protocol_version: String,
91    #[serde(rename = "User-Agent")]
92    /// User Agent used by default.
93    pub user_agent: String,
94    #[serde(rename = "V8-Version")]
95    /// The v8 engine version
96    pub v8_version: String,
97    #[serde(rename = "WebKit-Version")]
98    /// Webkit version
99    pub webkit_version: String,
100    #[serde(rename = "webSocketDebuggerUrl")]
101    /// Remote debugging address
102    pub web_socket_debugger_url: String,
103}
104
105impl Browser {
106    /// Connect to an already running chromium instance via the given URL.
107    ///
108    /// If the URL is a http(s) URL, it will first attempt to retrieve the Websocket URL from the `json/version` endpoint.
109    pub async fn connect(url: impl Into<String>) -> Result<(Self, Handler)> {
110        Self::connect_with_config(url, HandlerConfig::default()).await
111    }
112
113    // Connect to an already running chromium instance with a given `HandlerConfig`.
114    ///
115    /// If the URL is a http URL, it will first attempt to retrieve the Websocket URL from the `json/version` endpoint.
116    pub async fn connect_with_config(
117        url: impl Into<String>,
118        config: HandlerConfig,
119    ) -> Result<(Self, Handler)> {
120        let mut debug_ws_url = url.into();
121
122        if debug_ws_url.starts_with("http") {
123            match REQUEST_CLIENT
124                .get(
125                    if debug_ws_url.ends_with("/json/version")
126                        || debug_ws_url.ends_with("/json/version/")
127                    {
128                        debug_ws_url.to_owned()
129                    } else {
130                        format!(
131                            "{}{}json/version",
132                            &debug_ws_url,
133                            if debug_ws_url.ends_with('/') { "" } else { "/" }
134                        )
135                    },
136                )
137                .send()
138                .await
139            {
140                Ok(req) => {
141                    if let Ok(b) = req.bytes().await {
142                        if let Ok(connection) =
143                            crate::serde_json::from_slice::<Box<BrowserConnection>>(&b)
144                        {
145                            if !connection.web_socket_debugger_url.is_empty() {
146                                debug_ws_url = connection.web_socket_debugger_url;
147                            }
148                        }
149                    }
150                }
151                Err(_) => return Err(CdpError::NoResponse),
152            }
153        }
154
155        let conn = Connection::<CdpEventMessage>::connect(&debug_ws_url).await?;
156
157        let (tx, rx) = channel(config.channel_capacity);
158
159        let handler_config = BrowserConfig {
160            ignore_https_errors: config.ignore_https_errors,
161            viewport: config.viewport.clone(),
162            request_timeout: config.request_timeout,
163            request_intercept: config.request_intercept,
164            cache_enabled: config.cache_enabled,
165            ignore_visuals: config.ignore_visuals,
166            ignore_stylesheets: config.ignore_stylesheets,
167            ignore_javascript: config.ignore_javascript,
168            ignore_analytics: config.ignore_analytics,
169            ignore_prefetch: config.ignore_prefetch,
170            ignore_ads: config.ignore_ads,
171            extra_headers: config.extra_headers.clone(),
172            only_html: config.only_html,
173            service_worker_enabled: config.service_worker_enabled,
174            intercept_manager: config.intercept_manager,
175            max_bytes_allowed: config.max_bytes_allowed,
176            whitelist_patterns: config.whitelist_patterns.clone(),
177            blacklist_patterns: config.blacklist_patterns.clone(),
178            ..Default::default()
179        };
180
181        let fut = Handler::new(conn, rx, config);
182        let browser_context = fut.default_browser_context().clone();
183
184        let browser = Self {
185            sender: tx,
186            config: Some(handler_config),
187            child: None,
188            debug_ws_url,
189            browser_context,
190        };
191
192        Ok((browser, fut))
193    }
194
195    /// Launches a new instance of `chromium` in the background and attaches to
196    /// its debug web socket.
197    ///
198    /// This fails when no chromium executable could be detected.
199    ///
200    /// This fails if no web socket url could be detected from the child
201    /// processes stderr for more than the configured `launch_timeout`
202    /// (20 seconds by default).
203    pub async fn launch(mut config: BrowserConfig) -> Result<(Self, Handler)> {
204        // Canonalize paths to reduce issues with sandboxing
205        config.executable = utils::canonicalize_except_snap(config.executable).await?;
206
207        // Launch a new chromium instance
208        let mut child = config.launch()?;
209
210        /// Faillible initialization to run once the child process is created.
211        ///
212        /// All faillible calls must be executed inside this function. This ensures that all
213        /// errors are caught and that the child process is properly cleaned-up.
214        async fn with_child(
215            config: &BrowserConfig,
216            child: &mut Child,
217        ) -> Result<(String, Connection<CdpEventMessage>)> {
218            let dur = config.launch_timeout;
219            let timeout_fut = Box::pin(tokio::time::sleep(dur));
220
221            // extract the ws:
222            let debug_ws_url = ws_url_from_output(child, timeout_fut).await?;
223            let conn = Connection::<CdpEventMessage>::connect(&debug_ws_url).await?;
224            Ok((debug_ws_url, conn))
225        }
226
227        let (debug_ws_url, conn) = match with_child(&config, &mut child).await {
228            Ok(conn) => conn,
229            Err(e) => {
230                // An initialization error occurred, clean up the process
231                if let Ok(Some(_)) = child.try_wait() {
232                    // already exited, do nothing, may happen if the browser crashed
233                } else {
234                    // the process is still alive, kill it and wait for exit (avoid zombie processes)
235                    child.kill().await.expect("`Browser::launch` failed but could not clean-up the child process (`kill`)");
236                    child.wait().await.expect("`Browser::launch` failed but could not clean-up the child process (`wait`)");
237                }
238                return Err(e);
239            }
240        };
241
242        // Only infaillible calls are allowed after this point to avoid clean-up issues with the
243        // child process.
244
245        let (tx, rx) = channel(config.channel_capacity);
246
247        let handler_config = HandlerConfig {
248            ignore_https_errors: config.ignore_https_errors,
249            viewport: config.viewport.clone(),
250            context_ids: Vec::new(),
251            request_timeout: config.request_timeout,
252            request_intercept: config.request_intercept,
253            cache_enabled: config.cache_enabled,
254            ignore_visuals: config.ignore_visuals,
255            ignore_stylesheets: config.ignore_stylesheets,
256            ignore_javascript: config.ignore_javascript,
257            ignore_analytics: config.ignore_analytics,
258            ignore_prefetch: config.ignore_prefetch,
259            ignore_ads: config.ignore_ads,
260            extra_headers: config.extra_headers.clone(),
261            only_html: config.only_html,
262            service_worker_enabled: config.service_worker_enabled,
263            created_first_target: false,
264            intercept_manager: config.intercept_manager,
265            max_bytes_allowed: config.max_bytes_allowed,
266            whitelist_patterns: config.whitelist_patterns.clone(),
267            blacklist_patterns: config.blacklist_patterns.clone(),
268            channel_capacity: config.channel_capacity,
269        };
270
271        let fut = Handler::new(conn, rx, handler_config);
272        let browser_context = fut.default_browser_context().clone();
273
274        let browser = Self {
275            sender: tx,
276            config: Some(config),
277            child: Some(child),
278            debug_ws_url,
279            browser_context,
280        };
281
282        Ok((browser, fut))
283    }
284
285    /// Request to fetch all existing browser targets.
286    ///
287    /// By default, only targets launched after the browser connection are tracked
288    /// when connecting to a existing browser instance with the devtools websocket url
289    /// This function fetches existing targets on the browser and adds them as pages internally
290    ///
291    /// The pages are not guaranteed to be ready as soon as the function returns
292    /// You should wait a few millis if you need to use a page
293    /// Returns [TargetInfo]
294    pub async fn fetch_targets(&mut self) -> Result<Vec<TargetInfo>> {
295        let (tx, rx) = oneshot_channel();
296
297        self.sender
298            .clone()
299            .send(HandlerMessage::FetchTargets(tx))
300            .await?;
301
302        rx.await?
303    }
304
305    /// Request for the browser to close completely.
306    ///
307    /// If the browser was spawned by [`Browser::launch`], it is recommended to wait for the
308    /// spawned instance exit, to avoid "zombie" processes ([`Browser::wait`],
309    /// [`Browser::wait_sync`], [`Browser::try_wait`]).
310    /// [`Browser::drop`] waits automatically if needed.
311    pub async fn close(&self) -> Result<CloseReturns> {
312        let (tx, rx) = oneshot_channel();
313
314        self.sender
315            .clone()
316            .send(HandlerMessage::CloseBrowser(tx))
317            .await?;
318
319        rx.await?
320    }
321
322    /// Asynchronously wait for the spawned chromium instance to exit completely.
323    ///
324    /// The instance is spawned by [`Browser::launch`]. `wait` is usually called after
325    /// [`Browser::close`]. You can call this explicitly to collect the process and avoid
326    /// "zombie" processes.
327    ///
328    /// This call has no effect if this [`Browser`] did not spawn any chromium instance (e.g.
329    /// connected to an existing browser through [`Browser::connect`])
330    pub async fn wait(&mut self) -> io::Result<Option<ExitStatus>> {
331        if let Some(child) = self.child.as_mut() {
332            Ok(Some(child.wait().await?))
333        } else {
334            Ok(None)
335        }
336    }
337
338    /// If the spawned chromium instance has completely exited, wait for it.
339    ///
340    /// The instance is spawned by [`Browser::launch`]. `try_wait` is usually called after
341    /// [`Browser::close`]. You can call this explicitly to collect the process and avoid
342    /// "zombie" processes.
343    ///
344    /// This call has no effect if this [`Browser`] did not spawn any chromium instance (e.g.
345    /// connected to an existing browser through [`Browser::connect`])
346    pub fn try_wait(&mut self) -> io::Result<Option<ExitStatus>> {
347        if let Some(child) = self.child.as_mut() {
348            child.try_wait()
349        } else {
350            Ok(None)
351        }
352    }
353
354    /// Get the spawned chromium instance
355    ///
356    /// The instance is spawned by [`Browser::launch`]. The result is a [`async_process::Child`]
357    /// value. It acts as a compat wrapper for an `async-std` or `tokio` child process.
358    ///
359    /// You may use [`async_process::Child::as_mut_inner`] to retrieve the concrete implementation
360    /// for the selected runtime.
361    ///
362    /// This call has no effect if this [`Browser`] did not spawn any chromium instance (e.g.
363    /// connected to an existing browser through [`Browser::connect`])
364    pub fn get_mut_child(&mut self) -> Option<&mut Child> {
365        self.child.as_mut()
366    }
367
368    /// Has a browser instance launched on system.
369    pub fn has_child(&self) -> bool {
370        self.child.is_some()
371    }
372
373    /// Forcibly kill the spawned chromium instance
374    ///
375    /// The instance is spawned by [`Browser::launch`]. `kill` will automatically wait for the child
376    /// process to exit to avoid "zombie" processes.
377    ///
378    /// This method is provided to help if the browser does not close by itself. You should prefer
379    /// to use [`Browser::close`].
380    ///
381    /// This call has no effect if this [`Browser`] did not spawn any chromium instance (e.g.
382    /// connected to an existing browser through [`Browser::connect`])
383    pub async fn kill(&mut self) -> Option<io::Result<()>> {
384        match self.child.as_mut() {
385            Some(child) => Some(child.kill().await),
386            None => None,
387        }
388    }
389
390    /// If not launched as incognito this creates a new incognito browser
391    /// context. After that this browser exists within the incognito session.
392    /// New pages created while being in incognito mode will also run in the
393    /// incognito context. Incognito contexts won't share cookies/cache with
394    /// other browser contexts.
395    pub async fn start_incognito_context(&mut self) -> Result<&mut Self> {
396        if !self.is_incognito_configured() {
397            let browser_context_id = self
398                .create_browser_context(CreateBrowserContextParams::default())
399                .await?;
400            self.browser_context = BrowserContext::from(browser_context_id);
401            self.sender
402                .clone()
403                .send(HandlerMessage::InsertContext(self.browser_context.clone()))
404                .await?;
405        }
406
407        Ok(self)
408    }
409
410    /// If a incognito session was created with
411    /// `Browser::start_incognito_context` this disposes this context.
412    ///
413    /// # Note This will also dispose all pages that were running within the
414    /// incognito context.
415    pub async fn quit_incognito_context_base(
416        &self,
417        browser_context_id: BrowserContextId,
418    ) -> Result<&Self> {
419        self.dispose_browser_context(browser_context_id.clone())
420            .await?;
421        self.sender
422            .clone()
423            .send(HandlerMessage::DisposeContext(BrowserContext::from(
424                browser_context_id,
425            )))
426            .await?;
427        Ok(self)
428    }
429
430    /// If a incognito session was created with
431    /// `Browser::start_incognito_context` this disposes this context.
432    ///
433    /// # Note This will also dispose all pages that were running within the
434    /// incognito context.
435    pub async fn quit_incognito_context(&mut self) -> Result<&mut Self> {
436        if let Some(id) = self.browser_context.take() {
437            let _ = self.quit_incognito_context_base(id).await;
438        }
439        Ok(self)
440    }
441
442    /// Whether incognito mode was configured from the start
443    fn is_incognito_configured(&self) -> bool {
444        self.config
445            .as_ref()
446            .map(|c| c.incognito)
447            .unwrap_or_default()
448    }
449
450    /// Returns the address of the websocket this browser is attached to
451    pub fn websocket_address(&self) -> &String {
452        &self.debug_ws_url
453    }
454
455    /// Whether the BrowserContext is incognito.
456    pub fn is_incognito(&self) -> bool {
457        self.is_incognito_configured() || self.browser_context.is_incognito()
458    }
459
460    /// The config of the spawned chromium instance if any.
461    pub fn config(&self) -> Option<&BrowserConfig> {
462        self.config.as_ref()
463    }
464
465    /// Create a new browser page
466    pub async fn new_page(&self, params: impl Into<CreateTargetParams>) -> Result<Page> {
467        let (tx, rx) = oneshot_channel();
468        let mut params = params.into();
469
470        if let Some(id) = self.browser_context.id() {
471            if params.browser_context_id.is_none() {
472                params.browser_context_id = Some(id.clone());
473            }
474        }
475
476        let _ = self
477            .sender
478            .clone()
479            .send(HandlerMessage::CreatePage(params, tx))
480            .await;
481
482        rx.await?
483    }
484
485    /// Version information about the browser
486    pub async fn version(&self) -> Result<GetVersionReturns> {
487        Ok(self.execute(GetVersionParams::default()).await?.result)
488    }
489
490    /// Returns the user agent of the browser
491    pub async fn user_agent(&self) -> Result<String> {
492        Ok(self.version().await?.user_agent)
493    }
494
495    /// Call a browser method.
496    pub async fn execute<T: Command>(&self, cmd: T) -> Result<CommandResponse<T::Response>> {
497        let (tx, rx) = oneshot_channel();
498        let method = cmd.identifier();
499        let msg = CommandMessage::new(cmd, tx)?;
500
501        self.sender
502            .clone()
503            .send(HandlerMessage::Command(msg))
504            .await?;
505        let resp = rx.await??;
506        to_command_response::<T>(resp, method)
507    }
508
509    /// Set permission settings for given embedding and embedded origins.
510    /// [PermissionDescriptor](https://chromedevtools.github.io/devtools-protocol/tot/Browser/#type-PermissionDescriptor)
511    /// [PermissionSetting](https://chromedevtools.github.io/devtools-protocol/tot/Browser/#type-PermissionSetting)
512    pub async fn set_permission(
513        &self,
514        permission: PermissionDescriptor,
515        setting: PermissionSetting,
516        origin: Option<impl Into<String>>,
517        embedded_origin: Option<impl Into<String>>,
518        browser_context_id: Option<BrowserContextId>,
519    ) -> Result<&Self> {
520        self.execute(SetPermissionParams {
521            permission,
522            setting,
523            origin: origin.map(Into::into),
524            embedded_origin: embedded_origin.map(Into::into),
525            browser_context_id: browser_context_id.or_else(|| self.browser_context.id.clone()),
526        })
527        .await?;
528        Ok(self)
529    }
530
531    /// Convenience: set a permission for a single origin using the current browser context.
532    pub async fn set_permission_for_origin(
533        &self,
534        origin: impl Into<String>,
535        embedded_origin: Option<impl Into<String>>,
536        permission: PermissionDescriptor,
537        setting: PermissionSetting,
538    ) -> Result<&Self> {
539        self.set_permission(permission, setting, Some(origin), embedded_origin, None)
540            .await
541    }
542
543    /// "Reset" a permission override by setting it back to Prompt.
544    pub async fn reset_permission_for_origin(
545        &self,
546        origin: impl Into<String>,
547        embedded_origin: Option<impl Into<String>>,
548        permission: PermissionDescriptor,
549    ) -> Result<&Self> {
550        self.set_permission_for_origin(
551            origin,
552            embedded_origin,
553            permission,
554            PermissionSetting::Prompt,
555        )
556        .await
557    }
558
559    /// "Grant" all permissions.
560    pub async fn grant_all_permission_for_origin(
561        &self,
562        origin: impl Into<String>,
563        embedded_origin: Option<impl Into<String>>,
564        permission: PermissionDescriptor,
565    ) -> Result<&Self> {
566        self.set_permission_for_origin(
567            origin,
568            embedded_origin,
569            permission,
570            PermissionSetting::Granted,
571        )
572        .await
573    }
574
575    /// "Deny" all permissions.
576    pub async fn deny_all_permission_for_origin(
577        &self,
578        origin: impl Into<String>,
579        embedded_origin: Option<impl Into<String>>,
580        permission: PermissionDescriptor,
581    ) -> Result<&Self> {
582        self.set_permission_for_origin(
583            origin,
584            embedded_origin,
585            permission,
586            PermissionSetting::Denied,
587        )
588        .await
589    }
590
591    /// Return all of the pages of the browser
592    pub async fn pages(&self) -> Result<Vec<Page>> {
593        let (tx, rx) = oneshot_channel();
594        self.sender
595            .clone()
596            .send(HandlerMessage::GetPages(tx))
597            .await?;
598        Ok(rx.await?)
599    }
600
601    /// Return page of given target_id
602    pub async fn get_page(&self, target_id: TargetId) -> Result<Page> {
603        let (tx, rx) = oneshot_channel();
604        self.sender
605            .clone()
606            .send(HandlerMessage::GetPage(target_id, tx))
607            .await?;
608        rx.await?.ok_or(CdpError::NotFound)
609    }
610
611    /// Set listener for browser event
612    pub async fn event_listener<T: IntoEventKind>(&self) -> Result<EventStream<T>> {
613        let (tx, rx) = unbounded();
614        self.sender
615            .clone()
616            .send(HandlerMessage::AddEventListener(
617                EventListenerRequest::new::<T>(tx),
618            ))
619            .await?;
620
621        Ok(EventStream::new(rx))
622    }
623
624    /// Creates a new empty browser context.
625    pub async fn create_browser_context(
626        &mut self,
627        params: CreateBrowserContextParams,
628    ) -> Result<BrowserContextId> {
629        let response = self.execute(params).await?;
630
631        Ok(response.result.browser_context_id)
632    }
633
634    /// Returns all browser contexts created with Target.createBrowserContext method.
635    pub async fn get_browser_contexts(
636        &mut self,
637        params: GetBrowserContextsParams,
638    ) -> Result<GetBrowserContextsReturns> {
639        let response = self.execute(params).await?;
640        Ok(response.result)
641    }
642
643    /// Send a new empty browser context.
644    pub async fn send_new_context(
645        &mut self,
646        browser_context_id: BrowserContextId,
647    ) -> Result<&Self> {
648        self.browser_context = BrowserContext::from(browser_context_id);
649        self.sender
650            .clone()
651            .send(HandlerMessage::InsertContext(self.browser_context.clone()))
652            .await?;
653        Ok(self)
654    }
655
656    /// Deletes a browser context.
657    pub async fn dispose_browser_context(
658        &self,
659        browser_context_id: impl Into<BrowserContextId>,
660    ) -> Result<&Self> {
661        self.execute(DisposeBrowserContextParams::new(browser_context_id))
662            .await?;
663
664        Ok(self)
665    }
666
667    /// Clears cookies.
668    pub async fn clear_cookies(&self) -> Result<&Self> {
669        self.execute(ClearCookiesParams::default()).await?;
670        Ok(self)
671    }
672
673    /// Returns all browser cookies.
674    pub async fn get_cookies(&self) -> Result<Vec<Cookie>> {
675        let cmd = GetCookiesParams {
676            browser_context_id: self.browser_context.id.clone(),
677        };
678
679        Ok(self.execute(cmd).await?.result.cookies)
680    }
681
682    /// Sets given cookies.
683    pub async fn set_cookies(&self, mut cookies: Vec<CookieParam>) -> Result<&Self> {
684        for cookie in &mut cookies {
685            if let Some(url) = cookie.url.as_ref() {
686                crate::page::validate_cookie_url(url)?;
687            }
688        }
689
690        let mut cookies_param = SetCookiesParams::new(cookies);
691
692        cookies_param.browser_context_id = self.browser_context.id.clone();
693
694        self.execute(cookies_param).await?;
695        Ok(self)
696    }
697}
698
699impl Drop for Browser {
700    fn drop(&mut self) {
701        if let Some(child) = self.child.as_mut() {
702            if let Ok(Some(_)) = child.try_wait() {
703                // Already exited, do nothing. Usually occurs after using the method close or kill.
704            } else {
705                // We set the `kill_on_drop` property for the child process, so no need to explicitely
706                // kill it here. It can't really be done anyway since the method is async.
707                //
708                // On Unix, the process will be reaped in the background by the runtime automatically
709                // so it won't leave any resources locked. It is, however, a better practice for the user to
710                // do it himself since the runtime doesn't provide garantees as to when the reap occurs, so we
711                // warn him here.
712                tracing::warn!("Browser was not closed manually, it will be killed automatically in the background");
713            }
714        }
715    }
716}
717
718/// Resolve devtools WebSocket URL from the provided browser process
719///
720/// If an error occurs, it returns the browser's stderr output.
721///
722/// The URL resolution fails if:
723/// - [`CdpError::LaunchTimeout`]: `timeout_fut` completes, this corresponds to a timeout
724/// - [`CdpError::LaunchExit`]: the browser process exits (or is killed)
725/// - [`CdpError::LaunchIo`]: an input/output error occurs when await the process exit or reading
726///   the browser's stderr: end of stream, invalid UTF-8, other
727async fn ws_url_from_output(
728    child_process: &mut Child,
729    timeout_fut: impl Future<Output = ()> + Unpin,
730) -> Result<String> {
731    use futures::{AsyncBufReadExt, FutureExt};
732    let mut timeout_fut = timeout_fut.fuse();
733    let stderr = child_process.stderr.take().expect("no stderror");
734    let mut stderr_bytes = Vec::<u8>::new();
735    let mut exit_status_fut = Box::pin(child_process.wait()).fuse();
736    let mut buf = futures::io::BufReader::new(stderr);
737    loop {
738        select! {
739            _ = timeout_fut => return Err(CdpError::LaunchTimeout(BrowserStderr::new(stderr_bytes))),
740            exit_status = exit_status_fut => {
741                return Err(match exit_status {
742                    Err(e) => CdpError::LaunchIo(e, BrowserStderr::new(stderr_bytes)),
743                    Ok(exit_status) => CdpError::LaunchExit(exit_status, BrowserStderr::new(stderr_bytes)),
744                })
745            },
746            read_res = buf.read_until(b'\n', &mut stderr_bytes).fuse() => {
747                match read_res {
748                    Err(e) => return Err(CdpError::LaunchIo(e, BrowserStderr::new(stderr_bytes))),
749                    Ok(byte_count) => {
750                        if byte_count == 0 {
751                            let e = io::Error::new(io::ErrorKind::UnexpectedEof, "unexpected end of stream");
752                            return Err(CdpError::LaunchIo(e, BrowserStderr::new(stderr_bytes)));
753                        }
754                        let start_offset = stderr_bytes.len() - byte_count;
755                        let new_bytes = &stderr_bytes[start_offset..];
756                        match std::str::from_utf8(new_bytes) {
757                            Err(_) => {
758                                let e = io::Error::new(io::ErrorKind::InvalidData, "stream did not contain valid UTF-8");
759                                return Err(CdpError::LaunchIo(e, BrowserStderr::new(stderr_bytes)));
760                            }
761                            Ok(line) => {
762                                if let Some((_, ws)) = line.rsplit_once("listening on ") {
763                                    if ws.starts_with("ws") && ws.contains("devtools/browser") {
764                                        return Ok(ws.trim().to_string());
765                                    }
766                                }
767                            }
768                        }
769                    }
770                }
771            }
772        }
773    }
774}
775
776#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
777pub enum HeadlessMode {
778    /// The "headful" mode.
779    False,
780    /// The old headless mode.
781    #[default]
782    True,
783    /// The new headless mode. See also: https://developer.chrome.com/docs/chromium/new-headless
784    New,
785}
786
787#[derive(Debug, Clone, Default)]
788pub struct BrowserConfig {
789    /// Determines whether to run headless version of the browser. Defaults to
790    /// true.
791    headless: HeadlessMode,
792    /// Determines whether to run the browser with a sandbox.
793    sandbox: bool,
794    /// Launch the browser with a specific window width and height.
795    window_size: Option<(u32, u32)>,
796    /// Launch the browser with a specific debugging port.
797    port: u16,
798    /// Path for Chrome or Chromium.
799    ///
800    /// If unspecified, the create will try to automatically detect a suitable
801    /// binary.
802    executable: std::path::PathBuf,
803
804    /// A list of Chrome extensions to load.
805    ///
806    /// An extension should be a path to a folder containing the extension code.
807    /// CRX files cannot be used directly and must be first extracted.
808    ///
809    /// Note that Chrome does not support loading extensions in headless-mode.
810    /// See https://bugs.chromium.org/p/chromium/issues/detail?id=706008#c5
811    extensions: Vec<String>,
812
813    /// Environment variables to set for the Chromium process.
814    /// Passes value through to std::process::Command::envs.
815    pub process_envs: Option<HashMap<String, String>>,
816
817    /// Data dir for user data
818    pub user_data_dir: Option<PathBuf>,
819
820    /// Whether to launch the `Browser` in incognito mode.
821    incognito: bool,
822
823    /// Timeout duration for `Browser::launch`.
824    launch_timeout: Duration,
825
826    /// Ignore https errors, default is true.
827    ignore_https_errors: bool,
828    pub viewport: Option<Viewport>,
829    /// The duration after a request with no response should time out.
830    request_timeout: Duration,
831
832    /// Additional command line arguments to pass to the browser instance.
833    args: Vec<String>,
834
835    /// Whether to disable DEFAULT_ARGS or not, default is false.
836    disable_default_args: bool,
837
838    /// Whether to enable request interception.
839    pub request_intercept: bool,
840
841    /// Whether to enable cache.
842    pub cache_enabled: bool,
843    /// Whether to enable or disable Service Workers.
844    /// Disabling may reduce background network activity and caching effects.
845    pub service_worker_enabled: bool,
846    /// Whether to ignore image/visual requests during interception.
847    /// Can reduce bandwidth and speed up crawling when visuals are unnecessary.
848    pub ignore_visuals: bool,
849    /// Whether to ignore stylesheet (CSS) requests during interception.
850    /// Useful for content-only crawls.
851    pub ignore_stylesheets: bool,
852    /// Whether to ignore JavaScript requests during interception.
853    /// This still allows critical framework bundles to pass when applicable.
854    pub ignore_javascript: bool,
855    /// Whether to ignore analytics/telemetry requests during interception.
856    pub ignore_analytics: bool,
857    /// Ignore prefetch request.
858    pub ignore_prefetch: bool,
859    /// Whether to ignore ad network requests during interception.
860    pub ignore_ads: bool,
861    /// Extra headers.
862    pub extra_headers: Option<std::collections::HashMap<String, String>>,
863    /// Only html
864    pub only_html: bool,
865    /// The interception intercept manager.
866    pub intercept_manager: NetworkInterceptManager,
867    /// The max bytes to receive.
868    pub max_bytes_allowed: Option<u64>,
869    /// Whitelist patterns to allow through the network.
870    pub whitelist_patterns: Option<Vec<String>>,
871    /// Blacklist patterns to block through the network.
872    pub blacklist_patterns: Option<Vec<String>>,
873    /// Capacity of the channel between browser handle and handler.
874    /// Defaults to 1000.
875    pub channel_capacity: usize,
876}
877
878#[derive(Debug, Clone)]
879pub struct BrowserConfigBuilder {
880    /// Headless mode configuration for the browser.
881    headless: HeadlessMode,
882    /// Whether to run the browser with a sandbox.
883    sandbox: bool,
884    /// Optional initial browser window size `(width, height)`.
885    window_size: Option<(u32, u32)>,
886    /// DevTools debugging port to bind to.
887    port: u16,
888    /// Optional explicit path to the Chrome/Chromium executable.
889    /// If `None`, auto-detection may be attempted based on `executation_detection`.
890    executable: Option<PathBuf>,
891    /// Controls auto-detection behavior for finding a Chrome/Chromium binary.
892    executation_detection: DetectionOptions,
893    /// List of unpacked extensions (directories) to load at startup.
894    extensions: Vec<String>,
895    /// Environment variables to set on the spawned Chromium process.
896    process_envs: Option<HashMap<String, String>>,
897    /// User data directory to persist browser state, or `None` for ephemeral.
898    user_data_dir: Option<PathBuf>,
899    /// Whether to start the browser in incognito (off-the-record) mode.
900    incognito: bool,
901    /// Maximum time to wait for the browser to launch and become ready.
902    launch_timeout: Duration,
903    /// Whether to ignore HTTPS/TLS errors during navigation and requests.
904    ignore_https_errors: bool,
905    /// Default page viewport configuration applied on startup.
906    viewport: Option<Viewport>,
907    /// Timeout for individual network requests without response progress.
908    request_timeout: Duration,
909    /// Additional command-line flags passed directly to the browser process.
910    args: Vec<String>,
911    /// Disable the default argument set and use only the provided `args`.
912    disable_default_args: bool,
913    /// Enable Network.requestInterception for request filtering/handling.
914    request_intercept: bool,
915    /// Enable the browser cache for navigations and subresources.
916    cache_enabled: bool,
917    /// Enable/disable Service Workers.
918    service_worker_enabled: bool,
919    /// Drop image/visual requests when interception is enabled.
920    ignore_visuals: bool,
921    /// Drop ad network requests when interception is enabled.
922    ignore_ads: bool,
923    /// Drop JavaScript requests when interception is enabled.
924    ignore_javascript: bool,
925    /// Drop stylesheet (CSS) requests when interception is enabled.
926    ignore_stylesheets: bool,
927    /// Ignore prefetch domains.
928    ignore_prefetch: bool,
929    /// Drop analytics/telemetry requests when interception is enabled.
930    ignore_analytics: bool,
931    /// If `true`, limit fetching to HTML documents.
932    only_html: bool,
933    /// Extra HTTP headers to include with every request.
934    extra_headers: Option<std::collections::HashMap<String, String>>,
935    /// Network interception manager used to configure filtering behavior.
936    intercept_manager: NetworkInterceptManager,
937    /// Optional upper bound on bytes that may be received (per session/run).
938    max_bytes_allowed: Option<u64>,
939    /// Whitelist patterns to allow through the network.
940    whitelist_patterns: Option<Vec<String>>,
941    /// Blacklist patterns to block through the network.
942    blacklist_patterns: Option<Vec<String>>,
943    /// Capacity of the channel between browser handle and handler.
944    channel_capacity: usize,
945}
946
947impl BrowserConfig {
948    /// Browser builder default config.
949    pub fn builder() -> BrowserConfigBuilder {
950        BrowserConfigBuilder::default()
951    }
952
953    /// Launch with the executable path.
954    pub fn with_executable(path: impl AsRef<Path>) -> Self {
955        Self::builder()
956            .chrome_executable(path)
957            .build()
958            .expect("path to executable exist")
959    }
960}
961
962impl Default for BrowserConfigBuilder {
963    fn default() -> Self {
964        Self {
965            headless: HeadlessMode::True,
966            sandbox: true,
967            window_size: None,
968            port: 0,
969            executable: None,
970            executation_detection: DetectionOptions::default(),
971            extensions: Vec::new(),
972            process_envs: None,
973            user_data_dir: None,
974            incognito: false,
975            launch_timeout: Duration::from_millis(LAUNCH_TIMEOUT),
976            ignore_https_errors: true,
977            viewport: Some(Default::default()),
978            request_timeout: Duration::from_millis(REQUEST_TIMEOUT),
979            args: Vec::new(),
980            disable_default_args: false,
981            request_intercept: false,
982            cache_enabled: true,
983            ignore_visuals: false,
984            ignore_ads: false,
985            ignore_javascript: false,
986            ignore_analytics: false,
987            ignore_stylesheets: false,
988            ignore_prefetch: true,
989            only_html: false,
990            extra_headers: Default::default(),
991            service_worker_enabled: true,
992            intercept_manager: NetworkInterceptManager::Unknown,
993            max_bytes_allowed: None,
994            whitelist_patterns: None,
995            blacklist_patterns: None,
996            channel_capacity: 1000,
997        }
998    }
999}
1000
1001impl BrowserConfigBuilder {
1002    /// Configure window size.
1003    pub fn window_size(mut self, width: u32, height: u32) -> Self {
1004        self.window_size = Some((width, height));
1005        self
1006    }
1007    /// Configure sandboxing.
1008    pub fn no_sandbox(mut self) -> Self {
1009        self.sandbox = false;
1010        self
1011    }
1012    /// Configure the launch to start non headless.
1013    pub fn with_head(mut self) -> Self {
1014        self.headless = HeadlessMode::False;
1015        self
1016    }
1017    /// Configure the launch with the new headless mode.
1018    pub fn new_headless_mode(mut self) -> Self {
1019        self.headless = HeadlessMode::New;
1020        self
1021    }
1022    /// Configure the launch with headless.
1023    pub fn headless_mode(mut self, mode: HeadlessMode) -> Self {
1024        self.headless = mode;
1025        self
1026    }
1027    /// Configure the launch in incognito.
1028    pub fn incognito(mut self) -> Self {
1029        self.incognito = true;
1030        self
1031    }
1032
1033    pub fn respect_https_errors(mut self) -> Self {
1034        self.ignore_https_errors = false;
1035        self
1036    }
1037
1038    pub fn port(mut self, port: u16) -> Self {
1039        self.port = port;
1040        self
1041    }
1042
1043    pub fn with_max_bytes_allowed(mut self, max_bytes_allowed: Option<u64>) -> Self {
1044        self.max_bytes_allowed = max_bytes_allowed;
1045        self
1046    }
1047
1048    pub fn launch_timeout(mut self, timeout: Duration) -> Self {
1049        self.launch_timeout = timeout;
1050        self
1051    }
1052
1053    pub fn request_timeout(mut self, timeout: Duration) -> Self {
1054        self.request_timeout = timeout;
1055        self
1056    }
1057
1058    /// Configures the viewport of the browser, which defaults to `800x600`.
1059    /// `None` disables viewport emulation (i.e., it uses the browsers default
1060    /// configuration, which fills the available space. This is similar to what
1061    /// Playwright does when you provide `null` as the value of its `viewport`
1062    /// option).
1063    pub fn viewport(mut self, viewport: impl Into<Option<Viewport>>) -> Self {
1064        self.viewport = viewport.into();
1065        self
1066    }
1067
1068    pub fn user_data_dir(mut self, data_dir: impl AsRef<Path>) -> Self {
1069        self.user_data_dir = Some(data_dir.as_ref().to_path_buf());
1070        self
1071    }
1072
1073    pub fn chrome_executable(mut self, path: impl AsRef<Path>) -> Self {
1074        self.executable = Some(path.as_ref().to_path_buf());
1075        self
1076    }
1077
1078    pub fn chrome_detection(mut self, options: DetectionOptions) -> Self {
1079        self.executation_detection = options;
1080        self
1081    }
1082
1083    pub fn extension(mut self, extension: impl Into<String>) -> Self {
1084        self.extensions.push(extension.into());
1085        self
1086    }
1087
1088    pub fn extensions<I, S>(mut self, extensions: I) -> Self
1089    where
1090        I: IntoIterator<Item = S>,
1091        S: Into<String>,
1092    {
1093        for ext in extensions {
1094            self.extensions.push(ext.into());
1095        }
1096        self
1097    }
1098
1099    pub fn env(mut self, key: impl Into<String>, val: impl Into<String>) -> Self {
1100        self.process_envs
1101            .get_or_insert(HashMap::new())
1102            .insert(key.into(), val.into());
1103        self
1104    }
1105
1106    pub fn envs<I, K, V>(mut self, envs: I) -> Self
1107    where
1108        I: IntoIterator<Item = (K, V)>,
1109        K: Into<String>,
1110        V: Into<String>,
1111    {
1112        self.process_envs
1113            .get_or_insert(HashMap::new())
1114            .extend(envs.into_iter().map(|(k, v)| (k.into(), v.into())));
1115        self
1116    }
1117
1118    pub fn arg(mut self, arg: impl Into<String>) -> Self {
1119        self.args.push(arg.into());
1120        self
1121    }
1122
1123    pub fn args<I, S>(mut self, args: I) -> Self
1124    where
1125        I: IntoIterator<Item = S>,
1126        S: Into<String>,
1127    {
1128        for arg in args {
1129            self.args.push(arg.into());
1130        }
1131        self
1132    }
1133
1134    pub fn disable_default_args(mut self) -> Self {
1135        self.disable_default_args = true;
1136        self
1137    }
1138
1139    pub fn enable_request_intercept(mut self) -> Self {
1140        self.request_intercept = true;
1141        self
1142    }
1143
1144    pub fn disable_request_intercept(mut self) -> Self {
1145        self.request_intercept = false;
1146        self
1147    }
1148
1149    pub fn enable_cache(mut self) -> Self {
1150        self.cache_enabled = true;
1151        self
1152    }
1153
1154    pub fn disable_cache(mut self) -> Self {
1155        self.cache_enabled = false;
1156        self
1157    }
1158
1159    /// Set service worker enabled.
1160    pub fn set_service_worker_enabled(mut self, bypass: bool) -> Self {
1161        self.service_worker_enabled = bypass;
1162        self
1163    }
1164
1165    /// Set extra request headers.
1166    pub fn set_extra_headers(
1167        mut self,
1168        headers: Option<std::collections::HashMap<String, String>>,
1169    ) -> Self {
1170        self.extra_headers = headers;
1171        self
1172    }
1173
1174    /// Set whitelist patterns to allow through network interception allowing.
1175    pub fn set_whitelist_patterns(mut self, whitelist_patterns: Option<Vec<String>>) -> Self {
1176        self.whitelist_patterns = whitelist_patterns;
1177        self
1178    }
1179
1180    /// Set blacklist patterns to block through network interception.
1181    pub fn set_blacklist_patterns(mut self, blacklist_patterns: Option<Vec<String>>) -> Self {
1182        self.blacklist_patterns = blacklist_patterns;
1183        self
1184    }
1185
1186    /// Set the capacity of the channel between browser handle and handler.
1187    /// Defaults to 1000.
1188    pub fn channel_capacity(mut self, capacity: usize) -> Self {
1189        self.channel_capacity = capacity;
1190        self
1191    }
1192
1193    /// Build the browser.
1194    pub fn build(self) -> std::result::Result<BrowserConfig, String> {
1195        let executable = if let Some(e) = self.executable {
1196            e
1197        } else {
1198            detection::default_executable(self.executation_detection)?
1199        };
1200
1201        Ok(BrowserConfig {
1202            headless: self.headless,
1203            sandbox: self.sandbox,
1204            window_size: self.window_size,
1205            port: self.port,
1206            executable,
1207            extensions: self.extensions,
1208            process_envs: self.process_envs,
1209            user_data_dir: self.user_data_dir,
1210            incognito: self.incognito,
1211            launch_timeout: self.launch_timeout,
1212            ignore_https_errors: self.ignore_https_errors,
1213            viewport: self.viewport,
1214            request_timeout: self.request_timeout,
1215            args: self.args,
1216            disable_default_args: self.disable_default_args,
1217            request_intercept: self.request_intercept,
1218            cache_enabled: self.cache_enabled,
1219            ignore_visuals: self.ignore_visuals,
1220            ignore_ads: self.ignore_ads,
1221            ignore_javascript: self.ignore_javascript,
1222            ignore_analytics: self.ignore_analytics,
1223            ignore_stylesheets: self.ignore_stylesheets,
1224            ignore_prefetch: self.ignore_prefetch,
1225            extra_headers: self.extra_headers,
1226            only_html: self.only_html,
1227            intercept_manager: self.intercept_manager,
1228            service_worker_enabled: self.service_worker_enabled,
1229            max_bytes_allowed: self.max_bytes_allowed,
1230            whitelist_patterns: self.whitelist_patterns,
1231            blacklist_patterns: self.blacklist_patterns,
1232            channel_capacity: self.channel_capacity,
1233        })
1234    }
1235}
1236
1237impl BrowserConfig {
1238    pub fn launch(&self) -> io::Result<Child> {
1239        let mut cmd = async_process::Command::new(&self.executable);
1240
1241        if self.disable_default_args {
1242            cmd.args(&self.args);
1243        } else {
1244            cmd.args(DEFAULT_ARGS).args(&self.args);
1245        }
1246
1247        if !self
1248            .args
1249            .iter()
1250            .any(|arg| arg.contains("--remote-debugging-port="))
1251        {
1252            cmd.arg(format!("--remote-debugging-port={}", self.port));
1253        }
1254
1255        cmd.args(
1256            self.extensions
1257                .iter()
1258                .map(|e| format!("--load-extension={e}")),
1259        );
1260
1261        if let Some(ref user_data) = self.user_data_dir {
1262            cmd.arg(format!("--user-data-dir={}", user_data.display()));
1263        } else {
1264            // If the user did not specify a data directory, this would default to the systems default
1265            // data directory. In most cases, we would rather have a fresh instance of Chromium. Specify
1266            // a temp dir just for chromiumoxide instead.
1267            cmd.arg(format!(
1268                "--user-data-dir={}",
1269                std::env::temp_dir().join("chromiumoxide-runner").display()
1270            ));
1271        }
1272
1273        if let Some((width, height)) = self.window_size {
1274            cmd.arg(format!("--window-size={width},{height}"));
1275        }
1276
1277        if !self.sandbox {
1278            cmd.args(["--no-sandbox", "--disable-setuid-sandbox"]);
1279        }
1280
1281        match self.headless {
1282            HeadlessMode::False => (),
1283            HeadlessMode::True => {
1284                cmd.args(["--headless", "--hide-scrollbars", "--mute-audio"]);
1285            }
1286            HeadlessMode::New => {
1287                cmd.args(["--headless=new", "--hide-scrollbars", "--mute-audio"]);
1288            }
1289        }
1290
1291        if self.incognito {
1292            cmd.arg("--incognito");
1293        }
1294
1295        if let Some(ref envs) = self.process_envs {
1296            cmd.envs(envs);
1297        }
1298        cmd.stderr(Stdio::piped()).spawn()
1299    }
1300}
1301
1302/// Returns the path to Chrome's executable.
1303///
1304/// If the `CHROME` environment variable is set, `default_executable` will
1305/// use it as the default path. Otherwise, the filenames `google-chrome-stable`
1306/// `chromium`, `chromium-browser`, `chrome` and `chrome-browser` are
1307/// searched for in standard places. If that fails,
1308/// `/Applications/Google Chrome.app/...` (on MacOS) or the registry (on
1309/// Windows) is consulted. If all of the above fail, an error is returned.
1310#[deprecated(note = "Use detection::default_executable instead")]
1311pub fn default_executable() -> Result<std::path::PathBuf, String> {
1312    let options = DetectionOptions {
1313        msedge: false,
1314        unstable: false,
1315    };
1316    detection::default_executable(options)
1317}
1318
1319/// These are passed to the Chrome binary by default.
1320/// Via https://github.com/puppeteer/puppeteer/blob/4846b8723cf20d3551c0d755df394cc5e0c82a94/src/node/Launcher.ts#L157
1321static DEFAULT_ARGS: [&str; 26] = [
1322    "--disable-background-networking",
1323    "--enable-features=NetworkService,NetworkServiceInProcess",
1324    "--disable-background-timer-throttling",
1325    "--disable-backgrounding-occluded-windows",
1326    "--disable-breakpad",
1327    "--disable-client-side-phishing-detection",
1328    "--disable-component-extensions-with-background-pages",
1329    "--disable-default-apps",
1330    "--disable-dev-shm-usage",
1331    "--disable-extensions",
1332    "--disable-features=TranslateUI",
1333    "--disable-hang-monitor",
1334    "--disable-ipc-flooding-protection",
1335    "--disable-popup-blocking",
1336    "--disable-prompt-on-repost",
1337    "--disable-renderer-backgrounding",
1338    "--disable-sync",
1339    "--force-color-profile=srgb",
1340    "--metrics-recording-only",
1341    "--no-first-run",
1342    "--enable-automation",
1343    "--password-store=basic",
1344    "--use-mock-keychain",
1345    "--enable-blink-features=IdleDetection",
1346    "--lang=en_US",
1347    "--disable-blink-features=AutomationControlled",
1348];