chromiumoxide/
browser.rs

1use hashbrown::HashMap;
2use reqwest::header::{HeaderMap, HeaderValue, CONTENT_TYPE};
3use std::future::Future;
4use std::time::Duration;
5use std::{
6    io,
7    path::{Path, PathBuf},
8};
9
10use futures::channel::mpsc::{channel, unbounded, Sender};
11use futures::channel::oneshot::channel as oneshot_channel;
12use futures::select;
13use futures::SinkExt;
14
15use crate::async_process::{self, Child, ExitStatus, Stdio};
16use crate::cmd::{to_command_response, CommandMessage};
17use crate::conn::Connection;
18use crate::detection::{self, DetectionOptions};
19use crate::error::{BrowserStderr, CdpError, Result};
20use crate::handler::browser::BrowserContext;
21use crate::handler::viewport::Viewport;
22use crate::handler::{Handler, HandlerConfig, HandlerMessage, REQUEST_TIMEOUT};
23use crate::listeners::{EventListenerRequest, EventStream};
24use crate::page::Page;
25use crate::utils;
26use chromiumoxide_cdp::cdp::browser_protocol::browser::{
27    BrowserContextId, CloseReturns, GetVersionParams, GetVersionReturns,
28};
29use chromiumoxide_cdp::cdp::browser_protocol::browser::{
30    PermissionDescriptor, PermissionSetting, SetPermissionParams,
31};
32use chromiumoxide_cdp::cdp::browser_protocol::network::{Cookie, CookieParam};
33use chromiumoxide_cdp::cdp::browser_protocol::storage::{
34    ClearCookiesParams, GetCookiesParams, SetCookiesParams,
35};
36use chromiumoxide_cdp::cdp::browser_protocol::target::{
37    CreateBrowserContextParams, CreateTargetParams, DisposeBrowserContextParams,
38    GetBrowserContextsParams, GetBrowserContextsReturns, TargetId, TargetInfo,
39};
40
41use chromiumoxide_cdp::cdp::{CdpEventMessage, IntoEventKind};
42use chromiumoxide_types::*;
43use spider_network_blocker::intercept_manager::NetworkInterceptManager;
44
45/// Default `Browser::launch` timeout in MS
46pub const LAUNCH_TIMEOUT: u64 = 20_000;
47
48lazy_static::lazy_static! {
49    /// The request client to get the web socket url.
50    static ref REQUEST_CLIENT: reqwest::Client = reqwest::Client::builder()
51        .timeout(Duration::from_secs(60))
52        .default_headers({
53            let mut m = HeaderMap::new();
54
55            m.insert(CONTENT_TYPE, HeaderValue::from_static("application/json"));
56
57            m
58        })
59        .tcp_keepalive(Some(Duration::from_secs(5)))
60        .pool_idle_timeout(Some(Duration::from_secs(60)))
61        .pool_max_idle_per_host(10)
62        .build()
63        .expect("client to build");
64}
65
66/// A [`Browser`] is created when chromiumoxide connects to a Chromium instance.
67#[derive(Debug)]
68pub struct Browser {
69    /// The `Sender` to send messages to the connection handler that drives the
70    /// websocket
71    pub(crate) sender: Sender<HandlerMessage>,
72    /// How the spawned chromium instance was configured, if any
73    config: Option<BrowserConfig>,
74    /// The spawned chromium instance
75    child: Option<Child>,
76    /// The debug web socket url of the chromium instance
77    debug_ws_url: String,
78    /// The context of the browser
79    pub browser_context: BrowserContext,
80}
81
82/// Browser connection information.
83#[derive(serde::Deserialize, Debug, Default)]
84pub struct BrowserConnection {
85    #[serde(rename = "Browser")]
86    /// The browser name
87    pub browser: String,
88    #[serde(rename = "Protocol-Version")]
89    /// Browser version
90    pub protocol_version: String,
91    #[serde(rename = "User-Agent")]
92    /// User Agent used by default.
93    pub user_agent: String,
94    #[serde(rename = "V8-Version")]
95    /// The v8 engine version
96    pub v8_version: String,
97    #[serde(rename = "WebKit-Version")]
98    /// Webkit version
99    pub webkit_version: String,
100    #[serde(rename = "webSocketDebuggerUrl")]
101    /// Remote debugging address
102    pub web_socket_debugger_url: String,
103}
104
105impl Browser {
106    /// Connect to an already running chromium instance via the given URL.
107    ///
108    /// If the URL is a http(s) URL, it will first attempt to retrieve the Websocket URL from the `json/version` endpoint.
109    pub async fn connect(url: impl Into<String>) -> Result<(Self, Handler)> {
110        Self::connect_with_config(url, HandlerConfig::default()).await
111    }
112
113    // Connect to an already running chromium instance with a given `HandlerConfig`.
114    ///
115    /// If the URL is a http URL, it will first attempt to retrieve the Websocket URL from the `json/version` endpoint.
116    pub async fn connect_with_config(
117        url: impl Into<String>,
118        config: HandlerConfig,
119    ) -> Result<(Self, Handler)> {
120        let mut debug_ws_url = url.into();
121
122        if debug_ws_url.starts_with("http") {
123            match REQUEST_CLIENT
124                .get(
125                    if debug_ws_url.ends_with("/json/version")
126                        || debug_ws_url.ends_with("/json/version/")
127                    {
128                        debug_ws_url.to_owned()
129                    } else {
130                        format!(
131                            "{}{}json/version",
132                            &debug_ws_url,
133                            if debug_ws_url.ends_with('/') { "" } else { "/" }
134                        )
135                    },
136                )
137                .send()
138                .await
139            {
140                Ok(req) => {
141                    if let Ok(b) = req.bytes().await {
142                        if let Ok(connection) =
143                            crate::serde_json::from_slice::<Box<BrowserConnection>>(&b)
144                        {
145                            if !connection.web_socket_debugger_url.is_empty() {
146                                debug_ws_url = connection.web_socket_debugger_url;
147                            }
148                        }
149                    }
150                }
151                Err(_) => return Err(CdpError::NoResponse),
152            }
153        }
154
155        let conn = Connection::<CdpEventMessage>::connect(&debug_ws_url).await?;
156
157        let (tx, rx) = channel(1000);
158
159        let handler_config = BrowserConfig {
160            ignore_https_errors: config.ignore_https_errors,
161            viewport: config.viewport.clone(),
162            request_timeout: config.request_timeout,
163            request_intercept: config.request_intercept,
164            cache_enabled: config.cache_enabled,
165            ignore_visuals: config.ignore_visuals,
166            ignore_stylesheets: config.ignore_stylesheets,
167            ignore_javascript: config.ignore_javascript,
168            ignore_analytics: config.ignore_analytics,
169            ignore_ads: config.ignore_ads,
170            extra_headers: config.extra_headers.clone(),
171            only_html: config.only_html,
172            service_worker_enabled: config.service_worker_enabled,
173            intercept_manager: config.intercept_manager,
174            max_bytes_allowed: config.max_bytes_allowed,
175            ..Default::default()
176        };
177
178        let fut = Handler::new(conn, rx, config);
179        let browser_context = fut.default_browser_context().clone();
180
181        let browser = Self {
182            sender: tx,
183            config: Some(handler_config),
184            child: None,
185            debug_ws_url,
186            browser_context,
187        };
188
189        Ok((browser, fut))
190    }
191
192    /// Launches a new instance of `chromium` in the background and attaches to
193    /// its debug web socket.
194    ///
195    /// This fails when no chromium executable could be detected.
196    ///
197    /// This fails if no web socket url could be detected from the child
198    /// processes stderr for more than the configured `launch_timeout`
199    /// (20 seconds by default).
200    pub async fn launch(mut config: BrowserConfig) -> Result<(Self, Handler)> {
201        // Canonalize paths to reduce issues with sandboxing
202        config.executable = utils::canonicalize_except_snap(config.executable).await?;
203
204        // Launch a new chromium instance
205        let mut child = config.launch()?;
206
207        /// Faillible initialization to run once the child process is created.
208        ///
209        /// All faillible calls must be executed inside this function. This ensures that all
210        /// errors are caught and that the child process is properly cleaned-up.
211        async fn with_child(
212            config: &BrowserConfig,
213            child: &mut Child,
214        ) -> Result<(String, Connection<CdpEventMessage>)> {
215            let dur = config.launch_timeout;
216            let timeout_fut = Box::pin(tokio::time::sleep(dur));
217
218            // extract the ws:
219            let debug_ws_url = ws_url_from_output(child, timeout_fut).await?;
220            let conn = Connection::<CdpEventMessage>::connect(&debug_ws_url).await?;
221            Ok((debug_ws_url, conn))
222        }
223
224        let (debug_ws_url, conn) = match with_child(&config, &mut child).await {
225            Ok(conn) => conn,
226            Err(e) => {
227                // An initialization error occurred, clean up the process
228                if let Ok(Some(_)) = child.try_wait() {
229                    // already exited, do nothing, may happen if the browser crashed
230                } else {
231                    // the process is still alive, kill it and wait for exit (avoid zombie processes)
232                    child.kill().await.expect("`Browser::launch` failed but could not clean-up the child process (`kill`)");
233                    child.wait().await.expect("`Browser::launch` failed but could not clean-up the child process (`wait`)");
234                }
235                return Err(e);
236            }
237        };
238
239        // Only infaillible calls are allowed after this point to avoid clean-up issues with the
240        // child process.
241
242        let (tx, rx) = channel(1000);
243
244        let handler_config = HandlerConfig {
245            ignore_https_errors: config.ignore_https_errors,
246            viewport: config.viewport.clone(),
247            context_ids: Vec::new(),
248            request_timeout: config.request_timeout,
249            request_intercept: config.request_intercept,
250            cache_enabled: config.cache_enabled,
251            ignore_visuals: config.ignore_visuals,
252            ignore_stylesheets: config.ignore_stylesheets,
253            ignore_javascript: config.ignore_javascript,
254            ignore_analytics: config.ignore_analytics,
255            ignore_ads: config.ignore_ads,
256            extra_headers: config.extra_headers.clone(),
257            only_html: config.only_html,
258            service_worker_enabled: config.service_worker_enabled,
259            created_first_target: false,
260            intercept_manager: config.intercept_manager,
261            max_bytes_allowed: config.max_bytes_allowed,
262        };
263
264        let fut = Handler::new(conn, rx, handler_config);
265        let browser_context = fut.default_browser_context().clone();
266
267        let browser = Self {
268            sender: tx,
269            config: Some(config),
270            child: Some(child),
271            debug_ws_url,
272            browser_context,
273        };
274
275        Ok((browser, fut))
276    }
277
278    /// Request to fetch all existing browser targets.
279    ///
280    /// By default, only targets launched after the browser connection are tracked
281    /// when connecting to a existing browser instance with the devtools websocket url
282    /// This function fetches existing targets on the browser and adds them as pages internally
283    ///
284    /// The pages are not guaranteed to be ready as soon as the function returns
285    /// You should wait a few millis if you need to use a page
286    /// Returns [TargetInfo]
287    pub async fn fetch_targets(&mut self) -> Result<Vec<TargetInfo>> {
288        let (tx, rx) = oneshot_channel();
289
290        self.sender
291            .clone()
292            .send(HandlerMessage::FetchTargets(tx))
293            .await?;
294
295        rx.await?
296    }
297
298    /// Request for the browser to close completely.
299    ///
300    /// If the browser was spawned by [`Browser::launch`], it is recommended to wait for the
301    /// spawned instance exit, to avoid "zombie" processes ([`Browser::wait`],
302    /// [`Browser::wait_sync`], [`Browser::try_wait`]).
303    /// [`Browser::drop`] waits automatically if needed.
304    pub async fn close(&self) -> Result<CloseReturns> {
305        let (tx, rx) = oneshot_channel();
306
307        self.sender
308            .clone()
309            .send(HandlerMessage::CloseBrowser(tx))
310            .await?;
311
312        rx.await?
313    }
314
315    /// Asynchronously wait for the spawned chromium instance to exit completely.
316    ///
317    /// The instance is spawned by [`Browser::launch`]. `wait` is usually called after
318    /// [`Browser::close`]. You can call this explicitly to collect the process and avoid
319    /// "zombie" processes.
320    ///
321    /// This call has no effect if this [`Browser`] did not spawn any chromium instance (e.g.
322    /// connected to an existing browser through [`Browser::connect`])
323    pub async fn wait(&mut self) -> io::Result<Option<ExitStatus>> {
324        if let Some(child) = self.child.as_mut() {
325            Ok(Some(child.wait().await?))
326        } else {
327            Ok(None)
328        }
329    }
330
331    /// If the spawned chromium instance has completely exited, wait for it.
332    ///
333    /// The instance is spawned by [`Browser::launch`]. `try_wait` is usually called after
334    /// [`Browser::close`]. You can call this explicitly to collect the process and avoid
335    /// "zombie" processes.
336    ///
337    /// This call has no effect if this [`Browser`] did not spawn any chromium instance (e.g.
338    /// connected to an existing browser through [`Browser::connect`])
339    pub fn try_wait(&mut self) -> io::Result<Option<ExitStatus>> {
340        if let Some(child) = self.child.as_mut() {
341            child.try_wait()
342        } else {
343            Ok(None)
344        }
345    }
346
347    /// Get the spawned chromium instance
348    ///
349    /// The instance is spawned by [`Browser::launch`]. The result is a [`async_process::Child`]
350    /// value. It acts as a compat wrapper for an `async-std` or `tokio` child process.
351    ///
352    /// You may use [`async_process::Child::as_mut_inner`] to retrieve the concrete implementation
353    /// for the selected runtime.
354    ///
355    /// This call has no effect if this [`Browser`] did not spawn any chromium instance (e.g.
356    /// connected to an existing browser through [`Browser::connect`])
357    pub fn get_mut_child(&mut self) -> Option<&mut Child> {
358        self.child.as_mut()
359    }
360
361    /// Has a browser instance launched on system.
362    pub fn has_child(&self) -> bool {
363        self.child.is_some()
364    }
365
366    /// Forcibly kill the spawned chromium instance
367    ///
368    /// The instance is spawned by [`Browser::launch`]. `kill` will automatically wait for the child
369    /// process to exit to avoid "zombie" processes.
370    ///
371    /// This method is provided to help if the browser does not close by itself. You should prefer
372    /// to use [`Browser::close`].
373    ///
374    /// This call has no effect if this [`Browser`] did not spawn any chromium instance (e.g.
375    /// connected to an existing browser through [`Browser::connect`])
376    pub async fn kill(&mut self) -> Option<io::Result<()>> {
377        match self.child.as_mut() {
378            Some(child) => Some(child.kill().await),
379            None => None,
380        }
381    }
382
383    /// If not launched as incognito this creates a new incognito browser
384    /// context. After that this browser exists within the incognito session.
385    /// New pages created while being in incognito mode will also run in the
386    /// incognito context. Incognito contexts won't share cookies/cache with
387    /// other browser contexts.
388    pub async fn start_incognito_context(&mut self) -> Result<&mut Self> {
389        if !self.is_incognito_configured() {
390            let browser_context_id = self
391                .create_browser_context(CreateBrowserContextParams::default())
392                .await?;
393            self.browser_context = BrowserContext::from(browser_context_id);
394            self.sender
395                .clone()
396                .send(HandlerMessage::InsertContext(self.browser_context.clone()))
397                .await?;
398        }
399
400        Ok(self)
401    }
402
403    /// If a incognito session was created with
404    /// `Browser::start_incognito_context` this disposes this context.
405    ///
406    /// # Note This will also dispose all pages that were running within the
407    /// incognito context.
408    pub async fn quit_incognito_context_base(
409        &self,
410        browser_context_id: BrowserContextId,
411    ) -> Result<&Self> {
412        self.dispose_browser_context(browser_context_id.clone())
413            .await?;
414        self.sender
415            .clone()
416            .send(HandlerMessage::DisposeContext(BrowserContext::from(
417                browser_context_id,
418            )))
419            .await?;
420        Ok(self)
421    }
422
423    /// If a incognito session was created with
424    /// `Browser::start_incognito_context` this disposes this context.
425    ///
426    /// # Note This will also dispose all pages that were running within the
427    /// incognito context.
428    pub async fn quit_incognito_context(&mut self) -> Result<&mut Self> {
429        if let Some(id) = self.browser_context.take() {
430            let _ = self.quit_incognito_context_base(id).await;
431        }
432        Ok(self)
433    }
434
435    /// Whether incognito mode was configured from the start
436    fn is_incognito_configured(&self) -> bool {
437        self.config
438            .as_ref()
439            .map(|c| c.incognito)
440            .unwrap_or_default()
441    }
442
443    /// Returns the address of the websocket this browser is attached to
444    pub fn websocket_address(&self) -> &String {
445        &self.debug_ws_url
446    }
447
448    /// Whether the BrowserContext is incognito.
449    pub fn is_incognito(&self) -> bool {
450        self.is_incognito_configured() || self.browser_context.is_incognito()
451    }
452
453    /// The config of the spawned chromium instance if any.
454    pub fn config(&self) -> Option<&BrowserConfig> {
455        self.config.as_ref()
456    }
457
458    /// Create a new browser page
459    pub async fn new_page(&self, params: impl Into<CreateTargetParams>) -> Result<Page> {
460        let (tx, rx) = oneshot_channel();
461        let mut params = params.into();
462
463        if let Some(id) = self.browser_context.id() {
464            if params.browser_context_id.is_none() {
465                params.browser_context_id = Some(id.clone());
466            }
467        }
468
469        let _ = self
470            .sender
471            .clone()
472            .send(HandlerMessage::CreatePage(params, tx))
473            .await;
474
475        rx.await?
476    }
477
478    /// Version information about the browser
479    pub async fn version(&self) -> Result<GetVersionReturns> {
480        Ok(self.execute(GetVersionParams::default()).await?.result)
481    }
482
483    /// Returns the user agent of the browser
484    pub async fn user_agent(&self) -> Result<String> {
485        Ok(self.version().await?.user_agent)
486    }
487
488    /// Call a browser method.
489    pub async fn execute<T: Command>(&self, cmd: T) -> Result<CommandResponse<T::Response>> {
490        let (tx, rx) = oneshot_channel();
491        let method = cmd.identifier();
492        let msg = CommandMessage::new(cmd, tx)?;
493
494        self.sender
495            .clone()
496            .send(HandlerMessage::Command(msg))
497            .await?;
498        let resp = rx.await??;
499        to_command_response::<T>(resp, method)
500    }
501
502    /// Set permission settings for given embedding and embedded origins.
503    /// [PermissionDescriptor](https://chromedevtools.github.io/devtools-protocol/tot/Browser/#type-PermissionDescriptor)
504    /// [PermissionSetting](https://chromedevtools.github.io/devtools-protocol/tot/Browser/#type-PermissionSetting)
505    pub async fn set_permission(
506        &self,
507        permission: PermissionDescriptor,
508        setting: PermissionSetting,
509        origin: Option<impl Into<String>>,
510        embedded_origin: Option<impl Into<String>>,
511        browser_context_id: Option<BrowserContextId>,
512    ) -> Result<&Self> {
513        self.execute(SetPermissionParams {
514            permission,
515            setting,
516            origin: origin.map(Into::into),
517            embedded_origin: embedded_origin.map(Into::into),
518            browser_context_id: browser_context_id.or_else(|| self.browser_context.id.clone()),
519        })
520        .await?;
521        Ok(self)
522    }
523
524    /// Convenience: set a permission for a single origin using the current browser context.
525    pub async fn set_permission_for_origin(
526        &self,
527        origin: impl Into<String>,
528        embedded_origin: Option<impl Into<String>>,
529        permission: PermissionDescriptor,
530        setting: PermissionSetting,
531    ) -> Result<&Self> {
532        self.set_permission(permission, setting, Some(origin), embedded_origin, None)
533            .await
534    }
535
536    /// "Reset" a permission override by setting it back to Prompt.
537    pub async fn reset_permission_for_origin(
538        &self,
539        origin: impl Into<String>,
540        embedded_origin: Option<impl Into<String>>,
541        permission: PermissionDescriptor,
542    ) -> Result<&Self> {
543        self.set_permission_for_origin(
544            origin,
545            embedded_origin,
546            permission,
547            PermissionSetting::Prompt,
548        )
549        .await
550    }
551
552    /// "Grant" all permissions.
553    pub async fn grant_all_permission_for_origin(
554        &self,
555        origin: impl Into<String>,
556        embedded_origin: Option<impl Into<String>>,
557        permission: PermissionDescriptor,
558    ) -> Result<&Self> {
559        self.set_permission_for_origin(
560            origin,
561            embedded_origin,
562            permission,
563            PermissionSetting::Granted,
564        )
565        .await
566    }
567
568    /// "Deny" all permissions.
569    pub async fn deny_all_permission_for_origin(
570        &self,
571        origin: impl Into<String>,
572        embedded_origin: Option<impl Into<String>>,
573        permission: PermissionDescriptor,
574    ) -> Result<&Self> {
575        self.set_permission_for_origin(
576            origin,
577            embedded_origin,
578            permission,
579            PermissionSetting::Denied,
580        )
581        .await
582    }
583
584    /// Return all of the pages of the browser
585    pub async fn pages(&self) -> Result<Vec<Page>> {
586        let (tx, rx) = oneshot_channel();
587        self.sender
588            .clone()
589            .send(HandlerMessage::GetPages(tx))
590            .await?;
591        Ok(rx.await?)
592    }
593
594    /// Return page of given target_id
595    pub async fn get_page(&self, target_id: TargetId) -> Result<Page> {
596        let (tx, rx) = oneshot_channel();
597        self.sender
598            .clone()
599            .send(HandlerMessage::GetPage(target_id, tx))
600            .await?;
601        rx.await?.ok_or(CdpError::NotFound)
602    }
603
604    /// Set listener for browser event
605    pub async fn event_listener<T: IntoEventKind>(&self) -> Result<EventStream<T>> {
606        let (tx, rx) = unbounded();
607        self.sender
608            .clone()
609            .send(HandlerMessage::AddEventListener(
610                EventListenerRequest::new::<T>(tx),
611            ))
612            .await?;
613
614        Ok(EventStream::new(rx))
615    }
616
617    /// Creates a new empty browser context.
618    pub async fn create_browser_context(
619        &mut self,
620        params: CreateBrowserContextParams,
621    ) -> Result<BrowserContextId> {
622        let response = self.execute(params).await?;
623
624        Ok(response.result.browser_context_id)
625    }
626
627    /// Returns all browser contexts created with Target.createBrowserContext method.
628    pub async fn get_browser_contexts(
629        &mut self,
630        params: GetBrowserContextsParams,
631    ) -> Result<GetBrowserContextsReturns> {
632        let response = self.execute(params).await?;
633        Ok(response.result)
634    }
635
636    /// Send a new empty browser context.
637    pub async fn send_new_context(
638        &mut self,
639        browser_context_id: BrowserContextId,
640    ) -> Result<&Self> {
641        self.browser_context = BrowserContext::from(browser_context_id);
642        self.sender
643            .clone()
644            .send(HandlerMessage::InsertContext(self.browser_context.clone()))
645            .await?;
646        Ok(self)
647    }
648
649    /// Deletes a browser context.
650    pub async fn dispose_browser_context(
651        &self,
652        browser_context_id: impl Into<BrowserContextId>,
653    ) -> Result<&Self> {
654        self.execute(DisposeBrowserContextParams::new(browser_context_id))
655            .await?;
656
657        Ok(self)
658    }
659
660    /// Clears cookies.
661    pub async fn clear_cookies(&self) -> Result<&Self> {
662        self.execute(ClearCookiesParams::default()).await?;
663        Ok(self)
664    }
665
666    /// Returns all browser cookies.
667    pub async fn get_cookies(&self) -> Result<Vec<Cookie>> {
668        let mut cmd = GetCookiesParams::default();
669
670        cmd.browser_context_id = self.browser_context.id.clone();
671
672        Ok(self.execute(cmd).await?.result.cookies)
673    }
674
675    /// Sets given cookies.
676    pub async fn set_cookies(&self, mut cookies: Vec<CookieParam>) -> Result<&Self> {
677        for cookie in &mut cookies {
678            if let Some(url) = cookie.url.as_ref() {
679                crate::page::validate_cookie_url(url)?;
680            }
681        }
682
683        let mut cookies_param = SetCookiesParams::new(cookies);
684
685        cookies_param.browser_context_id = self.browser_context.id.clone();
686
687        self.execute(cookies_param).await?;
688        Ok(self)
689    }
690}
691
692impl Drop for Browser {
693    fn drop(&mut self) {
694        if let Some(child) = self.child.as_mut() {
695            if let Ok(Some(_)) = child.try_wait() {
696                // Already exited, do nothing. Usually occurs after using the method close or kill.
697            } else {
698                // We set the `kill_on_drop` property for the child process, so no need to explicitely
699                // kill it here. It can't really be done anyway since the method is async.
700                //
701                // On Unix, the process will be reaped in the background by the runtime automatically
702                // so it won't leave any resources locked. It is, however, a better practice for the user to
703                // do it himself since the runtime doesn't provide garantees as to when the reap occurs, so we
704                // warn him here.
705                tracing::warn!("Browser was not closed manually, it will be killed automatically in the background");
706            }
707        }
708    }
709}
710
711/// Resolve devtools WebSocket URL from the provided browser process
712///
713/// If an error occurs, it returns the browser's stderr output.
714///
715/// The URL resolution fails if:
716/// - [`CdpError::LaunchTimeout`]: `timeout_fut` completes, this corresponds to a timeout
717/// - [`CdpError::LaunchExit`]: the browser process exits (or is killed)
718/// - [`CdpError::LaunchIo`]: an input/output error occurs when await the process exit or reading
719///   the browser's stderr: end of stream, invalid UTF-8, other
720async fn ws_url_from_output(
721    child_process: &mut Child,
722    timeout_fut: impl Future<Output = ()> + Unpin,
723) -> Result<String> {
724    use futures::{AsyncBufReadExt, FutureExt};
725    let mut timeout_fut = timeout_fut.fuse();
726    let stderr = child_process.stderr.take().expect("no stderror");
727    let mut stderr_bytes = Vec::<u8>::new();
728    let mut exit_status_fut = Box::pin(child_process.wait()).fuse();
729    let mut buf = futures::io::BufReader::new(stderr);
730    loop {
731        select! {
732            _ = timeout_fut => return Err(CdpError::LaunchTimeout(BrowserStderr::new(stderr_bytes))),
733            exit_status = exit_status_fut => {
734                return Err(match exit_status {
735                    Err(e) => CdpError::LaunchIo(e, BrowserStderr::new(stderr_bytes)),
736                    Ok(exit_status) => CdpError::LaunchExit(exit_status, BrowserStderr::new(stderr_bytes)),
737                })
738            },
739            read_res = buf.read_until(b'\n', &mut stderr_bytes).fuse() => {
740                match read_res {
741                    Err(e) => return Err(CdpError::LaunchIo(e, BrowserStderr::new(stderr_bytes))),
742                    Ok(byte_count) => {
743                        if byte_count == 0 {
744                            let e = io::Error::new(io::ErrorKind::UnexpectedEof, "unexpected end of stream");
745                            return Err(CdpError::LaunchIo(e, BrowserStderr::new(stderr_bytes)));
746                        }
747                        let start_offset = stderr_bytes.len() - byte_count;
748                        let new_bytes = &stderr_bytes[start_offset..];
749                        match std::str::from_utf8(new_bytes) {
750                            Err(_) => {
751                                let e = io::Error::new(io::ErrorKind::InvalidData, "stream did not contain valid UTF-8");
752                                return Err(CdpError::LaunchIo(e, BrowserStderr::new(stderr_bytes)));
753                            }
754                            Ok(line) => {
755                                if let Some((_, ws)) = line.rsplit_once("listening on ") {
756                                    if ws.starts_with("ws") && ws.contains("devtools/browser") {
757                                        return Ok(ws.trim().to_string());
758                                    }
759                                }
760                            }
761                        }
762                    }
763                }
764            }
765        }
766    }
767}
768
769#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
770pub enum HeadlessMode {
771    /// The "headful" mode.
772    False,
773    /// The old headless mode.
774    #[default]
775    True,
776    /// The new headless mode. See also: https://developer.chrome.com/docs/chromium/new-headless
777    New,
778}
779
780#[derive(Debug, Clone, Default)]
781pub struct BrowserConfig {
782    /// Determines whether to run headless version of the browser. Defaults to
783    /// true.
784    headless: HeadlessMode,
785    /// Determines whether to run the browser with a sandbox.
786    sandbox: bool,
787    /// Launch the browser with a specific window width and height.
788    window_size: Option<(u32, u32)>,
789    /// Launch the browser with a specific debugging port.
790    port: u16,
791    /// Path for Chrome or Chromium.
792    ///
793    /// If unspecified, the create will try to automatically detect a suitable
794    /// binary.
795    executable: std::path::PathBuf,
796
797    /// A list of Chrome extensions to load.
798    ///
799    /// An extension should be a path to a folder containing the extension code.
800    /// CRX files cannot be used directly and must be first extracted.
801    ///
802    /// Note that Chrome does not support loading extensions in headless-mode.
803    /// See https://bugs.chromium.org/p/chromium/issues/detail?id=706008#c5
804    extensions: Vec<String>,
805
806    /// Environment variables to set for the Chromium process.
807    /// Passes value through to std::process::Command::envs.
808    pub process_envs: Option<HashMap<String, String>>,
809
810    /// Data dir for user data
811    pub user_data_dir: Option<PathBuf>,
812
813    /// Whether to launch the `Browser` in incognito mode.
814    incognito: bool,
815
816    /// Timeout duration for `Browser::launch`.
817    launch_timeout: Duration,
818
819    /// Ignore https errors, default is true.
820    ignore_https_errors: bool,
821    pub viewport: Option<Viewport>,
822    /// The duration after a request with no response should time out.
823    request_timeout: Duration,
824
825    /// Additional command line arguments to pass to the browser instance.
826    args: Vec<String>,
827
828    /// Whether to disable DEFAULT_ARGS or not, default is false.
829    disable_default_args: bool,
830
831    /// Whether to enable request interception.
832    pub request_intercept: bool,
833
834    /// Whether to enable cache.
835    pub cache_enabled: bool,
836    /// Whether to enable or disable Service Workers.
837    /// Disabling may reduce background network activity and caching effects.
838    pub service_worker_enabled: bool,
839    /// Whether to ignore image/visual requests during interception.
840    /// Can reduce bandwidth and speed up crawling when visuals are unnecessary.
841    pub ignore_visuals: bool,
842    /// Whether to ignore stylesheet (CSS) requests during interception.
843    /// Useful for content-only crawls.
844    pub ignore_stylesheets: bool,
845    /// Whether to ignore JavaScript requests during interception.
846    /// This still allows critical framework bundles to pass when applicable.
847    pub ignore_javascript: bool,
848    /// Whether to ignore analytics/telemetry requests during interception.
849    pub ignore_analytics: bool,
850    /// Whether to ignore ad network requests during interception.
851    pub ignore_ads: bool,
852    /// Extra headers.
853    pub extra_headers: Option<std::collections::HashMap<String, String>>,
854    /// Only html
855    pub only_html: bool,
856    /// The interception intercept manager.
857    pub intercept_manager: NetworkInterceptManager,
858    /// The max bytes to receive.
859    pub max_bytes_allowed: Option<u64>,
860}
861
862#[derive(Debug, Clone)]
863pub struct BrowserConfigBuilder {
864    /// Headless mode configuration for the browser.
865    headless: HeadlessMode,
866    /// Whether to run the browser with a sandbox.
867    sandbox: bool,
868    /// Optional initial browser window size `(width, height)`.
869    window_size: Option<(u32, u32)>,
870    /// DevTools debugging port to bind to.
871    port: u16,
872    /// Optional explicit path to the Chrome/Chromium executable.
873    /// If `None`, auto-detection may be attempted based on `executation_detection`.
874    executable: Option<PathBuf>,
875    /// Controls auto-detection behavior for finding a Chrome/Chromium binary.
876    executation_detection: DetectionOptions,
877    /// List of unpacked extensions (directories) to load at startup.
878    extensions: Vec<String>,
879    /// Environment variables to set on the spawned Chromium process.
880    process_envs: Option<HashMap<String, String>>,
881    /// User data directory to persist browser state, or `None` for ephemeral.
882    user_data_dir: Option<PathBuf>,
883    /// Whether to start the browser in incognito (off-the-record) mode.
884    incognito: bool,
885    /// Maximum time to wait for the browser to launch and become ready.
886    launch_timeout: Duration,
887    /// Whether to ignore HTTPS/TLS errors during navigation and requests.
888    ignore_https_errors: bool,
889    /// Default page viewport configuration applied on startup.
890    viewport: Option<Viewport>,
891    /// Timeout for individual network requests without response progress.
892    request_timeout: Duration,
893    /// Additional command-line flags passed directly to the browser process.
894    args: Vec<String>,
895    /// Disable the default argument set and use only the provided `args`.
896    disable_default_args: bool,
897    /// Enable Network.requestInterception for request filtering/handling.
898    request_intercept: bool,
899    /// Enable the browser cache for navigations and subresources.
900    cache_enabled: bool,
901    /// Enable/disable Service Workers.
902    service_worker_enabled: bool,
903    /// Drop image/visual requests when interception is enabled.
904    ignore_visuals: bool,
905    /// Drop ad network requests when interception is enabled.
906    ignore_ads: bool,
907    /// Drop JavaScript requests when interception is enabled.
908    ignore_javascript: bool,
909    /// Drop stylesheet (CSS) requests when interception is enabled.
910    ignore_stylesheets: bool,
911    /// Drop analytics/telemetry requests when interception is enabled.
912    ignore_analytics: bool,
913    /// If `true`, limit fetching to HTML documents.
914    only_html: bool,
915    /// Extra HTTP headers to include with every request.
916    extra_headers: Option<std::collections::HashMap<String, String>>,
917    /// Network interception manager used to configure filtering behavior.
918    intercept_manager: NetworkInterceptManager,
919    /// Optional upper bound on bytes that may be received (per session/run).
920    max_bytes_allowed: Option<u64>,
921}
922
923impl BrowserConfig {
924    /// Browser builder default config.
925    pub fn builder() -> BrowserConfigBuilder {
926        BrowserConfigBuilder::default()
927    }
928
929    /// Launch with the executable path.
930    pub fn with_executable(path: impl AsRef<Path>) -> Self {
931        Self::builder()
932            .chrome_executable(path)
933            .build()
934            .expect("path to executable exist")
935    }
936}
937
938impl Default for BrowserConfigBuilder {
939    fn default() -> Self {
940        Self {
941            headless: HeadlessMode::True,
942            sandbox: true,
943            window_size: None,
944            port: 0,
945            executable: None,
946            executation_detection: DetectionOptions::default(),
947            extensions: Vec::new(),
948            process_envs: None,
949            user_data_dir: None,
950            incognito: false,
951            launch_timeout: Duration::from_millis(LAUNCH_TIMEOUT),
952            ignore_https_errors: true,
953            viewport: Some(Default::default()),
954            request_timeout: Duration::from_millis(REQUEST_TIMEOUT),
955            args: Vec::new(),
956            disable_default_args: false,
957            request_intercept: false,
958            cache_enabled: true,
959            ignore_visuals: false,
960            ignore_ads: false,
961            ignore_javascript: false,
962            ignore_analytics: false,
963            ignore_stylesheets: false,
964            only_html: false,
965            extra_headers: Default::default(),
966            service_worker_enabled: true,
967            intercept_manager: NetworkInterceptManager::Unknown,
968            max_bytes_allowed: None,
969        }
970    }
971}
972
973impl BrowserConfigBuilder {
974    /// Configure window size.
975    pub fn window_size(mut self, width: u32, height: u32) -> Self {
976        self.window_size = Some((width, height));
977        self
978    }
979    /// Configure sandboxing.
980    pub fn no_sandbox(mut self) -> Self {
981        self.sandbox = false;
982        self
983    }
984    /// Configure the launch to start non headless.
985    pub fn with_head(mut self) -> Self {
986        self.headless = HeadlessMode::False;
987        self
988    }
989    /// Configure the launch with the new headless mode.
990    pub fn new_headless_mode(mut self) -> Self {
991        self.headless = HeadlessMode::New;
992        self
993    }
994    /// Configure the launch with headless.
995    pub fn headless_mode(mut self, mode: HeadlessMode) -> Self {
996        self.headless = mode;
997        self
998    }
999    /// Configure the launch in incognito.
1000    pub fn incognito(mut self) -> Self {
1001        self.incognito = true;
1002        self
1003    }
1004
1005    pub fn respect_https_errors(mut self) -> Self {
1006        self.ignore_https_errors = false;
1007        self
1008    }
1009
1010    pub fn port(mut self, port: u16) -> Self {
1011        self.port = port;
1012        self
1013    }
1014
1015    pub fn with_max_bytes_allowed(mut self, max_bytes_allowed: Option<u64>) -> Self {
1016        self.max_bytes_allowed = max_bytes_allowed;
1017        self
1018    }
1019
1020    pub fn launch_timeout(mut self, timeout: Duration) -> Self {
1021        self.launch_timeout = timeout;
1022        self
1023    }
1024
1025    pub fn request_timeout(mut self, timeout: Duration) -> Self {
1026        self.request_timeout = timeout;
1027        self
1028    }
1029
1030    /// Configures the viewport of the browser, which defaults to `800x600`.
1031    /// `None` disables viewport emulation (i.e., it uses the browsers default
1032    /// configuration, which fills the available space. This is similar to what
1033    /// Playwright does when you provide `null` as the value of its `viewport`
1034    /// option).
1035    pub fn viewport(mut self, viewport: impl Into<Option<Viewport>>) -> Self {
1036        self.viewport = viewport.into();
1037        self
1038    }
1039
1040    pub fn user_data_dir(mut self, data_dir: impl AsRef<Path>) -> Self {
1041        self.user_data_dir = Some(data_dir.as_ref().to_path_buf());
1042        self
1043    }
1044
1045    pub fn chrome_executable(mut self, path: impl AsRef<Path>) -> Self {
1046        self.executable = Some(path.as_ref().to_path_buf());
1047        self
1048    }
1049
1050    pub fn chrome_detection(mut self, options: DetectionOptions) -> Self {
1051        self.executation_detection = options;
1052        self
1053    }
1054
1055    pub fn extension(mut self, extension: impl Into<String>) -> Self {
1056        self.extensions.push(extension.into());
1057        self
1058    }
1059
1060    pub fn extensions<I, S>(mut self, extensions: I) -> Self
1061    where
1062        I: IntoIterator<Item = S>,
1063        S: Into<String>,
1064    {
1065        for ext in extensions {
1066            self.extensions.push(ext.into());
1067        }
1068        self
1069    }
1070
1071    pub fn env(mut self, key: impl Into<String>, val: impl Into<String>) -> Self {
1072        self.process_envs
1073            .get_or_insert(HashMap::new())
1074            .insert(key.into(), val.into());
1075        self
1076    }
1077
1078    pub fn envs<I, K, V>(mut self, envs: I) -> Self
1079    where
1080        I: IntoIterator<Item = (K, V)>,
1081        K: Into<String>,
1082        V: Into<String>,
1083    {
1084        self.process_envs
1085            .get_or_insert(HashMap::new())
1086            .extend(envs.into_iter().map(|(k, v)| (k.into(), v.into())));
1087        self
1088    }
1089
1090    pub fn arg(mut self, arg: impl Into<String>) -> Self {
1091        self.args.push(arg.into());
1092        self
1093    }
1094
1095    pub fn args<I, S>(mut self, args: I) -> Self
1096    where
1097        I: IntoIterator<Item = S>,
1098        S: Into<String>,
1099    {
1100        for arg in args {
1101            self.args.push(arg.into());
1102        }
1103        self
1104    }
1105
1106    pub fn disable_default_args(mut self) -> Self {
1107        self.disable_default_args = true;
1108        self
1109    }
1110
1111    pub fn enable_request_intercept(mut self) -> Self {
1112        self.request_intercept = true;
1113        self
1114    }
1115
1116    pub fn disable_request_intercept(mut self) -> Self {
1117        self.request_intercept = false;
1118        self
1119    }
1120
1121    pub fn enable_cache(mut self) -> Self {
1122        self.cache_enabled = true;
1123        self
1124    }
1125
1126    pub fn disable_cache(mut self) -> Self {
1127        self.cache_enabled = false;
1128        self
1129    }
1130
1131    pub fn set_service_worker_enabled(mut self, bypass: bool) -> Self {
1132        self.service_worker_enabled = bypass;
1133        self
1134    }
1135
1136    pub fn set_extra_headers(
1137        mut self,
1138        headers: Option<std::collections::HashMap<String, String>>,
1139    ) -> Self {
1140        self.extra_headers = headers;
1141        self
1142    }
1143
1144    pub fn build(self) -> std::result::Result<BrowserConfig, String> {
1145        let executable = if let Some(e) = self.executable {
1146            e
1147        } else {
1148            detection::default_executable(self.executation_detection)?
1149        };
1150
1151        Ok(BrowserConfig {
1152            headless: self.headless,
1153            sandbox: self.sandbox,
1154            window_size: self.window_size,
1155            port: self.port,
1156            executable,
1157            extensions: self.extensions,
1158            process_envs: self.process_envs,
1159            user_data_dir: self.user_data_dir,
1160            incognito: self.incognito,
1161            launch_timeout: self.launch_timeout,
1162            ignore_https_errors: self.ignore_https_errors,
1163            viewport: self.viewport,
1164            request_timeout: self.request_timeout,
1165            args: self.args,
1166            disable_default_args: self.disable_default_args,
1167            request_intercept: self.request_intercept,
1168            cache_enabled: self.cache_enabled,
1169            ignore_visuals: self.ignore_visuals,
1170            ignore_ads: self.ignore_ads,
1171            ignore_javascript: self.ignore_javascript,
1172            ignore_analytics: self.ignore_analytics,
1173            ignore_stylesheets: self.ignore_stylesheets,
1174            extra_headers: self.extra_headers,
1175            only_html: self.only_html,
1176            intercept_manager: self.intercept_manager,
1177            service_worker_enabled: self.service_worker_enabled,
1178            max_bytes_allowed: self.max_bytes_allowed,
1179        })
1180    }
1181}
1182
1183impl BrowserConfig {
1184    pub fn launch(&self) -> io::Result<Child> {
1185        let mut cmd = async_process::Command::new(&self.executable);
1186
1187        if self.disable_default_args {
1188            cmd.args(&self.args);
1189        } else {
1190            cmd.args(DEFAULT_ARGS).args(&self.args);
1191        }
1192
1193        if !self
1194            .args
1195            .iter()
1196            .any(|arg| arg.contains("--remote-debugging-port="))
1197        {
1198            cmd.arg(format!("--remote-debugging-port={}", self.port));
1199        }
1200
1201        cmd.args(
1202            self.extensions
1203                .iter()
1204                .map(|e| format!("--load-extension={e}")),
1205        );
1206
1207        if let Some(ref user_data) = self.user_data_dir {
1208            cmd.arg(format!("--user-data-dir={}", user_data.display()));
1209        } else {
1210            // If the user did not specify a data directory, this would default to the systems default
1211            // data directory. In most cases, we would rather have a fresh instance of Chromium. Specify
1212            // a temp dir just for chromiumoxide instead.
1213            cmd.arg(format!(
1214                "--user-data-dir={}",
1215                std::env::temp_dir().join("chromiumoxide-runner").display()
1216            ));
1217        }
1218
1219        if let Some((width, height)) = self.window_size {
1220            cmd.arg(format!("--window-size={width},{height}"));
1221        }
1222
1223        if !self.sandbox {
1224            cmd.args(["--no-sandbox", "--disable-setuid-sandbox"]);
1225        }
1226
1227        match self.headless {
1228            HeadlessMode::False => (),
1229            HeadlessMode::True => {
1230                cmd.args(["--headless", "--hide-scrollbars", "--mute-audio"]);
1231            }
1232            HeadlessMode::New => {
1233                cmd.args(["--headless=new", "--hide-scrollbars", "--mute-audio"]);
1234            }
1235        }
1236
1237        if self.incognito {
1238            cmd.arg("--incognito");
1239        }
1240
1241        if let Some(ref envs) = self.process_envs {
1242            cmd.envs(envs);
1243        }
1244        cmd.stderr(Stdio::piped()).spawn()
1245    }
1246}
1247
1248/// Returns the path to Chrome's executable.
1249///
1250/// If the `CHROME` environment variable is set, `default_executable` will
1251/// use it as the default path. Otherwise, the filenames `google-chrome-stable`
1252/// `chromium`, `chromium-browser`, `chrome` and `chrome-browser` are
1253/// searched for in standard places. If that fails,
1254/// `/Applications/Google Chrome.app/...` (on MacOS) or the registry (on
1255/// Windows) is consulted. If all of the above fail, an error is returned.
1256#[deprecated(note = "Use detection::default_executable instead")]
1257pub fn default_executable() -> Result<std::path::PathBuf, String> {
1258    let options = DetectionOptions {
1259        msedge: false,
1260        unstable: false,
1261    };
1262    detection::default_executable(options)
1263}
1264
1265/// These are passed to the Chrome binary by default.
1266/// Via https://github.com/puppeteer/puppeteer/blob/4846b8723cf20d3551c0d755df394cc5e0c82a94/src/node/Launcher.ts#L157
1267static DEFAULT_ARGS: [&str; 26] = [
1268    "--disable-background-networking",
1269    "--enable-features=NetworkService,NetworkServiceInProcess",
1270    "--disable-background-timer-throttling",
1271    "--disable-backgrounding-occluded-windows",
1272    "--disable-breakpad",
1273    "--disable-client-side-phishing-detection",
1274    "--disable-component-extensions-with-background-pages",
1275    "--disable-default-apps",
1276    "--disable-dev-shm-usage",
1277    "--disable-extensions",
1278    "--disable-features=TranslateUI",
1279    "--disable-hang-monitor",
1280    "--disable-ipc-flooding-protection",
1281    "--disable-popup-blocking",
1282    "--disable-prompt-on-repost",
1283    "--disable-renderer-backgrounding",
1284    "--disable-sync",
1285    "--force-color-profile=srgb",
1286    "--metrics-recording-only",
1287    "--no-first-run",
1288    "--enable-automation",
1289    "--password-store=basic",
1290    "--use-mock-keychain",
1291    "--enable-blink-features=IdleDetection",
1292    "--lang=en_US",
1293    "--disable-blink-features=AutomationControlled",
1294];