mdbook-pdf-headless_chrome 0.1.2

A fork from headless_chrome for publishing mdbook-pdf
use std::sync::mpsc;
use std::sync::mpsc::{RecvTimeoutError, TryRecvError};
use std::sync::Arc;
use std::sync::Mutex;
use std::time::Duration;

use anyhow::Result;
use log::{debug, error, info, trace};

use process::Process;
pub use process::{LaunchOptions, LaunchOptionsBuilder, DEFAULT_ARGS};
pub use tab::Tab;
pub use transport::ConnectionClosed;
use transport::Transport;
use websocket::url::Url;
use which::which;

use crate::protocol::cdp::{types::Event, types::Method, Browser as B, Target, CSS, DOM};

use crate::browser::context::Context;
use crate::util;
use Target::{CreateTarget, SetDiscoverTargets};
use B::GetVersion;
pub use B::GetVersionReturnObject;

#[cfg(feature = "fetch")]
pub use fetcher::FetcherOptions;

pub mod context;
#[cfg(feature = "fetch")]
mod fetcher;
mod process;
pub mod tab;
pub mod transport;

/// A handle to an instance of Chrome / Chromium, which wraps a WebSocket connection to its debugging port.
///
///
/// Most of your actual "driving" (e.g. clicking, typing, navigating) will be via instances of [Tab](../tab/struct.Tab.html), which are accessible via methods such as `get_tabs`.
///
/// A Browser can either manage its own Chrome process or connect to a remote one.
///
/// `Browser::default().unwrap()` will return a headless instance of whatever browser can be found using
/// `default_executable`, which will search on your PATH for relevant binaries or use the path
/// specified in the `CHROME` env var.
///
/// You can use [LaunchOptions](../process/LaunchOptions.struct.html) to automatically
/// download a revision of Chromium that has a compatible API into your `$XDG_DATA_DIR`. Alternatively,
/// you can specify your own path to a binary, or make use of the `default_executable` function to use
///  your already-installed copy of Chrome.
///
/// Option 1: Managing a Chrome process
/// ```rust
/// # use anyhow::Result;
/// # fn main() -> Result<()> {
/// #
/// use headless_chrome::Browser;
/// let browser = Browser::default()?;
/// let first_tab = browser.wait_for_initial_tab()?;
/// assert_eq!("about:blank", first_tab.get_url());
/// #
/// # Ok(())
/// # }
/// ```
///
/// Option 2: Connecting to a remote Chrome service
/// - see /examples/print_to_pdf.rs for a working example
///
///
/// While the Chrome DevTools Protocol (CDTP) does define some methods in a
/// ["Browser" domain](https://chromedevtools.github.io/devtools-protocol/tot/Browser)
/// (such as for resizing the window in non-headless mode), we currently don't implement those.
#[derive(Clone)]
pub struct Browser {
    inner: Arc<BrowserInner>,
}

pub struct BrowserInner {
    process: Option<Process>,
    transport: Arc<Transport>,
    tabs: Arc<Mutex<Vec<Arc<Tab>>>>,
    loop_shutdown_tx: mpsc::SyncSender<()>,
}

impl Browser {
    /// Launch a new Chrome browser.
    ///
    /// The browser will have its user data (aka "profile") directory stored in a temporary directory.
    /// The browser process will be killed when this struct is dropped.
    pub fn new(launch_options: LaunchOptions) -> Result<Self> {
        let idle_browser_timeout = launch_options.idle_browser_timeout;
        let process = Process::new(launch_options)?;
        let process_id = process.get_id();

        let transport = Arc::new(Transport::new(
            process.debug_ws_url.clone(),
            Some(process_id),
            idle_browser_timeout,
        )?);

        Self::create_browser(Some(process), transport, idle_browser_timeout)
    }

    /// Calls [`new`] with options to launch a headless browser using whatever Chrome / Chromium
    /// binary can be found on the system.
    pub fn default() -> Result<Self> {
        let launch_options = LaunchOptions::default_builder()
            .path(Some(default_executable().unwrap()))
            .build()
            .unwrap();
        Ok(Self::new(launch_options).unwrap())
    }

    /// Allows you to drive an externally-launched Chrome process instead of launch one via [`new`].
    /// If the browser is idle for 30 seconds, the connection will be dropped.
    pub fn connect(debug_ws_url: String) -> Result<Self> {
        Self::connect_with_timeout(debug_ws_url, Duration::from_secs(30))
    }

    /// Allows you to drive an externally-launched Chrome process instead of launch one via [`new`].
    /// If the browser is idle for `idle_browser_timeout`, the connection will be dropped.
    pub fn connect_with_timeout(
        debug_ws_url: String,
        idle_browser_timeout: Duration,
    ) -> Result<Self> {
        let url = Url::parse(&debug_ws_url)?;

        let transport = Arc::new(Transport::new(url, None, idle_browser_timeout)?);
        trace!("created transport");

        Self::create_browser(None, transport, idle_browser_timeout)
    }

    fn create_browser(
        process: Option<Process>,
        transport: Arc<Transport>,
        idle_browser_timeout: Duration,
    ) -> Result<Self> {
        let tabs = Arc::new(Mutex::new(vec![]));

        let (shutdown_tx, shutdown_rx) = mpsc::sync_channel(100);

        let browser = Browser {
            inner: Arc::new(BrowserInner {
                process,
                tabs,
                transport,
                loop_shutdown_tx: shutdown_tx,
            }),
        };

        let incoming_events_rx = browser.inner.transport.listen_to_browser_events();

        browser.handle_browser_level_events(
            incoming_events_rx,
            browser.get_process_id(),
            shutdown_rx,
            idle_browser_timeout,
        );
        trace!("created browser event listener");

        // so we get events like 'targetCreated' and 'targetDestroyed'
        trace!("Calling set discover");
        browser.call_method(SetDiscoverTargets { discover: true })?;

        let tab = browser.wait_for_initial_tab()?;

        tab.call_method(DOM::Enable(None))?;
        tab.call_method(CSS::Enable(None))?;

        Ok(browser)
    }

    pub fn get_process_id(&self) -> Option<u32> {
        self.inner.process.as_ref().map(process::Process::get_id)
    }

    /// The tabs are behind an `Arc` and `Mutex` because they're accessible from multiple threads
    /// (including the one that handles incoming protocol events about new or changed tabs).
    pub fn get_tabs(&self) -> &Arc<Mutex<Vec<Arc<Tab>>>> {
        &self.inner.tabs
    }

    /// Chrome always launches with at least one tab. The reason we have to 'wait' is because information
    /// about that tab isn't available *immediately* after starting the process. Tabs are behind `Arc`s
    /// because they each have their own thread which handles events and method responses directed to them.
    ///
    /// Wait timeout: 10 secs
    pub fn wait_for_initial_tab(&self) -> Result<Arc<Tab>> {
        util::Wait::with_timeout(Duration::from_secs(600))
            .until(|| self.inner.tabs.lock().unwrap().first().map(Arc::clone))
            .map_err(Into::into)
    }

    /// Create a new tab and return a handle to it.
    ///
    /// If you want to specify its starting options, see `new_tab_with_options`.
    ///
    /// ```rust
    /// # use anyhow::Result;
    /// # fn main() -> Result<()> {
    /// #
    /// # use headless_chrome::Browser;
    /// # let browser = Browser::default()?;
    /// let first_tab = browser.wait_for_initial_tab()?;
    /// let new_tab = browser.new_tab()?;
    /// let num_tabs = browser.get_tabs().lock().unwrap().len();
    /// assert_eq!(2, num_tabs);
    /// #
    /// # Ok(())
    /// # }
    /// ```
    pub fn new_tab(&self) -> Result<Arc<Tab>> {
        let default_blank_tab = CreateTarget {
            url: "about:blank".to_string(),
            width: None,
            height: None,
            browser_context_id: None,
            enable_begin_frame_control: None,
            new_window: None,
            background: None,
        };
        self.new_tab_with_options(default_blank_tab)
    }

    /// Create a new tab with a starting url, height / width, context ID and 'frame control'
    /// ```rust
    /// # use anyhow::Result;
    /// # fn main() -> Result<()> {
    /// #
    /// # use headless_chrome::{Browser, protocol::target::methods::CreateTarget};
    /// # let browser = Browser::default()?;
    ///    let new_tab = browser.new_tab_with_options(CreateTarget {
    ///    url: "chrome://version",
    ///    width: Some(1024),
    ///    height: Some(800),
    ///    browser_context_id: None,
    ///    enable_begin_frame_control: None,
    ///    })?;
    /// #
    /// # Ok(())
    /// # }
    /// ```
    pub fn new_tab_with_options(&self, create_target_params: CreateTarget) -> Result<Arc<Tab>> {
        let target_id = self.call_method(create_target_params)?.target_id;

        util::Wait::with_timeout(Duration::from_secs(20))
            .until(|| {
                let tabs = self.inner.tabs.lock().unwrap();
                tabs.iter().find_map(|tab| {
                    if *tab.get_target_id() == target_id {
                        Some(tab.clone())
                    } else {
                        None
                    }
                })
            })
            .map_err(Into::into)
    }

    /// Creates the equivalent of a new incognito window, AKA a browser context
    pub fn new_context(&self) -> Result<context::Context> {
        debug!("Creating new browser context");
        let context_id = self
            .call_method(Target::CreateBrowserContext {
                dispose_on_detach: None,
                proxy_server: None,
                proxy_bypass_list: None,
                origins_with_universal_network_access: None,
            })?
            .browser_context_id;
        debug!("Created new browser context: {:?}", context_id);
        Ok(Context::new(self, context_id))
    }

    /// Get version information
    ///
    /// ```rust
    /// # use anyhow::Result;
    /// # fn main() -> Result<()> {
    /// #
    /// # use headless_chrome::Browser;
    /// # let browser = Browser::default()?;
    /// let version_info = browser.get_version()?;
    /// println!("User-Agent is `{}`", version_info.user_agent);
    /// #
    /// # Ok(())
    /// # }
    /// ```
    pub fn get_version(&self) -> Result<GetVersionReturnObject> {
        self.call_method(GetVersion(None))
    }

    fn handle_browser_level_events(
        &self,
        events_rx: mpsc::Receiver<Event>,
        process_id: Option<u32>,
        shutdown_rx: mpsc::Receiver<()>,
        idle_browser_timeout: Duration,
    ) {
        let tabs = Arc::clone(&self.inner.tabs);
        let transport = Arc::clone(&self.inner.transport);

        std::thread::spawn(move || {
            trace!("Starting browser's event handling loop");
            loop {
                match shutdown_rx.try_recv() {
                    Ok(_) | Err(TryRecvError::Disconnected) => {
                        info!("Browser event loop received shutdown message");
                        break;
                    }
                    Err(TryRecvError::Empty) => {}
                }

                match events_rx.recv_timeout(idle_browser_timeout) {
                    Err(recv_timeout_error) => {
                        match recv_timeout_error {
                            RecvTimeoutError::Timeout => {
                                error!(
                                    "Got a timeout while listening for browser events (Chrome #{:?})",
                                    process_id
                                );
                            }
                            RecvTimeoutError::Disconnected => {
                                debug!(
                                    "Browser event sender disconnected while loop was waiting (Chrome #{:?})",
                                    process_id
                                );
                            }
                        }
                        break;
                    }
                    Ok(event) => {
                        match event {
                            Event::TargetCreated(ev) => {
                                let target_info = ev.params.target_info;
                                trace!("Creating target: {:?}", target_info);
                                if target_info.Type == "page" {
                                    match Tab::new(target_info, Arc::clone(&transport)) {
                                        Ok(new_tab) => {
                                            tabs.lock().unwrap().push(Arc::new(new_tab));
                                        }
                                        Err(_tab_creation_err) => {
                                            info!("Failed to create a handle to new tab");
                                            break;
                                        }
                                    }
                                }
                            }
                            Event::TargetInfoChanged(ev) => {
                                let target_info = ev.params.target_info;
                                trace!("Target info changed: {:?}", target_info);
                                if target_info.Type == "page" {
                                    let locked_tabs = tabs.lock().unwrap();
                                    let updated_tab = locked_tabs
                                        .iter()
                                        .find(|tab| *tab.get_target_id() == target_info.target_id)
                                        .expect("got TargetInfoChanged event about a tab not in our list");
                                    updated_tab.update_target_info(target_info);
                                }
                            }
                            Event::TargetDestroyed(ev) => {
                                trace!("Target destroyed: {:?}", ev.params.target_id);
                                let mut locked_tabs = tabs.lock().unwrap();
                                let pos = locked_tabs
                                    .iter()
                                    .position(|tab| *tab.get_target_id() == ev.params.target_id);

                                if let Some(idx) = pos {
                                    locked_tabs.remove(idx);
                                }
                            }
                            _ => {
                                let mut raw_event = format!("{:?}", event);
                                raw_event.truncate(50);
                                trace!("Unhandled event: {}", raw_event);
                            }
                        }
                    }
                }
            }
            info!("Finished browser's event handling loop");
        });
    }

    /// Call a browser method.
    ///
    /// See the `cdtp` module documentation for available methods.
    fn call_method<C>(&self, method: C) -> Result<C::ReturnObject>
    where
        C: Method + serde::Serialize,
    {
        self.inner.transport.call_method_on_browser(method)
    }

    #[allow(dead_code)]
    #[cfg(test)]
    pub(crate) fn process(&self) -> Option<&Process> {
        #[allow(clippy::used_underscore_binding)]
        self.inner.process.as_ref()
    }
}

/// [`Browser`] is being dropped!
/// Dropping the inner browser means that there are no more references in the `Arc` inside [`Browser`].
impl Drop for BrowserInner {
    fn drop(&mut self) {
        info!("Dropping browser");
        let _ = self.loop_shutdown_tx.send(());
        self.transport.shutdown();
    }
}

/// Returns the path to Chrome's executable.
///
/// If the `CHROME` environment variable is set, `default_executable` will
/// use it as the default path. Otherwise, the filenames `google-chrome-stable`
/// `chromium`, `chromium-browser`, `chrome` and `chrome-browser` are
/// searched for in standard places. If that fails,
/// `/Applications/Google Chrome.app/...` (on MacOS) or the registry (on Windows)
/// is consulted. If all of the above fail, an error is returned.
pub fn default_executable() -> Result<std::path::PathBuf, String> {
    if let Ok(path) = std::env::var("CHROME") {
        if std::path::Path::new(&path).exists() {
            return Ok(path.into());
        }
    }

    for app in &[
        "google-chrome-stable",
        "google-chrome-beta",
        "google-chrome-dev",
        "google-chrome-unstable",
        "chromium",
        "chromium-browser",
        "microsoft-edge-stable",
        "microsoft-edge-beta",
        "microsoft-edge-dev",
        "chrome",
        "chrome-browser",
        "msedge",
        "microsoft-edge",
    ] {
        if let Ok(path) = which(app) {
            return Ok(path);
        }
    }

    #[cfg(target_os = "macos")]
    {
        for path in &[
            "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
            "/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta",
            "/Applications/Google Chrome Dev.app/Contents/MacOS/Google Chrome Dev",
            "/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
            "/Applications/Chromium.app/Contents/MacOS/Chromium",
            "/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge",
            "/Applications/Microsoft Edge Beta.app/Contents/MacOS/Microsoft Edge Beta",
            "/Applications/Microsoft Edge Dev.app/Contents/MacOS/Microsoft Edge Dev",
            "/Applications/Microsoft Edge Canary.app/Contents/MacOS/Microsoft Edge Canary",
        ][..]
        {
            if std::path::Path::new(path).exists() {
                return Ok(path.into());
            }
        }
    }

    #[cfg(windows)]
    {
        use crate::browser::process::get_chrome_path_from_registry;

        if let Some(path) = get_chrome_path_from_registry() {
            if path.exists() {
                return Ok(path);
            } else {
                for path in &[r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"][..] {
                    if std::path::Path::new(path).exists() {
                        return Ok(path.into());
                    }
                }
            }
        }
    }

    Err("Could not auto detect a chrome executable".to_string())
}

#[cfg(test)]
mod test {
    use super::Browser;

    fn is_sync<T>()
    where
        T: Sync,
    {
    }

    #[test]
    fn test_if_browser_is_sync() {
        is_sync::<Browser>();
    }
}