parsoid 0.9.1

Wrapper around Parsoid HTML that provides convenient accessors for processing and manipulation
Documentation
/*
Copyright (C) 2020-2021 Kunal Mehta <legoktm@debian.org>

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

use crate::immutable::ImmutableWikicode;
use crate::private::Sealed;
use crate::{Error, Result, Wikicode};
use lazy_static::lazy_static;
use reqwest::header::HeaderMap;
use reqwest::{header, Client as HttpClient, Response};
use std::fmt::Write as _;
use std::sync::Arc;
use tokio::sync::Semaphore;
use tracing::debug;
use urlencoding::encode;

/// Version of library embedded in user-agent
const VERSION: &str = env!("CARGO_PKG_VERSION");
/// `Accept` header for [content negotiation](https://www.mediawiki.org/wiki/Parsoid/API#Content_Negotiation)
const ACCEPT_2_8_0: &str = "text/html; charset=utf-8; profile=\"https://www.mediawiki.org/wiki/Specs/HTML/2.8.0\"";

/// Allows transform_to_wikitext to be used by both Wikicode and
/// ImmutableWikicode
pub trait APICode: Sealed {
    fn html(&self) -> String;
    fn title(&self) -> Option<String>;
    fn etag(&self) -> Option<&str>;
    fn revid(&self) -> Option<u64>;
}

impl APICode for ImmutableWikicode {
    fn html(&self) -> String {
        self.html().to_string()
    }

    fn title(&self) -> Option<String> {
        self.title.clone()
    }

    fn etag(&self) -> Option<&str> {
        self.etag()
    }

    fn revid(&self) -> Option<u64> {
        self.revision_id()
    }
}

impl APICode for Wikicode {
    fn html(&self) -> String {
        self.to_string()
    }

    fn title(&self) -> Option<String> {
        self.title()
    }

    fn etag(&self) -> Option<&str> {
        self.etag.as_deref()
    }

    fn revid(&self) -> Option<u64> {
        self.revision_id()
    }
}

/// HTTP client to get Parsoid HTML from MediaWiki's Rest APIs
///
/// Note: This requires the `http` feature is enabled (it is by default).
#[derive(Clone, Debug)]
pub struct Client {
    http: HttpClient,
    base_url: String,
    semaphore: Arc<Semaphore>,
}

impl Client {
    /// Create a new Client. `base_url` should either point to `rest.php` or
    /// Restbase. For Wikimedia projects it would look something like:
    /// `https://en.wikipedia.org/api/rest_v1`. For other wikis it might be:
    /// `https://wiki.example.org/rest.php/wiki.example.org/v3`.
    ///
    /// (Note: no trailing slash on either endpoint style.)
    pub fn new(base_url: &str, user_agent: &str) -> Result<Self> {
        let concurrency = if base_url.contains("rest.php") {
            // rest.php, treat it like api.php and only allow 1 request
            // at a time
            1
        } else {
            // Most likely Restbase, which allows for 200 requests/second. We
            // don't exactly measure requests per second, so lowball it to 100
            // concurrent requests, and assume that all of our requests will
            // take longer than 0.5 seconds.
            100
        };
        let mut http = HttpClient::builder();
        let ua = format!("parsoid-rs/{VERSION} {user_agent}");

        #[cfg(target_arch = "wasm32")]
        {
            let mut headers = header::HeaderMap::new();
            headers
                .insert("Api-User-Agent", header::HeaderValue::from_str(&ua)?);
            http = http.default_headers(headers);
        }

        #[cfg(not(target_arch = "wasm32"))]
        {
            http = http.user_agent(ua);
        }

        Ok(Client {
            http: http.build()?,
            base_url: base_url.to_string(),
            semaphore: Arc::new(Semaphore::new(concurrency)),
        })
    }

    /// Create a new Client using an existing `reqwest::Client`. See the
    /// documentation for `new()` for what `base_url` should be. This is
    /// primarily useful when you are already making calls to the wiki and
    /// want to share connection pools and cookie state.
    pub fn new_with_client(base_url: &str, http: HttpClient) -> Self {
        Client {
            http,
            base_url: base_url.to_string(),
            semaphore: Arc::new(Semaphore::new(10)),
        }
    }

    fn default_headers(&self) -> HeaderMap {
        lazy_static! {
            static ref HEADERMAP: HeaderMap = {
                let mut headers = header::HeaderMap::new();
                headers.insert(
                    header::ACCEPT,
                    ACCEPT_2_8_0
                        .parse()
                        .expect("Unable to parse Accept header"),
                );
                headers
            };
        }

        (*HEADERMAP).clone()
    }

    /// Helper to get a page's HTML
    async fn page_html(
        &self,
        page: &str,
        revid: Option<u64>,
    ) -> Result<Response> {
        let url_part = format!("{}/page/html/{}", self.base_url, encode(page));
        let url = if let Some(revid) = revid {
            format!("{url_part}/{revid}")
        } else {
            url_part
        };
        let url = format!("{url}?redirect=false");
        let req = self
            .http
            .get(&url)
            .headers(self.default_headers())
            .build()?;
        let _lock = self.semaphore.acquire().await?;
        debug!(?req);
        // TODO: improve error handling
        let resp = self.http.execute(req).await?;
        debug!(?resp);
        drop(_lock);
        if resp.status() == 404 {
            Err(Error::PageDoesNotExist(page.to_string()))
        } else {
            Ok(resp.error_for_status()?)
        }
    }

    /// Get a `Wikicode` instance for the specified page
    pub async fn get(&self, page: &str) -> Result<ImmutableWikicode> {
        let resp = self.page_html(page, None).await?;
        let etag = match &resp.headers().get("etag") {
            Some(etag) => match etag.to_str() {
                Ok(etag) => etag.to_string(),
                Err(_) => return Err(Error::InvalidEtag),
            },
            None => return Err(Error::InvalidEtag),
        };
        // We go through Wikicode -> ImmutableWikicode to parse the revid out
        // of the HTML. TODO: there's probably a better way.
        let code = {
            let mut code = Wikicode::new(&resp.text().await?);
            code.etag = Some(etag);
            code.title = Some(page.to_string());
            code.into_immutable()
        };
        Ok(code)
    }

    /// Get a `Wikicode` instance for the specified page at the specified revision
    pub async fn get_revision(
        &self,
        page: &str,
        revid: u64,
    ) -> Result<ImmutableWikicode> {
        let resp = self.page_html(page, Some(revid)).await?;
        let etag = match &resp.headers().get("etag") {
            Some(etag) => match etag.to_str() {
                Ok(etag) => etag.to_string(),
                Err(_) => return Err(Error::InvalidEtag),
            },
            None => return Err(Error::InvalidEtag),
        };
        Ok(ImmutableWikicode {
            html: resp.text().await?,
            title: Some(page.to_string()),
            etag: Some(etag),
            revid: Some(revid),
        })
    }

    /// Get the Parsoid HTML for the specified page
    pub async fn get_raw(&self, page: &str) -> Result<String> {
        Ok(self.page_html(page, None).await?.text().await?)
    }

    /// Get the Parsoid HTML for the specified page at the specified revision
    pub async fn get_revision_raw(
        &self,
        page: &str,
        revid: u64,
    ) -> Result<String> {
        Ok(self.page_html(page, Some(revid)).await?.text().await?)
    }

    /// Get a `Wikicode` instance for the specified wikitext
    pub async fn transform_to_html(
        &self,
        wikitext: &str,
    ) -> Result<ImmutableWikicode> {
        let html = self.transform_to_html_raw(wikitext).await?;
        Ok(ImmutableWikicode::new(&html))
    }

    /// Get the Parsoid HTML for the specified wikitext
    pub async fn transform_to_html_raw(
        &self,
        wikitext: &str,
    ) -> Result<String> {
        let url = format!("{}/transform/wikitext/to/html", self.base_url);
        let req = self
            .http
            .post(&url)
            .headers(self.default_headers())
            .form(&[("wikitext", wikitext)])
            .build()?;
        let _lock = self.semaphore.acquire().await?;
        debug!(?req);
        let resp = self.http.execute(req).await?;
        debug!(?resp);
        drop(_lock);
        let html = resp.error_for_status()?.text().await?;
        Ok(html)
    }

    /// Get the wikitext for the specified Parsoid HTML
    pub async fn transform_to_wikitext<C: APICode>(
        &self,
        code: &C,
    ) -> Result<String> {
        self.transform_to_wikitext_raw(
            &code.html(),
            code.title().as_deref(),
            code.revid(),
            code.etag(),
        )
        .await
    }

    /// Get the wikitext for the specified Parsoid HTML
    pub async fn transform_to_wikitext_raw(
        &self,
        html: &str,
        title: Option<&str>,
        revid: Option<u64>,
        etag: Option<&str>,
    ) -> Result<String> {
        let mut url = format!("{}/transform/html/to/wikitext", self.base_url);
        if let Some(title) = title {
            let _ = write!(url, "/{}", encode(title));
            if let Some(revid) = revid {
                let _ = write!(url, "/{revid}");
            }
        }
        let mut header_map = self.default_headers();
        if let Some(etag) = etag {
            header_map.insert(header::IF_MATCH, etag.parse().unwrap());
        }

        let req = self
            .http
            .post(&url)
            .form(&[("html", html)])
            .headers(header_map)
            .build()?;
        let _lock = self.semaphore.acquire().await?;
        debug!(?req);
        let resp = self.http.execute(req).await?;
        debug!(?resp);
        drop(_lock);
        let wikitext = resp.error_for_status()?.text().await?;
        Ok(wikitext)
    }
}