mwtitle 0.2.0-alpha.2

MediaWiki title validation and formatting
Documentation
/*
Copyright (C) Tim Starling
Copyright (C) Daniel Kinzler
Copyright (C) 2021 Kunal Mehta <legoktm@debian.org>
Copyright (C) 2021 Erutuon

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
use crate::ip::sanitize_ip;
#[cfg(feature = "utils")]
#[cfg_attr(docs, doc(cfg(feature = "utils")))]
use crate::SiteInfoResponse;
use crate::{
    php, Error, Interwiki, InterwikiSet, NamespaceAlias, NamespaceInfo,
    NamespaceMap, Result, SiteInfo, Title, NS_MAIN,
};
#[cfg(feature = "utils")]
#[cfg_attr(docs, doc(cfg(feature = "utils")))]
use flate2::read::GzDecoder;
use regex::bytes::Regex;
#[cfg(feature = "utils")]
#[cfg_attr(docs, doc(cfg(feature = "utils")))]
use std::{fs::File, io::Read, path::Path, sync::Arc};

// Constants for readability
const NS_SPECIAL: i32 = -1;
const NS_TALK: i32 = 1;
const NS_USER: i32 = 2;
const NS_USER_TALK: i32 = 3;

/// The `TitleCodec` is responsible for parsing, normalizing and formatting
/// `Title`s. See the crate-level documentation for an example of how to
/// construct one.
#[cfg_attr(docs, doc(cfg(feature = "parsing")))]
#[derive(Clone, Debug)]
pub struct TitleCodec {
    namespace_map: NamespaceMap,
    interwiki_set: InterwikiSet,
    local_interwiki_set: InterwikiSet,
    main_page: String,
    lang: String,
    illegal_patterns: Regex,
}

#[test]
fn title_codec_is_send_and_sync() {
    fn assert_send_and_sync<T: Send + Sync>() {}

    assert_send_and_sync::<TitleCodec>();
}

impl TitleCodec {
    /// Create a new title by parsing the provided input.
    pub fn new_title(&self, input: &str) -> Result<Title> {
        self.secure_and_split(input, NS_MAIN)
    }

    /// Create a new title by parsing the provided input. If the title has no
    /// namespace part, then the namespace specified by `default_namespace` is
    /// used instead.
    pub fn new_title_with_namespace(
        &self,
        input: &str,
        default_namespace: i32,
    ) -> Result<Title> {
        self.secure_and_split(input, default_namespace)
    }

    /// Get the title with namespace in pretty aka text form (spaces).
    ///
    /// Fragments will not be included.
    ///
    /// # Panics
    ///
    /// This will panic if the `Title` is in a namespace that this `TitleCodec`
    /// is unaware of.
    pub fn to_pretty(&self, title: &Title) -> String {
        self.namespace_map
            .to_pretty(title)
            .expect("unknown namespace")
    }

    /// Get the title with namespace in underscore aka dbkey form. This is
    /// potentially useful when you want to make a database query.
    ///
    /// Fragments will not be included.
    ///
    /// # Panics
    ///
    /// This will panic if the `Title` is in a namespace that this `TitleCodec`
    /// is unaware of.
    pub fn to_underscores(&self, title: &Title) -> String {
        self.namespace_map
            .to_underscores(title)
            .expect("unknown namespace")
    }

    /// Get the title with namespace in pretty aka text form (spaces), with the
    /// fragment, if one exists, appended.
    ///
    /// # Panics
    ///
    /// This will panic if the `Title` is in a namespace that this `TitleCodec`
    /// is unaware of.
    pub fn to_pretty_with_fragment(&self, title: &Title) -> String {
        self.namespace_map
            .to_pretty_with_fragment(title)
            .expect("unknown namespace")
    }

    /// Construct a new `TitleCodec` using the given fields.
    ///
    /// In most cases it is easier to do so from one of the siteinfo methods.
    pub fn new(
        namespace_map: NamespaceMap,
        interwiki_set: InterwikiSet,
        local_interwiki_set: InterwikiSet,
        main_page: String,
        lang: String,
        legal_title_chars: String,
    ) -> Result<Self> {
        // Copied from `MediaWikiTitleCodec::getTitleInvalidRegex()`.
        // The `legal_title_chars` portion has to be changed when this lands:
        // https://phabricator.wikimedia.org/T297340
        // Matching titles will be held as illegal.
        let illegal_patterns = Regex::new(&format!(
            r"(?x-u)
                # x: ignore whitespace and allow comments;
                # -u: disable code point matching
                # so that \x80-\xff match bytes 0x80-0xFF
                # (corresponding to all non-ASCII code points, U+0080-U+10FFFF)
                # rather than code points U+0080-U+00FF.
                    # Any character not allowed is forbidden...
                    [^{legal_title_chars}]

                    # URL percent encoding sequences interfere with the ability
                    # to round-trip titles -- you can't link to them consistently.
                    | %[0-9A-Fa-f]{{2}}

                    # XML/HTML character references produce similar issues.
                    | &[A-Za-z0-9\x80-\xff]+;
                ",
            // / does not need to be escaped as \/ in Rust regex.
            legal_title_chars = legal_title_chars.replace(r"\/", "/")
        ))?;

        Ok(Self {
            namespace_map,
            interwiki_set,
            local_interwiki_set,

            illegal_patterns,
            main_page,
            lang,
        })
    }

    /// Create a new `TitleCodec` getting namespaces, namespace aliases, and interwikis from iterators.
    pub fn new_from_iters<
        N: IntoIterator<Item = NamespaceInfo>,
        A: IntoIterator<Item = NamespaceAlias>,
        I: IntoIterator<Item = Interwiki>,
    >(
        namespaces: N,
        namespace_aliases: A,
        interwikis: I,
        main_page: String,
        lang: String,
        legal_title_chars: String,
    ) -> Result<Self> {
        let (interwiki_set, local_interwiki_set) =
            InterwikiSet::all_and_local_from_iter(interwikis);
        let namespace_map =
            NamespaceMap::from_namespaces_and_namespace_aliases(
                namespaces,
                namespace_aliases,
            )?;
        Self::new(
            namespace_map,
            interwiki_set,
            local_interwiki_set,
            main_page,
            lang,
            legal_title_chars,
        )
    }

    /// Creates a `TitleCodec` by parsing the contents of a JSON or GZipped JSON file.
    ///
    /// Will accept the `siteinfo-namespaces.json.gz` file from in the Wikimedia dumps.
    /// If the file extension is `gz`, decompresses from the GZip format before deserializing the JSON;
    /// otherwise attempts to deserialize the file contents directly.
    #[cfg(feature = "utils")]
    #[cfg_attr(docs, doc(cfg(feature = "utils")))]
    pub fn from_path(path: &Path) -> Result<Self> {
        let json = if path.extension() == Some("gz".as_ref()) {
            let gz = File::open(path)
                .map_err(|source| Error::from_io("open file", source, path))?;
            let mut decoder = GzDecoder::new(gz);
            let mut decoded = String::new();
            decoder
                .read_to_string(&mut decoded)
                .map_err(|source| Error::from_io("parse GZip", source, path))?;
            decoded
        } else {
            std::fs::read_to_string(path).map_err(|source| {
                Error::from_io("read file to string", source, path)
            })?
        };
        Self::from_json_with_path(&json, Some(path))
    }

    /// Creates a `TitleCodec` by parsing the contents of a `Read` type that contains the JSON
    /// representation of a [`SiteInfoResponse`].
    #[cfg(feature = "utils")]
    #[cfg_attr(docs, doc(cfg(feature = "utils")))]
    pub fn from_reader<R: Read>(reader: R) -> Result<Self> {
        Self::from_site_info(
            serde_json::from_reader::<R, SiteInfoResponse>(reader)
                .map_err(|source| Error::Json {
                    source: Arc::new(source),
                })?
                .query,
        )
    }

    /// Creates a `TitleCodec` by parsing the JSON representation of a [`SiteInfoResponse`].
    #[cfg(feature = "utils")]
    #[cfg_attr(docs, doc(cfg(feature = "utils")))]
    pub fn from_json<S: AsRef<str>>(json: S) -> Result<Self> {
        Self::from_json_with_path(json.as_ref(), None)
    }

    /// Creates a `TitleCodec` by parsing the JSON representation of a [`SiteInfoResponse`].
    ///
    /// # Errors
    ///
    /// If this fails and `path` is `Some(_)`, gives an error message
    /// that mentions `path`.
    #[cfg(feature = "utils")]
    #[cfg_attr(docs, doc(cfg(feature = "utils")))]
    fn from_json_with_path(json: &str, path: Option<&Path>) -> Result<Self> {
        Self::from_site_info(
            serde_json::from_str::<SiteInfoResponse>(json)
                .map_err(|source| {
                    let source = Arc::new(source);
                    if let Some(path) = path {
                        Error::JsonFile {
                            source,
                            path: path.into(),
                        }
                    } else {
                        Error::Json { source }
                    }
                })?
                .query,
        )
    }

    /// Create a new `TitleCodec` using the provided [`SiteInfo`].
    ///
    /// The `SiteInfo` must include a non-empty `interwiki_map` field
    /// to enable the resulting `TitleCodec`
    /// to correctly parse titles with interwikis,
    /// but an empty `interwiki_map` is not an error.
    pub fn from_site_info(site_info: SiteInfo) -> Result<Self> {
        Self::new_from_iters(
            site_info.namespaces.into_values(),
            site_info.namespace_aliases,
            site_info.interwiki_map,
            site_info.general.main_page,
            site_info.general.lang,
            site_info.general.legal_title_chars,
        )
    }

    /// Equivalent of `MediaWikiTitleCodec::splitTitleString()`.
    ///
    /// Most comments are direct copies to make it easier to compare with
    /// the MediaWiki implementation.
    fn secure_and_split(
        &self,
        input: &str,
        default_namespace: i32,
    ) -> Result<Title> {
        let mut namespace = default_namespace;
        // Strip Unicode bidi override characters.
        // Clean up whitespace.
        let mut dbkey = normalize_title_chars(input);
        let mut fragment = None;
        let mut interwiki = None;
        let mut local_interwiki = false;

        // U+FFFD is the replacement character
        if dbkey.contains('\u{FFFD}') {
            // Contained illegal UTF-8 sequences or forbidden Unicode chars.
            return Err(Error::IllegalUtf8(input.to_string()));
        }
        // Skip "Contained illegal UTF-8 sequences or forbidden Unicode chars.",
        // because all Rust strings are valid UTF-8.

        // Initial colon indicates main namespace rather than specified default
        // but should not create invalid {ns,title} pairs such as {0,Project:Foo}
        if dbkey.get(0..1) == Some(":") {
            namespace = NS_MAIN;
            // remove the colon but continue processing
            dbkey.drain(..1);
            // remove any subsequent whitespace
            trim_title_whitespace(&mut dbkey);
        }
        if dbkey.is_empty() {
            return Err(Error::Empty(input.to_string()));
        }

        fn get_nonempty_trimmed(
            s: &str,
            range_to: std::ops::RangeTo<usize>,
        ) -> Option<&str> {
            s.get(range_to)
                .filter(|p| !p.is_empty())
                .map(|s| s.trim_end_matches('_'))
        }

        // Namespace or interwiki prefix
        // `MediaWikiTitleCodec` uses a regex here, but we're going to use string
        // parsing instead.
        loop {
            if let Some(colon_pos) = dbkey.find(':') {
                if let Some(prefix) = get_nonempty_trimmed(&dbkey, ..colon_pos)
                {
                    if let Some(ns) = self.namespace_map.get_id(prefix) {
                        // Ordinary namespace
                        namespace = ns;
                        dbkey.drain(..colon_pos + 1);
                        trim_title_whitespace(&mut dbkey);
                        // For Talk:X pages, check if X has a "namespace" prefix
                        if ns == NS_TALK {
                            if let Some(colon_pos) = dbkey.find(':') {
                                // Disallow Talk:File:x or Talk:Interwiki:x type titles ...
                                if let Some(prefix) =
                                    get_nonempty_trimmed(&dbkey, ..colon_pos)
                                {
                                    if self
                                        .namespace_map
                                        .get_id(prefix)
                                        .is_some()
                                        || self.interwiki_set.contains(prefix)
                                    {
                                        return Err(Error::TalkNamespace(
                                            input.to_string(),
                                        ));
                                    }
                                }
                            }
                        }
                    } else if self.interwiki_set.contains(prefix) {
                        // Check this using prefix before we mutably borrow dbkey
                        let is_local_interwiki =
                            self.local_interwiki_set.contains(prefix);
                        interwiki = Some(prefix.to_lowercase());
                        dbkey.drain(..colon_pos + 1);
                        trim_title_whitespace(&mut dbkey);

                        if is_local_interwiki {
                            if dbkey.is_empty() {
                                // Empty self-links should point to the Main Page, to ensure
                                // compatibility with cross-wiki transclusions and the like.
                                return Ok(self
                                    .new_title(&self.main_page)
                                    .map(|mut title| {
                                        title.local_interwiki = true;
                                        title
                                    })
                                    .unwrap_or_else(|_| {
                                        // Fallback to hardcoded "Main Page" if the configured main page
                                        // value is unparseable
                                        Title {
                                            namespace: NS_MAIN,
                                            dbkey: "Main_Page".to_string(),
                                            fragment: None,
                                            interwiki: None,
                                            local_interwiki: true,
                                        }
                                    }));
                            }
                            interwiki = None;
                            // local interwikis should behave like initial-colon links
                            local_interwiki = true;

                            // Do another namespace split...
                            continue;
                        }

                        // If there's an initial colon after the interwiki, that also
                        // resets the default namespace
                        if dbkey.starts_with(':') {
                            namespace = NS_MAIN;
                            dbkey.drain(..1);
                            trim_title_whitespace(&mut dbkey);
                        }
                    }
                }
            }
            // If there's no recognized interwiki or namespace,
            // then let the colon expression be part of the title.
            break;
        }

        if let Some((key, f)) = dbkey.split_once('#') {
            fragment = Some(f.replace('_', " "));
            let key_len = key.len(); // to satisfy borrow checker
            dbkey.truncate(key_len);
            // remove whitespace again: prevents "Foo_bar_#"
            // becoming "Foo_bar_"
            trim_title_whitespace(&mut dbkey);
        }

        // Reject illegal characters.
        if self.illegal_patterns.is_match(dbkey.as_bytes()) {
            return Err(Error::Characters(input.to_string()));
        }

        // Pages with "/./" or "/../" appearing in the URLs will often be un-
        // reachable due to the way web browsers deal with 'relative' URLs.
        // Also, they conflict with subpage syntax.  Forbid them explicitly.
        if dbkey == "."
            || dbkey == ".."
            || dbkey.starts_with("./")
            || dbkey.starts_with("../")
            || dbkey.contains("/./")
            || dbkey.contains("/../")
            || dbkey.ends_with("/.")
            || dbkey.ends_with("/..")
        {
            return Err(Error::Relative(input.to_string()));
        }

        // Magic tilde sequences? Nu-uh!
        if dbkey.contains("~~~") {
            return Err(Error::MagicTildes(input.to_string()));
        }

        // Limit the size of titles to 255 bytes. This is typically the size of the
        // underlying database field. We make an exception for special pages, which
        // don't need to be stored in the database, and may edge over 255 bytes due
        // to subpage syntax for long titles, e.g. [[Special:Block/Long name]]
        let max_length = if namespace == NS_SPECIAL { 512 } else { 255 };
        if dbkey.len() > max_length {
            return Err(Error::TooLong(input.to_string()));
        }

        // Normally, all wiki links are forced to have an initial capital letter so [[foo]]
        // and [[Foo]] point to the same place.  Don't force it for interwikis, since the
        // other site might be case-sensitive.
        if interwiki.is_none()
            && self
                .namespace_map
                .is_capitalized(namespace)
                .unwrap_or(false)
        {
            uppercase_first(&self.lang, &mut dbkey);
        }

        // Can't make a link to a namespace alone... "empty" local links can only be
        // self-links with a fragment identifier.
        // MediaWiki allows for links with just a fragment, but we won't.
        if dbkey.is_empty() && interwiki.is_none() && namespace != NS_MAIN {
            return Err(Error::Empty(input.to_string()));
        }

        if namespace == NS_USER || namespace == NS_USER_TALK {
            sanitize_ip(&mut dbkey);
        }

        // Any remaining initial :s are illegal.
        if dbkey.starts_with(':') {
            return Err(Error::LeadingColon(input.to_string()));
        }

        Ok(Title {
            namespace,
            dbkey,
            fragment,
            interwiki,
            local_interwiki,
        })
    }
}

/// Indicates whether a code point is considered whitespace when it is found in a title.
///
/// Includes all code points with the White_Space property
/// (see [PropList.txt](https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt)),
/// but excludes the control characters
/// U+009-U+00D (tab, newline, vertical tab, form feed, carriage return)
/// and U+0085 (next line), and adds U+180E (MONGOLIAN VOWEL SEPARATOR),
/// a format character (General Category: Cf).
/// The control characters U+009-U+00D are rejected
/// by the `illegal_patterns` regex;
/// U+0085 is accepted as a valid character.
#[rustfmt::skip]
fn is_title_whitespace(c: char) -> bool {
    matches!(
        c,
        ' ' | '_' // U+0020 SPACE, U+005F LOW LINE
            | '\u{A0}' // U+00A0 NO-BREAK SPACE
            | '\u{1680}' // U+1680 OGHAM SPACE MARK
            | '\u{180E}' // U+180E MONGOLIAN VOWEL SEPARATOR
            // U+2000-U+200A: EN QUAD, EM QUAD, EN SPACE, EM SPACE,
            // THREE-PER-EM SPACE, FOUR-PER-EM SPACE, SIX-PER-EM SPACE,
            // FIGURE SPACE, PUNCTUATION SPACE, THIN SPACE, HAIR SPACE
            | '\u{2000}'..='\u{200A}'
            | '\u{2028}' // U+2028 LINE SEPARATOR
            | '\u{2029}' // U+2029 PARAGRAPH SEPARATOR
            | '\u{202F}' // U+202F NARROW NO-BREAK SPACE
            | '\u{205F}' // U+205F MEDIUM MATHEMATICAL SPACE
            | '\u{3000}' // U+3000 IDEOGRAPHIC SPACE
    )
}

/**
 * Indicates that a character is a directional formatting character
 * that should be removed from titles.
 *
 * MediaWiki strips some [directional formatting characters](https://www.unicode.org/reports/tr9/#Directional_Formatting_Characters) from titles:
 * U+200E and U+200F (LEFT-TO-RIGHT MARK, RIGHT-TO-LEFT MARK)
 * and U+202A–U+202E (LEFT-TO-RIGHT EMBEDDING, RIGHT-TO-LEFT EMBEDDING,
 * POP DIRECTIONAL FORMATTING, LEFT-TO-RIGHT OVERRIDE, RIGHT-TO-LEFT OVERRIDE).
 * All of these were introduced in Unicode 1.1 and are referred to as
 * bidi override characters in the source code
 * of `MediaWikiTitleCodec::splitTitleString()`.
 *
 * The following directional formatting characters were introduced
 * in [Unicode 6.3](https://www.unicode.org/versions/Unicode6.3.0/) (2013)
 * and are not stripped:
 * U+061C (ARABIC LETTER MARK)
 * and U+2066–U+2069 (LEFT‑TO‑RIGHT ISOLATE, RIGHT‑TO‑LEFT ISOLATE, FIRST STRONG ISOLATE, POP DIRECTIONAL ISOLATE).
 */
fn is_bidirectional_override(c: char) -> bool {
    matches!(c, '\u{200E}' | '\u{200F}' | '\u{202A}'..='\u{202E}')
}

/**
 * Normalizes characters in a title.
 *
 * Removes the banned directional formatting characters (see [`is_bidirectional_override`]),
 * strips title whitespace characters (see [`is_title_whitespace`])
 * from the beginning and end of the title,
 * and replaces sequences of one or more title whitespace characters with a single underscore.
 */
fn normalize_title_chars(title: &str) -> String {
    // This gets the minimum possible length of the normalized title.
    // It will be longer than this if there is any untrimmed whitespace.
    let mut out = String::with_capacity(
        title
            .chars()
            .filter(|c| {
                !(is_title_whitespace(*c) || is_bidirectional_override(*c))
            })
            .count(),
    );
    let mut prev_whitespace = false;
    for c in title.chars() {
        let cur_whitespace = is_title_whitespace(c);
        if !(cur_whitespace || is_bidirectional_override(c)) {
            if prev_whitespace && !out.is_empty() {
                out.push('_');
            }
            out.push(c);
        }
        prev_whitespace = cur_whitespace;
    }
    out
}

#[test]
fn normalize_title_chars_strips_and_collapses_title_whitespace() {
    assert_eq!(normalize_title_chars(" a b"), "a_b");
    assert_eq!(normalize_title_chars("a b "), "a_b");
    assert_eq!(normalize_title_chars("a  b"), "a_b");
    assert_eq!(normalize_title_chars("a__b"), "a_b");
}

#[test]
fn normalize_title_chars_removes_directional_control_characters() {
    assert_eq!(normalize_title_chars("\u{200E}_a_b"), "a_b");
    assert_eq!(normalize_title_chars("a\u{200E}_b "), "a_b");
    assert_eq!(normalize_title_chars("a_b\u{200E}"), "a_b");
    assert_eq!(normalize_title_chars("a_\u{200E}_b"), "a_b");
}

fn trim_title_whitespace(s: &mut String) {
    let title_start = s.bytes().position(|b| b != b'_').unwrap_or(0);
    let trailing_whitespace_count =
        s.bytes().rev().position(|b| b != b'_').unwrap_or(0);
    // This `String::drain` won't panic because the `Iterator::position` call gets a valid `char` boundary.
    s.drain(..title_start);
    // This `String::truncate` won't panic because `s.len() - trailing_whitespace_count` is a valid `char` boundary;
    s.truncate(s.len() - trailing_whitespace_count);
}

#[test]
fn trim_title_whitespace_trims_underscores() {
    assert_eq!(normalize_title_chars("_a_b"), "a_b");
    assert_eq!(normalize_title_chars("a_b_"), "a_b");
    assert_eq!(normalize_title_chars("_a_b_"), "a_b");
}

const UPPERCASE_DOTTED_I_LANGUAGES: [&str; 4] = ["az", "kaa", "kk", "tr"];

/// Functional equivalent of `Language::ucfirst()`.
///
/// This is probably not going to be identical because of different Unicode
/// versions in use, but hopefully those cases are so rare we don't hit them.
///
/// Or we could just hardcode a special mapping like MediaWiki does for
/// client-side JavaScript.
fn uppercase_first(lang: &str, input: &mut String) {
    if let Some(first) = input.chars().next() {
        // `Language::ucfirst()` has special handling for the `i` character
        // in some languages
        if first == 'i' && UPPERCASE_DOTTED_I_LANGUAGES.contains(&lang) {
            // i has len_utf8() of 1
            input.drain(..1);
            // İ has len_utf8() of 2
            input.reserve(2);
            input.insert(0, 'İ');
        } else if php::ALREADY_UPPERCASE.contains(&first) {
            // Skip, do nothing
        } else if let Some(replace) = php::to_uppercase(first) {
            input.drain(..first.len_utf8());
            input.reserve(replace.len_utf8());
            input.insert(0, replace);
        } else if !first.is_uppercase() {
            input.drain(..first.len_utf8());
            input.reserve(first.to_uppercase().map(|c| c.len_utf8()).sum());
            for c in first.to_uppercase() {
                input.insert(0, c);
            }
        }
    }
}

#[test]
fn uppercase_first_respects_dotted_i_langs() {
    for ((lang, input), expected) in [
        (("en", "abc"), "Abc"),
        (("en", "istanbul"), "Istanbul"),
        (("tr", "istanbul"), "İstanbul"),
    ] {
        let mut capitalized = input.to_string();
        uppercase_first(lang, &mut capitalized);
        assert_eq!(capitalized, expected);
    }
}