mwtitle 0.2.0-alpha.1

MediaWiki title validation and formatting
Documentation
/*
Copyright (C) 2021 Erutuon

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

//! A case-insensitive map from namespace ID, name, or alias to [`NamespaceInfo`].

use crate::display::WhitespaceDisplayer;
use crate::site_info::{NamespaceAlias, NamespaceInfo, SiteInfo};
use crate::{Error, Result, Title, NS_MAIN};
use bytemuck::TransparentWrapper;
#[cfg(feature = "utils")]
#[cfg_attr(docs, doc(cfg(feature = "utils")))]
use flate2::read::GzDecoder;
use std::{collections::HashMap, iter::FusedIterator, sync::Arc};
#[cfg(feature = "utils")]
#[cfg_attr(docs, doc(cfg(feature = "utils")))]
use std::{io::Read, path::Path};

#[cfg(feature = "utils")]
#[cfg_attr(docs, doc(cfg(feature = "utils")))]
use crate::SiteInfoResponse;

pub enum Namespace<'a> {
    Id(i32),
    NameOrAlias(&'a str),
}

impl<'a> From<&'a str> for Namespace<'a> {
    fn from(name_or_alias: &'a str) -> Self {
        Namespace::NameOrAlias(name_or_alias)
    }
}

impl From<i32> for Namespace<'_> {
    fn from(id: i32) -> Self {
        Self::Id(id)
    }
}

/**
A string wrapper for internal use in [`NamespaceMap`]. Treated as the owned version of [`NamespaceStringBorrowed`],
which provides `NamespaceString` with [`Hash`](core::hash::Hash) and [`Eq`](core::cmp::Eq) implementations
where letter case is ignored and `'_'` is treated as equal to `'_'`.
This allows potential namespace names or aliases to be looked up directly in the `HashMap`
without allocating a new string containing the normalized version.
[`PartialOrd`] and [`Ord`] are also provided, but not used,
in case we want to switch the fields of `NamespaceMap` from `HashMap` to [`BTreeMap`](std::collections::BTreeMap)s,
which has advantages in debugging.
 */
#[derive(Clone, Debug)]
#[repr(transparent)]
pub(crate) struct NamespaceString(pub(crate) String);

/// Safe because `NamespaceString` has the same in-memory representation as `String` and no additional contracts to uphold,
/// because we aren't requiring the contents to be normalized.
unsafe impl TransparentWrapper<String> for NamespaceString {}

impl NamespaceString {
    fn as_namespace_str(&self) -> &NamespaceStringBorrowed {
        NamespaceStringBorrowed::from_str(self.0.as_str())
    }
}

impl<'a> PartialEq for NamespaceString {
    fn eq(&self, other: &Self) -> bool {
        self.as_namespace_str().eq(other.as_namespace_str())
    }
}

impl<'a> Eq for NamespaceString {}

// Test upper and lowercase in a few different scripts.
#[cfg(test)]
const NAMESPACE_STRING_TESTS: [&[&str]; 5] = [
    &[
        "User talk",
        "User_talk",
        "user talk",
        "user_talk",
        "User Talk",
        "User_Talk",
        "USER TALK",
        "USER_TALK",
    ],
    &["Catégorie", "CATÉGORIE"],
    &["Συζήτηση χρήστη", "συζήτηση χρήστη", "ΣΥΖΉΤΗΣΗ ΧΡΉΣΤΗ"],
    &[
        "Обсуждение Викисловаря",
        "обсуждение викисловаря",
        "ОБСУЖДЕНИЕ ВИКИСЛОВАРЯ",
    ],
    &[
        "Մասնակցի քննարկում",
        "մասնակցի քննարկում",
        "ՄԱՍՆԱԿՑԻ ՔՆՆԱՐԿՈՒՄ",
    ],
];

#[cfg(test)]
fn for_each_namespace_string_combination(f: impl Fn(&str, &str)) {
    for test in NAMESPACE_STRING_TESTS {
        for a in test {
            for b in test {
                f(a, b);
            }
        }
    }
}

#[test]
fn hash_and_eq_for_namespace_string_are_case_and_whitespace_insensitive() {
    for_each_namespace_string_combination(|a, b| {
        let (a, b) = (
            NamespaceString(a.to_string()),
            NamespaceString(b.to_string()),
        );
        assert_eq!(a, b);
        assert_eq!(hash(a), hash(b))
    });
}

impl std::borrow::Borrow<NamespaceStringBorrowed> for NamespaceString {
    fn borrow(&self) -> &NamespaceStringBorrowed {
        self.as_namespace_str()
    }
}

impl<'a> PartialOrd for NamespaceString {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        Some(self.as_namespace_str().cmp(other.as_namespace_str()))
    }
}

impl<'a> Ord for NamespaceString {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        // Unwrapping is safe because `partial_cmp` always returns `Some(_)`.
        self.as_namespace_str()
            .partial_cmp(other.as_namespace_str())
            .unwrap()
    }
}

impl<'a> std::hash::Hash for NamespaceString {
    fn hash<H>(&self, hasher: &mut H)
    where
        H: std::hash::Hasher,
    {
        self.as_namespace_str().hash(hasher);
    }
}

#[cfg(test)]
fn hash(v: impl std::hash::Hash) -> u64 {
    use std::hash::Hasher as _;
    let mut hasher = std::collections::hash_map::DefaultHasher::new();
    v.hash(&mut hasher);
    hasher.finish()
}

impl std::convert::From<&str> for NamespaceString {
    fn from(s: &str) -> Self {
        NamespaceString(s.into())
    }
}

/// The borrowed version of [`NamespaceString`].
#[derive(Debug)]
#[repr(transparent)]
pub(crate) struct NamespaceStringBorrowed(str);

/// Safe because `NamespaceStringBorrowed` has the same in-memory representation as `str` and no additional contracts to uphold,
/// because we aren't requiring the contents to be normalized.
unsafe impl TransparentWrapper<str> for NamespaceStringBorrowed {}

impl NamespaceStringBorrowed {
    pub fn from_str(s: &str) -> &Self {
        Self::wrap_ref(s)
    }

    fn chars_normalized(
        &self,
    ) -> impl Iterator<Item = char> + std::iter::FusedIterator + '_ {
        enum Iter {
            One(Option<char>),
            Many(std::char::ToLowercase),
        }
        impl Iterator for Iter {
            type Item = char;

            fn next(&mut self) -> Option<Self::Item> {
                match self {
                    Iter::One(char) => char.take(),
                    Iter::Many(chars) => chars.next(),
                }
            }
        }
        impl FusedIterator for Iter {}
        self.0.chars().flat_map(|c| {
            if c == '_' || c == ' ' {
                Iter::One(Some('_'))
            } else {
                Iter::Many(c.to_lowercase())
            }
        })
    }
}

impl PartialEq for NamespaceStringBorrowed {
    fn eq(&self, other: &Self) -> bool {
        self.chars_normalized().eq(other.chars_normalized())
    }
}

impl Eq for NamespaceStringBorrowed {}

#[test]
fn hash_and_eq_for_namespace_string_borrowed_are_case_and_whitespace_insensitive(
) {
    for_each_namespace_string_combination(|a, b| {
        let (a, b) = (
            NamespaceStringBorrowed::from_str(a),
            NamespaceStringBorrowed::from_str(b),
        );
        assert_eq!(a, b);
        assert_eq!(hash(&a), hash(&b));
    });
}

impl PartialOrd for NamespaceStringBorrowed {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        Some(self.chars_normalized().cmp(other.chars_normalized()))
    }
}

impl Ord for NamespaceStringBorrowed {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        // Unwrapping is safe because `partial_cmp` always returns `Some(_)`.
        self.partial_cmp(other).unwrap()
    }
}

impl std::hash::Hash for NamespaceStringBorrowed {
    fn hash<H>(&self, hasher: &mut H)
    where
        H: std::hash::Hasher,
    {
        for c in self.chars_normalized() {
            c.hash(hasher);
        }
    }
}

impl<'a> std::convert::From<&'a str> for &'a NamespaceStringBorrowed {
    fn from(s: &'a str) -> Self {
        NamespaceStringBorrowed::from_str(s)
    }
}

#[derive(Clone, Debug, PartialEq, Eq)]
pub struct NamespaceMap {
    namespaces_by_id: HashMap<i32, Arc<NamespaceInfo>>,
    namespaces_by_name_or_alias: HashMap<NamespaceString, Arc<NamespaceInfo>>,
}

impl NamespaceMap {
    /// Creates a `NamespaceMap` from a [`SiteInfo`].
    pub fn from_site_info(site_info: SiteInfo) -> Result<Self> {
        Self::from_namespaces_and_namespace_aliases(
            site_info.namespaces.into_values(),
            site_info.namespace_aliases.into_iter(),
        )
    }

    /// Creates a `NamespaceMap` by parsing the contents of a JSON or GZipped JSON file
    /// like `siteinfo-namespaces.json.gz` or `siteinfo-namespaces.json` in the Wikimedia dumps.
    /// If the file extension is `gz`, decompresses from the GZip format before decoding the JSON.
    #[cfg(feature = "utils")]
    #[cfg_attr(docs, doc(cfg(feature = "utils")))]
    pub fn from_path(path: &Path) -> Result<Self> {
        use std::fs::File;

        let json = if path.extension() == Some("gz".as_ref()) {
            let gz = File::open(path)
                .map_err(|source| Error::from_io("open file", source, path))?;
            let mut decoder = GzDecoder::new(gz);
            let mut decoded = String::new();
            decoder
                .read_to_string(&mut decoded)
                .map_err(|source| Error::from_io("parse GZip", source, path))?;
            decoded
        } else {
            std::fs::read_to_string(path).map_err(|source| {
                Error::from_io("read file to string", source, path)
            })?
        };
        Self::from_json_with_path(&json, Some(path))
    }

    /// Constructs a `NamespaceMap` from an iterator yielding `NamespaceInfo`s
    /// and an iterator yielding `NamespaceAlias`es.
    ///
    /// # Errors
    ///
    /// If the `namespacealiases` field contains any `id`s that are not found in the `namespaces` field of the `SiteInfo`,
    /// fails and returns `Err(Error::UnknownAliases(unrecognized_ids))`.
    pub fn from_namespaces_and_namespace_aliases<
        NS: IntoIterator<Item = NamespaceInfo>,
        AL: IntoIterator<Item = NamespaceAlias>,
    >(
        namespaces: NS,
        namespace_aliases: AL,
    ) -> Result<Self> {
        let mut namespaces_by_id = HashMap::new();
        let mut namespaces_by_name_or_alias = HashMap::new();
        for namespace in namespaces {
            let namespace = Arc::new(namespace);
            namespaces_by_id.insert(namespace.id, namespace.clone());
            namespaces_by_name_or_alias.insert(
                NamespaceString(namespace.name.clone()),
                namespace.clone(),
            );
            if let Some(canonical) = namespace.canonical.as_deref() {
                namespaces_by_name_or_alias.insert(
                    NamespaceString(canonical.to_string()),
                    namespace.clone(),
                );
            }
        }
        let mut aliases_not_found = Vec::new();
        for alias in namespace_aliases {
            if let Some(namespace_info) = namespaces_by_id.get(&alias.id) {
                namespaces_by_name_or_alias.insert(
                    NamespaceString(alias.alias),
                    namespace_info.clone(),
                );
            } else {
                aliases_not_found.push(alias);
            }
        }
        if aliases_not_found.is_empty() {
            Ok(Self {
                namespaces_by_id,
                namespaces_by_name_or_alias,
            })
        } else {
            Err(Error::UnknownAliases(aliases_not_found))
        }
    }

    /// Create a `NamespaceMap` from two iterators. The first iterator represents
    /// namespaces and contains `(key, value): (String, String)` tuples that represent the fields
    /// of a `NamespaceInfo`. The second contains `(alias, id): (String, i32)` tuples
    /// that each represent a `NamespaceAlias`.
    pub fn from_iters<
        NS: IntoIterator<Item = NI>,
        NI: IntoIterator<Item = (String, String)>,
        AL: IntoIterator<Item = (String, i32)>,
    >(
        namespaces: NS,
        namespace_aliases: AL,
    ) -> Result<Self> {
        // Have to collect namespaces first because `Self::from_namespaces_and_namespace_aliases`
        // doesn't accept an iterator where the item is a `Result`.
        let namespaces = namespaces
            .into_iter()
            .map(|hash_map| NamespaceInfo::try_from_iter(hash_map))
            .collect::<Result<Vec<_>>>()?;
        Self::from_namespaces_and_namespace_aliases(
            namespaces,
            namespace_aliases
                .into_iter()
                .map(|(alias, id)| NamespaceAlias { id, alias }),
        )
    }

    /// Creates a `NamespaceMap` by parsing the contents of a `Read` type that contains the JSON
    /// representation of a [`SiteInfoResponse`].
    #[cfg(feature = "utils")]
    #[cfg_attr(docs, doc(cfg(feature = "utils")))]
    pub fn from_reader<R: Read>(reader: R) -> Result<Self> {
        let site_info = serde_json::from_reader::<R, SiteInfoResponse>(reader)
            .map_err(|source| Error::Json {
                source: Arc::new(source),
            })?
            .query;
        Self::from_site_info(site_info)
    }

    /// Creates a `NamespaceMap` by parsing the JSON representation of a [`SiteInfoResponse`].
    #[cfg(feature = "utils")]
    #[cfg_attr(docs, doc(cfg(feature = "utils")))]
    pub fn from_json<S: AsRef<str>>(json: S) -> Result<Self> {
        Self::from_json_with_path(json.as_ref(), None)
    }

    /// Creates a `NamespaceMap` by parsing the JSON representation of a [`SiteInfoResponse`].
    /// If this fails and `path` is `Some(_)`, gives an error message
    /// that mentions `path`.
    #[cfg(feature = "utils")]
    #[cfg_attr(docs, doc(cfg(feature = "utils")))]
    fn from_json_with_path(json: &str, path: Option<&Path>) -> Result<Self> {
        Self::from_site_info(
            serde_json::from_str::<SiteInfoResponse>(json)
                .map_err(|source| {
                    let source = Arc::new(source);
                    if let Some(path) = path {
                        Error::JsonFile {
                            source,
                            path: path.into(),
                        }
                    } else {
                        Error::Json { source }
                    }
                })?
                .query,
        )
    }

    /// Returns the attributes of the namespace when given a valid namespace ID.
    pub fn get_by_id(&self, id: i32) -> Option<&NamespaceInfo> {
        self.namespaces_by_id.get(&id).map(|arc| &**arc)
    }

    /// Returns the attributes of the namespace when given a valid namespace name or alias.
    /// Private because it exposes [`NamespaceString`], an internal implementation detail.
    fn get_by_name_or_alias<S>(
        &self,
        name_or_alias: &S,
    ) -> Option<&NamespaceInfo>
    where
        S: ?Sized,
        NamespaceString: std::borrow::Borrow<S>,
        S: std::hash::Hash + Eq,
    {
        self.namespaces_by_name_or_alias
            .get(name_or_alias)
            .map(|arc| &**arc)
    }

    /// Returns the attributes of the namespace when given a valid namespace ID or name or alias.
    pub fn get_info<'a, 'b, N: Into<Namespace<'b>>>(
        &'a self,
        namespace: N,
    ) -> Option<&NamespaceInfo> {
        match namespace.into() {
            Namespace::Id(id) => self.get_by_id(id),
            Namespace::NameOrAlias(name_or_alias) => self.get_by_name_or_alias(
                NamespaceStringBorrowed::from_str(name_or_alias),
            ),
        }
    }

    /// Returns the ID of the namespace when given a valid namespace ID or name or alias.
    /// In case of an ID, this unnecessarily looks up the ID in the map.
    ///
    /// Equivalent of `Language::getNsIndex()`.
    pub fn get_id<'a, 'b, N: Into<Namespace<'b>>>(
        &'a self,
        namespace: N,
    ) -> Option<i32> {
        self.get_info(namespace).map(|info| info.id)
    }

    /// Returns the local name of the namespace when given a valid namespace ID or name or alias.
    pub fn get_name<'a, 'b, N: Into<Namespace<'b>>>(
        &'a self,
        namespace: N,
    ) -> Option<&'a str> {
        self.get_info(namespace).map(|info| &*info.name)
    }

    /// Returns case-sensitivity of the first letter of titles in the namespace when given a valid namespace ID or name or alias.
    pub fn get_case<'a, 'b, N: Into<Namespace<'b>>>(
        &'a self,
        namespace: N,
    ) -> Option<&'a str> {
        self.get_info(namespace).map(|info| &*info.case)
    }

    /// Returns canonical name for a namespace ID or name or alias,
    /// if it is valid and if that namespace has a canonical name.
    pub fn get_canonical_name<'a, 'b, N: Into<Namespace<'b>>>(
        &'a self,
        namespace: N,
    ) -> Option<&'a str> {
        self.get_info(namespace)
            .and_then(|info| info.canonical.as_deref())
    }

    /// Whether the first letter of titles in the namespace
    /// is always capitalized.
    ///
    /// Equivalent of `NamespaceInfo::isCapitalized()`.
    pub fn is_capitalized<'a, 'b, N: Into<Namespace<'b>>>(
        &'a self,
        namespace: N,
    ) -> Option<bool> {
        self.get_info(namespace)
            .map(|info| &*info.case)
            .map(|case| case == "first-letter")
    }

    /// Get the title with namespace in pretty aka text form (spaces).
    ///
    /// Fragments will not be included.
    ///
    /// Returns `None` if the title's namespace is not in the map.
    pub fn to_pretty(&self, title: &Title) -> Option<String> {
        self.prefixed::<' '>(title, false)
    }

    /// Get the title with namespace in underscore aka dbkey form. This is
    /// potentially useful when you want to make a database query.
    ///
    /// Fragments will not be included.
    ///
    /// Returns `None` if the title's namespace is not in the map
    pub fn to_underscores(&self, title: &Title) -> Option<String> {
        self.prefixed::<'_'>(title, false)
    }

    /// Get the title with namespace in pretty aka text form (spaces), with the
    /// fragment, if one exists, appended.
    ///
    /// Returns `None` if the title's namespace is not in the map.
    pub fn to_pretty_with_fragment(&self, title: &Title) -> Option<String> {
        self.prefixed::<' '>(title, true)
    }

    fn prefixed<const C: char>(
        &self,
        title: &Title,
        include_fragment: bool,
    ) -> Option<String> {
        // TODO: API doesn't expose gender distinction
        let (interwiki, iw_colon) = match title.interwiki() {
            Some(interwiki) => (interwiki, ":"),
            None => ("", ""),
        };
        let prefix = if title.namespace() == NS_MAIN {
            ""
        } else {
            self.get_name(title.namespace())?
        };
        let colon = if prefix.is_empty() { "" } else { ":" };
        let (hash, fragment) = if include_fragment {
            match title.fragment.as_deref() {
                Some(fragment) => ("#", fragment),
                None => ("", ""),
            }
        } else {
            ("", "")
        };
        Some(format!(
            "{}{}{}{}{}{}{}",
            WhitespaceDisplayer::<C>(interwiki),
            iw_colon,
            WhitespaceDisplayer::<C>(prefix),
            colon,
            WhitespaceDisplayer::<C>(title.dbkey()),
            hash,
            fragment
        ))
    }
}

#[test]
fn siteinfo_can_be_converted_to_namespace_map_and_lookup_is_case_insensitive() {
    use std::collections::HashMap;
    for (
        (namespaces, aliases),
        (expected_id_map, expected_name_map),
        run_tests,
    ) in [(
        (
            [
                (0, "", None, "first-letter"),
                (1, "Talk", Some("Talk"), "first-letter"),
                (10, "Template", Some("Template"), "first-letter"),
                (14, "Category", Some("Category"), "first-letter"),
                (15, "Category talk", Some("Category talk"), "first-letter"),
            ],
            [("CAT", 14)],
        ),
        (
            [
                (0, ("", None, "first-letter")),
                (1, ("Talk", Some("Talk"), "first-letter")),
                (10, ("Template", Some("Template"), "first-letter")),
                (14, ("Category", Some("Category"), "first-letter")),
                (15, ("Category talk", Some("Category talk"), "first-letter")),
            ],
            [
                ("", (0, "", None, "first-letter")),
                ("Talk", (1, "Talk", Some("Talk"), "first-letter")),
                (
                    "Template",
                    (10, "Template", Some("Template"), "first-letter"),
                ),
                (
                    "Category",
                    (14, "Category", Some("Category"), "first-letter"),
                ),
                ("CAT", (14, "Category", Some("Category"), "first-letter")),
                (
                    "Category talk",
                    (
                        15,
                        "Category talk",
                        Some("Category talk"),
                        "first-letter",
                    ),
                ),
            ],
        ),
        |namespace_map: NamespaceMap| {
            assert_eq!(namespace_map.get_name(1), Some("Talk"));
            assert_eq!(namespace_map.get_name(14), Some("Category"));
            for (names, expected) in [
                (&["Talk", "talk", "TALK"][..], 1),
                (
                    &[
                        "Category talk",
                        "Category_talk",
                        "CATEGORY TALK",
                        "CATEGORY_TALK",
                    ],
                    15,
                ),
            ] {
                for name in names {
                    assert_eq!(
                        namespace_map.get_id(*name),
                        Some(expected),
                        "\n{}",
                        name
                    );
                }
            }
        },
    )] {
        let namespaces =
            Vec::from_iter(namespaces.map(|(id, name, canonical, case)| {
                NamespaceInfo {
                    id,
                    name: name.into(),
                    canonical: canonical.map(String::from),
                    case: case.into(),
                }
            }));
        let namespacealiases =
            Vec::from(aliases.map(|(alias, id)| NamespaceAlias {
                alias: alias.into(),
                id,
            }));
        let expected = Ok(NamespaceMap {
            namespaces_by_id: HashMap::from_iter(expected_id_map.map(
                |(id, (name, canonical, case))| {
                    (
                        id,
                        Arc::new(NamespaceInfo {
                            id,
                            name: name.into(),
                            canonical: canonical.map(String::from),
                            case: case.into(),
                        }),
                    )
                },
            )),
            namespaces_by_name_or_alias: HashMap::from_iter(
                expected_name_map.map(
                    |(name_or_alias, (id, name, canonical, case))| {
                        (
                            name_or_alias.into(),
                            Arc::new(NamespaceInfo {
                                id,
                                name: name.into(),
                                canonical: canonical.map(String::from),
                                case: case.into(),
                            }),
                        )
                    },
                ),
            ),
        });
        // The map_err is to allow using assert_eq, because Error doesn't implement PartialEq.
        // The error variant is always Error::UnknownAliases.
        let namespace_map =
            NamespaceMap::from_namespaces_and_namespace_aliases(
                namespaces.clone(),
                namespacealiases.clone(),
            )
            .map_err(|e| {
                if let Error::UnknownAliases(aliases) = e {
                    Some(aliases)
                } else {
                    None
                }
            });
        assert_eq!(
            namespace_map, expected,
            "\nconverting {:?}\n{:?}",
            &namespaces, &namespacealiases
        );
        run_tests(namespace_map.unwrap());
    }
}