mwtitle 0.2.1

MediaWiki title validation and formatting
Documentation
/*
Copyright (C) 2021 Kunal Mehta <legoktm@debian.org>
Copyright (C) 2021 Erutuon

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#![cfg(feature = "parsing")]

use mwtitle::{Error, SiteInfoResponse, TitleCodec};
use once_cell::sync::Lazy;
use std::{collections::HashMap, sync::Mutex};

async fn codec(domain: &str) -> TitleCodec {
    static CODECS: Lazy<Mutex<HashMap<String, TitleCodec>>> =
        Lazy::new(|| Mutex::new(HashMap::new()));
    let mut codecs = CODECS.lock().unwrap();
    if let Some(codec) = codecs.get(domain) {
        return codec.clone();
    }
    let url = format!(
        "https://{}/w/api.php?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases|interwikimap&formatversion=2&format=json",
        domain
    );
    let resp: SiteInfoResponse =
        reqwest::get(url).await.unwrap().json().await.unwrap();
    let entry = codecs.entry(domain.into());
    // Value is always inserted because HashMap::get has returned None above.
    // Could use Entry::insert once it's stabilized.
    #[allow(clippy::or_fun_call)]
    entry
        .or_insert(TitleCodec::from_site_info(resp.query).expect(
            "API doesn't return namespacealiases with invalid namespace IDs",
        ))
        .clone()
}

async fn test_failure<const N: usize>(domain: &str, tests: [(&str, &str); N]) {
    let codec = codec(domain).await;
    for (input, expected) in tests {
        let title = codec.new_title(input);
        assert!(matches!(&title, Err(_)), "\n{:?}\n{:?}", title, expected);
        let error_code = title
            .as_ref()
            .err()
            .and_then(|error| error.mw_title_codec_error_code());
        assert_eq!(error_code, Some(expected), "\n{:?}\n{:?}", title, expected);
    }
}

// Test cases initially copied from mediawiki-title (npm package)

// Subject of NS_TALK does not roundtrip to NS_MAIN.
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn titles_that_cannot_roundtrip_from_talk_to_main_and_back_are_rejected()
{
    test_failure(
        "en.wikipedia.org",
        [
            ("Talk:File:Example.svg", "title-invalid-talk-namespace"),
            ("Talk:_File_:Example.svg", "title-invalid-talk-namespace"),
            ("Talk:wikt:Example.svg", "title-invalid-talk-namespace"),
            ("Talk: wikt :Example.svg", "title-invalid-talk-namespace"),
        ],
    )
    .await;
}

// Unicode characters that are both control characters and whitespace,
// and are forbidden regardless of `wgLegalTitleChars`.
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn c0_control_character_whitespace_is_rejected() {
    test_failure(
        "en.wikipedia.org",
        [
            ("A\t", "title-invalid-characters"),
            ("A\n", "title-invalid-characters"),
            ("A\r", "title-invalid-characters"),
            ("A\tB", "title-invalid-characters"),
            ("Talk:A\t", "title-invalid-characters"),
            ("Talk:\tA", "title-invalid-characters"),
            ("Talk\t:A", "title-invalid-characters"),
            ("Talk:A\t/B", "title-invalid-characters"),
            ("Talk:A/\tB", "title-invalid-characters"),
            ("Talk:A/B\t/C", "title-invalid-characters"),
        ],
    )
    .await;
}

// U+0085 is considered whitespace by Unicode but not by `TitleCodec`.
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn non_mediawiki_whitespace_is_not_collapsed_or_trimmed() {
    test_success("en.wikipedia.org", [("A \u{85} ", "A_\u{85}")]).await;
}

// Punctuation characters forbidden regardless of `wgLegalTitleChars`.
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn punctuation_characters_used_in_wikitext_or_html_are_rejected() {
    test_failure(
        "en.wikipedia.org",
        [
            ("A [ B", "title-invalid-characters"),
            ("A ] B", "title-invalid-characters"),
            ("A { B", "title-invalid-characters"),
            ("A } B", "title-invalid-characters"),
            ("A < B", "title-invalid-characters"),
            ("A > B", "title-invalid-characters"),
            ("A | B", "title-invalid-characters"),
        ],
    )
    .await;
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn url_percent_encoding_is_rejected() {
    test_failure(
        "en.wikipedia.org",
        [
            ("A%20B", "title-invalid-characters"),
            ("A%23B", "title-invalid-characters"),
            ("A%2523B", "title-invalid-characters"),
        ],
    )
    .await;
}

// Directory navigation
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn relative_directory_syntax_is_rejected() {
    test_failure(
        "en.wikipedia.org",
        [
            (".", "title-invalid-relative"),
            ("..", "title-invalid-relative"),
            ("./Sandbox", "title-invalid-relative"),
            ("../Sandbox", "title-invalid-relative"),
            ("Foo/./Sandbox", "title-invalid-relative"),
            ("Foo/../Sandbox", "title-invalid-relative"),
            ("Sandbox/.", "title-invalid-relative"),
            ("Sandbox/..", "title-invalid-relative"),
        ],
    )
    .await;
}

// Namespace prefix without actual title
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn empty_database_page_titles_are_rejected() {
    test_failure(
        "en.wikipedia.org",
        [
            ("", "title-invalid-empty"),
            (":", "title-invalid-empty"),
            ("__  __", "title-invalid-empty"),
            ("  __  ", "title-invalid-empty"),
            ("Talk:", "title-invalid-empty"),
            ("Talk: _", "title-invalid-empty"),
            ("Talk:#", "title-invalid-empty"),
            ("Category: ", "title-invalid-empty"),
            ("Category: #bar", "title-invalid-empty"),
        ],
    )
    .await;
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn replacement_character_is_rejected() {
    test_failure("en.wikipedia.org", [("", "title-invalid-utf8")]).await;
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn magic_tilde_is_rejected() {
    test_failure(
        "en.wikipedia.org",
        [
            ("A ~~~ Name", "title-invalid-magic-tilde"),
            ("A ~~~~ Signature", "title-invalid-magic-tilde"),
            ("A ~~~~~ Timestamp", "title-invalid-magic-tilde"),
        ],
    )
    .await;
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn leading_colon_is_rejected() {
    test_failure(
        "en.wikipedia.org",
        [
            ("::1", "title-invalid-leading-colon"),
            ("Category::Test", "title-invalid-leading-colon"),
        ],
    )
    .await;
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn title_length_constraints_apply_to_database_page_title() {
    let codec = codec("en.wikipedia.org").await;
    // Test cases initially copied from mediawiki-title (npm package)
    let test_good = [
        // Length is 256 total, but only title part matters
        format!("Category:{}", "x".repeat(247)),
        // Special pages can have longer titles
        format!("Special:{}", "x".repeat(499)),
        "x".repeat(251),
        "x".repeat(255),
        // Rust doesn't allow surrogates in strings.
        // repeat("\u{d83c}\u{df40}", 63),
    ];
    let test_bad = [
        "x".repeat(257),
        format!("Special:{}", "x".repeat(513)),
        // Rust doesn't allow surrogates in strings.
        // repeat("\u{d83c}\u{df40}", 64)
    ];
    for input in test_good {
        let title_result = codec.new_title(&input);
        assert!(title_result.is_ok(), "\n{:?}", title_result);
    }
    for input in test_bad {
        let title_result = codec.new_title(&input);
        assert!(
            matches!(title_result, Err(Error::TooLong(_))),
            "\n{:?}",
            title_result
        );
    }
}

async fn test_no_change_with_whitespace_added<const N: usize>(
    domain: &str,
    tests: [&str; N],
) {
    let codec = codec(domain).await;
    for input in tests {
        let title = codec.new_title(input).ok();
        let title2 = codec.new_title(&format!(" {}_", input)).ok();
        assert_eq!(title, title2);
    }
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn more_than_two_dots_are_accepted() {
    test_no_change_with_whitespace_added(
        "en.wikipedia.org",
        ["Foo/.../Sandbox", "Sandbox/..."],
    )
    .await;
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn one_or_two_tildes_are_accepted() {
    test_no_change_with_whitespace_added("en.wikipedia.org", ["~", "A~~"])
        .await;
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn title_parses_identically_after_leading_and_trailing_whitespace_is_added(
) {
    test_no_change_with_whitespace_added("en.wikipedia.org", [
        "Sandbox",
        "A \"B\"",
        "A 'B'",
        ".com",
        // We are not supporting standalone fragments
        // "#",
        "Test#Abc",
        "\"",
        "'",
        "Talk:Sandbox",
        "Talk:Foo:Sandbox",
        "File:Example.svg",
        "File_talk:Example.svg",
        ":A",
        "-",
        "",
        "Foo & bar",
        "\"Believing_Women\"_in_Islam._Unreading_Patriarchal_Interpretations_of_the_Qur\\\'ān"
    ]).await;
}

async fn test_success<const N: usize>(domain: &str, tests: [(&str, &str); N]) {
    let codec = codec(domain).await;
    for (input, expected) in tests {
        let title_result = codec.new_title(input);
        assert!(
            matches!(title_result, Ok(_)),
            "{:?}\t{:?}",
            title_result,
            expected
        );
        assert_eq!(
            title_result
                .ok()
                .map(|title| codec.to_underscores(&title))
                .as_deref(),
            Some(expected)
        );
    }
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn whitespace_is_trimmed_and_leading_colons_are_removed() {
    test_success(
        "en.wikipedia.org",
        [
            ("Test", "Test"),
            (":Test", "Test"),
            (": Test", "Test"),
            (":_Test_", "Test"),
            ("Test 123  456   789", "Test_123_456_789"),
            ("💩", "💩"),
            ("Talk: foo", "Talk:Foo"),
            ("X-Men (film series) #Gambit", "X-Men_(film_series)"),
            ("Foo _ bar", "Foo_bar"),
            (
                "Foo \u{00A0}\u{1680}\u{180E}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000} bar",
                "Foo_bar"),
            (
                "Foo\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}bar",
                "Foobar",
            ),
            (
                "list of Neighbours characters (2016)#Tom Quill",
                "List_of_Neighbours_characters_(2016)"
            ),
        ]).await;
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn titles_with_fake_namespaces_arent_trimmed_or_capitalized_after_colon()
{
    test_success(
        "en.wikipedia.org",
        [
            ("Foo:bar", "Foo:bar"),
            ("Foo: bar", "Foo:_bar"),
            ("int:eger", "Int:eger"),
        ],
    )
    .await;
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn namespace_aliases_map_to_local_namespace_names() {
    test_success("en.wikipedia.org", [("WP:eger", "Wikipedia:Eger")]).await;
}

macro_rules! add_subpage {
    (
        [
            $(($input:literal, $expected:literal)),*
            $(,)?
        ]
    ) => {{
        [
            $(
                ($input, $expected),
                (concat!($input, "/subpage"), concat!($expected, "/subpage"))
            ),*
        ]
    }};
}

// Test cases initially copied from mediawiki-title (npm package)
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn ipv4_address_is_sanitized_in_user_namespaces() {
    test_success(
        "en.wikipedia.org",
        add_subpage!([
            ("User:127.000.000.001", "User:127.0.0.1"),
            ("User:0.0.0.0", "User:0.0.0.0"),
            ("User:00.00.00.00", "User:0.0.0.0"),
            ("User:000.000.000.000", "User:0.0.0.0"),
            ("User:141.000.011.253", "User:141.0.11.253"),
            ("User: 1.2.4.5", "User:1.2.4.5"),
            ("User:01.02.04.05", "User:1.2.4.5"),
            ("User:001.002.004.005", "User:1.2.4.5"),
            ("User:010.0.000.1", "User:10.0.0.1"),
            ("User:080.072.250.04", "User:80.72.250.4"),
        ]),
    )
    .await;
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn ipv6_address_is_sanitized_in_user_namespaces() {
    test_success(
        "en.wikipedia.org",
        add_subpage!([
            ("User:::1", "User:0:0:0:0:0:0:0:1"),
            ("User:0:0:0:0:0:0:0:1", "User:0:0:0:0:0:0:0:1"),
            ("User:cebc:2004:f::", "User:CEBC:2004:F:0:0:0:0:0"),
            ("User:::", "User:0:0:0:0:0:0:0:0"),
            ("User:0:0:0:1::", "User:0:0:0:1:0:0:0:0"),
            ("User:3f:535::e:fbb", "User:3F:535:0:0:0:0:E:FBB"),
            ("User Talk:::1", "User_talk:0:0:0:0:0:0:0:1"),
            ("User_Talk:::1", "User_talk:0:0:0:0:0:0:0:1"),
            ("User_talk:::1", "User_talk:0:0:0:0:0:0:0:1"),
            ("User_talk:::1/24", "User_talk:0:0:0:0:0:0:0:1/24"),
        ]),
    )
    .await;
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn non_ip_addresses_are_not_sanitized() {
    test_success(
        "en.wikipedia.org",
        add_subpage!([
            ("User:Bar.01", "User:Bar.01"),
            ("User:Bar.010", "User:Bar.010"),
            ("User:00.00.00. 00", "User:00.00.00._00"),
            // No sanitization if there's a space before the slash.
            ("User:00.00.00.00 / subpage", "User:00.00.00.00_/_subpage"),
        ]),
    )
    .await;
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn ip_addresses_outside_of_user_namespaces_are_not_sanitized() {
    test_success(
        "en.wikipedia.org",
        add_subpage!([
            ("0:0:0:0:0:0:0:1", "0:0:0:0:0:0:0:1"),
            ("127.000.000.001", "127.000.000.001"),
            ("0.0.0.0", "0.0.0.0"),
            ("00.00.00.00", "00.00.00.00"),
            ("000.000.000.000", "000.000.000.000"),
            ("141.000.011.253", "141.000.011.253"),
            (" 1.2.4.5", "1.2.4.5"),
            ("01.02.04.05", "01.02.04.05"),
            ("001.002.004.005", "001.002.004.005"),
            ("010.0.000.1", "010.0.000.1"),
            ("080.072.250.04", "080.072.250.04"),
            ("Foo.1000.00", "Foo.1000.00"),
            ("Bar.01", "Bar.01"),
            ("Bar.010", "Bar.010"),
            ("cebc:2004:f::", "Cebc:2004:f::"),
            ("0:0:0:1::", "0:0:0:1::"),
            ("3f:535::e:fbb", "3f:535::e:fbb"),
        ]),
    )
    .await;
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn title_capitalization_follows_php_rules() {
    test_success(
        "en.wikipedia.org",
        [
            ("ß", "ß"),
            ("ʼn", "ʼn"),
            ("ǰ", "ǰ"),
            ("ΐ", "ΐ"),
            ("ΰ", "ΰ"),
            ("և", "և"),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
            ("", ""),
        ],
    )
    .await;
}

/// Special handling for `i` first character
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn dotted_i_is_uppercased_to_dotted_capital_i_according_to_language() {
    let tests = [
        ("tr.wikipedia.org", "iTestTest", "İTestTest"),
        ("az.wikipedia.org", "iTestTest", "İTestTest"),
        ("kk.wikipedia.org", "iTestTest", "İTestTest"),
        ("kaa.wikipedia.org", "iTestTest", "İTestTest"),
        ("en.wikipedia.org", "iTestTest", "ITestTest"),
    ];
    for (domain, input, expected) in tests {
        let codec = codec(domain).await;
        let prefixed_text = codec
            .new_title(input)
            .map(|title| codec.to_pretty(&title))
            .ok();
        assert_eq!(prefixed_text.as_deref(), Some(expected));
    }
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn interwiki_prefix_is_normalized() {
    test_success(
        "en.wikipedia.org",
        [("meta:foobar", "meta:foobar"), ("Meta:foo", "meta:foo")],
    )
    .await;
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn title_with_local_interwiki_is_capitalized_according_to_namespace() {
    test_success(
        "en.wikipedia.org",
        [
            ("w:talk:foo", "Talk:Foo"),
            ("w:spaces in page", "Spaces_in_page"),
            ("en:w:Sandbox_", "Sandbox"),
            ("en:", "Main_Page"),
        ],
    )
    .await;
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn local_interwiki_with_mainspace_and_empty_title_points_to_main_page() {
    test_success(
        "en.wikipedia.org",
        [("en:", "Main_Page"), ("w:", "Main_Page")],
    )
    .await;

    let codec = codec("en.wikipedia.org").await;
    for empty_title in ["w:", "w:en:", "en:w:"] {
        assert_eq!(
            codec
                .new_title(empty_title)
                .map(|title| codec.to_pretty(&title))
                .ok()
                .as_deref(),
            Some("Main Page"),
        );
    }
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn local_interwiki_with_namespace_and_empty_dbkey_is_rejected() {
    test_failure("en.wikipedia.org", [("w: Talk:", "title-invalid-empty")])
        .await;
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn default_namespace_is_assigned_to_title_without_namespace_prefix() {
    const NS_TEMPLATE: i32 = 10;
    let codec = codec("en.wikipedia.org").await;
    for template_title in [
        "lang",
        "Lang",
        "Template:lang",
        "Template:Lang",
        "w:template:lang",
        "w:en:template:lang",
        "en:w:template:lang",
    ] {
        assert_eq!(
            codec
                .new_title_with_namespace(template_title, NS_TEMPLATE)
                .map(|title| codec.to_pretty(&title))
                .ok()
                .as_deref(),
            Some("Template:Lang"),
        );
    }
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn test_to_pretty_with_fragment() {
    let codec = codec("en.wikipedia.org").await;
    assert_eq!(
        codec
            .new_title("Main_Page#Did_you_know_...")
            .map(|title| codec.to_pretty_with_fragment(&title))
            .ok()
            .as_deref(),
        Some("Main Page#Did you know ..."),
    );
}

#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn formatversion_one() {
    let url =
        "https://en.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases|interwikimap&formatversion=1&format=json";
    let resp: SiteInfoResponse =
        reqwest::get(url).await.unwrap().json().await.unwrap();
    TitleCodec::from_site_info(resp.query).unwrap();
}