#![cfg(feature = "parsing")]
use mwtitle::{Error, SiteInfoResponse, TitleCodec};
use once_cell::sync::Lazy;
use std::{collections::HashMap, sync::Mutex};
async fn codec(domain: &str) -> TitleCodec {
static CODECS: Lazy<Mutex<HashMap<String, TitleCodec>>> =
Lazy::new(|| Mutex::new(HashMap::new()));
let mut codecs = CODECS.lock().unwrap();
if let Some(codec) = codecs.get(domain) {
return codec.clone();
}
let url = format!(
"https://{}/w/api.php?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases|interwikimap&formatversion=2&format=json",
domain
);
let resp: SiteInfoResponse =
reqwest::get(url).await.unwrap().json().await.unwrap();
let entry = codecs.entry(domain.into());
#[allow(clippy::or_fun_call)]
entry
.or_insert(TitleCodec::from_site_info(resp.query).expect(
"API doesn't return namespacealiases with invalid namespace IDs",
))
.clone()
}
async fn test_failure<const N: usize>(domain: &str, tests: [(&str, &str); N]) {
let codec = codec(domain).await;
for (input, expected) in tests {
let title = codec.new_title(input);
assert!(matches!(&title, Err(_)), "\n{:?}\n{:?}", title, expected);
let error_code = title
.as_ref()
.err()
.and_then(|error| error.mw_title_codec_error_code());
assert_eq!(error_code, Some(expected), "\n{:?}\n{:?}", title, expected);
}
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn titles_that_cannot_roundtrip_from_talk_to_main_and_back_are_rejected()
{
test_failure(
"en.wikipedia.org",
[
("Talk:File:Example.svg", "title-invalid-talk-namespace"),
("Talk:_File_:Example.svg", "title-invalid-talk-namespace"),
("Talk:wikt:Example.svg", "title-invalid-talk-namespace"),
("Talk: wikt :Example.svg", "title-invalid-talk-namespace"),
],
)
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn c0_control_character_whitespace_is_rejected() {
test_failure(
"en.wikipedia.org",
[
("A\t", "title-invalid-characters"),
("A\n", "title-invalid-characters"),
("A\r", "title-invalid-characters"),
("A\tB", "title-invalid-characters"),
("Talk:A\t", "title-invalid-characters"),
("Talk:\tA", "title-invalid-characters"),
("Talk\t:A", "title-invalid-characters"),
("Talk:A\t/B", "title-invalid-characters"),
("Talk:A/\tB", "title-invalid-characters"),
("Talk:A/B\t/C", "title-invalid-characters"),
],
)
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn non_mediawiki_whitespace_is_not_collapsed_or_trimmed() {
test_success("en.wikipedia.org", [("A \u{85} ", "A_\u{85}")]).await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn punctuation_characters_used_in_wikitext_or_html_are_rejected() {
test_failure(
"en.wikipedia.org",
[
("A [ B", "title-invalid-characters"),
("A ] B", "title-invalid-characters"),
("A { B", "title-invalid-characters"),
("A } B", "title-invalid-characters"),
("A < B", "title-invalid-characters"),
("A > B", "title-invalid-characters"),
("A | B", "title-invalid-characters"),
],
)
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn url_percent_encoding_is_rejected() {
test_failure(
"en.wikipedia.org",
[
("A%20B", "title-invalid-characters"),
("A%23B", "title-invalid-characters"),
("A%2523B", "title-invalid-characters"),
],
)
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn relative_directory_syntax_is_rejected() {
test_failure(
"en.wikipedia.org",
[
(".", "title-invalid-relative"),
("..", "title-invalid-relative"),
("./Sandbox", "title-invalid-relative"),
("../Sandbox", "title-invalid-relative"),
("Foo/./Sandbox", "title-invalid-relative"),
("Foo/../Sandbox", "title-invalid-relative"),
("Sandbox/.", "title-invalid-relative"),
("Sandbox/..", "title-invalid-relative"),
],
)
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn empty_database_page_titles_are_rejected() {
test_failure(
"en.wikipedia.org",
[
("", "title-invalid-empty"),
(":", "title-invalid-empty"),
("__ __", "title-invalid-empty"),
(" __ ", "title-invalid-empty"),
("Talk:", "title-invalid-empty"),
("Talk: _", "title-invalid-empty"),
("Talk:#", "title-invalid-empty"),
("Category: ", "title-invalid-empty"),
("Category: #bar", "title-invalid-empty"),
],
)
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn replacement_character_is_rejected() {
test_failure("en.wikipedia.org", [("�", "title-invalid-utf8")]).await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn magic_tilde_is_rejected() {
test_failure(
"en.wikipedia.org",
[
("A ~~~ Name", "title-invalid-magic-tilde"),
("A ~~~~ Signature", "title-invalid-magic-tilde"),
("A ~~~~~ Timestamp", "title-invalid-magic-tilde"),
],
)
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn leading_colon_is_rejected() {
test_failure(
"en.wikipedia.org",
[
("::1", "title-invalid-leading-colon"),
("Category::Test", "title-invalid-leading-colon"),
],
)
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn title_length_constraints_apply_to_database_page_title() {
let codec = codec("en.wikipedia.org").await;
let test_good = [
format!("Category:{}", "x".repeat(247)),
format!("Special:{}", "x".repeat(499)),
"x".repeat(251),
"x".repeat(255),
];
let test_bad = [
"x".repeat(257),
format!("Special:{}", "x".repeat(513)),
];
for input in test_good {
let title_result = codec.new_title(&input);
assert!(title_result.is_ok(), "\n{:?}", title_result);
}
for input in test_bad {
let title_result = codec.new_title(&input);
assert!(
matches!(title_result, Err(Error::TooLong(_))),
"\n{:?}",
title_result
);
}
}
async fn test_no_change_with_whitespace_added<const N: usize>(
domain: &str,
tests: [&str; N],
) {
let codec = codec(domain).await;
for input in tests {
let title = codec.new_title(input).ok();
let title2 = codec.new_title(&format!(" {}_", input)).ok();
assert_eq!(title, title2);
}
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn more_than_two_dots_are_accepted() {
test_no_change_with_whitespace_added(
"en.wikipedia.org",
["Foo/.../Sandbox", "Sandbox/..."],
)
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn one_or_two_tildes_are_accepted() {
test_no_change_with_whitespace_added("en.wikipedia.org", ["~", "A~~"])
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn title_parses_identically_after_leading_and_trailing_whitespace_is_added(
) {
test_no_change_with_whitespace_added("en.wikipedia.org", [
"Sandbox",
"A \"B\"",
"A 'B'",
".com",
"Test#Abc",
"\"",
"'",
"Talk:Sandbox",
"Talk:Foo:Sandbox",
"File:Example.svg",
"File_talk:Example.svg",
":A",
"-",
"aũ",
"Foo & bar",
"\"Believing_Women\"_in_Islam._Unreading_Patriarchal_Interpretations_of_the_Qur\\\'ān"
]).await;
}
async fn test_success<const N: usize>(domain: &str, tests: [(&str, &str); N]) {
let codec = codec(domain).await;
for (input, expected) in tests {
let title_result = codec.new_title(input);
assert!(
matches!(title_result, Ok(_)),
"{:?}\t{:?}",
title_result,
expected
);
assert_eq!(
title_result
.ok()
.map(|title| codec.to_underscores(&title))
.as_deref(),
Some(expected)
);
}
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn whitespace_is_trimmed_and_leading_colons_are_removed() {
test_success(
"en.wikipedia.org",
[
("Test", "Test"),
(":Test", "Test"),
(": Test", "Test"),
(":_Test_", "Test"),
("Test 123 456 789", "Test_123_456_789"),
("💩", "💩"),
("Talk: foo", "Talk:Foo"),
("X-Men (film series) #Gambit", "X-Men_(film_series)"),
("Foo _ bar", "Foo_bar"),
(
"Foo \u{00A0}\u{1680}\u{180E}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000} bar",
"Foo_bar"),
(
"Foo\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}bar",
"Foobar",
),
(
"list of Neighbours characters (2016)#Tom Quill",
"List_of_Neighbours_characters_(2016)"
),
]).await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn titles_with_fake_namespaces_arent_trimmed_or_capitalized_after_colon()
{
test_success(
"en.wikipedia.org",
[
("Foo:bar", "Foo:bar"),
("Foo: bar", "Foo:_bar"),
("int:eger", "Int:eger"),
],
)
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn namespace_aliases_map_to_local_namespace_names() {
test_success("en.wikipedia.org", [("WP:eger", "Wikipedia:Eger")]).await;
}
macro_rules! add_subpage {
(
[
$(($input:literal, $expected:literal)),*
$(,)?
]
) => {{
[
$(
($input, $expected),
(concat!($input, "/subpage"), concat!($expected, "/subpage"))
),*
]
}};
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn ipv4_address_is_sanitized_in_user_namespaces() {
test_success(
"en.wikipedia.org",
add_subpage!([
("User:127.000.000.001", "User:127.0.0.1"),
("User:0.0.0.0", "User:0.0.0.0"),
("User:00.00.00.00", "User:0.0.0.0"),
("User:000.000.000.000", "User:0.0.0.0"),
("User:141.000.011.253", "User:141.0.11.253"),
("User: 1.2.4.5", "User:1.2.4.5"),
("User:01.02.04.05", "User:1.2.4.5"),
("User:001.002.004.005", "User:1.2.4.5"),
("User:010.0.000.1", "User:10.0.0.1"),
("User:080.072.250.04", "User:80.72.250.4"),
]),
)
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn ipv6_address_is_sanitized_in_user_namespaces() {
test_success(
"en.wikipedia.org",
add_subpage!([
("User:::1", "User:0:0:0:0:0:0:0:1"),
("User:0:0:0:0:0:0:0:1", "User:0:0:0:0:0:0:0:1"),
("User:cebc:2004:f::", "User:CEBC:2004:F:0:0:0:0:0"),
("User:::", "User:0:0:0:0:0:0:0:0"),
("User:0:0:0:1::", "User:0:0:0:1:0:0:0:0"),
("User:3f:535::e:fbb", "User:3F:535:0:0:0:0:E:FBB"),
("User Talk:::1", "User_talk:0:0:0:0:0:0:0:1"),
("User_Talk:::1", "User_talk:0:0:0:0:0:0:0:1"),
("User_talk:::1", "User_talk:0:0:0:0:0:0:0:1"),
("User_talk:::1/24", "User_talk:0:0:0:0:0:0:0:1/24"),
]),
)
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn non_ip_addresses_are_not_sanitized() {
test_success(
"en.wikipedia.org",
add_subpage!([
("User:Bar.01", "User:Bar.01"),
("User:Bar.010", "User:Bar.010"),
("User:00.00.00. 00", "User:00.00.00._00"),
("User:00.00.00.00 / subpage", "User:00.00.00.00_/_subpage"),
]),
)
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn ip_addresses_outside_of_user_namespaces_are_not_sanitized() {
test_success(
"en.wikipedia.org",
add_subpage!([
("0:0:0:0:0:0:0:1", "0:0:0:0:0:0:0:1"),
("127.000.000.001", "127.000.000.001"),
("0.0.0.0", "0.0.0.0"),
("00.00.00.00", "00.00.00.00"),
("000.000.000.000", "000.000.000.000"),
("141.000.011.253", "141.000.011.253"),
(" 1.2.4.5", "1.2.4.5"),
("01.02.04.05", "01.02.04.05"),
("001.002.004.005", "001.002.004.005"),
("010.0.000.1", "010.0.000.1"),
("080.072.250.04", "080.072.250.04"),
("Foo.1000.00", "Foo.1000.00"),
("Bar.01", "Bar.01"),
("Bar.010", "Bar.010"),
("cebc:2004:f::", "Cebc:2004:f::"),
("0:0:0:1::", "0:0:0:1::"),
("3f:535::e:fbb", "3f:535::e:fbb"),
]),
)
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn title_capitalization_follows_php_rules() {
test_success(
"en.wikipedia.org",
[
("ß", "ß"),
("ʼn", "ʼn"),
("ǰ", "ǰ"),
("ΐ", "ΐ"),
("ΰ", "ΰ"),
("և", "և"),
("ẖ", "ẖ"),
("ẗ", "ẗ"),
("ẘ", "ẘ"),
("ẙ", "ẙ"),
("ẚ", "ẚ"),
("ὐ", "ὐ"),
("ὒ", "ὒ"),
("ὔ", "ὔ"),
("ὖ", "ὖ"),
("ᾀ", "ᾈ"),
("ᾁ", "ᾉ"),
("ᾂ", "ᾊ"),
("ᾃ", "ᾋ"),
("ᾄ", "ᾌ"),
("ᾅ", "ᾍ"),
("ᾆ", "ᾎ"),
("ᾇ", "ᾏ"),
("ᾐ", "ᾘ"),
("ᾑ", "ᾙ"),
("ᾒ", "ᾚ"),
("ᾓ", "ᾛ"),
("ᾔ", "ᾜ"),
("ᾕ", "ᾝ"),
("ᾖ", "ᾞ"),
("ᾗ", "ᾟ"),
("ᾠ", "ᾨ"),
("ᾡ", "ᾩ"),
("ᾢ", "ᾪ"),
("ᾣ", "ᾫ"),
("ᾤ", "ᾬ"),
("ᾥ", "ᾭ"),
("ᾦ", "ᾮ"),
("ᾧ", "ᾯ"),
("ff", "ff"),
("fi", "fi"),
("fl", "fl"),
("ffi", "ffi"),
("ffl", "ffl"),
("ſt", "ſt"),
("st", "st"),
("ﬓ", "ﬓ"),
("ﬔ", "ﬔ"),
("ﬕ", "ﬕ"),
("ﬖ", "ﬖ"),
("ﬗ", "ﬗ"),
("ⓝ", "ⓝ"),
],
)
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn dotted_i_is_uppercased_to_dotted_capital_i_according_to_language() {
let tests = [
("tr.wikipedia.org", "iTestTest", "İTestTest"),
("az.wikipedia.org", "iTestTest", "İTestTest"),
("kk.wikipedia.org", "iTestTest", "İTestTest"),
("kaa.wikipedia.org", "iTestTest", "İTestTest"),
("en.wikipedia.org", "iTestTest", "ITestTest"),
];
for (domain, input, expected) in tests {
let codec = codec(domain).await;
let prefixed_text = codec
.new_title(input)
.map(|title| codec.to_pretty(&title))
.ok();
assert_eq!(prefixed_text.as_deref(), Some(expected));
}
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn interwiki_prefix_is_normalized() {
test_success(
"en.wikipedia.org",
[("meta:foobar", "meta:foobar"), ("Meta:foo", "meta:foo")],
)
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn title_with_local_interwiki_is_capitalized_according_to_namespace() {
test_success(
"en.wikipedia.org",
[
("w:talk:foo", "Talk:Foo"),
("w:spaces in page", "Spaces_in_page"),
("en:w:Sandbox_", "Sandbox"),
("en:", "Main_Page"),
],
)
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn local_interwiki_with_mainspace_and_empty_title_points_to_main_page() {
test_success(
"en.wikipedia.org",
[("en:", "Main_Page"), ("w:", "Main_Page")],
)
.await;
let codec = codec("en.wikipedia.org").await;
for empty_title in ["w:", "w:en:", "en:w:"] {
assert_eq!(
codec
.new_title(empty_title)
.map(|title| codec.to_pretty(&title))
.ok()
.as_deref(),
Some("Main Page"),
);
}
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn local_interwiki_with_namespace_and_empty_dbkey_is_rejected() {
test_failure("en.wikipedia.org", [("w: Talk:", "title-invalid-empty")])
.await;
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn default_namespace_is_assigned_to_title_without_namespace_prefix() {
const NS_TEMPLATE: i32 = 10;
let codec = codec("en.wikipedia.org").await;
for template_title in [
"lang",
"Lang",
"Template:lang",
"Template:Lang",
"w:template:lang",
"w:en:template:lang",
"en:w:template:lang",
] {
assert_eq!(
codec
.new_title_with_namespace(template_title, NS_TEMPLATE)
.map(|title| codec.to_pretty(&title))
.ok()
.as_deref(),
Some("Template:Lang"),
);
}
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn test_to_pretty_with_fragment() {
let codec = codec("en.wikipedia.org").await;
assert_eq!(
codec
.new_title("Main_Page#Did_you_know_...")
.map(|title| codec.to_pretty_with_fragment(&title))
.ok()
.as_deref(),
Some("Main Page#Did you know ..."),
);
}
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn formatversion_one() {
let url =
"https://en.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases|interwikimap&formatversion=1&format=json";
let resp: SiteInfoResponse =
reqwest::get(url).await.unwrap().json().await.unwrap();
TitleCodec::from_site_info(resp.query).unwrap();
}