/*
Copyright (C) Tim Starling
Copyright (C) Daniel Kinzler
Copyright (C) 2021 Kunal Mehta <legoktm@debian.org>
Copyright (C) 2021 Erutuon
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
use crate::ip::sanitize_ip;
#[cfg(feature = "utils")]
#[cfg_attr(docs, doc(cfg(feature = "utils")))]
use crate::SiteInfoResponse;
use crate::{
php, Error, Interwiki, InterwikiSet, NamespaceAlias, NamespaceInfo,
NamespaceMap, Result, SiteInfo, Title, NS_MAIN,
};
#[cfg(feature = "utils")]
#[cfg_attr(docs, doc(cfg(feature = "utils")))]
use flate2::read::GzDecoder;
use regex::bytes::Regex;
#[cfg(feature = "utils")]
#[cfg_attr(docs, doc(cfg(feature = "utils")))]
use std::{fs::File, io::Read, path::Path, sync::Arc};
// Constants for readability
const NS_SPECIAL: i32 = -1;
const NS_TALK: i32 = 1;
const NS_USER: i32 = 2;
const NS_USER_TALK: i32 = 3;
/// The `TitleCodec` is responsible for parsing, normalizing and formatting
/// `Title`s. See the crate-level documentation for an example of how to
/// construct one.
#[cfg_attr(docs, doc(cfg(feature = "parsing")))]
#[derive(Clone, Debug)]
pub struct TitleCodec {
namespace_map: NamespaceMap,
interwiki_set: InterwikiSet,
local_interwiki_set: InterwikiSet,
main_page: String,
lang: String,
illegal_patterns: Regex,
}
#[test]
fn title_codec_is_send_and_sync() {
fn assert_send_and_sync<T: Send + Sync>() {}
assert_send_and_sync::<TitleCodec>();
}
impl TitleCodec {
/// Create a new title by parsing the provided input.
pub fn new_title(&self, input: &str) -> Result<Title> {
self.secure_and_split(input, NS_MAIN)
}
/// Create a new title by parsing the provided input. If the title has no
/// namespace part, then the namespace specified by `default_namespace` is
/// used instead.
pub fn new_title_with_namespace(
&self,
input: &str,
default_namespace: i32,
) -> Result<Title> {
self.secure_and_split(input, default_namespace)
}
/// Get the title with namespace in pretty aka text form (spaces).
///
/// Fragments will not be included.
///
/// # Panics
///
/// This will panic if the `Title` is in a namespace that this `TitleCodec`
/// is unaware of.
pub fn to_pretty(&self, title: &Title) -> String {
self.namespace_map
.to_pretty(title)
.expect("unknown namespace")
}
/// Get the title with namespace in underscore aka dbkey form. This is
/// potentially useful when you want to make a database query.
///
/// Fragments will not be included.
///
/// # Panics
///
/// This will panic if the `Title` is in a namespace that this `TitleCodec`
/// is unaware of.
pub fn to_underscores(&self, title: &Title) -> String {
self.namespace_map
.to_underscores(title)
.expect("unknown namespace")
}
/// Get the title with namespace in pretty aka text form (spaces), with the
/// fragment, if one exists, appended.
///
/// # Panics
///
/// This will panic if the `Title` is in a namespace that this `TitleCodec`
/// is unaware of.
pub fn to_pretty_with_fragment(&self, title: &Title) -> String {
self.namespace_map
.to_pretty_with_fragment(title)
.expect("unknown namespace")
}
/// Construct a new `TitleCodec` using the given fields.
///
/// In most cases it is easier to do so from one of the siteinfo methods.
pub fn new(
namespace_map: NamespaceMap,
interwiki_set: InterwikiSet,
local_interwiki_set: InterwikiSet,
main_page: String,
lang: String,
legal_title_chars: String,
) -> Result<Self> {
// Copied from `MediaWikiTitleCodec::getTitleInvalidRegex()`.
// The `legal_title_chars` portion has to be changed when this lands:
// https://phabricator.wikimedia.org/T297340
// Matching titles will be held as illegal.
let illegal_patterns = Regex::new(&format!(
r"(?x-u)
# x: ignore whitespace and allow comments;
# -u: disable code point matching
# so that \x80-\xff match bytes 0x80-0xFF
# (corresponding to all non-ASCII code points, U+0080-U+10FFFF)
# rather than code points U+0080-U+00FF.
# Any character not allowed is forbidden...
[^{legal_title_chars}]
# URL percent encoding sequences interfere with the ability
# to round-trip titles -- you can't link to them consistently.
| %[0-9A-Fa-f]{{2}}
# XML/HTML character references produce similar issues.
| &[A-Za-z0-9\x80-\xff]+;
",
// / does not need to be escaped as \/ in Rust regex.
legal_title_chars = legal_title_chars.replace(r"\/", "/")
))?;
Ok(Self {
namespace_map,
interwiki_set,
local_interwiki_set,
illegal_patterns,
main_page,
lang,
})
}
/// Create a new `TitleCodec` getting namespaces, namespace aliases, and interwikis from iterators.
pub fn new_from_iters<
N: IntoIterator<Item = NamespaceInfo>,
A: IntoIterator<Item = NamespaceAlias>,
I: IntoIterator<Item = Interwiki>,
>(
namespaces: N,
namespace_aliases: A,
interwikis: I,
main_page: String,
lang: String,
legal_title_chars: String,
) -> Result<Self> {
let (interwiki_set, local_interwiki_set) =
InterwikiSet::all_and_local_from_iter(interwikis);
let namespace_map =
NamespaceMap::from_namespaces_and_namespace_aliases(
namespaces,
namespace_aliases,
)?;
Self::new(
namespace_map,
interwiki_set,
local_interwiki_set,
main_page,
lang,
legal_title_chars,
)
}
/// Creates a `TitleCodec` by parsing the contents of a JSON or GZipped JSON file.
///
/// Will accept the `siteinfo-namespaces.json.gz` file from in the Wikimedia dumps.
/// If the file extension is `gz`, decompresses from the GZip format before deserializing the JSON;
/// otherwise attempts to deserialize the file contents directly.
#[cfg(feature = "utils")]
#[cfg_attr(docs, doc(cfg(feature = "utils")))]
pub fn from_path(path: &Path) -> Result<Self> {
let json = if path.extension() == Some("gz".as_ref()) {
let gz = File::open(path)
.map_err(|source| Error::from_io("open file", source, path))?;
let mut decoder = GzDecoder::new(gz);
let mut decoded = String::new();
decoder
.read_to_string(&mut decoded)
.map_err(|source| Error::from_io("parse GZip", source, path))?;
decoded
} else {
std::fs::read_to_string(path).map_err(|source| {
Error::from_io("read file to string", source, path)
})?
};
Self::from_json_with_path(&json, Some(path))
}
/// Creates a `TitleCodec` by parsing the contents of a `Read` type that contains the JSON
/// representation of a [`SiteInfoResponse`].
#[cfg(feature = "utils")]
#[cfg_attr(docs, doc(cfg(feature = "utils")))]
pub fn from_reader<R: Read>(reader: R) -> Result<Self> {
Self::from_site_info(
serde_json::from_reader::<R, SiteInfoResponse>(reader)
.map_err(|source| Error::Json {
source: Arc::new(source),
})?
.query,
)
}
/// Creates a `TitleCodec` by parsing the JSON representation of a [`SiteInfoResponse`].
#[cfg(feature = "utils")]
#[cfg_attr(docs, doc(cfg(feature = "utils")))]
pub fn from_json<S: AsRef<str>>(json: S) -> Result<Self> {
Self::from_json_with_path(json.as_ref(), None)
}
/// Creates a `TitleCodec` by parsing the JSON representation of a [`SiteInfoResponse`].
///
/// # Errors
///
/// If this fails and `path` is `Some(_)`, gives an error message
/// that mentions `path`.
#[cfg(feature = "utils")]
#[cfg_attr(docs, doc(cfg(feature = "utils")))]
fn from_json_with_path(json: &str, path: Option<&Path>) -> Result<Self> {
Self::from_site_info(
serde_json::from_str::<SiteInfoResponse>(json)
.map_err(|source| {
let source = Arc::new(source);
if let Some(path) = path {
Error::JsonFile {
source,
path: path.into(),
}
} else {
Error::Json { source }
}
})?
.query,
)
}
/// Create a new `TitleCodec` using the provided [`SiteInfo`].
///
/// The `SiteInfo` must include a non-empty `interwiki_map` field
/// to enable the resulting `TitleCodec`
/// to correctly parse titles with interwikis,
/// but an empty `interwiki_map` is not an error.
pub fn from_site_info(site_info: SiteInfo) -> Result<Self> {
Self::new_from_iters(
site_info.namespaces.into_values(),
site_info.namespace_aliases,
site_info.interwiki_map,
site_info.general.main_page,
site_info.general.lang,
site_info.general.legal_title_chars,
)
}
/// Equivalent of `MediaWikiTitleCodec::splitTitleString()`.
///
/// Most comments are direct copies to make it easier to compare with
/// the MediaWiki implementation.
fn secure_and_split(
&self,
input: &str,
default_namespace: i32,
) -> Result<Title> {
let mut namespace = default_namespace;
// Strip Unicode bidi override characters.
// Clean up whitespace.
let mut dbkey = normalize_title_chars(input);
let mut fragment = None;
let mut interwiki = None;
let mut local_interwiki = false;
// U+FFFD is the replacement character
if dbkey.contains('\u{FFFD}') {
// Contained illegal UTF-8 sequences or forbidden Unicode chars.
return Err(Error::IllegalUtf8(input.to_string()));
}
// Skip "Contained illegal UTF-8 sequences or forbidden Unicode chars.",
// because all Rust strings are valid UTF-8.
// Initial colon indicates main namespace rather than specified default
// but should not create invalid {ns,title} pairs such as {0,Project:Foo}
if dbkey.get(0..1) == Some(":") {
namespace = NS_MAIN;
// remove the colon but continue processing
dbkey.drain(..1);
// remove any subsequent whitespace
trim_title_whitespace(&mut dbkey);
}
if dbkey.is_empty() {
return Err(Error::Empty(input.to_string()));
}
fn get_nonempty_trimmed(
s: &str,
range_to: std::ops::RangeTo<usize>,
) -> Option<&str> {
s.get(range_to)
.filter(|p| !p.is_empty())
.map(|s| s.trim_end_matches('_'))
}
// Namespace or interwiki prefix
// `MediaWikiTitleCodec` uses a regex here, but we're going to use string
// parsing instead.
loop {
if let Some(colon_pos) = dbkey.find(':') {
if let Some(prefix) = get_nonempty_trimmed(&dbkey, ..colon_pos)
{
if let Some(ns) = self.namespace_map.get_id(prefix) {
// Ordinary namespace
namespace = ns;
dbkey.drain(..colon_pos + 1);
trim_title_whitespace(&mut dbkey);
// For Talk:X pages, check if X has a "namespace" prefix
if ns == NS_TALK {
if let Some(colon_pos) = dbkey.find(':') {
// Disallow Talk:File:x or Talk:Interwiki:x type titles ...
if let Some(prefix) =
get_nonempty_trimmed(&dbkey, ..colon_pos)
{
if self
.namespace_map
.get_id(prefix)
.is_some()
|| self.interwiki_set.contains(prefix)
{
return Err(Error::TalkNamespace(
input.to_string(),
));
}
}
}
}
} else if self.interwiki_set.contains(prefix) {
// Check this using prefix before we mutably borrow dbkey
let is_local_interwiki =
self.local_interwiki_set.contains(prefix);
interwiki = Some(prefix.to_lowercase());
dbkey.drain(..colon_pos + 1);
trim_title_whitespace(&mut dbkey);
if is_local_interwiki {
if dbkey.is_empty() {
// Empty self-links should point to the Main Page, to ensure
// compatibility with cross-wiki transclusions and the like.
return Ok(self
.new_title(&self.main_page)
.map(|mut title| {
title.local_interwiki = true;
title
})
.unwrap_or_else(|_| {
// Fallback to hardcoded "Main Page" if the configured main page
// value is unparseable
Title {
namespace: NS_MAIN,
dbkey: "Main_Page".to_string(),
fragment: None,
interwiki: None,
local_interwiki: true,
}
}));
}
interwiki = None;
// local interwikis should behave like initial-colon links
local_interwiki = true;
// Do another namespace split...
continue;
}
// If there's an initial colon after the interwiki, that also
// resets the default namespace
if dbkey.starts_with(':') {
namespace = NS_MAIN;
dbkey.drain(..1);
trim_title_whitespace(&mut dbkey);
}
}
}
}
// If there's no recognized interwiki or namespace,
// then let the colon expression be part of the title.
break;
}
if let Some((key, f)) = dbkey.split_once('#') {
fragment = Some(f.replace('_', " "));
let key_len = key.len(); // to satisfy borrow checker
dbkey.truncate(key_len);
// remove whitespace again: prevents "Foo_bar_#"
// becoming "Foo_bar_"
trim_title_whitespace(&mut dbkey);
}
// Reject illegal characters.
if self.illegal_patterns.is_match(dbkey.as_bytes()) {
return Err(Error::Characters(input.to_string()));
}
// Pages with "/./" or "/../" appearing in the URLs will often be un-
// reachable due to the way web browsers deal with 'relative' URLs.
// Also, they conflict with subpage syntax. Forbid them explicitly.
if dbkey == "."
|| dbkey == ".."
|| dbkey.starts_with("./")
|| dbkey.starts_with("../")
|| dbkey.contains("/./")
|| dbkey.contains("/../")
|| dbkey.ends_with("/.")
|| dbkey.ends_with("/..")
{
return Err(Error::Relative(input.to_string()));
}
// Magic tilde sequences? Nu-uh!
if dbkey.contains("~~~") {
return Err(Error::MagicTildes(input.to_string()));
}
// Limit the size of titles to 255 bytes. This is typically the size of the
// underlying database field. We make an exception for special pages, which
// don't need to be stored in the database, and may edge over 255 bytes due
// to subpage syntax for long titles, e.g. [[Special:Block/Long name]]
let max_length = if namespace == NS_SPECIAL { 512 } else { 255 };
if dbkey.len() > max_length {
return Err(Error::TooLong(input.to_string()));
}
// Normally, all wiki links are forced to have an initial capital letter so [[foo]]
// and [[Foo]] point to the same place. Don't force it for interwikis, since the
// other site might be case-sensitive.
if interwiki.is_none()
&& self
.namespace_map
.is_capitalized(namespace)
.unwrap_or(false)
{
uppercase_first(&self.lang, &mut dbkey);
}
// Can't make a link to a namespace alone... "empty" local links can only be
// self-links with a fragment identifier.
// MediaWiki allows for links with just a fragment, but we won't.
if dbkey.is_empty() && interwiki.is_none() && namespace != NS_MAIN {
return Err(Error::Empty(input.to_string()));
}
if namespace == NS_USER || namespace == NS_USER_TALK {
sanitize_ip(&mut dbkey);
}
// Any remaining initial :s are illegal.
if dbkey.starts_with(':') {
return Err(Error::LeadingColon(input.to_string()));
}
Ok(Title {
namespace,
dbkey,
fragment,
interwiki,
local_interwiki,
})
}
}
/// Indicates whether a code point is considered whitespace when it is found in a title.
///
/// Includes all code points with the White_Space property
/// (see [PropList.txt](https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt)),
/// but excludes the control characters
/// U+009-U+00D (tab, newline, vertical tab, form feed, carriage return)
/// and U+0085 (next line), and adds U+180E (MONGOLIAN VOWEL SEPARATOR),
/// a format character (General Category: Cf).
/// The control characters U+009-U+00D are rejected
/// by the `illegal_patterns` regex;
/// U+0085 is accepted as a valid character.
#[rustfmt::skip]
fn is_title_whitespace(c: char) -> bool {
matches!(
c,
' ' | '_' // U+0020 SPACE, U+005F LOW LINE
| '\u{A0}' // U+00A0 NO-BREAK SPACE
| '\u{1680}' // U+1680 OGHAM SPACE MARK
| '\u{180E}' // U+180E MONGOLIAN VOWEL SEPARATOR
// U+2000-U+200A: EN QUAD, EM QUAD, EN SPACE, EM SPACE,
// THREE-PER-EM SPACE, FOUR-PER-EM SPACE, SIX-PER-EM SPACE,
// FIGURE SPACE, PUNCTUATION SPACE, THIN SPACE, HAIR SPACE
| '\u{2000}'..='\u{200A}'
| '\u{2028}' // U+2028 LINE SEPARATOR
| '\u{2029}' // U+2029 PARAGRAPH SEPARATOR
| '\u{202F}' // U+202F NARROW NO-BREAK SPACE
| '\u{205F}' // U+205F MEDIUM MATHEMATICAL SPACE
| '\u{3000}' // U+3000 IDEOGRAPHIC SPACE
)
}
/**
* Indicates that a character is a directional formatting character
* that should be removed from titles.
*
* MediaWiki strips some [directional formatting characters](https://www.unicode.org/reports/tr9/#Directional_Formatting_Characters) from titles:
* U+200E and U+200F (LEFT-TO-RIGHT MARK, RIGHT-TO-LEFT MARK)
* and U+202A–U+202E (LEFT-TO-RIGHT EMBEDDING, RIGHT-TO-LEFT EMBEDDING,
* POP DIRECTIONAL FORMATTING, LEFT-TO-RIGHT OVERRIDE, RIGHT-TO-LEFT OVERRIDE).
* All of these were introduced in Unicode 1.1 and are referred to as
* bidi override characters in the source code
* of `MediaWikiTitleCodec::splitTitleString()`.
*
* The following directional formatting characters were introduced
* in [Unicode 6.3](https://www.unicode.org/versions/Unicode6.3.0/) (2013)
* and are not stripped:
* U+061C (ARABIC LETTER MARK)
* and U+2066–U+2069 (LEFT‑TO‑RIGHT ISOLATE, RIGHT‑TO‑LEFT ISOLATE, FIRST STRONG ISOLATE, POP DIRECTIONAL ISOLATE).
*/
fn is_bidirectional_override(c: char) -> bool {
matches!(c, '\u{200E}' | '\u{200F}' | '\u{202A}'..='\u{202E}')
}
/**
* Normalizes characters in a title.
*
* Removes the banned directional formatting characters (see [`is_bidirectional_override`]),
* strips title whitespace characters (see [`is_title_whitespace`])
* from the beginning and end of the title,
* and replaces sequences of one or more title whitespace characters with a single underscore.
*/
fn normalize_title_chars(title: &str) -> String {
// This gets the minimum possible length of the normalized title.
// It will be longer than this if there is any untrimmed whitespace.
let mut out = String::with_capacity(
title
.chars()
.filter(|c| {
!(is_title_whitespace(*c) || is_bidirectional_override(*c))
})
.count(),
);
let mut prev_whitespace = false;
for c in title.chars() {
let cur_whitespace = is_title_whitespace(c);
if !(cur_whitespace || is_bidirectional_override(c)) {
if prev_whitespace && !out.is_empty() {
out.push('_');
}
out.push(c);
}
prev_whitespace = cur_whitespace;
}
out
}
#[test]
fn normalize_title_chars_strips_and_collapses_title_whitespace() {
assert_eq!(normalize_title_chars(" a b"), "a_b");
assert_eq!(normalize_title_chars("a b "), "a_b");
assert_eq!(normalize_title_chars("a b"), "a_b");
assert_eq!(normalize_title_chars("a__b"), "a_b");
}
#[test]
fn normalize_title_chars_removes_directional_control_characters() {
assert_eq!(normalize_title_chars("\u{200E}_a_b"), "a_b");
assert_eq!(normalize_title_chars("a\u{200E}_b "), "a_b");
assert_eq!(normalize_title_chars("a_b\u{200E}"), "a_b");
assert_eq!(normalize_title_chars("a_\u{200E}_b"), "a_b");
}
fn trim_title_whitespace(s: &mut String) {
let title_start = s.bytes().position(|b| b != b'_').unwrap_or(0);
let trailing_whitespace_count =
s.bytes().rev().position(|b| b != b'_').unwrap_or(0);
// This `String::drain` won't panic because the `Iterator::position` call gets a valid `char` boundary.
s.drain(..title_start);
// This `String::truncate` won't panic because `s.len() - trailing_whitespace_count` is a valid `char` boundary;
s.truncate(s.len() - trailing_whitespace_count);
}
#[test]
fn trim_title_whitespace_trims_underscores() {
assert_eq!(normalize_title_chars("_a_b"), "a_b");
assert_eq!(normalize_title_chars("a_b_"), "a_b");
assert_eq!(normalize_title_chars("_a_b_"), "a_b");
}
const UPPERCASE_DOTTED_I_LANGUAGES: [&str; 4] = ["az", "kaa", "kk", "tr"];
/// Functional equivalent of `Language::ucfirst()`.
///
/// This is probably not going to be identical because of different Unicode
/// versions in use, but hopefully those cases are so rare we don't hit them.
///
/// Or we could just hardcode a special mapping like MediaWiki does for
/// client-side JavaScript.
fn uppercase_first(lang: &str, input: &mut String) {
if let Some(first) = input.chars().next() {
// `Language::ucfirst()` has special handling for the `i` character
// in some languages
if first == 'i' && UPPERCASE_DOTTED_I_LANGUAGES.contains(&lang) {
// i has len_utf8() of 1
input.drain(..1);
// İ has len_utf8() of 2
input.reserve(2);
input.insert(0, 'İ');
} else if php::ALREADY_UPPERCASE.contains(&first) {
// Skip, do nothing
} else if let Some(replace) = php::to_uppercase(first) {
input.drain(..first.len_utf8());
input.reserve(replace.len_utf8());
input.insert(0, replace);
} else if !first.is_uppercase() {
input.drain(..first.len_utf8());
input.reserve(first.to_uppercase().map(|c| c.len_utf8()).sum());
for c in first.to_uppercase() {
input.insert(0, c);
}
}
}
}
#[test]
fn uppercase_first_respects_dotted_i_langs() {
for ((lang, input), expected) in [
(("en", "abc"), "Abc"),
(("en", "istanbul"), "Istanbul"),
(("tr", "istanbul"), "İstanbul"),
] {
let mut capitalized = input.to_string();
uppercase_first(lang, &mut capitalized);
assert_eq!(capitalized, expected);
}
}