use crate::ip::sanitize_ip;
use crate::namespace::{NS_SPECIAL, NS_TALK, NS_USER, NS_USER_TALK};
#[cfg(feature = "utils")]
#[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
use crate::SiteInfoResponse;
use crate::{
php, Error, Interwiki, InterwikiSet, NamespaceAlias, NamespaceInfo,
NamespaceMap, Result, SiteInfo, Title, NS_MAIN,
};
#[cfg(feature = "utils")]
#[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
use flate2::read::GzDecoder;
use regex::bytes::Regex;
#[cfg(feature = "utils")]
#[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
use std::{fs::File, io::Read, path::Path, sync::Arc};
#[cfg_attr(docsrs, doc(cfg(feature = "parsing")))]
#[derive(Clone, Debug)]
pub struct TitleCodec {
namespace_map: NamespaceMap,
interwiki_set: InterwikiSet,
local_interwiki_set: InterwikiSet,
main_page: String,
lang: String,
illegal_patterns: Regex,
}
#[test]
fn title_codec_is_send_and_sync() {
fn assert_send_and_sync<T: Send + Sync>() {}
assert_send_and_sync::<TitleCodec>();
}
impl TitleCodec {
pub fn new_title(&self, input: &str) -> Result<Title> {
self.secure_and_split(input, NS_MAIN)
}
pub fn new_title_with_namespace(
&self,
input: &str,
default_namespace: i32,
) -> Result<Title> {
self.secure_and_split(input, default_namespace)
}
pub fn new_title_from_database(
&self,
namespace: i32,
dbkey: &str,
) -> Result<Title> {
match self.namespace_map.get_name(namespace) {
Some(name) => {
if name.is_empty() {
self.new_title(dbkey)
} else {
self.new_title(&format!("{name}:{dbkey}"))
}
}
None => Err(Error::UnknownNamespace(namespace)),
}
}
pub fn namespace_map(&self) -> &NamespaceMap {
&self.namespace_map
}
pub fn to_pretty(&self, title: &Title) -> String {
self.namespace_map
.to_pretty(title)
.expect("unknown namespace")
}
pub fn to_underscores(&self, title: &Title) -> String {
self.namespace_map
.to_underscores(title)
.expect("unknown namespace")
}
pub fn to_pretty_with_fragment(&self, title: &Title) -> String {
self.namespace_map
.to_pretty_with_fragment(title)
.expect("unknown namespace")
}
pub fn new(
namespace_map: NamespaceMap,
interwiki_set: InterwikiSet,
local_interwiki_set: InterwikiSet,
main_page: String,
lang: String,
legal_title_chars: String,
) -> Result<Self> {
let illegal_patterns = Regex::new(&format!(
r"(?x-u)
# x: ignore whitespace and allow comments;
# -u: disable code point matching
# so that \x80-\xff match bytes 0x80-0xFF
# (corresponding to all non-ASCII code points, U+0080-U+10FFFF)
# rather than code points U+0080-U+00FF.
# Any character not allowed is forbidden...
[^{legal_title_chars}]
# URL percent encoding sequences interfere with the ability
# to round-trip titles -- you can't link to them consistently.
| %[0-9A-Fa-f]{{2}}
# XML/HTML character references produce similar issues.
| &[A-Za-z0-9\x80-\xff]+;
",
legal_title_chars = legal_title_chars.replace(r"\/", "/")
))?;
Ok(Self {
namespace_map,
interwiki_set,
local_interwiki_set,
illegal_patterns,
main_page,
lang,
})
}
pub fn new_from_iters<
N: IntoIterator<Item = NamespaceInfo>,
A: IntoIterator<Item = NamespaceAlias>,
I: IntoIterator<Item = Interwiki>,
>(
namespaces: N,
namespace_aliases: A,
interwikis: I,
main_page: String,
lang: String,
legal_title_chars: String,
) -> Result<Self> {
let (interwiki_set, local_interwiki_set) =
InterwikiSet::all_and_local_from_iter(interwikis);
let namespace_map =
NamespaceMap::from_namespaces_and_namespace_aliases(
namespaces,
namespace_aliases,
)?;
Self::new(
namespace_map,
interwiki_set,
local_interwiki_set,
main_page,
lang,
legal_title_chars,
)
}
#[cfg(feature = "utils")]
#[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
pub fn from_path(path: &Path) -> Result<Self> {
let json = if path.extension() == Some("gz".as_ref()) {
let gz = File::open(path)
.map_err(|source| Error::from_io("open file", source, path))?;
let mut decoder = GzDecoder::new(gz);
let mut decoded = String::new();
decoder
.read_to_string(&mut decoded)
.map_err(|source| Error::from_io("parse GZip", source, path))?;
decoded
} else {
std::fs::read_to_string(path).map_err(|source| {
Error::from_io("read file to string", source, path)
})?
};
Self::from_json_with_path(&json, Some(path))
}
#[cfg(feature = "utils")]
#[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
pub fn from_reader<R: Read>(reader: R) -> Result<Self> {
Self::from_site_info(
serde_json::from_reader::<R, SiteInfoResponse>(reader)
.map_err(|source| Error::Json {
source: Arc::new(source),
})?
.query,
)
}
#[cfg(feature = "utils")]
#[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
pub fn from_json<S: AsRef<str>>(json: S) -> Result<Self> {
Self::from_json_with_path(json.as_ref(), None)
}
#[cfg(feature = "utils")]
#[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
fn from_json_with_path(json: &str, path: Option<&Path>) -> Result<Self> {
Self::from_site_info(
serde_json::from_str::<SiteInfoResponse>(json)
.map_err(|source| {
let source = Arc::new(source);
if let Some(path) = path {
Error::JsonFile {
source,
path: path.into(),
}
} else {
Error::Json { source }
}
})?
.query,
)
}
pub fn from_site_info(site_info: SiteInfo) -> Result<Self> {
Self::new_from_iters(
site_info.namespaces.into_values(),
site_info.namespace_aliases,
site_info.interwiki_map,
site_info.general.main_page,
site_info.general.lang,
site_info.general.legal_title_chars,
)
}
fn secure_and_split(
&self,
input: &str,
default_namespace: i32,
) -> Result<Title> {
let mut namespace = default_namespace;
let mut dbkey = normalize_title_chars(input);
let mut fragment = None;
let mut interwiki = None;
let mut local_interwiki = false;
if dbkey.contains('\u{FFFD}') {
return Err(Error::IllegalUtf8(input.to_string()));
}
if dbkey.get(0..1) == Some(":") {
namespace = NS_MAIN;
dbkey.drain(..1);
trim_title_whitespace(&mut dbkey);
}
if dbkey.is_empty() {
return Err(Error::Empty(input.to_string()));
}
fn get_nonempty_trimmed(
s: &str,
range_to: std::ops::RangeTo<usize>,
) -> Option<&str> {
s.get(range_to)
.filter(|p| !p.is_empty())
.map(|s| s.trim_end_matches('_'))
}
loop {
if let Some(colon_pos) = dbkey.find(':') {
if let Some(prefix) = get_nonempty_trimmed(&dbkey, ..colon_pos)
{
if let Some(ns) = self.namespace_map.get_id(prefix) {
namespace = ns;
dbkey.drain(..colon_pos + 1);
trim_title_whitespace(&mut dbkey);
if ns == NS_TALK {
if let Some(colon_pos) = dbkey.find(':') {
if let Some(prefix) =
get_nonempty_trimmed(&dbkey, ..colon_pos)
{
if self
.namespace_map
.get_id(prefix)
.is_some()
|| self.interwiki_set.contains(prefix)
{
return Err(Error::TalkNamespace(
input.to_string(),
));
}
}
}
}
} else if self.interwiki_set.contains(prefix) {
let is_local_interwiki =
self.local_interwiki_set.contains(prefix);
interwiki = Some(prefix.to_lowercase());
dbkey.drain(..colon_pos + 1);
trim_title_whitespace(&mut dbkey);
if is_local_interwiki {
if dbkey.is_empty() {
return Ok(self
.new_title(&self.main_page)
.map(|mut title| {
title.local_interwiki = true;
title
})
.unwrap_or_else(|_| {
Title {
namespace: NS_MAIN,
dbkey: "Main_Page".to_string(),
fragment: None,
interwiki: None,
local_interwiki: true,
}
}));
}
interwiki = None;
local_interwiki = true;
continue;
}
if dbkey.starts_with(':') {
namespace = NS_MAIN;
dbkey.drain(..1);
trim_title_whitespace(&mut dbkey);
}
}
}
}
break;
}
if let Some((key, f)) = dbkey.split_once('#') {
fragment = Some(f.replace('_', " "));
let key_len = key.len(); dbkey.truncate(key_len);
trim_title_whitespace(&mut dbkey);
}
if self.illegal_patterns.is_match(dbkey.as_bytes()) {
return Err(Error::Characters(input.to_string()));
}
if dbkey == "."
|| dbkey == ".."
|| dbkey.starts_with("./")
|| dbkey.starts_with("../")
|| dbkey.contains("/./")
|| dbkey.contains("/../")
|| dbkey.ends_with("/.")
|| dbkey.ends_with("/..")
{
return Err(Error::Relative(input.to_string()));
}
if dbkey.contains("~~~") {
return Err(Error::MagicTildes(input.to_string()));
}
let max_length = if namespace == NS_SPECIAL { 512 } else { 255 };
if dbkey.len() > max_length {
return Err(Error::TooLong(input.to_string()));
}
if interwiki.is_none()
&& self
.namespace_map
.is_capitalized(namespace)
.unwrap_or(false)
{
uppercase_first(&self.lang, &mut dbkey);
}
if dbkey.is_empty() && interwiki.is_none() && namespace != NS_MAIN {
return Err(Error::Empty(input.to_string()));
}
if namespace == NS_USER || namespace == NS_USER_TALK {
sanitize_ip(&mut dbkey);
}
if dbkey.starts_with(':') {
return Err(Error::LeadingColon(input.to_string()));
}
Ok(Title {
namespace,
dbkey,
fragment,
interwiki,
local_interwiki,
})
}
}
#[rustfmt::skip]
fn is_title_whitespace(c: char) -> bool {
matches!(
c,
' ' | '_' | '\u{A0}' | '\u{1680}' | '\u{180E}' | '\u{2000}'..='\u{200A}'
| '\u{2028}' | '\u{2029}' | '\u{202F}' | '\u{205F}' | '\u{3000}' )
}
fn is_bidirectional_override(c: char) -> bool {
matches!(c, '\u{200E}' | '\u{200F}' | '\u{202A}'..='\u{202E}')
}
fn normalize_title_chars(title: &str) -> String {
let mut out = String::with_capacity(
title
.chars()
.filter(|c| {
!(is_title_whitespace(*c) || is_bidirectional_override(*c))
})
.count(),
);
let mut prev_whitespace = false;
for c in title.chars() {
let cur_whitespace = is_title_whitespace(c);
if !(cur_whitespace || is_bidirectional_override(c)) {
if prev_whitespace && !out.is_empty() {
out.push('_');
}
out.push(c);
}
prev_whitespace = cur_whitespace;
}
out
}
#[test]
fn normalize_title_chars_strips_and_collapses_title_whitespace() {
assert_eq!(normalize_title_chars(" a b"), "a_b");
assert_eq!(normalize_title_chars("a b "), "a_b");
assert_eq!(normalize_title_chars("a b"), "a_b");
assert_eq!(normalize_title_chars("a__b"), "a_b");
}
#[test]
fn normalize_title_chars_removes_directional_control_characters() {
assert_eq!(normalize_title_chars("\u{200E}_a_b"), "a_b");
assert_eq!(normalize_title_chars("a\u{200E}_b "), "a_b");
assert_eq!(normalize_title_chars("a_b\u{200E}"), "a_b");
assert_eq!(normalize_title_chars("a_\u{200E}_b"), "a_b");
}
fn trim_title_whitespace(s: &mut String) {
let title_start = s.bytes().position(|b| b != b'_').unwrap_or(0);
let trailing_whitespace_count =
s.bytes().rev().position(|b| b != b'_').unwrap_or(0);
s.drain(..title_start);
s.truncate(s.len() - trailing_whitespace_count);
}
#[test]
fn trim_title_whitespace_trims_underscores() {
assert_eq!(normalize_title_chars("_a_b"), "a_b");
assert_eq!(normalize_title_chars("a_b_"), "a_b");
assert_eq!(normalize_title_chars("_a_b_"), "a_b");
}
const UPPERCASE_DOTTED_I_LANGUAGES: [&str; 4] = ["az", "kaa", "kk", "tr"];
fn uppercase_first(lang: &str, input: &mut String) {
if let Some(first) = input.chars().next() {
if first == 'i' && UPPERCASE_DOTTED_I_LANGUAGES.contains(&lang) {
input.drain(..1);
input.reserve(2);
input.insert(0, 'İ');
} else if php::ALREADY_UPPERCASE.contains(&first) {
} else if let Some(replace) = php::to_uppercase(first) {
input.drain(..first.len_utf8());
input.reserve(replace.len_utf8());
input.insert(0, replace);
} else if !first.is_uppercase() {
input.drain(..first.len_utf8());
input.reserve(first.to_uppercase().map(|c| c.len_utf8()).sum());
for c in first.to_uppercase() {
input.insert(0, c);
}
}
}
}
#[test]
fn uppercase_first_respects_dotted_i_langs() {
for ((lang, input), expected) in [
(("en", "abc"), "Abc"),
(("en", "istanbul"), "Istanbul"),
(("tr", "istanbul"), "İstanbul"),
] {
let mut capitalized = input.to_string();
uppercase_first(lang, &mut capitalized);
assert_eq!(capitalized, expected);
}
}