mwtitle/codec.rs
1/*
2Copyright (C) Tim Starling
3Copyright (C) Daniel Kinzler
4Copyright (C) 2021 Kunal Mehta <legoktm@debian.org>
5Copyright (C) 2021 Erutuon
6
7This program is free software: you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation, either version 3 of the License, or
10(at your option) any later version.
11
12This program is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
17You should have received a copy of the GNU General Public License
18along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20use crate::ip::sanitize_ip;
21use crate::namespace::{NS_SPECIAL, NS_TALK, NS_USER, NS_USER_TALK};
22#[cfg(feature = "utils")]
23#[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
24use crate::SiteInfoResponse;
25use crate::{
26 php, Error, Interwiki, InterwikiSet, NamespaceAlias, NamespaceInfo,
27 NamespaceMap, Result, SiteInfo, Title, NS_MAIN,
28};
29#[cfg(feature = "utils")]
30#[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
31use flate2::read::GzDecoder;
32use regex::bytes::Regex;
33#[cfg(feature = "utils")]
34#[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
35use std::{fs::File, io::Read, path::Path, sync::Arc};
36
37/// The `TitleCodec` is responsible for parsing, normalizing and formatting
38/// `Title`s. See the crate-level documentation for an example of how to
39/// construct one.
40#[cfg_attr(docsrs, doc(cfg(feature = "parsing")))]
41#[derive(Clone, Debug)]
42pub struct TitleCodec {
43 namespace_map: NamespaceMap,
44 interwiki_set: InterwikiSet,
45 local_interwiki_set: InterwikiSet,
46 main_page: String,
47 lang: String,
48 illegal_patterns: Regex,
49}
50
51#[test]
52fn title_codec_is_send_and_sync() {
53 fn assert_send_and_sync<T: Send + Sync>() {}
54
55 assert_send_and_sync::<TitleCodec>();
56}
57
58impl TitleCodec {
59 /// Create a new title by parsing the provided input.
60 pub fn new_title(&self, input: &str) -> Result<Title> {
61 self.secure_and_split(input, NS_MAIN)
62 }
63
64 /// Create a new title by parsing the provided input. If the title has no
65 /// namespace part, then the namespace specified by `default_namespace` is
66 /// used instead.
67 pub fn new_title_with_namespace(
68 &self,
69 input: &str,
70 default_namespace: i32,
71 ) -> Result<Title> {
72 self.secure_and_split(input, default_namespace)
73 }
74
75 /// Create a new title from the numerical database ID and title portion,
76 /// usually obtained directly from the database.
77 pub fn new_title_from_database(
78 &self,
79 namespace: i32,
80 dbkey: &str,
81 ) -> Result<Title> {
82 match self.namespace_map.get_name(namespace) {
83 Some(name) => {
84 if name.is_empty() {
85 // No prefixing needed
86 self.new_title(dbkey)
87 } else {
88 self.new_title(&format!("{name}:{dbkey}"))
89 }
90 }
91 None => Err(Error::UnknownNamespace(namespace)),
92 }
93 }
94
95 /// Get a reference to the underlying `NamespaceMap`
96 /// to get information about namespaces.
97 pub fn namespace_map(&self) -> &NamespaceMap {
98 &self.namespace_map
99 }
100
101 /// Get the title with namespace in pretty aka text form (spaces).
102 ///
103 /// Fragments will not be included.
104 ///
105 /// # Panics
106 ///
107 /// This will panic if the `Title` is in a namespace that this `TitleCodec`
108 /// is unaware of.
109 pub fn to_pretty(&self, title: &Title) -> String {
110 self.namespace_map
111 .to_pretty(title)
112 .expect("unknown namespace")
113 }
114
115 /// Get the title with namespace in underscore aka dbkey form. This is
116 /// potentially useful when you want to make a database query.
117 ///
118 /// Fragments will not be included.
119 ///
120 /// # Panics
121 ///
122 /// This will panic if the `Title` is in a namespace that this `TitleCodec`
123 /// is unaware of.
124 pub fn to_underscores(&self, title: &Title) -> String {
125 self.namespace_map
126 .to_underscores(title)
127 .expect("unknown namespace")
128 }
129
130 /// Get the title with namespace in pretty aka text form (spaces), with the
131 /// fragment, if one exists, appended.
132 ///
133 /// # Panics
134 ///
135 /// This will panic if the `Title` is in a namespace that this `TitleCodec`
136 /// is unaware of.
137 pub fn to_pretty_with_fragment(&self, title: &Title) -> String {
138 self.namespace_map
139 .to_pretty_with_fragment(title)
140 .expect("unknown namespace")
141 }
142
143 /// Construct a new `TitleCodec` using the given fields.
144 ///
145 /// In most cases it is easier to do so from one of the siteinfo methods.
146 pub fn new(
147 namespace_map: NamespaceMap,
148 interwiki_set: InterwikiSet,
149 local_interwiki_set: InterwikiSet,
150 main_page: String,
151 lang: String,
152 legal_title_chars: String,
153 ) -> Result<Self> {
154 // Copied from `MediaWikiTitleCodec::getTitleInvalidRegex()`.
155 // The `legal_title_chars` portion has to be changed when this lands:
156 // https://phabricator.wikimedia.org/T297340
157 // Matching titles will be held as illegal.
158 let illegal_patterns = Regex::new(&format!(
159 r"(?x-u)
160 # x: ignore whitespace and allow comments;
161 # -u: disable code point matching
162 # so that \x80-\xff match bytes 0x80-0xFF
163 # (corresponding to all non-ASCII code points, U+0080-U+10FFFF)
164 # rather than code points U+0080-U+00FF.
165 # Any character not allowed is forbidden...
166 [^{legal_title_chars}]
167
168 # URL percent encoding sequences interfere with the ability
169 # to round-trip titles -- you can't link to them consistently.
170 | %[0-9A-Fa-f]{{2}}
171
172 # XML/HTML character references produce similar issues.
173 | &[A-Za-z0-9\x80-\xff]+;
174 ",
175 // / does not need to be escaped as \/ in Rust regex.
176 legal_title_chars = legal_title_chars.replace(r"\/", "/")
177 ))?;
178
179 Ok(Self {
180 namespace_map,
181 interwiki_set,
182 local_interwiki_set,
183
184 illegal_patterns,
185 main_page,
186 lang,
187 })
188 }
189
190 /// Create a new `TitleCodec` getting namespaces, namespace aliases, and interwikis from iterators.
191 pub fn new_from_iters<
192 N: IntoIterator<Item = NamespaceInfo>,
193 A: IntoIterator<Item = NamespaceAlias>,
194 I: IntoIterator<Item = Interwiki>,
195 >(
196 namespaces: N,
197 namespace_aliases: A,
198 interwikis: I,
199 main_page: String,
200 lang: String,
201 legal_title_chars: String,
202 ) -> Result<Self> {
203 let (interwiki_set, local_interwiki_set) =
204 InterwikiSet::all_and_local_from_iter(interwikis);
205 let namespace_map =
206 NamespaceMap::from_namespaces_and_namespace_aliases(
207 namespaces,
208 namespace_aliases,
209 )?;
210 Self::new(
211 namespace_map,
212 interwiki_set,
213 local_interwiki_set,
214 main_page,
215 lang,
216 legal_title_chars,
217 )
218 }
219
220 /// Creates a `TitleCodec` by parsing the contents of a JSON or GZipped JSON file.
221 ///
222 /// Will accept the `siteinfo-namespaces.json.gz` file from in the Wikimedia dumps.
223 /// If the file extension is `gz`, decompresses from the GZip format before deserializing the JSON;
224 /// otherwise attempts to deserialize the file contents directly.
225 #[cfg(feature = "utils")]
226 #[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
227 pub fn from_path(path: &Path) -> Result<Self> {
228 let json = if path.extension() == Some("gz".as_ref()) {
229 let gz = File::open(path)
230 .map_err(|source| Error::from_io("open file", source, path))?;
231 let mut decoder = GzDecoder::new(gz);
232 let mut decoded = String::new();
233 decoder
234 .read_to_string(&mut decoded)
235 .map_err(|source| Error::from_io("parse GZip", source, path))?;
236 decoded
237 } else {
238 std::fs::read_to_string(path).map_err(|source| {
239 Error::from_io("read file to string", source, path)
240 })?
241 };
242 Self::from_json_with_path(&json, Some(path))
243 }
244
245 /// Creates a `TitleCodec` by parsing the contents of a `Read` type that contains the JSON
246 /// representation of a [`SiteInfoResponse`].
247 #[cfg(feature = "utils")]
248 #[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
249 pub fn from_reader<R: Read>(reader: R) -> Result<Self> {
250 Self::from_site_info(
251 serde_json::from_reader::<R, SiteInfoResponse>(reader)
252 .map_err(|source| Error::Json {
253 source: Arc::new(source),
254 })?
255 .query,
256 )
257 }
258
259 /// Creates a `TitleCodec` by parsing the JSON representation of a [`SiteInfoResponse`].
260 #[cfg(feature = "utils")]
261 #[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
262 pub fn from_json<S: AsRef<str>>(json: S) -> Result<Self> {
263 Self::from_json_with_path(json.as_ref(), None)
264 }
265
266 /// Creates a `TitleCodec` by parsing the JSON representation of a [`SiteInfoResponse`].
267 ///
268 /// # Errors
269 ///
270 /// If this fails and `path` is `Some(_)`, gives an error message
271 /// that mentions `path`.
272 #[cfg(feature = "utils")]
273 #[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
274 fn from_json_with_path(json: &str, path: Option<&Path>) -> Result<Self> {
275 Self::from_site_info(
276 serde_json::from_str::<SiteInfoResponse>(json)
277 .map_err(|source| {
278 let source = Arc::new(source);
279 if let Some(path) = path {
280 Error::JsonFile {
281 source,
282 path: path.into(),
283 }
284 } else {
285 Error::Json { source }
286 }
287 })?
288 .query,
289 )
290 }
291
292 /// Create a new `TitleCodec` using the provided [`SiteInfo`].
293 ///
294 /// The `SiteInfo` must include a non-empty `interwiki_map` field
295 /// to enable the resulting `TitleCodec`
296 /// to correctly parse titles with interwikis,
297 /// but an empty `interwiki_map` is not an error.
298 pub fn from_site_info(site_info: SiteInfo) -> Result<Self> {
299 Self::new_from_iters(
300 site_info.namespaces.into_values(),
301 site_info.namespace_aliases,
302 site_info.interwiki_map,
303 site_info.general.main_page,
304 site_info.general.lang,
305 site_info.general.legal_title_chars,
306 )
307 }
308
309 /// Equivalent of `MediaWikiTitleCodec::splitTitleString()`.
310 ///
311 /// Most comments are direct copies to make it easier to compare with
312 /// the MediaWiki implementation.
313 fn secure_and_split(
314 &self,
315 input: &str,
316 default_namespace: i32,
317 ) -> Result<Title> {
318 let mut namespace = default_namespace;
319 // Strip Unicode bidi override characters.
320 // Clean up whitespace.
321 let mut dbkey = normalize_title_chars(input);
322 let mut fragment = None;
323 let mut interwiki = None;
324 let mut local_interwiki = false;
325
326 // U+FFFD is the replacement character
327 if dbkey.contains('\u{FFFD}') {
328 // Contained illegal UTF-8 sequences or forbidden Unicode chars.
329 return Err(Error::IllegalUtf8(input.to_string()));
330 }
331 // Skip "Contained illegal UTF-8 sequences or forbidden Unicode chars.",
332 // because all Rust strings are valid UTF-8.
333
334 // Initial colon indicates main namespace rather than specified default
335 // but should not create invalid {ns,title} pairs such as {0,Project:Foo}
336 if dbkey.get(0..1) == Some(":") {
337 namespace = NS_MAIN;
338 // remove the colon but continue processing
339 dbkey.drain(..1);
340 // remove any subsequent whitespace
341 trim_title_whitespace(&mut dbkey);
342 }
343 if dbkey.is_empty() {
344 return Err(Error::Empty(input.to_string()));
345 }
346
347 fn get_nonempty_trimmed(
348 s: &str,
349 range_to: std::ops::RangeTo<usize>,
350 ) -> Option<&str> {
351 s.get(range_to)
352 .filter(|p| !p.is_empty())
353 .map(|s| s.trim_end_matches('_'))
354 }
355
356 // Namespace or interwiki prefix
357 // `MediaWikiTitleCodec` uses a regex here, but we're going to use string
358 // parsing instead.
359 loop {
360 if let Some(colon_pos) = dbkey.find(':') {
361 if let Some(prefix) = get_nonempty_trimmed(&dbkey, ..colon_pos)
362 {
363 if let Some(ns) = self.namespace_map.get_id(prefix) {
364 // Ordinary namespace
365 namespace = ns;
366 dbkey.drain(..colon_pos + 1);
367 trim_title_whitespace(&mut dbkey);
368 // For Talk:X pages, check if X has a "namespace" prefix
369 if ns == NS_TALK {
370 if let Some(colon_pos) = dbkey.find(':') {
371 // Disallow Talk:File:x or Talk:Interwiki:x type titles ...
372 if let Some(prefix) =
373 get_nonempty_trimmed(&dbkey, ..colon_pos)
374 {
375 if self
376 .namespace_map
377 .get_id(prefix)
378 .is_some()
379 || self.interwiki_set.contains(prefix)
380 {
381 return Err(Error::TalkNamespace(
382 input.to_string(),
383 ));
384 }
385 }
386 }
387 }
388 } else if self.interwiki_set.contains(prefix) {
389 // Check this using prefix before we mutably borrow dbkey
390 let is_local_interwiki =
391 self.local_interwiki_set.contains(prefix);
392 interwiki = Some(prefix.to_lowercase());
393 dbkey.drain(..colon_pos + 1);
394 trim_title_whitespace(&mut dbkey);
395
396 if is_local_interwiki {
397 if dbkey.is_empty() {
398 // Empty self-links should point to the Main Page, to ensure
399 // compatibility with cross-wiki transclusions and the like.
400 return Ok(self
401 .new_title(&self.main_page)
402 .map(|mut title| {
403 title.local_interwiki = true;
404 title
405 })
406 .unwrap_or_else(|_| {
407 // Fallback to hardcoded "Main Page" if the configured main page
408 // value is unparseable
409 Title {
410 namespace: NS_MAIN,
411 dbkey: "Main_Page".to_string(),
412 fragment: None,
413 interwiki: None,
414 local_interwiki: true,
415 }
416 }));
417 }
418 interwiki = None;
419 // local interwikis should behave like initial-colon links
420 local_interwiki = true;
421
422 // Do another namespace split...
423 continue;
424 }
425
426 // If there's an initial colon after the interwiki, that also
427 // resets the default namespace
428 if dbkey.starts_with(':') {
429 namespace = NS_MAIN;
430 dbkey.drain(..1);
431 trim_title_whitespace(&mut dbkey);
432 }
433 }
434 }
435 }
436 // If there's no recognized interwiki or namespace,
437 // then let the colon expression be part of the title.
438 break;
439 }
440
441 if let Some((key, f)) = dbkey.split_once('#') {
442 fragment = Some(f.replace('_', " "));
443 let key_len = key.len(); // to satisfy borrow checker
444 dbkey.truncate(key_len);
445 // remove whitespace again: prevents "Foo_bar_#"
446 // becoming "Foo_bar_"
447 trim_title_whitespace(&mut dbkey);
448 }
449
450 // Reject illegal characters.
451 if self.illegal_patterns.is_match(dbkey.as_bytes()) {
452 return Err(Error::Characters(input.to_string()));
453 }
454
455 // Pages with "/./" or "/../" appearing in the URLs will often be un-
456 // reachable due to the way web browsers deal with 'relative' URLs.
457 // Also, they conflict with subpage syntax. Forbid them explicitly.
458 if dbkey == "."
459 || dbkey == ".."
460 || dbkey.starts_with("./")
461 || dbkey.starts_with("../")
462 || dbkey.contains("/./")
463 || dbkey.contains("/../")
464 || dbkey.ends_with("/.")
465 || dbkey.ends_with("/..")
466 {
467 return Err(Error::Relative(input.to_string()));
468 }
469
470 // Magic tilde sequences? Nu-uh!
471 if dbkey.contains("~~~") {
472 return Err(Error::MagicTildes(input.to_string()));
473 }
474
475 // Limit the size of titles to 255 bytes. This is typically the size of the
476 // underlying database field. We make an exception for special pages, which
477 // don't need to be stored in the database, and may edge over 255 bytes due
478 // to subpage syntax for long titles, e.g. [[Special:Block/Long name]]
479 let max_length = if namespace == NS_SPECIAL { 512 } else { 255 };
480 if dbkey.len() > max_length {
481 return Err(Error::TooLong(input.to_string()));
482 }
483
484 // Normally, all wiki links are forced to have an initial capital letter so [[foo]]
485 // and [[Foo]] point to the same place. Don't force it for interwikis, since the
486 // other site might be case-sensitive.
487 if interwiki.is_none()
488 && self
489 .namespace_map
490 .is_capitalized(namespace)
491 .unwrap_or(false)
492 {
493 uppercase_first(&self.lang, &mut dbkey);
494 }
495
496 // Can't make a link to a namespace alone... "empty" local links can only be
497 // self-links with a fragment identifier.
498 // MediaWiki allows for links with just a fragment, but we won't.
499 if dbkey.is_empty() && interwiki.is_none() && namespace != NS_MAIN {
500 return Err(Error::Empty(input.to_string()));
501 }
502
503 if namespace == NS_USER || namespace == NS_USER_TALK {
504 sanitize_ip(&mut dbkey);
505 }
506
507 // Any remaining initial :s are illegal.
508 if dbkey.starts_with(':') {
509 return Err(Error::LeadingColon(input.to_string()));
510 }
511
512 Ok(Title {
513 namespace,
514 dbkey,
515 fragment,
516 interwiki,
517 local_interwiki,
518 })
519 }
520}
521
522/// Indicates whether a code point is considered whitespace when it is found in a title.
523///
524/// Includes all code points with the White_Space property
525/// (see [PropList.txt](https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt)),
526/// but excludes the control characters
527/// U+009-U+00D (tab, newline, vertical tab, form feed, carriage return)
528/// and U+0085 (next line), and adds U+180E (MONGOLIAN VOWEL SEPARATOR),
529/// a format character (General Category: Cf).
530/// The control characters U+009-U+00D are rejected
531/// by the `illegal_patterns` regex;
532/// U+0085 is accepted as a valid character.
533#[rustfmt::skip]
534fn is_title_whitespace(c: char) -> bool {
535 matches!(
536 c,
537 ' ' | '_' // U+0020 SPACE, U+005F LOW LINE
538 | '\u{A0}' // U+00A0 NO-BREAK SPACE
539 | '\u{1680}' // U+1680 OGHAM SPACE MARK
540 | '\u{180E}' // U+180E MONGOLIAN VOWEL SEPARATOR
541 // U+2000-U+200A: EN QUAD, EM QUAD, EN SPACE, EM SPACE,
542 // THREE-PER-EM SPACE, FOUR-PER-EM SPACE, SIX-PER-EM SPACE,
543 // FIGURE SPACE, PUNCTUATION SPACE, THIN SPACE, HAIR SPACE
544 | '\u{2000}'..='\u{200A}'
545 | '\u{2028}' // U+2028 LINE SEPARATOR
546 | '\u{2029}' // U+2029 PARAGRAPH SEPARATOR
547 | '\u{202F}' // U+202F NARROW NO-BREAK SPACE
548 | '\u{205F}' // U+205F MEDIUM MATHEMATICAL SPACE
549 | '\u{3000}' // U+3000 IDEOGRAPHIC SPACE
550 )
551}
552
553/**
554 * Indicates that a character is a directional formatting character
555 * that should be removed from titles.
556 *
557 * MediaWiki strips some [directional formatting characters](https://www.unicode.org/reports/tr9/#Directional_Formatting_Characters) from titles:
558 * U+200E and U+200F (LEFT-TO-RIGHT MARK, RIGHT-TO-LEFT MARK)
559 * and U+202A–U+202E (LEFT-TO-RIGHT EMBEDDING, RIGHT-TO-LEFT EMBEDDING,
560 * POP DIRECTIONAL FORMATTING, LEFT-TO-RIGHT OVERRIDE, RIGHT-TO-LEFT OVERRIDE).
561 * All of these were introduced in Unicode 1.1 and are referred to as
562 * bidi override characters in the source code
563 * of `MediaWikiTitleCodec::splitTitleString()`.
564 *
565 * The following directional formatting characters were introduced
566 * in [Unicode 6.3](https://www.unicode.org/versions/Unicode6.3.0/) (2013)
567 * and are not stripped:
568 * U+061C (ARABIC LETTER MARK)
569 * and U+2066–U+2069 (LEFT‑TO‑RIGHT ISOLATE, RIGHT‑TO‑LEFT ISOLATE, FIRST STRONG ISOLATE, POP DIRECTIONAL ISOLATE).
570 */
571fn is_bidirectional_override(c: char) -> bool {
572 matches!(c, '\u{200E}' | '\u{200F}' | '\u{202A}'..='\u{202E}')
573}
574
575/**
576 * Normalizes characters in a title.
577 *
578 * Removes the banned directional formatting characters (see [`is_bidirectional_override`]),
579 * strips title whitespace characters (see [`is_title_whitespace`])
580 * from the beginning and end of the title,
581 * and replaces sequences of one or more title whitespace characters with a single underscore.
582 */
583fn normalize_title_chars(title: &str) -> String {
584 // This gets the minimum possible length of the normalized title.
585 // It will be longer than this if there is any untrimmed whitespace.
586 let mut out = String::with_capacity(
587 title
588 .chars()
589 .filter(|c| {
590 !(is_title_whitespace(*c) || is_bidirectional_override(*c))
591 })
592 .count(),
593 );
594 let mut prev_whitespace = false;
595 for c in title.chars() {
596 let cur_whitespace = is_title_whitespace(c);
597 if !(cur_whitespace || is_bidirectional_override(c)) {
598 if prev_whitespace && !out.is_empty() {
599 out.push('_');
600 }
601 out.push(c);
602 }
603 prev_whitespace = cur_whitespace;
604 }
605 out
606}
607
608#[test]
609fn normalize_title_chars_strips_and_collapses_title_whitespace() {
610 assert_eq!(normalize_title_chars(" a b"), "a_b");
611 assert_eq!(normalize_title_chars("a b "), "a_b");
612 assert_eq!(normalize_title_chars("a b"), "a_b");
613 assert_eq!(normalize_title_chars("a__b"), "a_b");
614}
615
616#[test]
617fn normalize_title_chars_removes_directional_control_characters() {
618 assert_eq!(normalize_title_chars("\u{200E}_a_b"), "a_b");
619 assert_eq!(normalize_title_chars("a\u{200E}_b "), "a_b");
620 assert_eq!(normalize_title_chars("a_b\u{200E}"), "a_b");
621 assert_eq!(normalize_title_chars("a_\u{200E}_b"), "a_b");
622}
623
624fn trim_title_whitespace(s: &mut String) {
625 let title_start = s.bytes().position(|b| b != b'_').unwrap_or(0);
626 let trailing_whitespace_count =
627 s.bytes().rev().position(|b| b != b'_').unwrap_or(0);
628 // This `String::drain` won't panic because the `Iterator::position` call gets a valid `char` boundary.
629 s.drain(..title_start);
630 // This `String::truncate` won't panic because `s.len() - trailing_whitespace_count` is a valid `char` boundary;
631 s.truncate(s.len() - trailing_whitespace_count);
632}
633
634#[test]
635fn trim_title_whitespace_trims_underscores() {
636 assert_eq!(normalize_title_chars("_a_b"), "a_b");
637 assert_eq!(normalize_title_chars("a_b_"), "a_b");
638 assert_eq!(normalize_title_chars("_a_b_"), "a_b");
639}
640
641const UPPERCASE_DOTTED_I_LANGUAGES: [&str; 4] = ["az", "kaa", "kk", "tr"];
642
643/// Functional equivalent of `Language::ucfirst()`.
644///
645/// This is probably not going to be identical because of different Unicode
646/// versions in use, but hopefully those cases are so rare we don't hit them.
647///
648/// Or we could just hardcode a special mapping like MediaWiki does for
649/// client-side JavaScript.
650fn uppercase_first(lang: &str, input: &mut String) {
651 if let Some(first) = input.chars().next() {
652 // `Language::ucfirst()` has special handling for the `i` character
653 // in some languages
654 if first == 'i' && UPPERCASE_DOTTED_I_LANGUAGES.contains(&lang) {
655 // i has len_utf8() of 1
656 input.drain(..1);
657 // İ has len_utf8() of 2
658 input.reserve(2);
659 input.insert(0, 'İ');
660 } else if php::ALREADY_UPPERCASE.contains(&first) {
661 // Skip, do nothing
662 } else if let Some(replace) = php::to_uppercase(first) {
663 input.drain(..first.len_utf8());
664 input.reserve(replace.len_utf8());
665 input.insert(0, replace);
666 } else if !first.is_uppercase() {
667 input.drain(..first.len_utf8());
668 input.reserve(first.to_uppercase().map(|c| c.len_utf8()).sum());
669 for c in first.to_uppercase() {
670 input.insert(0, c);
671 }
672 }
673 }
674}
675
676#[test]
677fn uppercase_first_respects_dotted_i_langs() {
678 for ((lang, input), expected) in [
679 (("en", "abc"), "Abc"),
680 (("en", "istanbul"), "Istanbul"),
681 (("tr", "istanbul"), "İstanbul"),
682 ] {
683 let mut capitalized = input.to_string();
684 uppercase_first(lang, &mut capitalized);
685 assert_eq!(capitalized, expected);
686 }
687}