mwtitle/
codec.rs

1/*
2Copyright (C) Tim Starling
3Copyright (C) Daniel Kinzler
4Copyright (C) 2021 Kunal Mehta <legoktm@debian.org>
5Copyright (C) 2021 Erutuon
6
7This program is free software: you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation, either version 3 of the License, or
10(at your option) any later version.
11
12This program is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15GNU General Public License for more details.
16
17You should have received a copy of the GNU General Public License
18along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 */
20use crate::ip::sanitize_ip;
21use crate::namespace::{NS_SPECIAL, NS_TALK, NS_USER, NS_USER_TALK};
22#[cfg(feature = "utils")]
23#[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
24use crate::SiteInfoResponse;
25use crate::{
26    php, Error, Interwiki, InterwikiSet, NamespaceAlias, NamespaceInfo,
27    NamespaceMap, Result, SiteInfo, Title, NS_MAIN,
28};
29#[cfg(feature = "utils")]
30#[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
31use flate2::read::GzDecoder;
32use regex::bytes::Regex;
33#[cfg(feature = "utils")]
34#[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
35use std::{fs::File, io::Read, path::Path, sync::Arc};
36
37/// The `TitleCodec` is responsible for parsing, normalizing and formatting
38/// `Title`s. See the crate-level documentation for an example of how to
39/// construct one.
40#[cfg_attr(docsrs, doc(cfg(feature = "parsing")))]
41#[derive(Clone, Debug)]
42pub struct TitleCodec {
43    namespace_map: NamespaceMap,
44    interwiki_set: InterwikiSet,
45    local_interwiki_set: InterwikiSet,
46    main_page: String,
47    lang: String,
48    illegal_patterns: Regex,
49}
50
51#[test]
52fn title_codec_is_send_and_sync() {
53    fn assert_send_and_sync<T: Send + Sync>() {}
54
55    assert_send_and_sync::<TitleCodec>();
56}
57
58impl TitleCodec {
59    /// Create a new title by parsing the provided input.
60    pub fn new_title(&self, input: &str) -> Result<Title> {
61        self.secure_and_split(input, NS_MAIN)
62    }
63
64    /// Create a new title by parsing the provided input. If the title has no
65    /// namespace part, then the namespace specified by `default_namespace` is
66    /// used instead.
67    pub fn new_title_with_namespace(
68        &self,
69        input: &str,
70        default_namespace: i32,
71    ) -> Result<Title> {
72        self.secure_and_split(input, default_namespace)
73    }
74
75    /// Create a new title from the numerical database ID and title portion,
76    /// usually obtained directly from the database.
77    pub fn new_title_from_database(
78        &self,
79        namespace: i32,
80        dbkey: &str,
81    ) -> Result<Title> {
82        match self.namespace_map.get_name(namespace) {
83            Some(name) => {
84                if name.is_empty() {
85                    // No prefixing needed
86                    self.new_title(dbkey)
87                } else {
88                    self.new_title(&format!("{name}:{dbkey}"))
89                }
90            }
91            None => Err(Error::UnknownNamespace(namespace)),
92        }
93    }
94
95    /// Get a reference to the underlying `NamespaceMap`
96    /// to get information about namespaces.
97    pub fn namespace_map(&self) -> &NamespaceMap {
98        &self.namespace_map
99    }
100
101    /// Get the title with namespace in pretty aka text form (spaces).
102    ///
103    /// Fragments will not be included.
104    ///
105    /// # Panics
106    ///
107    /// This will panic if the `Title` is in a namespace that this `TitleCodec`
108    /// is unaware of.
109    pub fn to_pretty(&self, title: &Title) -> String {
110        self.namespace_map
111            .to_pretty(title)
112            .expect("unknown namespace")
113    }
114
115    /// Get the title with namespace in underscore aka dbkey form. This is
116    /// potentially useful when you want to make a database query.
117    ///
118    /// Fragments will not be included.
119    ///
120    /// # Panics
121    ///
122    /// This will panic if the `Title` is in a namespace that this `TitleCodec`
123    /// is unaware of.
124    pub fn to_underscores(&self, title: &Title) -> String {
125        self.namespace_map
126            .to_underscores(title)
127            .expect("unknown namespace")
128    }
129
130    /// Get the title with namespace in pretty aka text form (spaces), with the
131    /// fragment, if one exists, appended.
132    ///
133    /// # Panics
134    ///
135    /// This will panic if the `Title` is in a namespace that this `TitleCodec`
136    /// is unaware of.
137    pub fn to_pretty_with_fragment(&self, title: &Title) -> String {
138        self.namespace_map
139            .to_pretty_with_fragment(title)
140            .expect("unknown namespace")
141    }
142
143    /// Construct a new `TitleCodec` using the given fields.
144    ///
145    /// In most cases it is easier to do so from one of the siteinfo methods.
146    pub fn new(
147        namespace_map: NamespaceMap,
148        interwiki_set: InterwikiSet,
149        local_interwiki_set: InterwikiSet,
150        main_page: String,
151        lang: String,
152        legal_title_chars: String,
153    ) -> Result<Self> {
154        // Copied from `MediaWikiTitleCodec::getTitleInvalidRegex()`.
155        // The `legal_title_chars` portion has to be changed when this lands:
156        // https://phabricator.wikimedia.org/T297340
157        // Matching titles will be held as illegal.
158        let illegal_patterns = Regex::new(&format!(
159            r"(?x-u)
160                # x: ignore whitespace and allow comments;
161                # -u: disable code point matching
162                # so that \x80-\xff match bytes 0x80-0xFF
163                # (corresponding to all non-ASCII code points, U+0080-U+10FFFF)
164                # rather than code points U+0080-U+00FF.
165                    # Any character not allowed is forbidden...
166                    [^{legal_title_chars}]
167
168                    # URL percent encoding sequences interfere with the ability
169                    # to round-trip titles -- you can't link to them consistently.
170                    | %[0-9A-Fa-f]{{2}}
171
172                    # XML/HTML character references produce similar issues.
173                    | &[A-Za-z0-9\x80-\xff]+;
174                ",
175            // / does not need to be escaped as \/ in Rust regex.
176            legal_title_chars = legal_title_chars.replace(r"\/", "/")
177        ))?;
178
179        Ok(Self {
180            namespace_map,
181            interwiki_set,
182            local_interwiki_set,
183
184            illegal_patterns,
185            main_page,
186            lang,
187        })
188    }
189
190    /// Create a new `TitleCodec` getting namespaces, namespace aliases, and interwikis from iterators.
191    pub fn new_from_iters<
192        N: IntoIterator<Item = NamespaceInfo>,
193        A: IntoIterator<Item = NamespaceAlias>,
194        I: IntoIterator<Item = Interwiki>,
195    >(
196        namespaces: N,
197        namespace_aliases: A,
198        interwikis: I,
199        main_page: String,
200        lang: String,
201        legal_title_chars: String,
202    ) -> Result<Self> {
203        let (interwiki_set, local_interwiki_set) =
204            InterwikiSet::all_and_local_from_iter(interwikis);
205        let namespace_map =
206            NamespaceMap::from_namespaces_and_namespace_aliases(
207                namespaces,
208                namespace_aliases,
209            )?;
210        Self::new(
211            namespace_map,
212            interwiki_set,
213            local_interwiki_set,
214            main_page,
215            lang,
216            legal_title_chars,
217        )
218    }
219
220    /// Creates a `TitleCodec` by parsing the contents of a JSON or GZipped JSON file.
221    ///
222    /// Will accept the `siteinfo-namespaces.json.gz` file from in the Wikimedia dumps.
223    /// If the file extension is `gz`, decompresses from the GZip format before deserializing the JSON;
224    /// otherwise attempts to deserialize the file contents directly.
225    #[cfg(feature = "utils")]
226    #[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
227    pub fn from_path(path: &Path) -> Result<Self> {
228        let json = if path.extension() == Some("gz".as_ref()) {
229            let gz = File::open(path)
230                .map_err(|source| Error::from_io("open file", source, path))?;
231            let mut decoder = GzDecoder::new(gz);
232            let mut decoded = String::new();
233            decoder
234                .read_to_string(&mut decoded)
235                .map_err(|source| Error::from_io("parse GZip", source, path))?;
236            decoded
237        } else {
238            std::fs::read_to_string(path).map_err(|source| {
239                Error::from_io("read file to string", source, path)
240            })?
241        };
242        Self::from_json_with_path(&json, Some(path))
243    }
244
245    /// Creates a `TitleCodec` by parsing the contents of a `Read` type that contains the JSON
246    /// representation of a [`SiteInfoResponse`].
247    #[cfg(feature = "utils")]
248    #[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
249    pub fn from_reader<R: Read>(reader: R) -> Result<Self> {
250        Self::from_site_info(
251            serde_json::from_reader::<R, SiteInfoResponse>(reader)
252                .map_err(|source| Error::Json {
253                    source: Arc::new(source),
254                })?
255                .query,
256        )
257    }
258
259    /// Creates a `TitleCodec` by parsing the JSON representation of a [`SiteInfoResponse`].
260    #[cfg(feature = "utils")]
261    #[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
262    pub fn from_json<S: AsRef<str>>(json: S) -> Result<Self> {
263        Self::from_json_with_path(json.as_ref(), None)
264    }
265
266    /// Creates a `TitleCodec` by parsing the JSON representation of a [`SiteInfoResponse`].
267    ///
268    /// # Errors
269    ///
270    /// If this fails and `path` is `Some(_)`, gives an error message
271    /// that mentions `path`.
272    #[cfg(feature = "utils")]
273    #[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
274    fn from_json_with_path(json: &str, path: Option<&Path>) -> Result<Self> {
275        Self::from_site_info(
276            serde_json::from_str::<SiteInfoResponse>(json)
277                .map_err(|source| {
278                    let source = Arc::new(source);
279                    if let Some(path) = path {
280                        Error::JsonFile {
281                            source,
282                            path: path.into(),
283                        }
284                    } else {
285                        Error::Json { source }
286                    }
287                })?
288                .query,
289        )
290    }
291
292    /// Create a new `TitleCodec` using the provided [`SiteInfo`].
293    ///
294    /// The `SiteInfo` must include a non-empty `interwiki_map` field
295    /// to enable the resulting `TitleCodec`
296    /// to correctly parse titles with interwikis,
297    /// but an empty `interwiki_map` is not an error.
298    pub fn from_site_info(site_info: SiteInfo) -> Result<Self> {
299        Self::new_from_iters(
300            site_info.namespaces.into_values(),
301            site_info.namespace_aliases,
302            site_info.interwiki_map,
303            site_info.general.main_page,
304            site_info.general.lang,
305            site_info.general.legal_title_chars,
306        )
307    }
308
309    /// Equivalent of `MediaWikiTitleCodec::splitTitleString()`.
310    ///
311    /// Most comments are direct copies to make it easier to compare with
312    /// the MediaWiki implementation.
313    fn secure_and_split(
314        &self,
315        input: &str,
316        default_namespace: i32,
317    ) -> Result<Title> {
318        let mut namespace = default_namespace;
319        // Strip Unicode bidi override characters.
320        // Clean up whitespace.
321        let mut dbkey = normalize_title_chars(input);
322        let mut fragment = None;
323        let mut interwiki = None;
324        let mut local_interwiki = false;
325
326        // U+FFFD is the replacement character
327        if dbkey.contains('\u{FFFD}') {
328            // Contained illegal UTF-8 sequences or forbidden Unicode chars.
329            return Err(Error::IllegalUtf8(input.to_string()));
330        }
331        // Skip "Contained illegal UTF-8 sequences or forbidden Unicode chars.",
332        // because all Rust strings are valid UTF-8.
333
334        // Initial colon indicates main namespace rather than specified default
335        // but should not create invalid {ns,title} pairs such as {0,Project:Foo}
336        if dbkey.get(0..1) == Some(":") {
337            namespace = NS_MAIN;
338            // remove the colon but continue processing
339            dbkey.drain(..1);
340            // remove any subsequent whitespace
341            trim_title_whitespace(&mut dbkey);
342        }
343        if dbkey.is_empty() {
344            return Err(Error::Empty(input.to_string()));
345        }
346
347        fn get_nonempty_trimmed(
348            s: &str,
349            range_to: std::ops::RangeTo<usize>,
350        ) -> Option<&str> {
351            s.get(range_to)
352                .filter(|p| !p.is_empty())
353                .map(|s| s.trim_end_matches('_'))
354        }
355
356        // Namespace or interwiki prefix
357        // `MediaWikiTitleCodec` uses a regex here, but we're going to use string
358        // parsing instead.
359        loop {
360            if let Some(colon_pos) = dbkey.find(':') {
361                if let Some(prefix) = get_nonempty_trimmed(&dbkey, ..colon_pos)
362                {
363                    if let Some(ns) = self.namespace_map.get_id(prefix) {
364                        // Ordinary namespace
365                        namespace = ns;
366                        dbkey.drain(..colon_pos + 1);
367                        trim_title_whitespace(&mut dbkey);
368                        // For Talk:X pages, check if X has a "namespace" prefix
369                        if ns == NS_TALK {
370                            if let Some(colon_pos) = dbkey.find(':') {
371                                // Disallow Talk:File:x or Talk:Interwiki:x type titles ...
372                                if let Some(prefix) =
373                                    get_nonempty_trimmed(&dbkey, ..colon_pos)
374                                {
375                                    if self
376                                        .namespace_map
377                                        .get_id(prefix)
378                                        .is_some()
379                                        || self.interwiki_set.contains(prefix)
380                                    {
381                                        return Err(Error::TalkNamespace(
382                                            input.to_string(),
383                                        ));
384                                    }
385                                }
386                            }
387                        }
388                    } else if self.interwiki_set.contains(prefix) {
389                        // Check this using prefix before we mutably borrow dbkey
390                        let is_local_interwiki =
391                            self.local_interwiki_set.contains(prefix);
392                        interwiki = Some(prefix.to_lowercase());
393                        dbkey.drain(..colon_pos + 1);
394                        trim_title_whitespace(&mut dbkey);
395
396                        if is_local_interwiki {
397                            if dbkey.is_empty() {
398                                // Empty self-links should point to the Main Page, to ensure
399                                // compatibility with cross-wiki transclusions and the like.
400                                return Ok(self
401                                    .new_title(&self.main_page)
402                                    .map(|mut title| {
403                                        title.local_interwiki = true;
404                                        title
405                                    })
406                                    .unwrap_or_else(|_| {
407                                        // Fallback to hardcoded "Main Page" if the configured main page
408                                        // value is unparseable
409                                        Title {
410                                            namespace: NS_MAIN,
411                                            dbkey: "Main_Page".to_string(),
412                                            fragment: None,
413                                            interwiki: None,
414                                            local_interwiki: true,
415                                        }
416                                    }));
417                            }
418                            interwiki = None;
419                            // local interwikis should behave like initial-colon links
420                            local_interwiki = true;
421
422                            // Do another namespace split...
423                            continue;
424                        }
425
426                        // If there's an initial colon after the interwiki, that also
427                        // resets the default namespace
428                        if dbkey.starts_with(':') {
429                            namespace = NS_MAIN;
430                            dbkey.drain(..1);
431                            trim_title_whitespace(&mut dbkey);
432                        }
433                    }
434                }
435            }
436            // If there's no recognized interwiki or namespace,
437            // then let the colon expression be part of the title.
438            break;
439        }
440
441        if let Some((key, f)) = dbkey.split_once('#') {
442            fragment = Some(f.replace('_', " "));
443            let key_len = key.len(); // to satisfy borrow checker
444            dbkey.truncate(key_len);
445            // remove whitespace again: prevents "Foo_bar_#"
446            // becoming "Foo_bar_"
447            trim_title_whitespace(&mut dbkey);
448        }
449
450        // Reject illegal characters.
451        if self.illegal_patterns.is_match(dbkey.as_bytes()) {
452            return Err(Error::Characters(input.to_string()));
453        }
454
455        // Pages with "/./" or "/../" appearing in the URLs will often be un-
456        // reachable due to the way web browsers deal with 'relative' URLs.
457        // Also, they conflict with subpage syntax.  Forbid them explicitly.
458        if dbkey == "."
459            || dbkey == ".."
460            || dbkey.starts_with("./")
461            || dbkey.starts_with("../")
462            || dbkey.contains("/./")
463            || dbkey.contains("/../")
464            || dbkey.ends_with("/.")
465            || dbkey.ends_with("/..")
466        {
467            return Err(Error::Relative(input.to_string()));
468        }
469
470        // Magic tilde sequences? Nu-uh!
471        if dbkey.contains("~~~") {
472            return Err(Error::MagicTildes(input.to_string()));
473        }
474
475        // Limit the size of titles to 255 bytes. This is typically the size of the
476        // underlying database field. We make an exception for special pages, which
477        // don't need to be stored in the database, and may edge over 255 bytes due
478        // to subpage syntax for long titles, e.g. [[Special:Block/Long name]]
479        let max_length = if namespace == NS_SPECIAL { 512 } else { 255 };
480        if dbkey.len() > max_length {
481            return Err(Error::TooLong(input.to_string()));
482        }
483
484        // Normally, all wiki links are forced to have an initial capital letter so [[foo]]
485        // and [[Foo]] point to the same place.  Don't force it for interwikis, since the
486        // other site might be case-sensitive.
487        if interwiki.is_none()
488            && self
489                .namespace_map
490                .is_capitalized(namespace)
491                .unwrap_or(false)
492        {
493            uppercase_first(&self.lang, &mut dbkey);
494        }
495
496        // Can't make a link to a namespace alone... "empty" local links can only be
497        // self-links with a fragment identifier.
498        // MediaWiki allows for links with just a fragment, but we won't.
499        if dbkey.is_empty() && interwiki.is_none() && namespace != NS_MAIN {
500            return Err(Error::Empty(input.to_string()));
501        }
502
503        if namespace == NS_USER || namespace == NS_USER_TALK {
504            sanitize_ip(&mut dbkey);
505        }
506
507        // Any remaining initial :s are illegal.
508        if dbkey.starts_with(':') {
509            return Err(Error::LeadingColon(input.to_string()));
510        }
511
512        Ok(Title {
513            namespace,
514            dbkey,
515            fragment,
516            interwiki,
517            local_interwiki,
518        })
519    }
520}
521
522/// Indicates whether a code point is considered whitespace when it is found in a title.
523///
524/// Includes all code points with the White_Space property
525/// (see [PropList.txt](https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt)),
526/// but excludes the control characters
527/// U+009-U+00D (tab, newline, vertical tab, form feed, carriage return)
528/// and U+0085 (next line), and adds U+180E (MONGOLIAN VOWEL SEPARATOR),
529/// a format character (General Category: Cf).
530/// The control characters U+009-U+00D are rejected
531/// by the `illegal_patterns` regex;
532/// U+0085 is accepted as a valid character.
533#[rustfmt::skip]
534fn is_title_whitespace(c: char) -> bool {
535    matches!(
536        c,
537        ' ' | '_' // U+0020 SPACE, U+005F LOW LINE
538            | '\u{A0}' // U+00A0 NO-BREAK SPACE
539            | '\u{1680}' // U+1680 OGHAM SPACE MARK
540            | '\u{180E}' // U+180E MONGOLIAN VOWEL SEPARATOR
541            // U+2000-U+200A: EN QUAD, EM QUAD, EN SPACE, EM SPACE,
542            // THREE-PER-EM SPACE, FOUR-PER-EM SPACE, SIX-PER-EM SPACE,
543            // FIGURE SPACE, PUNCTUATION SPACE, THIN SPACE, HAIR SPACE
544            | '\u{2000}'..='\u{200A}'
545            | '\u{2028}' // U+2028 LINE SEPARATOR
546            | '\u{2029}' // U+2029 PARAGRAPH SEPARATOR
547            | '\u{202F}' // U+202F NARROW NO-BREAK SPACE
548            | '\u{205F}' // U+205F MEDIUM MATHEMATICAL SPACE
549            | '\u{3000}' // U+3000 IDEOGRAPHIC SPACE
550    )
551}
552
553/**
554 * Indicates that a character is a directional formatting character
555 * that should be removed from titles.
556 *
557 * MediaWiki strips some [directional formatting characters](https://www.unicode.org/reports/tr9/#Directional_Formatting_Characters) from titles:
558 * U+200E and U+200F (LEFT-TO-RIGHT MARK, RIGHT-TO-LEFT MARK)
559 * and U+202A–U+202E (LEFT-TO-RIGHT EMBEDDING, RIGHT-TO-LEFT EMBEDDING,
560 * POP DIRECTIONAL FORMATTING, LEFT-TO-RIGHT OVERRIDE, RIGHT-TO-LEFT OVERRIDE).
561 * All of these were introduced in Unicode 1.1 and are referred to as
562 * bidi override characters in the source code
563 * of `MediaWikiTitleCodec::splitTitleString()`.
564 *
565 * The following directional formatting characters were introduced
566 * in [Unicode 6.3](https://www.unicode.org/versions/Unicode6.3.0/) (2013)
567 * and are not stripped:
568 * U+061C (ARABIC LETTER MARK)
569 * and U+2066–U+2069 (LEFT‑TO‑RIGHT ISOLATE, RIGHT‑TO‑LEFT ISOLATE, FIRST STRONG ISOLATE, POP DIRECTIONAL ISOLATE).
570 */
571fn is_bidirectional_override(c: char) -> bool {
572    matches!(c, '\u{200E}' | '\u{200F}' | '\u{202A}'..='\u{202E}')
573}
574
575/**
576 * Normalizes characters in a title.
577 *
578 * Removes the banned directional formatting characters (see [`is_bidirectional_override`]),
579 * strips title whitespace characters (see [`is_title_whitespace`])
580 * from the beginning and end of the title,
581 * and replaces sequences of one or more title whitespace characters with a single underscore.
582 */
583fn normalize_title_chars(title: &str) -> String {
584    // This gets the minimum possible length of the normalized title.
585    // It will be longer than this if there is any untrimmed whitespace.
586    let mut out = String::with_capacity(
587        title
588            .chars()
589            .filter(|c| {
590                !(is_title_whitespace(*c) || is_bidirectional_override(*c))
591            })
592            .count(),
593    );
594    let mut prev_whitespace = false;
595    for c in title.chars() {
596        let cur_whitespace = is_title_whitespace(c);
597        if !(cur_whitespace || is_bidirectional_override(c)) {
598            if prev_whitespace && !out.is_empty() {
599                out.push('_');
600            }
601            out.push(c);
602        }
603        prev_whitespace = cur_whitespace;
604    }
605    out
606}
607
608#[test]
609fn normalize_title_chars_strips_and_collapses_title_whitespace() {
610    assert_eq!(normalize_title_chars(" a b"), "a_b");
611    assert_eq!(normalize_title_chars("a b "), "a_b");
612    assert_eq!(normalize_title_chars("a  b"), "a_b");
613    assert_eq!(normalize_title_chars("a__b"), "a_b");
614}
615
616#[test]
617fn normalize_title_chars_removes_directional_control_characters() {
618    assert_eq!(normalize_title_chars("\u{200E}_a_b"), "a_b");
619    assert_eq!(normalize_title_chars("a\u{200E}_b "), "a_b");
620    assert_eq!(normalize_title_chars("a_b\u{200E}"), "a_b");
621    assert_eq!(normalize_title_chars("a_\u{200E}_b"), "a_b");
622}
623
624fn trim_title_whitespace(s: &mut String) {
625    let title_start = s.bytes().position(|b| b != b'_').unwrap_or(0);
626    let trailing_whitespace_count =
627        s.bytes().rev().position(|b| b != b'_').unwrap_or(0);
628    // This `String::drain` won't panic because the `Iterator::position` call gets a valid `char` boundary.
629    s.drain(..title_start);
630    // This `String::truncate` won't panic because `s.len() - trailing_whitespace_count` is a valid `char` boundary;
631    s.truncate(s.len() - trailing_whitespace_count);
632}
633
634#[test]
635fn trim_title_whitespace_trims_underscores() {
636    assert_eq!(normalize_title_chars("_a_b"), "a_b");
637    assert_eq!(normalize_title_chars("a_b_"), "a_b");
638    assert_eq!(normalize_title_chars("_a_b_"), "a_b");
639}
640
641const UPPERCASE_DOTTED_I_LANGUAGES: [&str; 4] = ["az", "kaa", "kk", "tr"];
642
643/// Functional equivalent of `Language::ucfirst()`.
644///
645/// This is probably not going to be identical because of different Unicode
646/// versions in use, but hopefully those cases are so rare we don't hit them.
647///
648/// Or we could just hardcode a special mapping like MediaWiki does for
649/// client-side JavaScript.
650fn uppercase_first(lang: &str, input: &mut String) {
651    if let Some(first) = input.chars().next() {
652        // `Language::ucfirst()` has special handling for the `i` character
653        // in some languages
654        if first == 'i' && UPPERCASE_DOTTED_I_LANGUAGES.contains(&lang) {
655            // i has len_utf8() of 1
656            input.drain(..1);
657            // İ has len_utf8() of 2
658            input.reserve(2);
659            input.insert(0, 'İ');
660        } else if php::ALREADY_UPPERCASE.contains(&first) {
661            // Skip, do nothing
662        } else if let Some(replace) = php::to_uppercase(first) {
663            input.drain(..first.len_utf8());
664            input.reserve(replace.len_utf8());
665            input.insert(0, replace);
666        } else if !first.is_uppercase() {
667            input.drain(..first.len_utf8());
668            input.reserve(first.to_uppercase().map(|c| c.len_utf8()).sum());
669            for c in first.to_uppercase() {
670                input.insert(0, c);
671            }
672        }
673    }
674}
675
676#[test]
677fn uppercase_first_respects_dotted_i_langs() {
678    for ((lang, input), expected) in [
679        (("en", "abc"), "Abc"),
680        (("en", "istanbul"), "Istanbul"),
681        (("tr", "istanbul"), "İstanbul"),
682    ] {
683        let mut capitalized = input.to_string();
684        uppercase_first(lang, &mut capitalized);
685        assert_eq!(capitalized, expected);
686    }
687}
mwtitle/codec.rs

mwtitle/
codec.rs