mwtitle 0.2.0-alpha.2

MediaWiki title validation and formatting
Documentation
/*
Copyright (C) 2016, 2019 Ed Sanders
Copyright (C) 2021 Kunal Mehta <legoktm@debian.org>

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*!
 * Non-Unicode uppercase mapping used in MediaWiki titles.
 *
 * Some MediaWiki titles have their first code point (letter) uppercased.
 * Most code points are uppercased according to the [Unicode uppercase mapping],
 * but some have a different uppercase mapping,
 * following the `mb_strtoupper` function from PHP version 7.2 and earlier.
 * In [PHP 7.3], `mb_strtoupper` was updated to follow Unicode casemapping,
 * but MediaWiki titles still use the old uppercasing for compatibility.
 * `ALREADY_UPPERCASE` lists code points that are unchanged by uppercasing
 * and `to_uppercase` maps some code points to their non-Unicode uppercasing.
 *
 * In general, MediaWiki title casing doesn't uppercase one code point to two or three code points
 * as Unicode does with some code points (see [SpecialCasing.txt] for a full list).
 * It leaves some of these code points unchanged,
 * like ff (U+FB00 LATIN SMALL LIGATURE FF),
 * whose Unicode uppercase is
 * FF (U+0046 LATIN CAPITAL LETTER F, U+0046 LATIN CAPITAL LETTER F).
 * It maps other code points to a different single code point,
 * like ᾳ (U+1FB3 GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI),
 * whose Unicode uppercase is
 * ΑΙ (U+0391 GREEK CAPITAL LETTER ALPHA, U+0399 GREEK CAPITAL LETTER IOTA)
 * but whose MediaWiki title uppercase is
 * ᾼ (U+1FBC GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI).
 *
 * MediaWiki title casing also leaves some code points unchanged
 * even when Unicode gives them a one-to-one uppercase mapping.
 * Some of these code points had uppercase mappings
 * in [UnicodeData.txt] for version 1.1 of Unicode.
 * like ⓐ (U+24D0 CIRCLED LATIN SMALL LETTER A)
 * to Ⓐ (U+24B6 CIRCLED LATIN CAPITAL LETTER A).
 * Others had uppercase mappings added in a later version of Unicode;
 * for instance, lowercase ა (U+10D0 GEORGIAN LETTER AN) began to be mapped
 * to uppercase Ა (U+1C90 GEORGIAN MTAVRULI CAPITAL LETTER AN) in Unicode 11.0 (2018-06-05).
 *
 * [PHP 7.3]: https://www.zend.com/blog/guide-to-php-73
 * [SpecialCasing.txt]: http://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
 * [UnicodeData.txt]: http://ftp.unicode.org/Public/1.1-Update/UnicodeData-1.1.5.txt
 * [Unicode uppercase mapping]: https://www.unicode.org/reports/tr44/#Casemapping
 */

/// Characters that PHP 7.2 and earlier considers to be uppercase but Unicode does not.
///
///
/// Taken from MediaWiki's mediawiki.Title/phpCharToUpper.json @ 58233ac5af17d
pub(crate) const ALREADY_UPPERCASE: [char; 204] = [
    'ß', 'ʼn', 'ǰ', 'ʂ', 'ͅ', 'ΐ', 'ΰ', 'և', '', '', '', '', '', '', '',
    '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
    '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
    '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
    '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
    '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
    '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
    '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
    '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
    '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
    '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
    '', '', '', '', '', '', '', '𖹠', '𖹡', '𖹢', '𖹣', '𖹤', '𖹥', '𖹦', '𖹧',
    '𖹨', '𖹩', '𖹪', '𖹫', '𖹬', '𖹭', '𖹮', '𖹯', '𖹰', '𖹱', '𖹲', '𖹳', '𖹴', '𖹵', '𖹶',
    '𖹷', '𖹸', '𖹹', '𖹺', '𖹻', '𖹼', '𖹽', '𖹾', '𖹿',
];

/// Characters that PHP 7.2 and earlier maps to uppercase differently than Unicode.
///
/// Taken from MediaWiki's mediawiki.Title/phpCharToUpper.json @ 58233ac5af17d
pub(crate) fn to_uppercase(input: char) -> Option<char> {
    match input {
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        '' => Some(''),
        // Can't default to input.to_uppercase() because that returns an iter
        // of chars rather than a single one.
        _ => None,
    }
}