encoding-next 0.3.0

Character encoding support for Rust
Documentation
// This is a part of encoding-next.
// Copyright (c) 2013-2015, Kang Seonghoon.
// See README.md and LICENSE.txt for details.

//! An interface for retrieving an encoding (or a set of encodings) from a string/numeric label.

use crate::all;
use crate::types::EncodingRef;

/// Returns an encoding from given label, defined in the WHATWG Encoding standard, if any.
/// Implements "get an encoding" algorithm: <http://encoding.spec.whatwg.org/#concept-encoding-get>
pub fn encoding_from_whatwg_label(label: &str) -> Option<EncodingRef> {
    let label = label.trim_matches(&[' ', '\n', '\r', '\t', '\x0C'][..]);
    let label: String = label
        .chars()
        .map(|c| match c {
            'A'..='Z' => (c as u8 + 32) as char,
            _ => c,
        })
        .collect();
    match &label[..] {
        "unicode-1-1-utf-8" | "utf-8" | "utf8" => Some(all::UTF_8 as EncodingRef),
        "866" | "cp866" | "csibm866" | "ibm866" => Some(all::IBM866 as EncodingRef),
        "csisolatin2" | "iso-8859-2" | "iso-ir-101" | "iso8859-2" | "iso88592" | "iso_8859-2"
        | "iso_8859-2:1987" | "l2" | "latin2" => Some(all::ISO_8859_2 as EncodingRef),
        "csisolatin3" | "iso-8859-3" | "iso-ir-109" | "iso8859-3" | "iso88593" | "iso_8859-3"
        | "iso_8859-3:1988" | "l3" | "latin3" => Some(all::ISO_8859_3 as EncodingRef),
        "csisolatin4" | "iso-8859-4" | "iso-ir-110" | "iso8859-4" | "iso88594" | "iso_8859-4"
        | "iso_8859-4:1988" | "l4" | "latin4" => Some(all::ISO_8859_4 as EncodingRef),
        "csisolatincyrillic" | "cyrillic" | "iso-8859-5" | "iso-ir-144" | "iso8859-5"
        | "iso88595" | "iso_8859-5" | "iso_8859-5:1988" => Some(all::ISO_8859_5 as EncodingRef),
        "arabic" | "asmo-708" | "csiso88596e" | "csiso88596i" | "csisolatinarabic" | "ecma-114"
        | "iso-8859-6" | "iso-8859-6-e" | "iso-8859-6-i" | "iso-ir-127" | "iso8859-6"
        | "iso88596" | "iso_8859-6" | "iso_8859-6:1987" => Some(all::ISO_8859_6 as EncodingRef),
        "csisolatingreek" | "ecma-118" | "elot_928" | "greek" | "greek8" | "iso-8859-7"
        | "iso-ir-126" | "iso8859-7" | "iso88597" | "iso_8859-7" | "iso_8859-7:1987"
        | "sun_eu_greek" => Some(all::ISO_8859_7 as EncodingRef),
        "csiso88598e" | "csisolatinhebrew" | "hebrew" | "iso-8859-8" | "iso-8859-8-e"
        | "iso-ir-138" | "iso8859-8" | "iso88598" | "iso_8859-8" | "iso_8859-8:1988" | "visual" => {
            Some(all::ISO_8859_8 as EncodingRef)
        }
        "csiso88598i" | "iso-8859-8-i" | "logical" => {
            Some(all::whatwg::ISO_8859_8_I as EncodingRef)
        }
        "csisolatin6" | "iso-8859-10" | "iso-ir-157" | "iso8859-10" | "iso885910" | "l6"
        | "latin6" => Some(all::ISO_8859_10 as EncodingRef),
        "iso-8859-13" | "iso8859-13" | "iso885913" => Some(all::ISO_8859_13 as EncodingRef),
        "iso-8859-14" | "iso8859-14" | "iso885914" => Some(all::ISO_8859_14 as EncodingRef),
        "csisolatin9" | "iso-8859-15" | "iso8859-15" | "iso885915" | "iso_8859-15" | "l9" => {
            Some(all::ISO_8859_15 as EncodingRef)
        }
        "iso-8859-16" => Some(all::ISO_8859_16 as EncodingRef),
        "cskoi8r" | "koi" | "koi8" | "koi8-r" | "koi8_r" => Some(all::KOI8_R as EncodingRef),
        "koi8-ru" | "koi8-u" => Some(all::KOI8_U as EncodingRef),
        "csmacintosh" | "mac" | "macintosh" | "x-mac-roman" => Some(all::MAC_ROMAN as EncodingRef),
        "dos-874" | "iso-8859-11" | "iso8859-11" | "iso885911" | "tis-620" | "windows-874" => {
            Some(all::WINDOWS_874 as EncodingRef)
        }
        "cp1250" | "windows-1250" | "x-cp1250" => Some(all::WINDOWS_1250 as EncodingRef),
        "cp1251" | "windows-1251" | "x-cp1251" => Some(all::WINDOWS_1251 as EncodingRef),
        "ansi_x3.4-1968" | "ascii" | "cp1252" | "cp819" | "csisolatin1" | "ibm819"
        | "iso-8859-1" | "iso-ir-100" | "iso8859-1" | "iso88591" | "iso_8859-1"
        | "iso_8859-1:1987" | "l1" | "latin1" | "us-ascii" | "windows-1252" | "x-cp1252" => {
            Some(all::WINDOWS_1252 as EncodingRef)
        }
        "cp1253" | "windows-1253" | "x-cp1253" => Some(all::WINDOWS_1253 as EncodingRef),
        "cp1254" | "csisolatin5" | "iso-8859-9" | "iso-ir-148" | "iso8859-9" | "iso88599"
        | "iso_8859-9" | "iso_8859-9:1989" | "l5" | "latin5" | "windows-1254" | "x-cp1254" => {
            Some(all::WINDOWS_1254 as EncodingRef)
        }
        "cp1255" | "windows-1255" | "x-cp1255" => Some(all::WINDOWS_1255 as EncodingRef),
        "cp1256" | "windows-1256" | "x-cp1256" => Some(all::WINDOWS_1256 as EncodingRef),
        "cp1257" | "windows-1257" | "x-cp1257" => Some(all::WINDOWS_1257 as EncodingRef),
        "cp1258" | "windows-1258" | "x-cp1258" => Some(all::WINDOWS_1258 as EncodingRef),
        "x-mac-cyrillic" | "x-mac-ukrainian" => Some(all::MAC_CYRILLIC as EncodingRef),
        "chinese" | "csgb2312" | "csiso58gb231280" | "gb2312" | "gb_2312" | "gb_2312-80"
        | "gbk" | "iso-ir-58" | "x-gbk" => Some(all::GBK as EncodingRef),
        "gb18030" => Some(all::GB18030 as EncodingRef),
        "big5" | "big5-hkscs" | "cn-big5" | "csbig5" | "x-x-big5" => {
            Some(all::BIG5_2003 as EncodingRef)
        }
        "cseucpkdfmtjapanese" | "euc-jp" | "x-euc-jp" => Some(all::EUC_JP as EncodingRef),
        "csiso2022jp" | "iso-2022-jp" => Some(all::ISO_2022_JP as EncodingRef),
        "csshiftjis" | "ms932" | "ms_kanji" | "shift-jis" | "shift_jis" | "sjis"
        | "windows-31j" | "x-sjis" => Some(all::WINDOWS_31J as EncodingRef),
        "cseuckr" | "csksc56011987" | "euc-kr" | "iso-ir-149" | "korean" | "ks_c_5601-1987"
        | "ks_c_5601-1989" | "ksc5601" | "ksc_5601" | "windows-949" => {
            Some(all::WINDOWS_949 as EncodingRef)
        }
        "csiso2022kr" | "hz-gb-2312" | "iso-2022-kr" | "iso-2022-cn" | "iso-2022-cn-ext" => {
            Some(all::whatwg::REPLACEMENT as EncodingRef)
        }
        "utf-16be" => Some(all::UTF_16BE as EncodingRef),
        "utf-16" | "utf-16le" => Some(all::UTF_16LE as EncodingRef),
        "x-user-defined" => Some(all::whatwg::X_USER_DEFINED as EncodingRef),
        _ => None,
    }
}

/// Returns an encoding from Windows code page number.
/// <http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx>
/// Sometimes it can return a *superset* of the requested encoding, e.g. for several CJK encodings.
pub fn encoding_from_windows_code_page(cp: usize) -> Option<EncodingRef> {
    match cp {
        65001 => Some(all::UTF_8 as EncodingRef),
        866 => Some(all::IBM866 as EncodingRef),
        28591 => Some(all::ISO_8859_1 as EncodingRef),
        28592 => Some(all::ISO_8859_2 as EncodingRef),
        28593 => Some(all::ISO_8859_3 as EncodingRef),
        28594 => Some(all::ISO_8859_4 as EncodingRef),
        28595 => Some(all::ISO_8859_5 as EncodingRef),
        28596 => Some(all::ISO_8859_6 as EncodingRef),
        28597 => Some(all::ISO_8859_7 as EncodingRef),
        28598 => Some(all::ISO_8859_8 as EncodingRef),
        38598 => Some(all::whatwg::ISO_8859_8_I as EncodingRef),
        28603 => Some(all::ISO_8859_13 as EncodingRef),
        28605 => Some(all::ISO_8859_15 as EncodingRef),
        20866 => Some(all::KOI8_R as EncodingRef),
        21866 => Some(all::KOI8_U as EncodingRef),
        10000 => Some(all::MAC_ROMAN as EncodingRef),
        874 => Some(all::WINDOWS_874 as EncodingRef),
        1250 => Some(all::WINDOWS_1250 as EncodingRef),
        1251 => Some(all::WINDOWS_1251 as EncodingRef),
        1252 => Some(all::WINDOWS_1252 as EncodingRef),
        1253 => Some(all::WINDOWS_1253 as EncodingRef),
        1254 => Some(all::WINDOWS_1254 as EncodingRef),
        1255 => Some(all::WINDOWS_1255 as EncodingRef),
        1256 => Some(all::WINDOWS_1256 as EncodingRef),
        1257 => Some(all::WINDOWS_1257 as EncodingRef),
        1258 => Some(all::WINDOWS_1258 as EncodingRef),
        1259 => Some(all::MAC_CYRILLIC as EncodingRef),
        936 | 54936 => Some(all::GB18030 as EncodingRef), // XXX technically wrong
        52936 => Some(all::HZ as EncodingRef),
        950 => Some(all::BIG5_2003 as EncodingRef),
        20932 => Some(all::EUC_JP as EncodingRef),
        50220 => Some(all::ISO_2022_JP as EncodingRef),
        932 => Some(all::WINDOWS_31J as EncodingRef),
        949 => Some(all::WINDOWS_949 as EncodingRef),
        1201 => Some(all::UTF_16BE as EncodingRef),
        1200 => Some(all::UTF_16LE as EncodingRef),
        _ => None,
    }
}

#[cfg(test)]
mod tests {
    extern crate test;
    use super::encoding_from_whatwg_label;
    use crate::all;

    #[test]
    fn test_encoding_from_whatwg_label() {
        assert!(encoding_from_whatwg_label("utf-8").is_some());
        assert!(encoding_from_whatwg_label("UTF-8").is_some());
        assert!(encoding_from_whatwg_label("\t\n\x0C\r utf-8\t\n\x0C\r ").is_some());
        assert!(
            encoding_from_whatwg_label("\u{A0}utf-8").is_none(),
            "Non-ASCII whitespace should not be trimmed"
        );
        assert!(encoding_from_whatwg_label("greek").is_some());
        assert!(
            encoding_from_whatwg_label("gree\u{212A}").is_none(),
            "Case-insensitive matching should be ASCII only. Kelvin sign does not match k."
        );

        // checks if the `whatwg_name` method returns the label that resolves back to that encoding
        for encoding in all::encodings() {
            if let Some(whatwg_name) = encoding.whatwg_name() {
                if whatwg_name == "replacement" {
                    continue;
                }
                assert_eq!(
                    encoding_from_whatwg_label(whatwg_name).and_then(|e| e.whatwg_name()),
                    Some(whatwg_name)
                );
            }
        }
    }

    #[bench]
    fn bench_encoding_from_whatwg_label(bencher: &mut test::Bencher) {
        bencher.iter(|| test::black_box(encoding_from_whatwg_label("iso-8859-bazinga")))
    }
}