encoding/
label.rs

1// This is a part of encoding-next.
2// Copyright (c) 2013-2015, Kang Seonghoon.
3// See README.md and LICENSE.txt for details.
4
5//! An interface for retrieving an encoding (or a set of encodings) from a string/numeric label.
6
7use crate::all;
8use crate::types::EncodingRef;
9
10/// Returns an encoding from given label, defined in the WHATWG Encoding standard, if any.
11/// Implements "get an encoding" algorithm: <http://encoding.spec.whatwg.org/#concept-encoding-get>
12pub fn encoding_from_whatwg_label(label: &str) -> Option<EncodingRef> {
13    let label = label.trim_matches(&[' ', '\n', '\r', '\t', '\x0C'][..]);
14    let label: String = label
15        .chars()
16        .map(|c| match c {
17            'A'..='Z' => (c as u8 + 32) as char,
18            _ => c,
19        })
20        .collect();
21    match &label[..] {
22        "unicode-1-1-utf-8" | "utf-8" | "utf8" => Some(all::UTF_8 as EncodingRef),
23        "866" | "cp866" | "csibm866" | "ibm866" => Some(all::IBM866 as EncodingRef),
24        "csisolatin2" | "iso-8859-2" | "iso-ir-101" | "iso8859-2" | "iso88592" | "iso_8859-2"
25        | "iso_8859-2:1987" | "l2" | "latin2" => Some(all::ISO_8859_2 as EncodingRef),
26        "csisolatin3" | "iso-8859-3" | "iso-ir-109" | "iso8859-3" | "iso88593" | "iso_8859-3"
27        | "iso_8859-3:1988" | "l3" | "latin3" => Some(all::ISO_8859_3 as EncodingRef),
28        "csisolatin4" | "iso-8859-4" | "iso-ir-110" | "iso8859-4" | "iso88594" | "iso_8859-4"
29        | "iso_8859-4:1988" | "l4" | "latin4" => Some(all::ISO_8859_4 as EncodingRef),
30        "csisolatincyrillic" | "cyrillic" | "iso-8859-5" | "iso-ir-144" | "iso8859-5"
31        | "iso88595" | "iso_8859-5" | "iso_8859-5:1988" => Some(all::ISO_8859_5 as EncodingRef),
32        "arabic" | "asmo-708" | "csiso88596e" | "csiso88596i" | "csisolatinarabic" | "ecma-114"
33        | "iso-8859-6" | "iso-8859-6-e" | "iso-8859-6-i" | "iso-ir-127" | "iso8859-6"
34        | "iso88596" | "iso_8859-6" | "iso_8859-6:1987" => Some(all::ISO_8859_6 as EncodingRef),
35        "csisolatingreek" | "ecma-118" | "elot_928" | "greek" | "greek8" | "iso-8859-7"
36        | "iso-ir-126" | "iso8859-7" | "iso88597" | "iso_8859-7" | "iso_8859-7:1987"
37        | "sun_eu_greek" => Some(all::ISO_8859_7 as EncodingRef),
38        "csiso88598e" | "csisolatinhebrew" | "hebrew" | "iso-8859-8" | "iso-8859-8-e"
39        | "iso-ir-138" | "iso8859-8" | "iso88598" | "iso_8859-8" | "iso_8859-8:1988" | "visual" => {
40            Some(all::ISO_8859_8 as EncodingRef)
41        }
42        "csiso88598i" | "iso-8859-8-i" | "logical" => {
43            Some(all::whatwg::ISO_8859_8_I as EncodingRef)
44        }
45        "csisolatin6" | "iso-8859-10" | "iso-ir-157" | "iso8859-10" | "iso885910" | "l6"
46        | "latin6" => Some(all::ISO_8859_10 as EncodingRef),
47        "iso-8859-13" | "iso8859-13" | "iso885913" => Some(all::ISO_8859_13 as EncodingRef),
48        "iso-8859-14" | "iso8859-14" | "iso885914" => Some(all::ISO_8859_14 as EncodingRef),
49        "csisolatin9" | "iso-8859-15" | "iso8859-15" | "iso885915" | "iso_8859-15" | "l9" => {
50            Some(all::ISO_8859_15 as EncodingRef)
51        }
52        "iso-8859-16" => Some(all::ISO_8859_16 as EncodingRef),
53        "cskoi8r" | "koi" | "koi8" | "koi8-r" | "koi8_r" => Some(all::KOI8_R as EncodingRef),
54        "koi8-ru" | "koi8-u" => Some(all::KOI8_U as EncodingRef),
55        "csmacintosh" | "mac" | "macintosh" | "x-mac-roman" => Some(all::MAC_ROMAN as EncodingRef),
56        "dos-874" | "iso-8859-11" | "iso8859-11" | "iso885911" | "tis-620" | "windows-874" => {
57            Some(all::WINDOWS_874 as EncodingRef)
58        }
59        "cp1250" | "windows-1250" | "x-cp1250" => Some(all::WINDOWS_1250 as EncodingRef),
60        "cp1251" | "windows-1251" | "x-cp1251" => Some(all::WINDOWS_1251 as EncodingRef),
61        "ansi_x3.4-1968" | "ascii" | "cp1252" | "cp819" | "csisolatin1" | "ibm819"
62        | "iso-8859-1" | "iso-ir-100" | "iso8859-1" | "iso88591" | "iso_8859-1"
63        | "iso_8859-1:1987" | "l1" | "latin1" | "us-ascii" | "windows-1252" | "x-cp1252" => {
64            Some(all::WINDOWS_1252 as EncodingRef)
65        }
66        "cp1253" | "windows-1253" | "x-cp1253" => Some(all::WINDOWS_1253 as EncodingRef),
67        "cp1254" | "csisolatin5" | "iso-8859-9" | "iso-ir-148" | "iso8859-9" | "iso88599"
68        | "iso_8859-9" | "iso_8859-9:1989" | "l5" | "latin5" | "windows-1254" | "x-cp1254" => {
69            Some(all::WINDOWS_1254 as EncodingRef)
70        }
71        "cp1255" | "windows-1255" | "x-cp1255" => Some(all::WINDOWS_1255 as EncodingRef),
72        "cp1256" | "windows-1256" | "x-cp1256" => Some(all::WINDOWS_1256 as EncodingRef),
73        "cp1257" | "windows-1257" | "x-cp1257" => Some(all::WINDOWS_1257 as EncodingRef),
74        "cp1258" | "windows-1258" | "x-cp1258" => Some(all::WINDOWS_1258 as EncodingRef),
75        "x-mac-cyrillic" | "x-mac-ukrainian" => Some(all::MAC_CYRILLIC as EncodingRef),
76        "chinese" | "csgb2312" | "csiso58gb231280" | "gb2312" | "gb_2312" | "gb_2312-80"
77        | "gbk" | "iso-ir-58" | "x-gbk" => Some(all::GBK as EncodingRef),
78        "gb18030" => Some(all::GB18030 as EncodingRef),
79        "big5" | "big5-hkscs" | "cn-big5" | "csbig5" | "x-x-big5" => {
80            Some(all::BIG5_2003 as EncodingRef)
81        }
82        "cseucpkdfmtjapanese" | "euc-jp" | "x-euc-jp" => Some(all::EUC_JP as EncodingRef),
83        "csiso2022jp" | "iso-2022-jp" => Some(all::ISO_2022_JP as EncodingRef),
84        "csshiftjis" | "ms932" | "ms_kanji" | "shift-jis" | "shift_jis" | "sjis"
85        | "windows-31j" | "x-sjis" => Some(all::WINDOWS_31J as EncodingRef),
86        "cseuckr" | "csksc56011987" | "euc-kr" | "iso-ir-149" | "korean" | "ks_c_5601-1987"
87        | "ks_c_5601-1989" | "ksc5601" | "ksc_5601" | "windows-949" => {
88            Some(all::WINDOWS_949 as EncodingRef)
89        }
90        "csiso2022kr" | "hz-gb-2312" | "iso-2022-kr" | "iso-2022-cn" | "iso-2022-cn-ext" => {
91            Some(all::whatwg::REPLACEMENT as EncodingRef)
92        }
93        "utf-16be" => Some(all::UTF_16BE as EncodingRef),
94        "utf-16" | "utf-16le" => Some(all::UTF_16LE as EncodingRef),
95        "x-user-defined" => Some(all::whatwg::X_USER_DEFINED as EncodingRef),
96        _ => None,
97    }
98}
99
100/// Returns an encoding from Windows code page number.
101/// <http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx>
102/// Sometimes it can return a *superset* of the requested encoding, e.g. for several CJK encodings.
103pub fn encoding_from_windows_code_page(cp: usize) -> Option<EncodingRef> {
104    match cp {
105        65001 => Some(all::UTF_8 as EncodingRef),
106        866 => Some(all::IBM866 as EncodingRef),
107        28591 => Some(all::ISO_8859_1 as EncodingRef),
108        28592 => Some(all::ISO_8859_2 as EncodingRef),
109        28593 => Some(all::ISO_8859_3 as EncodingRef),
110        28594 => Some(all::ISO_8859_4 as EncodingRef),
111        28595 => Some(all::ISO_8859_5 as EncodingRef),
112        28596 => Some(all::ISO_8859_6 as EncodingRef),
113        28597 => Some(all::ISO_8859_7 as EncodingRef),
114        28598 => Some(all::ISO_8859_8 as EncodingRef),
115        38598 => Some(all::whatwg::ISO_8859_8_I as EncodingRef),
116        28603 => Some(all::ISO_8859_13 as EncodingRef),
117        28605 => Some(all::ISO_8859_15 as EncodingRef),
118        20866 => Some(all::KOI8_R as EncodingRef),
119        21866 => Some(all::KOI8_U as EncodingRef),
120        10000 => Some(all::MAC_ROMAN as EncodingRef),
121        874 => Some(all::WINDOWS_874 as EncodingRef),
122        1250 => Some(all::WINDOWS_1250 as EncodingRef),
123        1251 => Some(all::WINDOWS_1251 as EncodingRef),
124        1252 => Some(all::WINDOWS_1252 as EncodingRef),
125        1253 => Some(all::WINDOWS_1253 as EncodingRef),
126        1254 => Some(all::WINDOWS_1254 as EncodingRef),
127        1255 => Some(all::WINDOWS_1255 as EncodingRef),
128        1256 => Some(all::WINDOWS_1256 as EncodingRef),
129        1257 => Some(all::WINDOWS_1257 as EncodingRef),
130        1258 => Some(all::WINDOWS_1258 as EncodingRef),
131        1259 => Some(all::MAC_CYRILLIC as EncodingRef),
132        936 | 54936 => Some(all::GB18030 as EncodingRef), // XXX technically wrong
133        52936 => Some(all::HZ as EncodingRef),
134        950 => Some(all::BIG5_2003 as EncodingRef),
135        20932 => Some(all::EUC_JP as EncodingRef),
136        50220 => Some(all::ISO_2022_JP as EncodingRef),
137        932 => Some(all::WINDOWS_31J as EncodingRef),
138        949 => Some(all::WINDOWS_949 as EncodingRef),
139        1201 => Some(all::UTF_16BE as EncodingRef),
140        1200 => Some(all::UTF_16LE as EncodingRef),
141        _ => None,
142    }
143}
144
145#[cfg(test)]
146mod tests {
147    extern crate test;
148    use super::encoding_from_whatwg_label;
149    use crate::all;
150
151    #[test]
152    fn test_encoding_from_whatwg_label() {
153        assert!(encoding_from_whatwg_label("utf-8").is_some());
154        assert!(encoding_from_whatwg_label("UTF-8").is_some());
155        assert!(encoding_from_whatwg_label("\t\n\x0C\r utf-8\t\n\x0C\r ").is_some());
156        assert!(
157            encoding_from_whatwg_label("\u{A0}utf-8").is_none(),
158            "Non-ASCII whitespace should not be trimmed"
159        );
160        assert!(encoding_from_whatwg_label("greek").is_some());
161        assert!(
162            encoding_from_whatwg_label("gree\u{212A}").is_none(),
163            "Case-insensitive matching should be ASCII only. Kelvin sign does not match k."
164        );
165
166        // checks if the `whatwg_name` method returns the label that resolves back to that encoding
167        for encoding in all::encodings() {
168            if let Some(whatwg_name) = encoding.whatwg_name() {
169                if whatwg_name == "replacement" {
170                    continue;
171                }
172                assert_eq!(
173                    encoding_from_whatwg_label(whatwg_name).and_then(|e| e.whatwg_name()),
174                    Some(whatwg_name)
175                );
176            }
177        }
178    }
179
180    #[bench]
181    fn bench_encoding_from_whatwg_label(bencher: &mut test::Bencher) {
182        bencher.iter(|| test::black_box(encoding_from_whatwg_label("iso-8859-bazinga")))
183    }
184}