1use crate::all;
8use crate::types::EncodingRef;
9
10pub fn encoding_from_whatwg_label(label: &str) -> Option<EncodingRef> {
13 let label = label.trim_matches(&[' ', '\n', '\r', '\t', '\x0C'][..]);
14 let label: String = label
15 .chars()
16 .map(|c| match c {
17 'A'..='Z' => (c as u8 + 32) as char,
18 _ => c,
19 })
20 .collect();
21 match &label[..] {
22 "unicode-1-1-utf-8" | "utf-8" | "utf8" => Some(all::UTF_8 as EncodingRef),
23 "866" | "cp866" | "csibm866" | "ibm866" => Some(all::IBM866 as EncodingRef),
24 "csisolatin2" | "iso-8859-2" | "iso-ir-101" | "iso8859-2" | "iso88592" | "iso_8859-2"
25 | "iso_8859-2:1987" | "l2" | "latin2" => Some(all::ISO_8859_2 as EncodingRef),
26 "csisolatin3" | "iso-8859-3" | "iso-ir-109" | "iso8859-3" | "iso88593" | "iso_8859-3"
27 | "iso_8859-3:1988" | "l3" | "latin3" => Some(all::ISO_8859_3 as EncodingRef),
28 "csisolatin4" | "iso-8859-4" | "iso-ir-110" | "iso8859-4" | "iso88594" | "iso_8859-4"
29 | "iso_8859-4:1988" | "l4" | "latin4" => Some(all::ISO_8859_4 as EncodingRef),
30 "csisolatincyrillic" | "cyrillic" | "iso-8859-5" | "iso-ir-144" | "iso8859-5"
31 | "iso88595" | "iso_8859-5" | "iso_8859-5:1988" => Some(all::ISO_8859_5 as EncodingRef),
32 "arabic" | "asmo-708" | "csiso88596e" | "csiso88596i" | "csisolatinarabic" | "ecma-114"
33 | "iso-8859-6" | "iso-8859-6-e" | "iso-8859-6-i" | "iso-ir-127" | "iso8859-6"
34 | "iso88596" | "iso_8859-6" | "iso_8859-6:1987" => Some(all::ISO_8859_6 as EncodingRef),
35 "csisolatingreek" | "ecma-118" | "elot_928" | "greek" | "greek8" | "iso-8859-7"
36 | "iso-ir-126" | "iso8859-7" | "iso88597" | "iso_8859-7" | "iso_8859-7:1987"
37 | "sun_eu_greek" => Some(all::ISO_8859_7 as EncodingRef),
38 "csiso88598e" | "csisolatinhebrew" | "hebrew" | "iso-8859-8" | "iso-8859-8-e"
39 | "iso-ir-138" | "iso8859-8" | "iso88598" | "iso_8859-8" | "iso_8859-8:1988" | "visual" => {
40 Some(all::ISO_8859_8 as EncodingRef)
41 }
42 "csiso88598i" | "iso-8859-8-i" | "logical" => {
43 Some(all::whatwg::ISO_8859_8_I as EncodingRef)
44 }
45 "csisolatin6" | "iso-8859-10" | "iso-ir-157" | "iso8859-10" | "iso885910" | "l6"
46 | "latin6" => Some(all::ISO_8859_10 as EncodingRef),
47 "iso-8859-13" | "iso8859-13" | "iso885913" => Some(all::ISO_8859_13 as EncodingRef),
48 "iso-8859-14" | "iso8859-14" | "iso885914" => Some(all::ISO_8859_14 as EncodingRef),
49 "csisolatin9" | "iso-8859-15" | "iso8859-15" | "iso885915" | "iso_8859-15" | "l9" => {
50 Some(all::ISO_8859_15 as EncodingRef)
51 }
52 "iso-8859-16" => Some(all::ISO_8859_16 as EncodingRef),
53 "cskoi8r" | "koi" | "koi8" | "koi8-r" | "koi8_r" => Some(all::KOI8_R as EncodingRef),
54 "koi8-ru" | "koi8-u" => Some(all::KOI8_U as EncodingRef),
55 "csmacintosh" | "mac" | "macintosh" | "x-mac-roman" => Some(all::MAC_ROMAN as EncodingRef),
56 "dos-874" | "iso-8859-11" | "iso8859-11" | "iso885911" | "tis-620" | "windows-874" => {
57 Some(all::WINDOWS_874 as EncodingRef)
58 }
59 "cp1250" | "windows-1250" | "x-cp1250" => Some(all::WINDOWS_1250 as EncodingRef),
60 "cp1251" | "windows-1251" | "x-cp1251" => Some(all::WINDOWS_1251 as EncodingRef),
61 "ansi_x3.4-1968" | "ascii" | "cp1252" | "cp819" | "csisolatin1" | "ibm819"
62 | "iso-8859-1" | "iso-ir-100" | "iso8859-1" | "iso88591" | "iso_8859-1"
63 | "iso_8859-1:1987" | "l1" | "latin1" | "us-ascii" | "windows-1252" | "x-cp1252" => {
64 Some(all::WINDOWS_1252 as EncodingRef)
65 }
66 "cp1253" | "windows-1253" | "x-cp1253" => Some(all::WINDOWS_1253 as EncodingRef),
67 "cp1254" | "csisolatin5" | "iso-8859-9" | "iso-ir-148" | "iso8859-9" | "iso88599"
68 | "iso_8859-9" | "iso_8859-9:1989" | "l5" | "latin5" | "windows-1254" | "x-cp1254" => {
69 Some(all::WINDOWS_1254 as EncodingRef)
70 }
71 "cp1255" | "windows-1255" | "x-cp1255" => Some(all::WINDOWS_1255 as EncodingRef),
72 "cp1256" | "windows-1256" | "x-cp1256" => Some(all::WINDOWS_1256 as EncodingRef),
73 "cp1257" | "windows-1257" | "x-cp1257" => Some(all::WINDOWS_1257 as EncodingRef),
74 "cp1258" | "windows-1258" | "x-cp1258" => Some(all::WINDOWS_1258 as EncodingRef),
75 "x-mac-cyrillic" | "x-mac-ukrainian" => Some(all::MAC_CYRILLIC as EncodingRef),
76 "chinese" | "csgb2312" | "csiso58gb231280" | "gb2312" | "gb_2312" | "gb_2312-80"
77 | "gbk" | "iso-ir-58" | "x-gbk" => Some(all::GBK as EncodingRef),
78 "gb18030" => Some(all::GB18030 as EncodingRef),
79 "big5" | "big5-hkscs" | "cn-big5" | "csbig5" | "x-x-big5" => {
80 Some(all::BIG5_2003 as EncodingRef)
81 }
82 "cseucpkdfmtjapanese" | "euc-jp" | "x-euc-jp" => Some(all::EUC_JP as EncodingRef),
83 "csiso2022jp" | "iso-2022-jp" => Some(all::ISO_2022_JP as EncodingRef),
84 "csshiftjis" | "ms932" | "ms_kanji" | "shift-jis" | "shift_jis" | "sjis"
85 | "windows-31j" | "x-sjis" => Some(all::WINDOWS_31J as EncodingRef),
86 "cseuckr" | "csksc56011987" | "euc-kr" | "iso-ir-149" | "korean" | "ks_c_5601-1987"
87 | "ks_c_5601-1989" | "ksc5601" | "ksc_5601" | "windows-949" => {
88 Some(all::WINDOWS_949 as EncodingRef)
89 }
90 "csiso2022kr" | "hz-gb-2312" | "iso-2022-kr" | "iso-2022-cn" | "iso-2022-cn-ext" => {
91 Some(all::whatwg::REPLACEMENT as EncodingRef)
92 }
93 "utf-16be" => Some(all::UTF_16BE as EncodingRef),
94 "utf-16" | "utf-16le" => Some(all::UTF_16LE as EncodingRef),
95 "x-user-defined" => Some(all::whatwg::X_USER_DEFINED as EncodingRef),
96 _ => None,
97 }
98}
99
100pub fn encoding_from_windows_code_page(cp: usize) -> Option<EncodingRef> {
104 match cp {
105 65001 => Some(all::UTF_8 as EncodingRef),
106 866 => Some(all::IBM866 as EncodingRef),
107 28591 => Some(all::ISO_8859_1 as EncodingRef),
108 28592 => Some(all::ISO_8859_2 as EncodingRef),
109 28593 => Some(all::ISO_8859_3 as EncodingRef),
110 28594 => Some(all::ISO_8859_4 as EncodingRef),
111 28595 => Some(all::ISO_8859_5 as EncodingRef),
112 28596 => Some(all::ISO_8859_6 as EncodingRef),
113 28597 => Some(all::ISO_8859_7 as EncodingRef),
114 28598 => Some(all::ISO_8859_8 as EncodingRef),
115 38598 => Some(all::whatwg::ISO_8859_8_I as EncodingRef),
116 28603 => Some(all::ISO_8859_13 as EncodingRef),
117 28605 => Some(all::ISO_8859_15 as EncodingRef),
118 20866 => Some(all::KOI8_R as EncodingRef),
119 21866 => Some(all::KOI8_U as EncodingRef),
120 10000 => Some(all::MAC_ROMAN as EncodingRef),
121 874 => Some(all::WINDOWS_874 as EncodingRef),
122 1250 => Some(all::WINDOWS_1250 as EncodingRef),
123 1251 => Some(all::WINDOWS_1251 as EncodingRef),
124 1252 => Some(all::WINDOWS_1252 as EncodingRef),
125 1253 => Some(all::WINDOWS_1253 as EncodingRef),
126 1254 => Some(all::WINDOWS_1254 as EncodingRef),
127 1255 => Some(all::WINDOWS_1255 as EncodingRef),
128 1256 => Some(all::WINDOWS_1256 as EncodingRef),
129 1257 => Some(all::WINDOWS_1257 as EncodingRef),
130 1258 => Some(all::WINDOWS_1258 as EncodingRef),
131 1259 => Some(all::MAC_CYRILLIC as EncodingRef),
132 936 | 54936 => Some(all::GB18030 as EncodingRef), 52936 => Some(all::HZ as EncodingRef),
134 950 => Some(all::BIG5_2003 as EncodingRef),
135 20932 => Some(all::EUC_JP as EncodingRef),
136 50220 => Some(all::ISO_2022_JP as EncodingRef),
137 932 => Some(all::WINDOWS_31J as EncodingRef),
138 949 => Some(all::WINDOWS_949 as EncodingRef),
139 1201 => Some(all::UTF_16BE as EncodingRef),
140 1200 => Some(all::UTF_16LE as EncodingRef),
141 _ => None,
142 }
143}
144
145#[cfg(test)]
146mod tests {
147 extern crate test;
148 use super::encoding_from_whatwg_label;
149 use crate::all;
150
151 #[test]
152 fn test_encoding_from_whatwg_label() {
153 assert!(encoding_from_whatwg_label("utf-8").is_some());
154 assert!(encoding_from_whatwg_label("UTF-8").is_some());
155 assert!(encoding_from_whatwg_label("\t\n\x0C\r utf-8\t\n\x0C\r ").is_some());
156 assert!(
157 encoding_from_whatwg_label("\u{A0}utf-8").is_none(),
158 "Non-ASCII whitespace should not be trimmed"
159 );
160 assert!(encoding_from_whatwg_label("greek").is_some());
161 assert!(
162 encoding_from_whatwg_label("gree\u{212A}").is_none(),
163 "Case-insensitive matching should be ASCII only. Kelvin sign does not match k."
164 );
165
166 for encoding in all::encodings() {
168 if let Some(whatwg_name) = encoding.whatwg_name() {
169 if whatwg_name == "replacement" {
170 continue;
171 }
172 assert_eq!(
173 encoding_from_whatwg_label(whatwg_name).and_then(|e| e.whatwg_name()),
174 Some(whatwg_name)
175 );
176 }
177 }
178 }
179
180 #[bench]
181 fn bench_encoding_from_whatwg_label(bencher: &mut test::Bencher) {
182 bencher.iter(|| test::black_box(encoding_from_whatwg_label("iso-8859-bazinga")))
183 }
184}