1use std::io::{self, Read};
16
17#[derive(Debug)]
18struct Bom(u8, u8, u8, u8);
19
20#[derive(Clone, Copy, Debug, PartialEq, Eq)]
21enum Flavour {
22 UCS,
23 UTF,
24 EBCDIC,
25 ASCII,
26 Unknown,
27}
28
29#[derive(Clone, Copy, Debug, PartialEq, Eq)]
30enum ByteOrder {
31 BigEndian,
32 LittleEndian,
33 Unusual2143,
34 Unusual3412,
35 NotApplicable,
36}
37
38#[derive(Clone, Copy, Debug, PartialEq, Eq)]
39enum Width {
40 EightBit = 8,
41 SixteenBit = 16,
42 ThirtyTwoBit = 32,
43}
44
45#[derive(Clone, Debug, PartialEq, Eq)]
46struct Descriptor(Flavour, Width, ByteOrder);
47
48const UCS_4_BE: Descriptor = Descriptor(Flavour::UCS, Width::ThirtyTwoBit, ByteOrder::BigEndian);
50const UCS_4_LE: Descriptor = Descriptor(Flavour::UCS, Width::ThirtyTwoBit, ByteOrder::LittleEndian);
51const UCS_4_2143: Descriptor =
52 Descriptor(Flavour::UCS, Width::ThirtyTwoBit, ByteOrder::Unusual2143);
53const UCS_4_3412: Descriptor =
54 Descriptor(Flavour::UCS, Width::ThirtyTwoBit, ByteOrder::Unusual3412);
55
56const UTF_16_BE: Descriptor = Descriptor(Flavour::UTF, Width::SixteenBit, ByteOrder::BigEndian);
58const UTF_16_LE: Descriptor = Descriptor(Flavour::UTF, Width::SixteenBit, ByteOrder::LittleEndian);
59
60const UTF_8: Descriptor = Descriptor(Flavour::UTF, Width::EightBit, ByteOrder::NotApplicable);
61const EBCDIC: Descriptor = Descriptor(Flavour::EBCDIC, Width::EightBit, ByteOrder::NotApplicable);
62
63const ASCII_32BIT_BE: Descriptor =
65 Descriptor(Flavour::Unknown, Width::ThirtyTwoBit, ByteOrder::BigEndian);
66const ASCII_32BIT_LE: Descriptor = Descriptor(
67 Flavour::Unknown,
68 Width::ThirtyTwoBit,
69 ByteOrder::LittleEndian,
70);
71const ASCII_16BIT_BE: Descriptor =
72 Descriptor(Flavour::Unknown, Width::SixteenBit, ByteOrder::BigEndian);
73const ASCII_16BIT_LE: Descriptor =
74 Descriptor(Flavour::Unknown, Width::SixteenBit, ByteOrder::LittleEndian);
75const ASCII_8BIT: Descriptor =
76 Descriptor(Flavour::ASCII, Width::EightBit, ByteOrder::NotApplicable);
77
78pub fn detect<R: Read>(reader: &mut R, hint: Option<String>) -> Result<Vec<String>, io::Error> {
98 let mut first_four_bytes = [0u8; 4];
100 reader.read_exact(&mut first_four_bytes)?;
101
102 let bom = Bom(
103 first_four_bytes[0],
104 first_four_bytes[1],
105 first_four_bytes[2],
106 first_four_bytes[3],
107 );
108
109 let possible_encoding = detect_byte_order_mark(&bom);
110
111 let mut buf = [0u8; 512];
114 loop {
115 match reader.read(&mut buf) {
116 Ok(0) => return Ok(Vec::new()), Ok(_n) => break,
118 Err(ref err) if err.kind() == io::ErrorKind::Interrupted => {} Err(err) => return Err(err),
120 };
121 }
122
123 let mut candidates = Vec::with_capacity(3);
124
125 search("encoding=", &buf, possible_encoding.as_ref())
127 .or_else(|| search("charset=", &buf, possible_encoding.as_ref()))
128 .map(normalise)
129 .map(|encoding| {
130 push_if_not_contains(
131 &mut candidates,
132 endianify(&encoding, possible_encoding.as_ref()),
133 )
134 });
135
136 hint.map(normalise).map(|encoding| {
138 push_if_not_contains(
139 &mut candidates,
140 endianify(&encoding, possible_encoding.as_ref()),
141 )
142 });
143
144 match possible_encoding {
146 Some(UCS_4_LE) => Some("ucs-4le"),
147 Some(UCS_4_BE) => Some("ucs-4be"),
148 Some(UTF_16_LE) => Some("utf-16le"),
149 Some(UTF_16_BE) => Some("utf-16be"),
150 Some(Descriptor(Flavour::UTF, Width::EightBit, _)) => Some("utf-8"),
151 Some(EBCDIC) => Some("ebcdic"),
152 _ => None,
153 }
154 .map(|encoding| push_if_not_contains(&mut candidates, encoding.to_string()));
155
156 if candidates.is_empty() && std::str::from_utf8(&buf).is_ok() {
158 candidates.push("utf-8".to_string());
159 }
160
161 Ok(candidates)
162}
163
164fn detect_byte_order_mark(bom: &Bom) -> Option<Descriptor> {
165 match *bom {
192 Bom(0x00, 0x00, 0xFE, 0xFF) => Some(UCS_4_BE),
194 Bom(0xFF, 0xFE, 0x00, 0x00) => Some(UCS_4_LE),
195 Bom(0x00, 0x00, 0xFF, 0xFE) => Some(UCS_4_2143),
196 Bom(0xFE, 0xFF, 0x00, 0x00) => Some(UCS_4_3412),
197 Bom(0xFE, 0xFF, c, d) if c > 0 || d > 0 => Some(UTF_16_BE),
198 Bom(0xFF, 0xFE, c, d) if c > 0 || d > 0 => Some(UTF_16_LE),
199 Bom(0xEF, 0xBB, 0xBF, _) => Some(UTF_8),
200
201 Bom(0x00, 0x00, 0x00, 0x3C) => Some(ASCII_32BIT_BE),
203 Bom(0x3C, 0x00, 0x00, 0x00) => Some(ASCII_32BIT_LE),
204 Bom(0x00, 0x00, 0x3C, 0x00) => Some(Descriptor(
205 Flavour::Unknown,
206 Width::ThirtyTwoBit,
207 ByteOrder::Unusual2143,
208 )),
209 Bom(0x00, 0x3C, 0x00, 0x00) => Some(Descriptor(
210 Flavour::Unknown,
211 Width::ThirtyTwoBit,
212 ByteOrder::Unusual3412,
213 )),
214 Bom(0x00, 0x3C, 0x00, 0x3F) => Some(ASCII_16BIT_BE),
215 Bom(0x3C, 0x00, 0x3F, 0x00) => Some(ASCII_16BIT_LE),
216 Bom(0x3C, 0x3F, 0x78, 0x6D) => Some(ASCII_8BIT),
217 Bom(0x4C, 0x6F, 0xA7, 0x94) => Some(EBCDIC),
218 _ => None,
221 }
222}
223
224fn normalise<S: AsRef<str>>(encoding: S) -> String {
225 encoding
226 .as_ref()
227 .to_lowercase()
228 .replace("us-ascii", "ascii")
229 .replace("utf8", "utf-8")
230 .replace("shift-jis", "shift_jis")
231}
232
233fn push_if_not_contains<T: PartialEq>(vec: &mut Vec<T>, item: T) {
234 if !vec.contains(&item) {
235 vec.push(item);
236 }
237}
238
239fn endianify(encoding: &str, descriptor: Option<&Descriptor>) -> String {
240 let ascii = ASCII_8BIT;
241 let &Descriptor(_, _, ref order) = descriptor.unwrap_or(&ascii);
242
243 match encoding {
244 "utf-16" => match *order {
245 ByteOrder::LittleEndian => "utf-16le".to_string(),
246 ByteOrder::BigEndian => "utf-16be".to_string(),
247 _ => encoding.to_string(),
248 },
249 _ => encoding.to_string(),
250 }
251}
252
253fn search(needle: &str, haystack: &[u8], descriptor: Option<&Descriptor>) -> Option<String> {
254 let ascii = ASCII_8BIT;
255 let &Descriptor(_, ref width, ref order) = descriptor.unwrap_or(&ascii);
256 let chunk_size = (*width as usize) / 8;
257
258 let mut index = match *order {
259 ByteOrder::NotApplicable | ByteOrder::LittleEndian => 0,
260 ByteOrder::BigEndian => chunk_size - 1,
261 ByteOrder::Unusual2143 => 2,
262 ByteOrder::Unusual3412 => 1,
263 };
264
265 let mut ascii_bytes = Vec::with_capacity(haystack.len() / chunk_size);
266 while index < haystack.len() {
267 ascii_bytes.push(haystack[index]);
268 index += chunk_size;
269 }
270
271 let ascii_haystack = String::from_utf8_lossy(&ascii_bytes);
272
273 ascii_haystack.find(needle).map(|pos| {
274 ascii_haystack[pos + needle.len()..]
276 .chars()
277 .skip_while(|char| *char == '"' || *char == '\'')
278 .take_while(|char| *char != '"' && *char != '\'')
279 .collect()
280 })
281}
282
283#[cfg(test)]
284mod tests {
285 use super::*;
286 use std::io::Cursor;
287
288 #[test]
289 fn test_detect_empty() {
290 let mut text_cursor = Cursor::new("");
291 let detected_charsets = detect(&mut text_cursor, None);
292 assert!(detected_charsets.is_err()); }
294
295 #[test]
296 fn test_detect_4_bytes() {
297 let mut text_cursor = Cursor::new("1234");
298 let detected_charsets = detect(&mut text_cursor, None).unwrap();
299 assert!(detected_charsets.is_empty());
300 }
301}