xhtmlchardet/
lib.rs

1//! Basic character set detection for XML and HTML in Rust.
2//!
3//! ## Example
4//!
5//! ```rust
6//! use std::io::Cursor;
7//! extern crate xhtmlchardet;
8//!
9//! let text: &[u8] = b"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?><channel><title>Example</title></channel>";
10//! let mut text_cursor = Cursor::new(text);
11//! let detected_charsets: Vec<String> = xhtmlchardet::detect(&mut text_cursor, None).unwrap();
12//! assert_eq!(detected_charsets, vec!["iso-8859-1".to_string()]);
13//! ```
14
15use std::io::{self, Read};
16
17#[derive(Debug)]
18struct Bom(u8, u8, u8, u8);
19
20#[derive(Clone, Copy, Debug, PartialEq, Eq)]
21enum Flavour {
22    UCS,
23    UTF,
24    EBCDIC,
25    ASCII,
26    Unknown,
27}
28
29#[derive(Clone, Copy, Debug, PartialEq, Eq)]
30enum ByteOrder {
31    BigEndian,
32    LittleEndian,
33    Unusual2143,
34    Unusual3412,
35    NotApplicable,
36}
37
38#[derive(Clone, Copy, Debug, PartialEq, Eq)]
39enum Width {
40    EightBit = 8,
41    SixteenBit = 16,
42    ThirtyTwoBit = 32,
43}
44
45#[derive(Clone, Debug, PartialEq, Eq)]
46struct Descriptor(Flavour, Width, ByteOrder);
47
48// 32-Bit Encodings
49const UCS_4_BE: Descriptor = Descriptor(Flavour::UCS, Width::ThirtyTwoBit, ByteOrder::BigEndian);
50const UCS_4_LE: Descriptor = Descriptor(Flavour::UCS, Width::ThirtyTwoBit, ByteOrder::LittleEndian);
51const UCS_4_2143: Descriptor =
52    Descriptor(Flavour::UCS, Width::ThirtyTwoBit, ByteOrder::Unusual2143);
53const UCS_4_3412: Descriptor =
54    Descriptor(Flavour::UCS, Width::ThirtyTwoBit, ByteOrder::Unusual3412);
55
56// 16-Bit Encodings
57const UTF_16_BE: Descriptor = Descriptor(Flavour::UTF, Width::SixteenBit, ByteOrder::BigEndian);
58const UTF_16_LE: Descriptor = Descriptor(Flavour::UTF, Width::SixteenBit, ByteOrder::LittleEndian);
59
60const UTF_8: Descriptor = Descriptor(Flavour::UTF, Width::EightBit, ByteOrder::NotApplicable);
61const EBCDIC: Descriptor = Descriptor(Flavour::EBCDIC, Width::EightBit, ByteOrder::NotApplicable);
62
63// ASCII compatible encodings
64const ASCII_32BIT_BE: Descriptor =
65    Descriptor(Flavour::Unknown, Width::ThirtyTwoBit, ByteOrder::BigEndian);
66const ASCII_32BIT_LE: Descriptor = Descriptor(
67    Flavour::Unknown,
68    Width::ThirtyTwoBit,
69    ByteOrder::LittleEndian,
70);
71const ASCII_16BIT_BE: Descriptor =
72    Descriptor(Flavour::Unknown, Width::SixteenBit, ByteOrder::BigEndian);
73const ASCII_16BIT_LE: Descriptor =
74    Descriptor(Flavour::Unknown, Width::SixteenBit, ByteOrder::LittleEndian);
75const ASCII_8BIT: Descriptor =
76    Descriptor(Flavour::ASCII, Width::EightBit, ByteOrder::NotApplicable);
77
78/// Attempt to detect the character set of the supplied byte stream.
79///
80/// `reader` is expected to be positioned at the start of the stream. `detect` will read up to 512
81/// bytes in order to determine the encoding.
82///
83/// The optional `hint` is a possible encoding name for the text that may have been received
84/// externally to the text itself, such as from HTTP header.
85///
86/// ### Example
87///
88/// ```
89/// use std::io::Cursor;
90/// extern crate xhtmlchardet;
91///
92/// let text = b"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?><channel><title>Example</title></channel>";
93/// let mut text_cursor = Cursor::new(text.to_vec());
94/// let detected_charsets = xhtmlchardet::detect(&mut text_cursor, None);
95/// assert_eq!(detected_charsets.unwrap_or(vec![]), vec!["iso-8859-1".to_string()]);
96/// ```
97pub fn detect<R: Read>(reader: &mut R, hint: Option<String>) -> Result<Vec<String>, io::Error> {
98    // Read the first 4 bytes and see if they help
99    let mut first_four_bytes = [0u8; 4];
100    reader.read_exact(&mut first_four_bytes)?;
101
102    let bom = Bom(
103        first_four_bytes[0],
104        first_four_bytes[1],
105        first_four_bytes[2],
106        first_four_bytes[3],
107    );
108
109    let possible_encoding = detect_byte_order_mark(&bom);
110
111    // Now that byte size may have been determined try reading the first 512ish bytes to read an
112    // encoding declaration
113    let mut buf = [0u8; 512];
114    loop {
115        match reader.read(&mut buf) {
116            Ok(0) => return Ok(Vec::new()), // eof
117            Ok(_n) => break,
118            Err(ref err) if err.kind() == io::ErrorKind::Interrupted => {} // retry
119            Err(err) => return Err(err),
120        };
121    }
122
123    let mut candidates = Vec::with_capacity(3);
124
125    // Look for encoding="", charset="?"?
126    search("encoding=", &buf, possible_encoding.as_ref())
127        .or_else(|| search("charset=", &buf, possible_encoding.as_ref()))
128        .map(normalise)
129        .map(|encoding| {
130            push_if_not_contains(
131                &mut candidates,
132                endianify(&encoding, possible_encoding.as_ref()),
133            )
134        });
135
136    // Consider hint
137    hint.map(normalise).map(|encoding| {
138        push_if_not_contains(
139            &mut candidates,
140            endianify(&encoding, possible_encoding.as_ref()),
141        )
142    });
143
144    // Include info from BOM detection
145    match possible_encoding {
146        Some(UCS_4_LE) => Some("ucs-4le"),
147        Some(UCS_4_BE) => Some("ucs-4be"),
148        Some(UTF_16_LE) => Some("utf-16le"),
149        Some(UTF_16_BE) => Some("utf-16be"),
150        Some(Descriptor(Flavour::UTF, Width::EightBit, _)) => Some("utf-8"),
151        Some(EBCDIC) => Some("ebcdic"),
152        _ => None,
153    }
154    .map(|encoding| push_if_not_contains(&mut candidates, encoding.to_string()));
155
156    // Otherwise test if UTF-8
157    if candidates.is_empty() && std::str::from_utf8(&buf).is_ok() {
158        candidates.push("utf-8".to_string());
159    }
160
161    Ok(candidates)
162}
163
164fn detect_byte_order_mark(bom: &Bom) -> Option<Descriptor> {
165    // Can do below without the Bom type if slice pattern syntax becomes non-experimental
166    // let possible = match first_four_bytes {
167    //     // With Byte Order Mark
168    //     [0x00, 0x00, 0xFE, 0xFF]             => Some("UCS-4BE"),
169    //     [0xFF, 0xFE, 0x00, 0x00]             => Some("UCS-4LE"),
170    //     [0x00, 0x00, 0xFF, 0xFE]             => Some("UCS-4OE"),
171    //     [0xFE, 0xFF, 0x00, 0x00]             => Some("UCS-4EO"),
172    //     [0xFE, 0xFF, c, d] if c > 0 && d > 0 => Some("UTF-16BE"),
173    //     [0xFF, 0xFE, c, d] if c > 0 && d > 0 => Some("UCS-16LE"),
174    //     [0xEF, 0xBB, 0xBF, _   ]             => Some("UTF-8"),
175
176    //     //  Without Byte Order Mark
177    //     [0x00, 0x00, 0x00, 0x3C] |
178    //     [0x3C, 0x00, 0x00, 0x00] |
179    //     [0x00, 0x00, 0x3C, 0x00] |
180    //     [0x00, 0x3C, 0x00, 0x00]             => Some("32-bit"),
181    //     [0x00, 0x3C, 0x00, 0x3F]             => Some("16-bit Big Endian"),
182    //     [0x3C, 0x00, 0x3F, 0x00]             => Some("16-bit Little Endian"),
183    //     [0x3C, 0x3F, 0x78, 0x6D]             => Some("8-bit"),
184    //     [0x4C, 0x6F, 0xA7, 0x94]             => Some("EBCDIC"),
185    //     // This may be UTF-8 without an encoding declaration as this is not required
186    //     // for UTF-8
187    //     _                                    => Some("Other"),
188    // };
189
190    // http://www.w3.org/TR/2004/REC-xml-20040204/#sec-guessing-no-ext-info
191    match *bom {
192        // With Byte Order Mark
193        Bom(0x00, 0x00, 0xFE, 0xFF) => Some(UCS_4_BE),
194        Bom(0xFF, 0xFE, 0x00, 0x00) => Some(UCS_4_LE),
195        Bom(0x00, 0x00, 0xFF, 0xFE) => Some(UCS_4_2143),
196        Bom(0xFE, 0xFF, 0x00, 0x00) => Some(UCS_4_3412),
197        Bom(0xFE, 0xFF, c, d) if c > 0 || d > 0 => Some(UTF_16_BE),
198        Bom(0xFF, 0xFE, c, d) if c > 0 || d > 0 => Some(UTF_16_LE),
199        Bom(0xEF, 0xBB, 0xBF, _) => Some(UTF_8),
200
201        //  Without Byte Order Mark
202        Bom(0x00, 0x00, 0x00, 0x3C) => Some(ASCII_32BIT_BE),
203        Bom(0x3C, 0x00, 0x00, 0x00) => Some(ASCII_32BIT_LE),
204        Bom(0x00, 0x00, 0x3C, 0x00) => Some(Descriptor(
205            Flavour::Unknown,
206            Width::ThirtyTwoBit,
207            ByteOrder::Unusual2143,
208        )),
209        Bom(0x00, 0x3C, 0x00, 0x00) => Some(Descriptor(
210            Flavour::Unknown,
211            Width::ThirtyTwoBit,
212            ByteOrder::Unusual3412,
213        )),
214        Bom(0x00, 0x3C, 0x00, 0x3F) => Some(ASCII_16BIT_BE),
215        Bom(0x3C, 0x00, 0x3F, 0x00) => Some(ASCII_16BIT_LE),
216        Bom(0x3C, 0x3F, 0x78, 0x6D) => Some(ASCII_8BIT),
217        Bom(0x4C, 0x6F, 0xA7, 0x94) => Some(EBCDIC),
218        // This may be UTF-8 without an encoding declaration as this is not required
219        // for UTF-8
220        _ => None,
221    }
222}
223
224fn normalise<S: AsRef<str>>(encoding: S) -> String {
225    encoding
226        .as_ref()
227        .to_lowercase()
228        .replace("us-ascii", "ascii")
229        .replace("utf8", "utf-8")
230        .replace("shift-jis", "shift_jis")
231}
232
233fn push_if_not_contains<T: PartialEq>(vec: &mut Vec<T>, item: T) {
234    if !vec.contains(&item) {
235        vec.push(item);
236    }
237}
238
239fn endianify(encoding: &str, descriptor: Option<&Descriptor>) -> String {
240    let ascii = ASCII_8BIT;
241    let &Descriptor(_, _, ref order) = descriptor.unwrap_or(&ascii);
242
243    match encoding {
244        "utf-16" => match *order {
245            ByteOrder::LittleEndian => "utf-16le".to_string(),
246            ByteOrder::BigEndian => "utf-16be".to_string(),
247            _ => encoding.to_string(),
248        },
249        _ => encoding.to_string(),
250    }
251}
252
253fn search(needle: &str, haystack: &[u8], descriptor: Option<&Descriptor>) -> Option<String> {
254    let ascii = ASCII_8BIT;
255    let &Descriptor(_, ref width, ref order) = descriptor.unwrap_or(&ascii);
256    let chunk_size = (*width as usize) / 8;
257
258    let mut index = match *order {
259        ByteOrder::NotApplicable | ByteOrder::LittleEndian => 0,
260        ByteOrder::BigEndian => chunk_size - 1,
261        ByteOrder::Unusual2143 => 2,
262        ByteOrder::Unusual3412 => 1,
263    };
264
265    let mut ascii_bytes = Vec::with_capacity(haystack.len() / chunk_size);
266    while index < haystack.len() {
267        ascii_bytes.push(haystack[index]);
268        index += chunk_size;
269    }
270
271    let ascii_haystack = String::from_utf8_lossy(&ascii_bytes);
272
273    ascii_haystack.find(needle).map(|pos| {
274        // Skip to the matching byte + length of the needle
275        ascii_haystack[pos + needle.len()..]
276            .chars()
277            .skip_while(|char| *char == '"' || *char == '\'')
278            .take_while(|char| *char != '"' && *char != '\'')
279            .collect()
280    })
281}
282
283#[cfg(test)]
284mod tests {
285    use super::*;
286    use std::io::Cursor;
287
288    #[test]
289    fn test_detect_empty() {
290        let mut text_cursor = Cursor::new("");
291        let detected_charsets = detect(&mut text_cursor, None);
292        assert!(detected_charsets.is_err()); // UnexpectedEof
293    }
294
295    #[test]
296    fn test_detect_4_bytes() {
297        let mut text_cursor = Cursor::new("1234");
298        let detected_charsets = detect(&mut text_cursor, None).unwrap();
299        assert!(detected_charsets.is_empty());
300    }
301}