Skip to main content

xml_syntax_reader/
encoding.rs

1use crate::types::{is_xml_whitespace, DeclaredEncoding, Encoding};
2
3/// Result of probing the encoding of an XML document.
4#[derive(Debug, Clone, PartialEq, Eq)]
5pub struct ProbeResult {
6    /// Detected encoding.
7    pub encoding: Encoding,
8    /// Number of BOM bytes consumed (0 if no BOM).
9    pub bom_length: usize,
10}
11
12/// Probe the encoding of an XML document from its initial bytes.
13///
14/// Inspects up to the first ~128 bytes for:
15/// 1. Byte Order Mark (BOM): UTF-8, UTF-16 LE/BE, UTF-32 LE/BE
16/// 2. XML declaration `<?xml ... encoding="..."?>`
17///
18/// The main parser assumes UTF-8. Callers that need to support other
19/// encodings should use this function to detect the encoding and
20/// transcode before feeding data to `Reader::parse()`.
21///
22/// Returns `Encoding::Unknown` if the input is too short to determine encoding.
23pub fn probe_encoding(data: &[u8]) -> ProbeResult {
24    if data.len() < 2 {
25        return ProbeResult {
26            encoding: Encoding::Unknown,
27            bom_length: 0,
28        };
29    }
30
31    // Check for BOM
32    if data.len() >= 4 {
33        // UTF-32 LE: FF FE 00 00
34        if data[0] == 0xFF && data[1] == 0xFE && data[2] == 0x00 && data[3] == 0x00 {
35            return ProbeResult {
36                encoding: Encoding::Utf32Le,
37                bom_length: 4,
38            };
39        }
40        // UTF-32 BE: 00 00 FE FF
41        if data[0] == 0x00 && data[1] == 0x00 && data[2] == 0xFE && data[3] == 0xFF {
42            return ProbeResult {
43                encoding: Encoding::Utf32Be,
44                bom_length: 4,
45            };
46        }
47    }
48
49    if data.len() >= 3 {
50        // UTF-8 BOM: EF BB BF
51        if data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF {
52            return ProbeResult {
53                encoding: Encoding::Utf8,
54                bom_length: 3,
55            };
56        }
57    }
58
59    // UTF-16 LE: FF FE (must check after UTF-32 LE)
60    if data[0] == 0xFF && data[1] == 0xFE {
61        return ProbeResult {
62            encoding: Encoding::Utf16Le,
63            bom_length: 2,
64        };
65    }
66    // UTF-16 BE: FE FF
67    if data[0] == 0xFE && data[1] == 0xFF {
68        return ProbeResult {
69            encoding: Encoding::Utf16Be,
70            bom_length: 2,
71        };
72    }
73
74    // No BOM - check for XML declaration and encoding attribute.
75    // Heuristic: if the first bytes look like "<?xml" in some encoding,
76    // try to extract the encoding declaration.
77
78    // Check for UTF-16 without BOM by looking at null byte patterns
79    if data.len() >= 4 {
80        // 00 3C 00 3F → UTF-16 BE without BOM (< ?)
81        if data[0] == 0x00 && data[1] == 0x3C && data[2] == 0x00 && data[3] == 0x3F {
82            return ProbeResult {
83                encoding: Encoding::Utf16Be,
84                bom_length: 0,
85            };
86        }
87        // 3C 00 3F 00 → UTF-16 LE without BOM (< ?)
88        if data[0] == 0x3C && data[1] == 0x00 && data[2] == 0x3F && data[3] == 0x00 {
89            return ProbeResult {
90                encoding: Encoding::Utf16Le,
91                bom_length: 0,
92            };
93        }
94        // 00 00 00 3C → UTF-32 BE without BOM
95        if data[0] == 0x00 && data[1] == 0x00 && data[2] == 0x00 && data[3] == 0x3C {
96            return ProbeResult {
97                encoding: Encoding::Utf32Be,
98                bom_length: 0,
99            };
100        }
101        // 3C 00 00 00 → UTF-32 LE without BOM
102        if data[0] == 0x3C && data[1] == 0x00 && data[2] == 0x00 && data[3] == 0x00 {
103            return ProbeResult {
104                encoding: Encoding::Utf32Le,
105                bom_length: 0,
106            };
107        }
108    }
109
110    // Looks like ASCII-compatible (UTF-8 or single-byte). Try to read encoding from XML decl.
111    if let Some(enc) = extract_encoding_from_decl(data) {
112        return ProbeResult {
113            encoding: Encoding::Declared(enc),
114            bom_length: 0,
115        };
116    }
117
118    // Default: assume UTF-8 if it starts with `<` or looks like ASCII
119    if data[0] == b'<' || data[0].is_ascii() {
120        return ProbeResult {
121            encoding: Encoding::Utf8,
122            bom_length: 0,
123        };
124    }
125
126    ProbeResult {
127        encoding: Encoding::Unknown,
128        bom_length: 0,
129    }
130}
131
132/// Try to extract an `encoding="..."` value from an XML declaration in ASCII-compatible data.
133///
134/// Scans for `<?xml` followed by `encoding` attribute. Returns `None` if not found
135/// or if the data is too short to contain a complete declaration.
136fn extract_encoding_from_decl(data: &[u8]) -> Option<DeclaredEncoding> {
137    // Need at least "<?xml encoding='x'?>" = 22 bytes
138    if data.len() < 22 {
139        return None;
140    }
141
142    // Must start with "<?xml" (case-sensitive per XML spec)
143    if !data.starts_with(b"<?xml") {
144        return None;
145    }
146
147    // Byte after "<?xml" must be whitespace
148    if data.len() <= 5 || !is_xml_whitespace(data[5]) {
149        return None;
150    }
151
152    // Scan for "encoding" within the first 256 bytes or until "?>"
153    let limit = data.len().min(256);
154    let search = &data[6..limit];
155
156    // Find "encoding"
157    let enc_pos = find_subsequence(search, b"encoding")?;
158    let after_enc = enc_pos + 8; // skip "encoding"
159
160    if after_enc >= search.len() {
161        return None;
162    }
163
164    // Skip whitespace and '='
165    let mut pos = after_enc;
166    while pos < search.len() && is_xml_whitespace(search[pos]) {
167        pos += 1;
168    }
169    if pos >= search.len() || search[pos] != b'=' {
170        return None;
171    }
172    pos += 1; // skip '='
173    while pos < search.len() && is_xml_whitespace(search[pos]) {
174        pos += 1;
175    }
176
177    if pos >= search.len() {
178        return None;
179    }
180
181    // Read quoted value
182    let quote = search[pos];
183    if quote != b'"' && quote != b'\'' {
184        return None;
185    }
186    pos += 1; // skip opening quote
187
188    let value_start = pos;
189    while pos < search.len() && search[pos] != quote {
190        pos += 1;
191    }
192    if pos >= search.len() {
193        return None;
194    }
195
196    let value = &search[value_start..pos];
197    DeclaredEncoding::new(value)
198}
199
200/// Find the first occurrence of `needle` in `haystack`.
201fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option<usize> {
202    haystack
203        .windows(needle.len())
204        .position(|w| w == needle)
205}
206
207#[cfg(test)]
208mod tests {
209    use super::*;
210
211    #[test]
212    fn utf8_bom() {
213        let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
214        let result = probe_encoding(data);
215        assert_eq!(result.encoding, Encoding::Utf8);
216        assert_eq!(result.bom_length, 3);
217    }
218
219    #[test]
220    fn utf16_le_bom() {
221        let data = b"\xFF\xFE<\x00?\x00x\x00m\x00l\x00";
222        let result = probe_encoding(data);
223        assert_eq!(result.encoding, Encoding::Utf16Le);
224        assert_eq!(result.bom_length, 2);
225    }
226
227    #[test]
228    fn utf16_be_bom() {
229        let data = b"\xFE\xFF\x00<\x00?\x00x\x00m\x00l";
230        let result = probe_encoding(data);
231        assert_eq!(result.encoding, Encoding::Utf16Be);
232        assert_eq!(result.bom_length, 2);
233    }
234
235    #[test]
236    fn utf32_le_bom() {
237        let data = b"\xFF\xFE\x00\x00<\x00\x00\x00";
238        let result = probe_encoding(data);
239        assert_eq!(result.encoding, Encoding::Utf32Le);
240        assert_eq!(result.bom_length, 4);
241    }
242
243    #[test]
244    fn utf32_be_bom() {
245        let data = b"\x00\x00\xFE\xFF\x00\x00\x00<";
246        let result = probe_encoding(data);
247        assert_eq!(result.encoding, Encoding::Utf32Be);
248        assert_eq!(result.bom_length, 4);
249    }
250
251    #[test]
252    fn utf16_be_no_bom() {
253        let data = b"\x00<\x00?\x00x\x00m\x00l";
254        let result = probe_encoding(data);
255        assert_eq!(result.encoding, Encoding::Utf16Be);
256        assert_eq!(result.bom_length, 0);
257    }
258
259    #[test]
260    fn utf16_le_no_bom() {
261        let data = b"<\x00?\x00x\x00m\x00l\x00";
262        let result = probe_encoding(data);
263        assert_eq!(result.encoding, Encoding::Utf16Le);
264        assert_eq!(result.bom_length, 0);
265    }
266
267    #[test]
268    fn encoding_declaration() {
269        let data = b"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>";
270        let result = probe_encoding(data);
271        assert_eq!(result.bom_length, 0);
272        match result.encoding {
273            Encoding::Declared(enc) => {
274                assert_eq!(enc.as_str(), Some("ISO-8859-1"));
275            }
276            other => panic!("expected Declared, got {other:?}"),
277        }
278    }
279
280    #[test]
281    fn encoding_declaration_single_quotes() {
282        let data = b"<?xml version='1.0' encoding='Shift_JIS'?>";
283        let result = probe_encoding(data);
284        match result.encoding {
285            Encoding::Declared(enc) => {
286                assert_eq!(enc.as_str(), Some("Shift_JIS"));
287            }
288            other => panic!("expected Declared, got {other:?}"),
289        }
290    }
291
292    #[test]
293    fn no_encoding_declaration() {
294        let data = b"<?xml version=\"1.0\"?><root/>";
295        let result = probe_encoding(data);
296        // No encoding attribute → assume UTF-8 (starts with '<')
297        assert_eq!(result.encoding, Encoding::Utf8);
298        assert_eq!(result.bom_length, 0);
299    }
300
301    #[test]
302    fn plain_utf8_document() {
303        let data = b"<root>hello</root>";
304        let result = probe_encoding(data);
305        assert_eq!(result.encoding, Encoding::Utf8);
306        assert_eq!(result.bom_length, 0);
307    }
308
309    #[test]
310    fn empty_input() {
311        let result = probe_encoding(b"");
312        assert_eq!(result.encoding, Encoding::Unknown);
313    }
314
315    #[test]
316    fn single_byte() {
317        let result = probe_encoding(b"<");
318        assert_eq!(result.encoding, Encoding::Unknown);
319    }
320
321    #[test]
322    fn encoding_with_spaces_around_eq() {
323        let data = b"<?xml version = \"1.0\" encoding = \"windows-1252\" ?>";
324        let result = probe_encoding(data);
325        match result.encoding {
326            Encoding::Declared(enc) => {
327                assert_eq!(enc.as_str(), Some("windows-1252"));
328            }
329            other => panic!("expected Declared, got {other:?}"),
330        }
331    }
332}