xml_syntax_reader/
encoding.rs1use crate::types::{is_xml_whitespace, DeclaredEncoding, Encoding};
2
3#[derive(Debug, Clone, PartialEq, Eq)]
5pub struct ProbeResult {
6 pub encoding: Encoding,
8 pub bom_length: usize,
10}
11
12pub fn probe_encoding(data: &[u8]) -> ProbeResult {
24 if data.len() < 2 {
25 return ProbeResult {
26 encoding: Encoding::Unknown,
27 bom_length: 0,
28 };
29 }
30
31 if data.len() >= 4 {
33 if data[0] == 0xFF && data[1] == 0xFE && data[2] == 0x00 && data[3] == 0x00 {
35 return ProbeResult {
36 encoding: Encoding::Utf32Le,
37 bom_length: 4,
38 };
39 }
40 if data[0] == 0x00 && data[1] == 0x00 && data[2] == 0xFE && data[3] == 0xFF {
42 return ProbeResult {
43 encoding: Encoding::Utf32Be,
44 bom_length: 4,
45 };
46 }
47 }
48
49 if data.len() >= 3 {
50 if data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF {
52 return ProbeResult {
53 encoding: Encoding::Utf8,
54 bom_length: 3,
55 };
56 }
57 }
58
59 if data[0] == 0xFF && data[1] == 0xFE {
61 return ProbeResult {
62 encoding: Encoding::Utf16Le,
63 bom_length: 2,
64 };
65 }
66 if data[0] == 0xFE && data[1] == 0xFF {
68 return ProbeResult {
69 encoding: Encoding::Utf16Be,
70 bom_length: 2,
71 };
72 }
73
74 if data.len() >= 4 {
80 if data[0] == 0x00 && data[1] == 0x3C && data[2] == 0x00 && data[3] == 0x3F {
82 return ProbeResult {
83 encoding: Encoding::Utf16Be,
84 bom_length: 0,
85 };
86 }
87 if data[0] == 0x3C && data[1] == 0x00 && data[2] == 0x3F && data[3] == 0x00 {
89 return ProbeResult {
90 encoding: Encoding::Utf16Le,
91 bom_length: 0,
92 };
93 }
94 if data[0] == 0x00 && data[1] == 0x00 && data[2] == 0x00 && data[3] == 0x3C {
96 return ProbeResult {
97 encoding: Encoding::Utf32Be,
98 bom_length: 0,
99 };
100 }
101 if data[0] == 0x3C && data[1] == 0x00 && data[2] == 0x00 && data[3] == 0x00 {
103 return ProbeResult {
104 encoding: Encoding::Utf32Le,
105 bom_length: 0,
106 };
107 }
108 }
109
110 if let Some(enc) = extract_encoding_from_decl(data) {
112 return ProbeResult {
113 encoding: Encoding::Declared(enc),
114 bom_length: 0,
115 };
116 }
117
118 if data[0] == b'<' || data[0].is_ascii() {
120 return ProbeResult {
121 encoding: Encoding::Utf8,
122 bom_length: 0,
123 };
124 }
125
126 ProbeResult {
127 encoding: Encoding::Unknown,
128 bom_length: 0,
129 }
130}
131
132fn extract_encoding_from_decl(data: &[u8]) -> Option<DeclaredEncoding> {
137 if data.len() < 22 {
139 return None;
140 }
141
142 if !data.starts_with(b"<?xml") {
144 return None;
145 }
146
147 if data.len() <= 5 || !is_xml_whitespace(data[5]) {
149 return None;
150 }
151
152 let limit = data.len().min(256);
154 let search = &data[6..limit];
155
156 let enc_pos = find_subsequence(search, b"encoding")?;
158 let after_enc = enc_pos + 8; if after_enc >= search.len() {
161 return None;
162 }
163
164 let mut pos = after_enc;
166 while pos < search.len() && is_xml_whitespace(search[pos]) {
167 pos += 1;
168 }
169 if pos >= search.len() || search[pos] != b'=' {
170 return None;
171 }
172 pos += 1; while pos < search.len() && is_xml_whitespace(search[pos]) {
174 pos += 1;
175 }
176
177 if pos >= search.len() {
178 return None;
179 }
180
181 let quote = search[pos];
183 if quote != b'"' && quote != b'\'' {
184 return None;
185 }
186 pos += 1; let value_start = pos;
189 while pos < search.len() && search[pos] != quote {
190 pos += 1;
191 }
192 if pos >= search.len() {
193 return None;
194 }
195
196 let value = &search[value_start..pos];
197 DeclaredEncoding::new(value)
198}
199
200fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option<usize> {
202 haystack
203 .windows(needle.len())
204 .position(|w| w == needle)
205}
206
207#[cfg(test)]
208mod tests {
209 use super::*;
210
211 #[test]
212 fn utf8_bom() {
213 let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
214 let result = probe_encoding(data);
215 assert_eq!(result.encoding, Encoding::Utf8);
216 assert_eq!(result.bom_length, 3);
217 }
218
219 #[test]
220 fn utf16_le_bom() {
221 let data = b"\xFF\xFE<\x00?\x00x\x00m\x00l\x00";
222 let result = probe_encoding(data);
223 assert_eq!(result.encoding, Encoding::Utf16Le);
224 assert_eq!(result.bom_length, 2);
225 }
226
227 #[test]
228 fn utf16_be_bom() {
229 let data = b"\xFE\xFF\x00<\x00?\x00x\x00m\x00l";
230 let result = probe_encoding(data);
231 assert_eq!(result.encoding, Encoding::Utf16Be);
232 assert_eq!(result.bom_length, 2);
233 }
234
235 #[test]
236 fn utf32_le_bom() {
237 let data = b"\xFF\xFE\x00\x00<\x00\x00\x00";
238 let result = probe_encoding(data);
239 assert_eq!(result.encoding, Encoding::Utf32Le);
240 assert_eq!(result.bom_length, 4);
241 }
242
243 #[test]
244 fn utf32_be_bom() {
245 let data = b"\x00\x00\xFE\xFF\x00\x00\x00<";
246 let result = probe_encoding(data);
247 assert_eq!(result.encoding, Encoding::Utf32Be);
248 assert_eq!(result.bom_length, 4);
249 }
250
251 #[test]
252 fn utf16_be_no_bom() {
253 let data = b"\x00<\x00?\x00x\x00m\x00l";
254 let result = probe_encoding(data);
255 assert_eq!(result.encoding, Encoding::Utf16Be);
256 assert_eq!(result.bom_length, 0);
257 }
258
259 #[test]
260 fn utf16_le_no_bom() {
261 let data = b"<\x00?\x00x\x00m\x00l\x00";
262 let result = probe_encoding(data);
263 assert_eq!(result.encoding, Encoding::Utf16Le);
264 assert_eq!(result.bom_length, 0);
265 }
266
267 #[test]
268 fn encoding_declaration() {
269 let data = b"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>";
270 let result = probe_encoding(data);
271 assert_eq!(result.bom_length, 0);
272 match result.encoding {
273 Encoding::Declared(enc) => {
274 assert_eq!(enc.as_str(), Some("ISO-8859-1"));
275 }
276 other => panic!("expected Declared, got {other:?}"),
277 }
278 }
279
280 #[test]
281 fn encoding_declaration_single_quotes() {
282 let data = b"<?xml version='1.0' encoding='Shift_JIS'?>";
283 let result = probe_encoding(data);
284 match result.encoding {
285 Encoding::Declared(enc) => {
286 assert_eq!(enc.as_str(), Some("Shift_JIS"));
287 }
288 other => panic!("expected Declared, got {other:?}"),
289 }
290 }
291
292 #[test]
293 fn no_encoding_declaration() {
294 let data = b"<?xml version=\"1.0\"?><root/>";
295 let result = probe_encoding(data);
296 assert_eq!(result.encoding, Encoding::Utf8);
298 assert_eq!(result.bom_length, 0);
299 }
300
301 #[test]
302 fn plain_utf8_document() {
303 let data = b"<root>hello</root>";
304 let result = probe_encoding(data);
305 assert_eq!(result.encoding, Encoding::Utf8);
306 assert_eq!(result.bom_length, 0);
307 }
308
309 #[test]
310 fn empty_input() {
311 let result = probe_encoding(b"");
312 assert_eq!(result.encoding, Encoding::Unknown);
313 }
314
315 #[test]
316 fn single_byte() {
317 let result = probe_encoding(b"<");
318 assert_eq!(result.encoding, Encoding::Unknown);
319 }
320
321 #[test]
322 fn encoding_with_spaces_around_eq() {
323 let data = b"<?xml version = \"1.0\" encoding = \"windows-1252\" ?>";
324 let result = probe_encoding(data);
325 match result.encoding {
326 Encoding::Declared(enc) => {
327 assert_eq!(enc.as_str(), Some("windows-1252"));
328 }
329 other => panic!("expected Declared, got {other:?}"),
330 }
331 }
332}