1use crate::document::{Document, Node};
2use crate::element::Element;
3use crate::error::{Error, Result};
4use encoding_rs::Decoder;
5use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
6use quick_xml::events::{BytesDecl, BytesStart, Event};
7use quick_xml::Reader;
8use std::borrow::Cow;
9use std::collections::HashMap;
10use std::io::{BufRead, Read};
11
12pub(crate) struct DecodeReader<R: Read> {
13 decoder: Option<Decoder>,
14 inner: R,
15 undecoded: Box<[u8]>,
16 undecoded_pos: usize,
17 undecoded_cap: usize,
18 remaining: [u8; 32], decoded: Box<[u8]>,
20 decoded_pos: usize,
21 decoded_cap: usize,
22 done: bool,
23}
24
25impl<R: Read> DecodeReader<R> {
26 pub(crate) fn new(reader: R, decoder: Option<Decoder>) -> DecodeReader<R> {
28 DecodeReader {
29 decoder,
30 inner: reader,
31 undecoded: vec![0; 4096].into_boxed_slice(),
32 undecoded_pos: 0,
33 undecoded_cap: 0,
34 remaining: [0; 32],
35 decoded: vec![0; 12288].into_boxed_slice(),
36 decoded_pos: 0,
37 decoded_cap: 0,
38 done: false,
39 }
40 }
41
42 pub(crate) fn set_encoding(&mut self, encoding: Option<&'static Encoding>) {
43 self.decoder = encoding.map(|e| e.new_decoder_without_bom_handling());
44 self.done = false;
45 }
46
47 fn fill_buf_decode(&mut self) -> std::io::Result<&[u8]> {
49 if self.decoded_pos >= self.decoded_cap {
50 debug_assert!(self.decoded_pos == self.decoded_cap);
51 if self.done {
52 return Ok(&[]);
53 }
54 let remaining = self.undecoded_cap - self.undecoded_pos;
55 if remaining <= 32 {
56 self.remaining[..remaining]
58 .copy_from_slice(&self.undecoded[self.undecoded_pos..self.undecoded_cap]);
59 self.undecoded[..remaining].copy_from_slice(&self.remaining[..remaining]);
60 let read = self.inner.read(&mut self.undecoded[remaining..])?;
62 self.done = read == 0;
63 self.undecoded_pos = 0;
64 self.undecoded_cap = remaining + read;
65 }
66
67 let (_res, read, written, _replaced) = self.decoder.as_mut().unwrap().decode_to_utf8(
69 &self.undecoded[self.undecoded_pos..self.undecoded_cap],
70 &mut self.decoded,
71 self.done,
72 );
73 self.undecoded_pos += read;
74 self.decoded_cap = written;
75 self.decoded_pos = 0;
76 }
77 Ok(&self.decoded[self.decoded_pos..self.decoded_cap])
78 }
79
80 fn fill_buf_without_decode(&mut self) -> std::io::Result<&[u8]> {
81 if self.undecoded_pos >= self.undecoded_cap {
82 debug_assert!(self.undecoded_pos == self.undecoded_cap);
83 self.undecoded_cap = self.inner.read(&mut self.undecoded)?;
84 self.undecoded_pos = 0;
85 }
86 Ok(&self.undecoded[self.undecoded_pos..self.undecoded_cap])
87 }
88}
89
90impl<R: Read> Read for DecodeReader<R> {
91 fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
92 (&self.decoded[..]).read(buf)
93 }
94}
95
96impl<R: Read> BufRead for DecodeReader<R> {
97 fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
99 match &self.decoder {
100 Some(_) => self.fill_buf_decode(),
101 None => self.fill_buf_without_decode(),
102 }
103 }
104 fn consume(&mut self, amt: usize) {
105 match &self.decoder {
106 Some(_) => {
107 self.decoded_pos = std::cmp::min(self.decoded_pos + amt, self.decoded_cap);
108 }
109 None => {
110 self.undecoded_pos = std::cmp::min(self.undecoded_pos + amt, self.undecoded_cap);
111 }
112 }
113 }
114}
115
116#[derive(Debug, Clone, PartialEq, Eq)]
118pub struct ReadOptions {
119 pub empty_text_node: bool,
122 pub trim_text: bool,
125 pub ignore_whitespace_only: bool,
129 pub require_decl: bool,
133 pub encoding: Option<String>,
142 pub enforce_encoding: bool,
149}
150
151impl Default for ReadOptions {
152 fn default() -> Self {
153 ReadOptions {
154 empty_text_node: true,
155 trim_text: true,
156 ignore_whitespace_only: false,
157 require_decl: true,
158 encoding: None,
159 enforce_encoding: false,
160 }
161 }
162}
163
164pub(crate) struct DocumentParser {
166 doc: Document,
167 read_opts: ReadOptions,
168 encoding: Option<&'static Encoding>,
169 element_stack: Vec<Element>,
170}
171
172impl DocumentParser {
173 pub(crate) fn parse_reader<R: Read>(reader: R, opts: ReadOptions) -> Result<Document> {
174 let doc = Document::new();
175 let element_stack = vec![doc.container()];
176 let mut parser = DocumentParser {
177 doc,
178 read_opts: opts,
179 encoding: None,
180 element_stack,
181 };
182 parser.parse_start(reader)?;
183 Ok(parser.doc)
184 }
185
186 fn handle_decl(&mut self, ev: &BytesDecl) -> Result<()> {
187 self.doc.version = String::from_utf8(ev.version()?.to_vec())?;
188 self.encoding = match ev.encoding() {
189 Some(res) => {
190 let encoding = Encoding::for_label(&res?).ok_or(Error::CannotDecode)?;
191 if encoding == UTF_8 {
192 None
193 } else {
194 Some(encoding)
195 }
196 }
197 None => None,
198 };
199 self.doc.standalone = match ev.standalone() {
200 Some(res) => {
201 let val = std::str::from_utf8(&res?)?.to_lowercase();
202 match val.as_str() {
203 "yes" => true,
204 "no" => false,
205 _ => {
206 return Err(Error::MalformedXML(
207 "Standalone Document Declaration has non boolean value".to_string(),
208 ))
209 }
210 }
211 }
212 None => false,
213 };
214 Ok(())
215 }
216
217 fn create_element(&mut self, parent: Element, ev: &BytesStart) -> Result<Element> {
218 let full_name = String::from_utf8(ev.name().into_inner().to_vec())?;
219 let mut namespace_decls = HashMap::new();
220 let mut attributes = HashMap::new();
221 for attr in ev.attributes() {
222 let mut attr = attr?;
223 attr.value = Cow::Owned(normalize_space(&attr.value));
224 let key = String::from_utf8(attr.key.into_inner().to_vec())?;
225 let value = String::from_utf8(attr.unescape_value()?.as_bytes().to_vec())?;
226 if key == "xmlns" {
227 namespace_decls.insert(String::new(), value);
228 continue;
229 } else if let Some(prefix) = key.strip_prefix("xmlns:") {
230 namespace_decls.insert(prefix.to_owned(), value);
231 continue;
232 }
233 attributes.insert(key, value);
234 }
235 let elem = Element::with_data(&mut self.doc, full_name, attributes, namespace_decls);
236 parent
237 .push_child(&mut self.doc, Node::Element(elem))
238 .unwrap();
239 Ok(elem)
240 }
241
242 fn handle_event(&mut self, event: Event) -> Result<bool> {
244 match event {
245 Event::Start(ref ev) => {
246 let parent = *self
247 .element_stack
248 .last()
249 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
250 let element = self.create_element(parent, ev)?;
251 self.element_stack.push(element);
252 Ok(false)
253 }
254 Event::End(_) => {
255 let elem = self
256 .element_stack
257 .pop()
258 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?; if self.read_opts.empty_text_node {
260 if !elem.has_children(&self.doc) {
262 elem.push_child(&mut self.doc, Node::Text(String::new()))
263 .unwrap();
264 }
265 }
266 Ok(false)
267 }
268 Event::Empty(ref ev) => {
269 let parent = *self
270 .element_stack
271 .last()
272 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
273 self.create_element(parent, ev)?;
274 Ok(false)
275 }
276 Event::Text(ev) => {
279 if self.read_opts.ignore_whitespace_only && only_has_whitespace(&ev) {
280 return Ok(false);
281 }
282 if ev.is_empty() {
284 return Ok(false);
285 }
286 let content = ev.unescape()?.to_string();
287 let node = Node::Text(content);
288 let parent = *self
289 .element_stack
290 .last()
291 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
292 parent.push_child(&mut self.doc, node).unwrap();
293 Ok(false)
294 }
295 Event::DocType(ev) => {
296 let content = ev.unescape()?;
298 let raw = content.as_bytes();
299 let content = if !raw.is_empty() && raw[0] == b' ' {
300 String::from_utf8(raw[1..].to_vec())?
301 } else {
302 String::from_utf8(raw.to_vec())?
303 };
304 let node = Node::DocType(content);
305 let parent = *self
306 .element_stack
307 .last()
308 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
309 parent.push_child(&mut self.doc, node).unwrap();
310 Ok(false)
311 }
312 Event::Comment(ev) => {
313 let content = String::from_utf8(ev.to_vec())?;
314 let node = Node::Comment(content);
315 let parent = *self
316 .element_stack
317 .last()
318 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
319 parent.push_child(&mut self.doc, node).unwrap();
320 Ok(false)
321 }
322 Event::CData(ev) => {
323 let content = String::from_utf8(ev.to_vec())?;
324 let node = Node::CData(content);
325 let parent = *self
326 .element_stack
327 .last()
328 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
329 parent.push_child(&mut self.doc, node).unwrap();
330 Ok(false)
331 }
332 Event::PI(ev) => {
333 let content = String::from_utf8(ev.to_vec())?;
334 let node = Node::PI(content);
335 let parent = *self
336 .element_stack
337 .last()
338 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
339 parent.push_child(&mut self.doc, node).unwrap();
340 Ok(false)
341 }
342 Event::Decl(_) => Err(Error::MalformedXML(
343 "XML declaration found in the middle of the document".to_string(),
344 )),
345 Event::Eof => Ok(true),
346 }
347 }
348
349 fn sniff_encoding<R: Read>(
351 &mut self,
352 decodereader: &mut DecodeReader<R>,
353 ) -> Result<Option<&'static Encoding>> {
354 let bytes = decodereader.fill_buf()?;
355 let encoding = match bytes {
356 [0x3c, 0x3f, ..] => None, [0xfe, 0xff, ..] => {
358 decodereader.consume(2);
360 Some(UTF_16BE)
361 }
362 [0xff, 0xfe, ..] => {
363 decodereader.consume(2);
365 Some(UTF_16LE)
366 }
367 [0xef, 0xbb, 0xbf, ..] => {
368 decodereader.consume(3);
370 None
371 }
372 [0x00, 0x3c, 0x00, 0x3f, ..] => Some(UTF_16BE),
373 [0x3c, 0x00, 0x3f, 0x00, ..] => Some(UTF_16LE),
374 _ => None, };
376 Ok(encoding)
377 }
378
379 fn parse_start<R: Read>(&mut self, reader: R) -> Result<()> {
381 let mut decodereader = DecodeReader::new(reader, None);
382 let mut init_encoding = self.sniff_encoding(&mut decodereader)?;
383 let requested_encoding = self
384 .read_opts
385 .encoding
386 .as_ref()
387 .map(|enc| Encoding::for_label(enc.as_bytes()).ok_or(Error::CannotDecode))
388 .transpose()?;
389
390 if requested_encoding.is_some() {
391 if requested_encoding == Some(UTF_8) {
393 init_encoding = None;
394 } else {
395 init_encoding = requested_encoding;
396 }
397 }
398
399 decodereader.set_encoding(init_encoding);
400 let mut xmlreader = Reader::from_reader(decodereader);
401 xmlreader.trim_text(self.read_opts.trim_text);
402
403 let mut buf = Vec::with_capacity(200);
404
405 let event = match xmlreader.read_event_into(&mut buf)? {
407 Event::Text(ev) => {
408 let should_ignore = {
409 let is_empty = ev.len() == 0;
410 let is_whitespace = only_has_whitespace(&ev);
411 is_empty || (self.read_opts.ignore_whitespace_only && is_whitespace)
412 };
413 if should_ignore {
414 xmlreader.read_event_into(&mut buf)?
415 } else {
416 Event::Text(ev)
417 }
418 }
419 ev => ev,
420 };
421
422 if let Event::Decl(ev) = event {
423 self.handle_decl(&ev)?;
424 if self.read_opts.enforce_encoding {
425 if requested_encoding.is_none() {
429 return Err(Error::CannotDecode);
430 }
431 if requested_encoding == Some(UTF_8) {
432 if self.encoding.is_some() {
433 return Err(Error::CannotDecode);
434 }
435 } else if self.encoding != requested_encoding {
436 return Err(Error::CannotDecode);
437 }
438 }
439 if self.encoding != init_encoding
441 && !(self.encoding == Some(UTF_16LE) && init_encoding == Some(UTF_16BE))
442 {
443 let mut decode_reader = xmlreader.into_inner();
444 decode_reader.set_encoding(self.encoding);
445 xmlreader = Reader::from_reader(decode_reader);
446 xmlreader.trim_text(self.read_opts.trim_text);
447 }
448 } else if self.read_opts.require_decl {
449 return Err(Error::MalformedXML(
450 "Didn't find XML Declaration at the start of file".to_string(),
451 ));
452 } else if self.handle_event(event)? {
453 return Ok(());
454 }
455 self.parse_content(xmlreader)
457 }
458
459 fn parse_content<B: BufRead>(&mut self, mut reader: Reader<B>) -> Result<()> {
460 let mut buf = Vec::with_capacity(200); loop {
463 let ev = reader.read_event_into(&mut buf)?;
464
465 if self.handle_event(ev)? {
466 return if self.element_stack.len() == 1 {
467 Ok(())
469 } else {
470 Err(Error::MalformedXML("Closing tag not found.".to_string()))
471 };
472 }
473 }
474 }
475}
476
477fn is_whitespace(byte: u8) -> bool {
479 matches!(byte, b'\r' | b'\n' | b'\t' | b' ')
480}
481
482fn only_has_whitespace(bytes: &[u8]) -> bool {
484 bytes.iter().all(|b| is_whitespace(*b))
485}
486
487pub fn normalize_space(bytes: &[u8]) -> Vec<u8> {
491 let mut normalized = Vec::with_capacity(bytes.len());
492 let mut char_found = false;
493 let mut last_space = false;
494 for &byte in bytes {
495 if is_whitespace(byte) {
496 if char_found && !last_space {
497 normalized.push(b' ');
498 last_space = true;
499 }
500 } else {
501 char_found = true;
502 last_space = false;
503 normalized.push(byte);
504 }
505 }
506 if normalized.last() == Some(&b' ') {
508 normalized.pop();
509 }
510 normalized
511}