xml_doc/
parser.rs

1use crate::document::{Document, Node};
2use crate::element::Element;
3use crate::error::{Error, Result};
4use encoding_rs::Decoder;
5use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
6use quick_xml::events::{BytesDecl, BytesStart, Event};
7use quick_xml::Reader;
8use std::borrow::Cow;
9use std::collections::HashMap;
10use std::io::{BufRead, Read};
11
12#[cfg(debug_assertions)]
13macro_rules! debug {
14    ($x:expr) => {
15        println!("{:?}", $x)
16    };
17}
18
19pub(crate) struct DecodeReader<R: Read> {
20    decoder: Option<Decoder>,
21    inner: R,
22    undecoded: Box<[u8]>,
23    undecoded_pos: usize,
24    undecoded_cap: usize,
25    remaining: [u8; 32], // Is there an encoding with > 32 bytes for a char?
26    decoded: Box<[u8]>,
27    decoded_pos: usize,
28    decoded_cap: usize,
29    done: bool,
30}
31
32impl<R: Read> DecodeReader<R> {
33    // If Decoder is not set, don't decode.
34    pub(crate) fn new(reader: R, decoder: Option<Decoder>) -> DecodeReader<R> {
35        DecodeReader {
36            decoder,
37            inner: reader,
38            undecoded: vec![0; 4096].into_boxed_slice(),
39            undecoded_pos: 0,
40            undecoded_cap: 0,
41            remaining: [0; 32],
42            decoded: vec![0; 12288].into_boxed_slice(),
43            decoded_pos: 0,
44            decoded_cap: 0,
45            done: false,
46        }
47    }
48
49    pub(crate) fn set_encoding(&mut self, encoding: Option<&'static Encoding>) {
50        self.decoder = encoding.map(|e| e.new_decoder_without_bom_handling());
51        self.done = false;
52    }
53
54    // Call this only when decoder is Some
55    fn fill_buf_decode(&mut self) -> std::io::Result<&[u8]> {
56        if self.decoded_pos >= self.decoded_cap {
57            debug_assert!(self.decoded_pos == self.decoded_cap);
58            if self.done {
59                return Ok(&[]);
60            }
61            let remaining = self.undecoded_cap - self.undecoded_pos;
62            if remaining <= 32 {
63                // Move remaining undecoded bytes at the end to start
64                self.remaining[..remaining]
65                    .copy_from_slice(&self.undecoded[self.undecoded_pos..self.undecoded_cap]);
66                self.undecoded[..remaining].copy_from_slice(&self.remaining[..remaining]);
67                // Fill undecoded buffer
68                let read = self.inner.read(&mut self.undecoded[remaining..])?;
69                self.done = read == 0;
70                self.undecoded_pos = 0;
71                self.undecoded_cap = remaining + read;
72            }
73
74            // Fill decoded buffer
75            let (_res, read, written, _replaced) = self.decoder.as_mut().unwrap().decode_to_utf8(
76                &self.undecoded[self.undecoded_pos..self.undecoded_cap],
77                &mut self.decoded,
78                self.done,
79            );
80            self.undecoded_pos += read;
81            self.decoded_cap = written;
82            self.decoded_pos = 0;
83        }
84        Ok(&self.decoded[self.decoded_pos..self.decoded_cap])
85    }
86
87    fn fill_buf_without_decode(&mut self) -> std::io::Result<&[u8]> {
88        if self.undecoded_pos >= self.undecoded_cap {
89            debug_assert!(self.undecoded_pos == self.undecoded_cap);
90            self.undecoded_cap = self.inner.read(&mut self.undecoded)?;
91            self.undecoded_pos = 0;
92        }
93        Ok(&self.undecoded[self.undecoded_pos..self.undecoded_cap])
94    }
95}
96
97impl<R: Read> Read for DecodeReader<R> {
98    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
99        (&self.decoded[..]).read(buf)
100    }
101}
102
103impl<R: Read> BufRead for DecodeReader<R> {
104    // Decoder may change from None to Some.
105    fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
106        match &self.decoder {
107            Some(_) => self.fill_buf_decode(),
108            None => self.fill_buf_without_decode(),
109        }
110    }
111    fn consume(&mut self, amt: usize) {
112        match &self.decoder {
113            Some(_) => {
114                self.decoded_pos = std::cmp::min(self.decoded_pos + amt, self.decoded_cap);
115            }
116            None => {
117                self.undecoded_pos = std::cmp::min(self.undecoded_pos + amt, self.undecoded_cap);
118            }
119        }
120    }
121}
122
123/// Options when parsing xml.
124#[derive(Debug, Clone, PartialEq, Eq)]
125pub struct ReadOptions {
126    /// <tag></tag> will have a Node::Text("") as its children, while <tag /> won't.
127    /// Default: `true`
128    pub empty_text_node: bool,
129    /// Trims leading and ending whitespaces in `Node::Text`, and ignore node if it is empty.
130    /// Default: `true`
131    pub trim_text: bool,
132    /// Ignore Node::Text that only has whitespaces.
133    /// Only makes sense if `trim_text` is `false`. (If both are `true`, performance takes a hit for no gain)
134    /// Default: `false`
135    pub ignore_whitespace_only: bool,
136    /// Returns error if document doesn't start with XML declaration.
137    /// If there is no XML declaration, the parser won't be able to decode encodings other than UTF-8, unless `encoding` below is set.
138    /// Default: `true`
139    pub require_decl: bool,
140    /// If this is set, the parser will start reading with this encoding.
141    /// But it will switch to XML declaration's encoding value if it has a different value.
142    /// See [`encoding_rs::Encoding::for_label`] for valid values.
143    /// Default: `None`
144    pub encoding: Option<String>,
145}
146
147impl ReadOptions {
148    /// Create ReadOptions with default options.
149    pub fn default() -> ReadOptions {
150        ReadOptions {
151            empty_text_node: true,
152            trim_text: true,
153            ignore_whitespace_only: false,
154            require_decl: true,
155            encoding: None,
156        }
157    }
158}
159
160//TODO: don't unwrap element_stack.last() or pop(). Invalid XML file can crash the software.
161pub(crate) struct DocumentParser {
162    doc: Document,
163    read_opts: ReadOptions,
164    encoding: Option<&'static Encoding>,
165    element_stack: Vec<Element>,
166}
167
168impl DocumentParser {
169    pub(crate) fn parse_reader<R: Read>(reader: R, opts: ReadOptions) -> Result<Document> {
170        let doc = Document::new();
171        let element_stack = vec![doc.container()];
172        let mut parser = DocumentParser {
173            doc,
174            read_opts: opts,
175            encoding: None,
176            element_stack: element_stack,
177        };
178        parser.parse_start(reader)?;
179        Ok(parser.doc)
180    }
181
182    fn handle_decl(&mut self, ev: &BytesDecl) -> Result<()> {
183        self.doc.version = String::from_utf8(ev.version()?.to_vec())?;
184        self.encoding = match ev.encoding() {
185            Some(res) => {
186                let encoding = Encoding::for_label(&res?).ok_or(Error::CannotDecode)?;
187                if encoding == UTF_8 {
188                    None
189                } else {
190                    Some(encoding)
191                }
192            }
193            None => None,
194        };
195        self.doc.standalone = match ev.standalone() {
196            Some(res) => {
197                let val = std::str::from_utf8(&res?)?.to_lowercase();
198                match val.as_str() {
199                    "yes" => true,
200                    "no" => false,
201                    _ => {
202                        return Err(Error::MalformedXML(
203                            "Standalone Document Declaration has non boolean value".to_string(),
204                        ))
205                    }
206                }
207            }
208            None => false,
209        };
210        Ok(())
211    }
212
213    fn create_element(&mut self, parent: Element, ev: &BytesStart) -> Result<Element> {
214        let full_name = String::from_utf8(ev.name().to_vec())?;
215        let mut namespace_decls = HashMap::new();
216        let mut attributes = HashMap::new();
217        for attr in ev.attributes() {
218            let mut attr = attr?;
219            attr.value = Cow::Owned(normalize_space(&attr.value));
220            let key = String::from_utf8(attr.key.to_vec())?;
221            let value = String::from_utf8(attr.unescaped_value()?.to_vec())?;
222            if key == "xmlns" {
223                namespace_decls.insert(String::new(), value);
224                continue;
225            } else if let Some(prefix) = key.strip_prefix("xmlns:") {
226                namespace_decls.insert(prefix.to_owned(), value);
227                continue;
228            }
229            attributes.insert(key, value);
230        }
231
232        let elem = Element::with_data(&mut self.doc, full_name, attributes, namespace_decls);
233        parent
234            .push_child(&mut self.doc, Node::Element(elem))
235            .unwrap();
236        Ok(elem)
237    }
238
239    // Returns true if document parsing is finished.
240    fn handle_event(&mut self, event: Event) -> Result<bool> {
241        match event {
242            Event::Start(ref ev) => {
243                let parent = *self
244                    .element_stack
245                    .last()
246                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
247                let element = self.create_element(parent, ev)?;
248                self.element_stack.push(element);
249                Ok(false)
250            }
251            Event::End(_) => {
252                let elem = self
253                    .element_stack
254                    .pop()
255                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?; // quick-xml checks if tag names match for us
256                if self.read_opts.empty_text_node {
257                    // distinguish <tag></tag> and <tag />
258                    if !elem.has_children(&self.doc) {
259                        elem.push_child(&mut self.doc, Node::Text(String::new()))
260                            .unwrap();
261                    }
262                }
263                Ok(false)
264            }
265            Event::Empty(ref ev) => {
266                let parent = *self
267                    .element_stack
268                    .last()
269                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
270                self.create_element(parent, ev)?;
271                Ok(false)
272            }
273            // Comment, CData, and PI content should not be escaped,
274            // but quick-xml assumes only CDATA is not escaped.
275            Event::Text(ev) => {
276                if self.read_opts.ignore_whitespace_only && only_has_whitespace(&ev) {
277                    return Ok(false);
278                }
279                // when trim_text, ignore_whitespace_only, empty_text_node are all false
280                if ev.is_empty() {
281                    return Ok(false);
282                }
283                let content = String::from_utf8(ev.unescaped()?.to_vec())?;
284                let node = Node::Text(content);
285                let parent = *self
286                    .element_stack
287                    .last()
288                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
289                parent.push_child(&mut self.doc, node).unwrap();
290                Ok(false)
291            }
292            Event::DocType(ev) => {
293                // Event::DocType comes with one leading whitespace. Strip the whitespace.
294                let raw = ev.unescaped()?;
295                let content = if !raw.is_empty() && raw[0] == b' ' {
296                    String::from_utf8(raw[1..].to_vec())?
297                } else {
298                    String::from_utf8(raw.to_vec())?
299                };
300                let node = Node::DocType(content);
301                let parent = *self
302                    .element_stack
303                    .last()
304                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
305                parent.push_child(&mut self.doc, node).unwrap();
306                Ok(false)
307            }
308            Event::Comment(ev) => {
309                let content = String::from_utf8(ev.escaped().to_vec())?;
310                let node = Node::Comment(content);
311                let parent = *self
312                    .element_stack
313                    .last()
314                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
315                parent.push_child(&mut self.doc, node).unwrap();
316                Ok(false)
317            }
318            Event::CData(ev) => {
319                let content = String::from_utf8(ev.unescaped()?.to_vec())?;
320                let node = Node::CData(content);
321                let parent = *self
322                    .element_stack
323                    .last()
324                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
325                parent.push_child(&mut self.doc, node).unwrap();
326                Ok(false)
327            }
328            Event::PI(ev) => {
329                let content = String::from_utf8(ev.escaped().to_vec())?;
330                let node = Node::PI(content);
331                let parent = *self
332                    .element_stack
333                    .last()
334                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
335                parent.push_child(&mut self.doc, node).unwrap();
336                Ok(false)
337            }
338            Event::Decl(_) => Err(Error::MalformedXML(
339                "XML declaration found in the middle of the document".to_string(),
340            )),
341            Event::Eof => Ok(true),
342        }
343    }
344
345    // Sniff encoding and consume BOM
346    fn sniff_encoding<R: Read>(
347        &mut self,
348        decodereader: &mut DecodeReader<R>,
349    ) -> Result<Option<&'static Encoding>> {
350        let bytes = decodereader.fill_buf()?;
351        let encoding = match bytes {
352            [0x3c, 0x3f, ..] => None, // UTF-8 '<?'
353            [0xfe, 0xff, ..] => {
354                // UTF-16 BE BOM
355                decodereader.consume(2);
356                Some(UTF_16BE)
357            }
358            [0xff, 0xfe, ..] => {
359                // UTF-16 LE BOM
360                decodereader.consume(2);
361                Some(UTF_16LE)
362            }
363            [0xef, 0xbb, 0xbf, ..] => {
364                // UTF-8 BOM
365                decodereader.consume(3);
366                None
367            }
368            [0x00, 0x3c, 0x00, 0x3f, ..] => Some(UTF_16BE),
369            [0x3c, 0x00, 0x3f, 0x00, ..] => Some(UTF_16LE),
370            _ => None, // Try decoding it with UTF-8
371        };
372        Ok(encoding)
373    }
374
375    // Look at the document decl and figure out the document encoding
376    fn parse_start<R: Read>(&mut self, reader: R) -> Result<()> {
377        let mut decodereader = DecodeReader::new(reader, None);
378        let mut init_encoding = self.sniff_encoding(&mut decodereader)?;
379        if let Some(enc) = &self.read_opts.encoding {
380            init_encoding = Some(Encoding::for_label(enc.as_bytes()).ok_or(Error::CannotDecode)?)
381        }
382        decodereader.set_encoding(init_encoding);
383        let mut xmlreader = Reader::from_reader(decodereader);
384        xmlreader.trim_text(self.read_opts.trim_text);
385
386        let mut buf = Vec::with_capacity(200);
387
388        // Skip first event if it only has whitespace
389        let event = match xmlreader.read_event(&mut buf)? {
390            Event::Text(ev) => {
391                if ev.len() == 0 {
392                    xmlreader.read_event(&mut buf)?
393                } else if self.read_opts.ignore_whitespace_only && only_has_whitespace(&ev) {
394                    xmlreader.read_event(&mut buf)?
395                } else {
396                    Event::Text(ev)
397                }
398            }
399            ev => ev,
400        };
401        #[cfg(debug_assertions)]
402        debug!(event);
403        if let Event::Decl(ev) = event {
404            self.handle_decl(&ev)?;
405            // Encoding::for_label("UTF-16") defaults to UTF-16 LE, even though it could be UTF-16 BE
406            if self.encoding != init_encoding
407                && !(self.encoding == Some(UTF_16LE) && init_encoding == Some(UTF_16BE))
408            {
409                let mut decode_reader = xmlreader.into_underlying_reader();
410                decode_reader.set_encoding(self.encoding);
411                xmlreader = Reader::from_reader(decode_reader);
412                xmlreader.trim_text(self.read_opts.trim_text);
413            }
414        } else if self.read_opts.require_decl {
415            return Err(Error::MalformedXML(
416                "Didn't find XML Declaration at the start of file".to_string(),
417            ));
418        } else if self.handle_event(event)? {
419            return Ok(());
420        }
421        // Handle rest of the events
422        self.parse_content(xmlreader)
423    }
424
425    fn parse_content<B: BufRead>(&mut self, mut reader: Reader<B>) -> Result<()> {
426        let mut buf = Vec::with_capacity(200); // reduce time increasing capacity at start.
427
428        loop {
429            let ev = reader.read_event(&mut buf)?;
430            #[cfg(debug_assertions)]
431            debug!(ev);
432            if self.handle_event(ev)? {
433                if self.element_stack.len() == 1 {
434                    // Should only have container remaining in element_stack
435                    return Ok(());
436                } else {
437                    return Err(Error::MalformedXML("Closing tag not found.".to_string()));
438                }
439            }
440        }
441    }
442}
443
444/// Returns true if byte is an XML whitespace character
445fn is_whitespace(byte: u8) -> bool {
446    match byte {
447        b'\r' | b'\n' | b'\t' | b' ' => true,
448        _ => false,
449    }
450}
451
452/// Returns true if bytes.len() == 0 or bytes only has a whitespace-like character.
453fn only_has_whitespace(bytes: &[u8]) -> bool {
454    bytes.iter().all(|b| is_whitespace(*b))
455}
456
457/// #xD(\r), #xA(\n), #x9(\t) is normalized into #x20.
458/// Leading and trailing spaces(#x20) are discarded
459/// and sequence of spaces are replaced by a single space.
460pub fn normalize_space(bytes: &[u8]) -> Vec<u8> {
461    let mut normalized = Vec::with_capacity(bytes.len());
462    let mut char_found = false;
463    let mut last_space = false;
464    for &byte in bytes {
465        if is_whitespace(byte) {
466            if char_found && !last_space {
467                normalized.push(b' ');
468                last_space = true;
469            }
470        } else {
471            char_found = true;
472            last_space = false;
473            normalized.push(byte);
474        }
475    }
476    // There can't be multiple whitespaces
477    if normalized.last() == Some(&b' ') {
478        normalized.pop();
479    }
480    normalized
481}