biodivine_xml_doc/
parser.rs

1use crate::document::{Document, Node};
2use crate::element::Element;
3use crate::error::{Error, Result};
4use encoding_rs::Decoder;
5use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
6use quick_xml::events::{BytesDecl, BytesStart, Event};
7use quick_xml::Reader;
8use std::borrow::Cow;
9use std::collections::HashMap;
10use std::io::{BufRead, Read};
11
12pub(crate) struct DecodeReader<R: Read> {
13    decoder: Option<Decoder>,
14    inner: R,
15    undecoded: Box<[u8]>,
16    undecoded_pos: usize,
17    undecoded_cap: usize,
18    remaining: [u8; 32], // Is there an encoding with > 32 bytes for a char?
19    decoded: Box<[u8]>,
20    decoded_pos: usize,
21    decoded_cap: usize,
22    done: bool,
23}
24
25impl<R: Read> DecodeReader<R> {
26    // If Decoder is not set, don't decode.
27    pub(crate) fn new(reader: R, decoder: Option<Decoder>) -> DecodeReader<R> {
28        DecodeReader {
29            decoder,
30            inner: reader,
31            undecoded: vec![0; 4096].into_boxed_slice(),
32            undecoded_pos: 0,
33            undecoded_cap: 0,
34            remaining: [0; 32],
35            decoded: vec![0; 12288].into_boxed_slice(),
36            decoded_pos: 0,
37            decoded_cap: 0,
38            done: false,
39        }
40    }
41
42    pub(crate) fn set_encoding(&mut self, encoding: Option<&'static Encoding>) {
43        self.decoder = encoding.map(|e| e.new_decoder_without_bom_handling());
44        self.done = false;
45    }
46
47    // Call this only when decoder is Some
48    fn fill_buf_decode(&mut self) -> std::io::Result<&[u8]> {
49        if self.decoded_pos >= self.decoded_cap {
50            debug_assert!(self.decoded_pos == self.decoded_cap);
51            if self.done {
52                return Ok(&[]);
53            }
54            let remaining = self.undecoded_cap - self.undecoded_pos;
55            if remaining <= 32 {
56                // Move remaining undecoded bytes at the end to start
57                self.remaining[..remaining]
58                    .copy_from_slice(&self.undecoded[self.undecoded_pos..self.undecoded_cap]);
59                self.undecoded[..remaining].copy_from_slice(&self.remaining[..remaining]);
60                // Fill undecoded buffer
61                let read = self.inner.read(&mut self.undecoded[remaining..])?;
62                self.done = read == 0;
63                self.undecoded_pos = 0;
64                self.undecoded_cap = remaining + read;
65            }
66
67            // Fill decoded buffer
68            let (_res, read, written, _replaced) = self.decoder.as_mut().unwrap().decode_to_utf8(
69                &self.undecoded[self.undecoded_pos..self.undecoded_cap],
70                &mut self.decoded,
71                self.done,
72            );
73            self.undecoded_pos += read;
74            self.decoded_cap = written;
75            self.decoded_pos = 0;
76        }
77        Ok(&self.decoded[self.decoded_pos..self.decoded_cap])
78    }
79
80    fn fill_buf_without_decode(&mut self) -> std::io::Result<&[u8]> {
81        if self.undecoded_pos >= self.undecoded_cap {
82            debug_assert!(self.undecoded_pos == self.undecoded_cap);
83            self.undecoded_cap = self.inner.read(&mut self.undecoded)?;
84            self.undecoded_pos = 0;
85        }
86        Ok(&self.undecoded[self.undecoded_pos..self.undecoded_cap])
87    }
88}
89
90impl<R: Read> Read for DecodeReader<R> {
91    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
92        (&self.decoded[..]).read(buf)
93    }
94}
95
96impl<R: Read> BufRead for DecodeReader<R> {
97    // Decoder may change from None to Some.
98    fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
99        match &self.decoder {
100            Some(_) => self.fill_buf_decode(),
101            None => self.fill_buf_without_decode(),
102        }
103    }
104    fn consume(&mut self, amt: usize) {
105        match &self.decoder {
106            Some(_) => {
107                self.decoded_pos = std::cmp::min(self.decoded_pos + amt, self.decoded_cap);
108            }
109            None => {
110                self.undecoded_pos = std::cmp::min(self.undecoded_pos + amt, self.undecoded_cap);
111            }
112        }
113    }
114}
115
116/// Options when parsing xml.
117#[derive(Debug, Clone, PartialEq, Eq)]
118pub struct ReadOptions {
119    /// <tag></tag> will have a Node::Text("") as its children, while <tag /> won't.
120    /// Default: `true`
121    pub empty_text_node: bool,
122    /// Trims leading and ending whitespaces in `Node::Text`, and ignore node if it is empty.
123    /// Default: `true`
124    pub trim_text: bool,
125    /// Ignore Node::Text that only has whitespaces.
126    /// Only makes sense if `trim_text` is `false`. (If both are `true`, performance takes a hit for no gain)
127    /// Default: `false`
128    pub ignore_whitespace_only: bool,
129    /// Returns error if document doesn't start with XML declaration.
130    /// If there is no XML declaration, the parser won't be able to decode encodings other than UTF-8, unless `encoding` below is set.
131    /// Default: `true`
132    pub require_decl: bool,
133    /// If this is set, the parser will start reading with this encoding.
134    ///
135    /// If `enforce_encoding` is set to `false`, the parser will then switch to XML declaration's
136    /// encoding value if it has a different value.
137    ///
138    /// See [`Encoding::for_label`] for valid values.
139    ///
140    /// Default: `None`
141    pub encoding: Option<String>,
142    /// If set to `true`, the parser will fail with [Error::CannotDecode] in case `encoding`
143    /// is specified and the declared encoding of the XML document is different.
144    ///
145    /// The parser will always fail if this is set to `true`, but no `encoding` is specified.
146    ///
147    /// Default: `false`
148    pub enforce_encoding: bool,
149}
150
151impl Default for ReadOptions {
152    fn default() -> Self {
153        ReadOptions {
154            empty_text_node: true,
155            trim_text: true,
156            ignore_whitespace_only: false,
157            require_decl: true,
158            encoding: None,
159            enforce_encoding: false,
160        }
161    }
162}
163
164//TODO: don't unwrap element_stack.last() or pop(). Invalid XML file can crash the software.
165pub(crate) struct DocumentParser {
166    doc: Document,
167    read_opts: ReadOptions,
168    encoding: Option<&'static Encoding>,
169    element_stack: Vec<Element>,
170}
171
172impl DocumentParser {
173    pub(crate) fn parse_reader<R: Read>(reader: R, opts: ReadOptions) -> Result<Document> {
174        let doc = Document::new();
175        let element_stack = vec![doc.container()];
176        let mut parser = DocumentParser {
177            doc,
178            read_opts: opts,
179            encoding: None,
180            element_stack,
181        };
182        parser.parse_start(reader)?;
183        Ok(parser.doc)
184    }
185
186    fn handle_decl(&mut self, ev: &BytesDecl) -> Result<()> {
187        self.doc.version = String::from_utf8(ev.version()?.to_vec())?;
188        self.encoding = match ev.encoding() {
189            Some(res) => {
190                let encoding = Encoding::for_label(&res?).ok_or(Error::CannotDecode)?;
191                if encoding == UTF_8 {
192                    None
193                } else {
194                    Some(encoding)
195                }
196            }
197            None => None,
198        };
199        self.doc.standalone = match ev.standalone() {
200            Some(res) => {
201                let val = std::str::from_utf8(&res?)?.to_lowercase();
202                match val.as_str() {
203                    "yes" => true,
204                    "no" => false,
205                    _ => {
206                        return Err(Error::MalformedXML(
207                            "Standalone Document Declaration has non boolean value".to_string(),
208                        ))
209                    }
210                }
211            }
212            None => false,
213        };
214        Ok(())
215    }
216
217    fn create_element(&mut self, parent: Element, ev: &BytesStart) -> Result<Element> {
218        let full_name = String::from_utf8(ev.name().into_inner().to_vec())?;
219        let mut namespace_decls = HashMap::new();
220        let mut attributes = HashMap::new();
221        for attr in ev.attributes() {
222            let mut attr = attr?;
223            attr.value = Cow::Owned(normalize_space(&attr.value));
224            let key = String::from_utf8(attr.key.into_inner().to_vec())?;
225            let value = String::from_utf8(attr.unescape_value()?.as_bytes().to_vec())?;
226            if key == "xmlns" {
227                namespace_decls.insert(String::new(), value);
228                continue;
229            } else if let Some(prefix) = key.strip_prefix("xmlns:") {
230                namespace_decls.insert(prefix.to_owned(), value);
231                continue;
232            }
233            attributes.insert(key, value);
234        }
235        let elem = Element::with_data(&mut self.doc, full_name, attributes, namespace_decls);
236        parent
237            .push_child(&mut self.doc, Node::Element(elem))
238            .unwrap();
239        Ok(elem)
240    }
241
242    // Returns true if document parsing is finished.
243    fn handle_event(&mut self, event: Event) -> Result<bool> {
244        match event {
245            Event::Start(ref ev) => {
246                let parent = *self
247                    .element_stack
248                    .last()
249                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
250                let element = self.create_element(parent, ev)?;
251                self.element_stack.push(element);
252                Ok(false)
253            }
254            Event::End(_) => {
255                let elem = self
256                    .element_stack
257                    .pop()
258                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?; // quick-xml checks if tag names match for us
259                if self.read_opts.empty_text_node {
260                    // distinguish <tag></tag> and <tag />
261                    if !elem.has_children(&self.doc) {
262                        elem.push_child(&mut self.doc, Node::Text(String::new()))
263                            .unwrap();
264                    }
265                }
266                Ok(false)
267            }
268            Event::Empty(ref ev) => {
269                let parent = *self
270                    .element_stack
271                    .last()
272                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
273                self.create_element(parent, ev)?;
274                Ok(false)
275            }
276            // Comment, CData, and PI content should not be escaped,
277            // but quick-xml assumes only CDATA is not escaped.
278            Event::Text(ev) => {
279                if self.read_opts.ignore_whitespace_only && only_has_whitespace(&ev) {
280                    return Ok(false);
281                }
282                // when trim_text, ignore_whitespace_only, empty_text_node are all false
283                if ev.is_empty() {
284                    return Ok(false);
285                }
286                let content = ev.unescape()?.to_string();
287                let node = Node::Text(content);
288                let parent = *self
289                    .element_stack
290                    .last()
291                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
292                parent.push_child(&mut self.doc, node).unwrap();
293                Ok(false)
294            }
295            Event::DocType(ev) => {
296                // Event::DocType comes with one leading whitespace. Strip the whitespace.
297                let content = ev.unescape()?;
298                let raw = content.as_bytes();
299                let content = if !raw.is_empty() && raw[0] == b' ' {
300                    String::from_utf8(raw[1..].to_vec())?
301                } else {
302                    String::from_utf8(raw.to_vec())?
303                };
304                let node = Node::DocType(content);
305                let parent = *self
306                    .element_stack
307                    .last()
308                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
309                parent.push_child(&mut self.doc, node).unwrap();
310                Ok(false)
311            }
312            Event::Comment(ev) => {
313                let content = String::from_utf8(ev.to_vec())?;
314                let node = Node::Comment(content);
315                let parent = *self
316                    .element_stack
317                    .last()
318                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
319                parent.push_child(&mut self.doc, node).unwrap();
320                Ok(false)
321            }
322            Event::CData(ev) => {
323                let content = String::from_utf8(ev.to_vec())?;
324                let node = Node::CData(content);
325                let parent = *self
326                    .element_stack
327                    .last()
328                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
329                parent.push_child(&mut self.doc, node).unwrap();
330                Ok(false)
331            }
332            Event::PI(ev) => {
333                let content = String::from_utf8(ev.to_vec())?;
334                let node = Node::PI(content);
335                let parent = *self
336                    .element_stack
337                    .last()
338                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
339                parent.push_child(&mut self.doc, node).unwrap();
340                Ok(false)
341            }
342            Event::Decl(_) => Err(Error::MalformedXML(
343                "XML declaration found in the middle of the document".to_string(),
344            )),
345            Event::Eof => Ok(true),
346        }
347    }
348
349    // Sniff encoding and consume BOM
350    fn sniff_encoding<R: Read>(
351        &mut self,
352        decodereader: &mut DecodeReader<R>,
353    ) -> Result<Option<&'static Encoding>> {
354        let bytes = decodereader.fill_buf()?;
355        let encoding = match bytes {
356            [0x3c, 0x3f, ..] => None, // UTF-8 '<?'
357            [0xfe, 0xff, ..] => {
358                // UTF-16 BE BOM
359                decodereader.consume(2);
360                Some(UTF_16BE)
361            }
362            [0xff, 0xfe, ..] => {
363                // UTF-16 LE BOM
364                decodereader.consume(2);
365                Some(UTF_16LE)
366            }
367            [0xef, 0xbb, 0xbf, ..] => {
368                // UTF-8 BOM
369                decodereader.consume(3);
370                None
371            }
372            [0x00, 0x3c, 0x00, 0x3f, ..] => Some(UTF_16BE),
373            [0x3c, 0x00, 0x3f, 0x00, ..] => Some(UTF_16LE),
374            _ => None, // Try decoding it with UTF-8
375        };
376        Ok(encoding)
377    }
378
379    // Look at the document decl and figure out the document encoding
380    fn parse_start<R: Read>(&mut self, reader: R) -> Result<()> {
381        let mut decodereader = DecodeReader::new(reader, None);
382        let mut init_encoding = self.sniff_encoding(&mut decodereader)?;
383        let requested_encoding = self
384            .read_opts
385            .encoding
386            .as_ref()
387            .map(|enc| Encoding::for_label(enc.as_bytes()).ok_or(Error::CannotDecode))
388            .transpose()?;
389
390        if requested_encoding.is_some() {
391            // If UTF is requested, the correct value to use is `None`, because UTF is the default.
392            if requested_encoding == Some(UTF_8) {
393                init_encoding = None;
394            } else {
395                init_encoding = requested_encoding;
396            }
397        }
398
399        decodereader.set_encoding(init_encoding);
400        let mut xmlreader = Reader::from_reader(decodereader);
401        xmlreader.trim_text(self.read_opts.trim_text);
402
403        let mut buf = Vec::with_capacity(200);
404
405        // Skip first event if it only has whitespace
406        let event = match xmlreader.read_event_into(&mut buf)? {
407            Event::Text(ev) => {
408                let should_ignore = {
409                    let is_empty = ev.len() == 0;
410                    let is_whitespace = only_has_whitespace(&ev);
411                    is_empty || (self.read_opts.ignore_whitespace_only && is_whitespace)
412                };
413                if should_ignore {
414                    xmlreader.read_event_into(&mut buf)?
415                } else {
416                    Event::Text(ev)
417                }
418            }
419            ev => ev,
420        };
421
422        if let Event::Decl(ev) = event {
423            self.handle_decl(&ev)?;
424            if self.read_opts.enforce_encoding {
425                // User requested encoding X, but Y was actually found in the document declaration.
426                // Note that if the declaration contains UTF-8, then self.encoding is actually
427                // `None`, so we have to account for that.
428                if requested_encoding.is_none() {
429                    return Err(Error::CannotDecode);
430                }
431                if requested_encoding == Some(UTF_8) {
432                    if self.encoding.is_some() {
433                        return Err(Error::CannotDecode);
434                    }
435                } else if self.encoding != requested_encoding {
436                    return Err(Error::CannotDecode);
437                }
438            }
439            // Encoding::for_label("UTF-16") defaults to UTF-16 LE, even though it could be UTF-16 BE
440            if self.encoding != init_encoding
441                && !(self.encoding == Some(UTF_16LE) && init_encoding == Some(UTF_16BE))
442            {
443                let mut decode_reader = xmlreader.into_inner();
444                decode_reader.set_encoding(self.encoding);
445                xmlreader = Reader::from_reader(decode_reader);
446                xmlreader.trim_text(self.read_opts.trim_text);
447            }
448        } else if self.read_opts.require_decl {
449            return Err(Error::MalformedXML(
450                "Didn't find XML Declaration at the start of file".to_string(),
451            ));
452        } else if self.handle_event(event)? {
453            return Ok(());
454        }
455        // Handle rest of the events
456        self.parse_content(xmlreader)
457    }
458
459    fn parse_content<B: BufRead>(&mut self, mut reader: Reader<B>) -> Result<()> {
460        let mut buf = Vec::with_capacity(200); // reduce time increasing capacity at start.
461
462        loop {
463            let ev = reader.read_event_into(&mut buf)?;
464
465            if self.handle_event(ev)? {
466                return if self.element_stack.len() == 1 {
467                    // Should only have container remaining in element_stack
468                    Ok(())
469                } else {
470                    Err(Error::MalformedXML("Closing tag not found.".to_string()))
471                };
472            }
473        }
474    }
475}
476
477/// Returns true if byte is an XML whitespace character
478fn is_whitespace(byte: u8) -> bool {
479    matches!(byte, b'\r' | b'\n' | b'\t' | b' ')
480}
481
482/// Returns true if bytes.len() == 0 or bytes only has a whitespace-like character.
483fn only_has_whitespace(bytes: &[u8]) -> bool {
484    bytes.iter().all(|b| is_whitespace(*b))
485}
486
487/// #xD(\r), #xA(\n), #x9(\t) is normalized into #x20.
488/// Leading and trailing spaces(#x20) are discarded
489/// and sequence of spaces are replaced by a single space.
490pub fn normalize_space(bytes: &[u8]) -> Vec<u8> {
491    let mut normalized = Vec::with_capacity(bytes.len());
492    let mut char_found = false;
493    let mut last_space = false;
494    for &byte in bytes {
495        if is_whitespace(byte) {
496            if char_found && !last_space {
497                normalized.push(b' ');
498                last_space = true;
499            }
500        } else {
501            char_found = true;
502            last_space = false;
503            normalized.push(byte);
504        }
505    }
506    // There can't be multiple whitespaces
507    if normalized.last() == Some(&b' ') {
508        normalized.pop();
509    }
510    normalized
511}