xml_doc_log4rs/
parser.rs

1use crate::document::{Document, Node};
2use crate::element::Element;
3use crate::error::{Error, Result};
4use encoding_rs::Decoder;
5use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
6use quick_xml::events::{BytesDecl, BytesStart, Event};
7use quick_xml::Reader;
8use std::borrow::Cow;
9use std::collections::HashMap;
10use std::io::{BufRead, Read};
11use log4rs_macros::debug;
12
13pub(crate) struct DecodeReader<R: Read> {
14    decoder: Option<Decoder>,
15    inner: R,
16    undecoded: Box<[u8]>,
17    undecoded_pos: usize,
18    undecoded_cap: usize,
19    remaining: [u8; 32], // Is there an encoding with > 32 bytes for a char?
20    decoded: Box<[u8]>,
21    decoded_pos: usize,
22    decoded_cap: usize,
23    done: bool,
24}
25
26impl<R: Read> DecodeReader<R> {
27    // If Decoder is not set, don't decode.
28    pub(crate) fn new(reader: R, decoder: Option<Decoder>) -> DecodeReader<R> {
29        DecodeReader {
30            decoder,
31            inner: reader,
32            undecoded: vec![0; 4096].into_boxed_slice(),
33            undecoded_pos: 0,
34            undecoded_cap: 0,
35            remaining: [0; 32],
36            decoded: vec![0; 12288].into_boxed_slice(),
37            decoded_pos: 0,
38            decoded_cap: 0,
39            done: false,
40        }
41    }
42
43    pub(crate) fn set_encoding(&mut self, encoding: Option<&'static Encoding>) {
44        self.decoder = encoding.map(|e| e.new_decoder_without_bom_handling());
45        self.done = false;
46    }
47
48    // Call this only when decoder is Some
49    fn fill_buf_decode(&mut self) -> std::io::Result<&[u8]> {
50        if self.decoded_pos >= self.decoded_cap {
51            debug_assert!(self.decoded_pos == self.decoded_cap);
52            if self.done {
53                return Ok(&[]);
54            }
55            let remaining = self.undecoded_cap - self.undecoded_pos;
56            if remaining <= 32 {
57                // Move remaining undecoded bytes at the end to start
58                self.remaining[..remaining]
59                    .copy_from_slice(&self.undecoded[self.undecoded_pos..self.undecoded_cap]);
60                self.undecoded[..remaining].copy_from_slice(&self.remaining[..remaining]);
61                // Fill undecoded buffer
62                let read = self.inner.read(&mut self.undecoded[remaining..])?;
63                self.done = read == 0;
64                self.undecoded_pos = 0;
65                self.undecoded_cap = remaining + read;
66            }
67
68            // Fill decoded buffer
69            let (_res, read, written, _replaced) = self.decoder.as_mut().unwrap().decode_to_utf8(
70                &self.undecoded[self.undecoded_pos..self.undecoded_cap],
71                &mut self.decoded,
72                self.done,
73            );
74            self.undecoded_pos += read;
75            self.decoded_cap = written;
76            self.decoded_pos = 0;
77        }
78        Ok(&self.decoded[self.decoded_pos..self.decoded_cap])
79    }
80
81    fn fill_buf_without_decode(&mut self) -> std::io::Result<&[u8]> {
82        if self.undecoded_pos >= self.undecoded_cap {
83            debug_assert!(self.undecoded_pos == self.undecoded_cap);
84            self.undecoded_cap = self.inner.read(&mut self.undecoded)?;
85            self.undecoded_pos = 0;
86        }
87        Ok(&self.undecoded[self.undecoded_pos..self.undecoded_cap])
88    }
89}
90
91impl<R: Read> Read for DecodeReader<R> {
92    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
93        (&self.decoded[..]).read(buf)
94    }
95}
96
97impl<R: Read> BufRead for DecodeReader<R> {
98    // Decoder may change from None to Some.
99    fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
100        match &self.decoder {
101            Some(_) => self.fill_buf_decode(),
102            None => self.fill_buf_without_decode(),
103        }
104    }
105    fn consume(&mut self, amt: usize) {
106        match &self.decoder {
107            Some(_) => {
108                self.decoded_pos = std::cmp::min(self.decoded_pos + amt, self.decoded_cap);
109            }
110            None => {
111                self.undecoded_pos = std::cmp::min(self.undecoded_pos + amt, self.undecoded_cap);
112            }
113        }
114    }
115}
116
117/// Options when parsing xml.
118#[derive(Debug, Clone, PartialEq, Eq)]
119pub struct ReadOptions {
120    /// <tag></tag> will have a Node::Text("") as its children, while <tag /> won't.
121    /// Default: `true`
122    pub empty_text_node: bool,
123    /// Trims leading and ending whitespaces in `Node::Text`, and ignore node if it is empty.
124    /// Default: `true`
125    pub trim_text: bool,
126    /// Ignore Node::Text that only has whitespaces.
127    /// Only makes sense if `trim_text` is `false`. (If both are `true`, performance takes a hit for no gain)
128    /// Default: `false`
129    pub ignore_whitespace_only: bool,
130    /// Returns error if document doesn't start with XML declaration.
131    /// If there is no XML declaration, the parser won't be able to decode encodings other than UTF-8, unless `encoding` below is set.
132    /// Default: `true`
133    pub require_decl: bool,
134    /// If this is set, the parser will start reading with this encoding.
135    /// But it will switch to XML declaration's encoding value if it has a different value.
136    /// See [`encoding_rs::Encoding::for_label`] for valid values.
137    /// Default: `None`
138    pub encoding: Option<String>,
139}
140
141impl ReadOptions {
142    /// Create ReadOptions with default options.
143    pub fn default() -> ReadOptions {
144        ReadOptions {
145            empty_text_node: true,
146            trim_text: true,
147            ignore_whitespace_only: false,
148            require_decl: true,
149            encoding: None,
150        }
151    }
152}
153
154//TODO: don't unwrap element_stack.last() or pop(). Invalid XML file can crash the software.
155pub(crate) struct DocumentParser {
156    doc: Document,
157    read_opts: ReadOptions,
158    encoding: Option<&'static Encoding>,
159    element_stack: Vec<Element>,
160}
161
162impl DocumentParser {
163    pub(crate) fn parse_reader<R: Read>(reader: R, opts: ReadOptions) -> Result<Document> {
164        let doc = Document::new();
165        let element_stack = vec![doc.container()];
166        let mut parser = DocumentParser {
167            doc,
168            read_opts: opts,
169            encoding: None,
170            element_stack: element_stack,
171        };
172        parser.parse_start(reader)?;
173        Ok(parser.doc)
174    }
175
176    fn handle_decl(&mut self, ev: &BytesDecl) -> Result<()> {
177        self.doc.version = String::from_utf8(ev.version()?.to_vec())?;
178        self.encoding = match ev.encoding() {
179            Some(res) => {
180                let encoding = Encoding::for_label(&res?).ok_or(Error::CannotDecode)?;
181                if encoding == UTF_8 {
182                    None
183                } else {
184                    Some(encoding)
185                }
186            }
187            None => None,
188        };
189        self.doc.standalone = match ev.standalone() {
190            Some(res) => {
191                let val = std::str::from_utf8(&res?)?.to_lowercase();
192                match val.as_str() {
193                    "yes" => true,
194                    "no" => false,
195                    _ => {
196                        return Err(Error::MalformedXML(
197                            "Standalone Document Declaration has non boolean value".to_string(),
198                        ))
199                    }
200                }
201            }
202            None => false,
203        };
204        Ok(())
205    }
206
207    fn create_element(&mut self, parent: Element, ev: &BytesStart) -> Result<Element> {
208        let full_name = String::from_utf8(ev.name().to_vec())?;
209        let mut namespace_decls = HashMap::new();
210        let mut attributes = HashMap::new();
211        for attr in ev.attributes() {
212            let mut attr = attr?;
213            attr.value = Cow::Owned(normalize_space(&attr.value));
214            let key = String::from_utf8(attr.key.to_vec())?;
215            let value = String::from_utf8(attr.unescaped_value()?.to_vec())?;
216            if key == "xmlns" {
217                namespace_decls.insert(String::new(), value);
218                continue;
219            } else if let Some(prefix) = key.strip_prefix("xmlns:") {
220                namespace_decls.insert(prefix.to_owned(), value);
221                continue;
222            }
223            attributes.insert(key, value);
224        }
225
226        let elem = Element::with_data(&mut self.doc, full_name, attributes, namespace_decls);
227        parent
228            .push_child(&mut self.doc, Node::Element(elem))
229            .unwrap();
230        Ok(elem)
231    }
232
233    // Returns true if document parsing is finished.
234    fn handle_event(&mut self, event: Event) -> Result<bool> {
235        match event {
236            Event::Start(ref ev) => {
237                let parent = *self
238                    .element_stack
239                    .last()
240                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
241                let element = self.create_element(parent, ev)?;
242                self.element_stack.push(element);
243                Ok(false)
244            }
245            Event::End(_) => {
246                let elem = self
247                    .element_stack
248                    .pop()
249                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?; // quick-xml checks if tag names match for us
250                if self.read_opts.empty_text_node {
251                    // distinguish <tag></tag> and <tag />
252                    if !elem.has_children(&self.doc) {
253                        elem.push_child(&mut self.doc, Node::Text(String::new()))
254                            .unwrap();
255                    }
256                }
257                Ok(false)
258            }
259            Event::Empty(ref ev) => {
260                let parent = *self
261                    .element_stack
262                    .last()
263                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
264                self.create_element(parent, ev)?;
265                Ok(false)
266            }
267            // Comment, CData, and PI content should not be escaped,
268            // but quick-xml assumes only CDATA is not escaped.
269            Event::Text(ev) => {
270                if self.read_opts.ignore_whitespace_only && only_has_whitespace(&ev) {
271                    return Ok(false);
272                }
273                // when trim_text, ignore_whitespace_only, empty_text_node are all false
274                if ev.is_empty() {
275                    return Ok(false);
276                }
277                let content = String::from_utf8(ev.unescaped()?.to_vec())?;
278                let node = Node::Text(content);
279                let parent = *self
280                    .element_stack
281                    .last()
282                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
283                parent.push_child(&mut self.doc, node).unwrap();
284                Ok(false)
285            }
286            Event::DocType(ev) => {
287                // Event::DocType comes with one leading whitespace. Strip the whitespace.
288                let raw = ev.unescaped()?;
289                let content = if !raw.is_empty() && raw[0] == b' ' {
290                    String::from_utf8(raw[1..].to_vec())?
291                } else {
292                    String::from_utf8(raw.to_vec())?
293                };
294                let node = Node::DocType(content);
295                let parent = *self
296                    .element_stack
297                    .last()
298                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
299                parent.push_child(&mut self.doc, node).unwrap();
300                Ok(false)
301            }
302            Event::Comment(ev) => {
303                let content = String::from_utf8(ev.escaped().to_vec())?;
304                let node = Node::Comment(content);
305                let parent = *self
306                    .element_stack
307                    .last()
308                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
309                parent.push_child(&mut self.doc, node).unwrap();
310                Ok(false)
311            }
312            Event::CData(ev) => {
313                let content = String::from_utf8(ev.unescaped()?.to_vec())?;
314                let node = Node::CData(content);
315                let parent = *self
316                    .element_stack
317                    .last()
318                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
319                parent.push_child(&mut self.doc, node).unwrap();
320                Ok(false)
321            }
322            Event::PI(ev) => {
323                let content = String::from_utf8(ev.escaped().to_vec())?;
324                let node = Node::PI(content);
325                let parent = *self
326                    .element_stack
327                    .last()
328                    .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
329                parent.push_child(&mut self.doc, node).unwrap();
330                Ok(false)
331            }
332            Event::Decl(_) => Err(Error::MalformedXML(
333                "XML declaration found in the middle of the document".to_string(),
334            )),
335            Event::Eof => Ok(true),
336        }
337    }
338
339    // Sniff encoding and consume BOM
340    fn sniff_encoding<R: Read>(
341        &mut self,
342        decodereader: &mut DecodeReader<R>,
343    ) -> Result<Option<&'static Encoding>> {
344        let bytes = decodereader.fill_buf()?;
345        let encoding = match bytes {
346            [0x3c, 0x3f, ..] => None, // UTF-8 '<?'
347            [0xfe, 0xff, ..] => {
348                // UTF-16 BE BOM
349                decodereader.consume(2);
350                Some(UTF_16BE)
351            }
352            [0xff, 0xfe, ..] => {
353                // UTF-16 LE BOM
354                decodereader.consume(2);
355                Some(UTF_16LE)
356            }
357            [0xef, 0xbb, 0xbf, ..] => {
358                // UTF-8 BOM
359                decodereader.consume(3);
360                None
361            }
362            [0x00, 0x3c, 0x00, 0x3f, ..] => Some(UTF_16BE),
363            [0x3c, 0x00, 0x3f, 0x00, ..] => Some(UTF_16LE),
364            _ => None, // Try decoding it with UTF-8
365        };
366        Ok(encoding)
367    }
368
369    // Look at the document decl and figure out the document encoding
370    fn parse_start<R: Read>(&mut self, reader: R) -> Result<()> {
371        let mut decodereader = DecodeReader::new(reader, None);
372        let mut init_encoding = self.sniff_encoding(&mut decodereader)?;
373        if let Some(enc) = &self.read_opts.encoding {
374            init_encoding = Some(Encoding::for_label(enc.as_bytes()).ok_or(Error::CannotDecode)?)
375        }
376        decodereader.set_encoding(init_encoding);
377        let mut xmlreader = Reader::from_reader(decodereader);
378        xmlreader.trim_text(self.read_opts.trim_text);
379
380        let mut buf = Vec::with_capacity(200);
381
382        // Skip first event if it only has whitespace
383        let event = match xmlreader.read_event(&mut buf)? {
384            Event::Text(ev) => {
385                if ev.len() == 0 {
386                    xmlreader.read_event(&mut buf)?
387                } else if self.read_opts.ignore_whitespace_only && only_has_whitespace(&ev) {
388                    xmlreader.read_event(&mut buf)?
389                } else {
390                    Event::Text(ev)
391                }
392            }
393            ev => ev,
394        };
395        #[cfg(debug_assertions)]
396        debug!(event);
397        if let Event::Decl(ev) = event {
398            self.handle_decl(&ev)?;
399            // Encoding::for_label("UTF-16") defaults to UTF-16 LE, even though it could be UTF-16 BE
400            if self.encoding != init_encoding
401                && !(self.encoding == Some(UTF_16LE) && init_encoding == Some(UTF_16BE))
402            {
403                let mut decode_reader = xmlreader.into_underlying_reader();
404                decode_reader.set_encoding(self.encoding);
405                xmlreader = Reader::from_reader(decode_reader);
406                xmlreader.trim_text(self.read_opts.trim_text);
407            }
408        } else if self.read_opts.require_decl {
409            return Err(Error::MalformedXML(
410                "Didn't find XML Declaration at the start of file".to_string(),
411            ));
412        } else if self.handle_event(event)? {
413            return Ok(());
414        }
415        // Handle rest of the events
416        self.parse_content(xmlreader)
417    }
418
419    fn parse_content<B: BufRead>(&mut self, mut reader: Reader<B>) -> Result<()> {
420        let mut buf = Vec::with_capacity(200); // reduce time increasing capacity at start.
421
422        loop {
423            let ev = reader.read_event(&mut buf)?;
424            #[cfg(debug_assertions)]
425            debug!(ev);
426            if self.handle_event(ev)? {
427                if self.element_stack.len() == 1 {
428                    // Should only have container remaining in element_stack
429                    return Ok(());
430                } else {
431                    return Err(Error::MalformedXML("Closing tag not found.".to_string()));
432                }
433            }
434        }
435    }
436}
437
438/// Returns true if byte is an XML whitespace character
439fn is_whitespace(byte: u8) -> bool {
440    match byte {
441        b'\r' | b'\n' | b'\t' | b' ' => true,
442        _ => false,
443    }
444}
445
446/// Returns true if bytes.len() == 0 or bytes only has a whitespace-like character.
447fn only_has_whitespace(bytes: &[u8]) -> bool {
448    bytes.iter().all(|b| is_whitespace(*b))
449}
450
451/// #xD(\r), #xA(\n), #x9(\t) is normalized into #x20.
452/// Leading and trailing spaces(#x20) are discarded
453/// and sequence of spaces are replaced by a single space.
454pub fn normalize_space(bytes: &[u8]) -> Vec<u8> {
455    let mut normalized = Vec::with_capacity(bytes.len());
456    let mut char_found = false;
457    let mut last_space = false;
458    for &byte in bytes {
459        if is_whitespace(byte) {
460            if char_found && !last_space {
461                normalized.push(b' ');
462                last_space = true;
463            }
464        } else {
465            char_found = true;
466            last_space = false;
467            normalized.push(byte);
468        }
469    }
470    // There can't be multiple whitespaces
471    if normalized.last() == Some(&b' ') {
472        normalized.pop();
473    }
474    normalized
475}