edit_xml/
parser.rs

1use crate::document::{Document, Node};
2use crate::element::Element;
3use crate::error::{DecodeError, EditXMLError, MalformedReason, Result};
4use crate::types::StandaloneValue;
5use crate::utils::HashMap;
6use crate::utils::{bytes_to_unescaped_string, XMLStringUtils};
7use encoding_rs::Decoder;
8use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
9use quick_xml::events::{BytesDecl, BytesStart, Event};
10use quick_xml::Reader;
11use std::borrow::Cow;
12use std::io::{BufRead, Read};
13use tracing::{debug, trace};
14
15pub(crate) struct DecodeReader<R: Read> {
16    decoder: Option<Decoder>,
17    inner: R,
18    undecoded: Box<[u8]>,
19    undecoded_pos: usize,
20    undecoded_cap: usize,
21    remaining: [u8; 32], // Is there an encoding with > 32 bytes for a char?
22    decoded: Box<[u8]>,
23    decoded_pos: usize,
24    decoded_cap: usize,
25    done: bool,
26}
27
28impl<R: Read> DecodeReader<R> {
29    // If Decoder is not set, don't decode.
30    pub(crate) fn new(reader: R, decoder: Option<Decoder>) -> DecodeReader<R> {
31        DecodeReader {
32            decoder,
33            inner: reader,
34            undecoded: vec![0; 4096].into_boxed_slice(),
35            undecoded_pos: 0,
36            undecoded_cap: 0,
37            remaining: [0; 32],
38            decoded: vec![0; 12288].into_boxed_slice(),
39            decoded_pos: 0,
40            decoded_cap: 0,
41            done: false,
42        }
43    }
44
45    pub(crate) fn set_encoding(&mut self, encoding: Option<&'static Encoding>) {
46        self.decoder = encoding.map(|e| e.new_decoder_without_bom_handling());
47        self.done = false;
48    }
49
50    // Call this only when decoder is Some
51    fn fill_buf_decode(&mut self) -> std::io::Result<&[u8]> {
52        if self.decoded_pos >= self.decoded_cap {
53            debug_assert!(self.decoded_pos == self.decoded_cap);
54            if self.done {
55                return Ok(&[]);
56            }
57            let remaining = self.undecoded_cap - self.undecoded_pos;
58            if remaining <= 32 {
59                // Move remaining undecoded bytes at the end to start
60                self.remaining[..remaining]
61                    .copy_from_slice(&self.undecoded[self.undecoded_pos..self.undecoded_cap]);
62                self.undecoded[..remaining].copy_from_slice(&self.remaining[..remaining]);
63                // Fill undecoded buffer
64                let read = self.inner.read(&mut self.undecoded[remaining..])?;
65                self.done = read == 0;
66                self.undecoded_pos = 0;
67                self.undecoded_cap = remaining + read;
68            }
69
70            // Fill decoded buffer
71            let (_res, read, written, _replaced) = self.decoder.as_mut().unwrap().decode_to_utf8(
72                &self.undecoded[self.undecoded_pos..self.undecoded_cap],
73                &mut self.decoded,
74                self.done,
75            );
76            self.undecoded_pos += read;
77            self.decoded_cap = written;
78            self.decoded_pos = 0;
79        }
80        Ok(&self.decoded[self.decoded_pos..self.decoded_cap])
81    }
82
83    fn fill_buf_without_decode(&mut self) -> std::io::Result<&[u8]> {
84        if self.undecoded_pos >= self.undecoded_cap {
85            debug_assert!(self.undecoded_pos == self.undecoded_cap);
86            self.undecoded_cap = self.inner.read(&mut self.undecoded)?;
87            self.undecoded_pos = 0;
88        }
89        Ok(&self.undecoded[self.undecoded_pos..self.undecoded_cap])
90    }
91}
92
93impl<R: Read> Read for DecodeReader<R> {
94    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
95        (&self.decoded[..]).read(buf)
96    }
97}
98
99impl<R: Read> BufRead for DecodeReader<R> {
100    // Decoder may change from None to Some.
101    fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
102        match &self.decoder {
103            Some(_) => self.fill_buf_decode(),
104            None => self.fill_buf_without_decode(),
105        }
106    }
107    fn consume(&mut self, amt: usize) {
108        match &self.decoder {
109            Some(_) => {
110                self.decoded_pos = std::cmp::min(self.decoded_pos + amt, self.decoded_cap);
111            }
112            None => {
113                self.undecoded_pos = std::cmp::min(self.undecoded_pos + amt, self.undecoded_cap);
114            }
115        }
116    }
117}
118
119/// Options when parsing xml.
120#[derive(Debug, Clone, PartialEq, Eq)]
121pub struct ReadOptions {
122    /// `<tag></tag>` will have a Node::Text("") as its children, while `<tag />` won't.
123    /// Default: `true`
124    pub empty_text_node: bool,
125    /// Trims leading and ending whitespaces in `Node::Text`, and ignore node if it is empty.
126    /// Default: `true`
127    pub trim_text: bool,
128    /// Ignore Node::Text that only has whitespaces.
129    /// Only makes sense if `trim_text` is `false`. (If both are `true`, performance takes a hit for no gain)
130    /// Default: `false`
131    pub ignore_whitespace_only: bool,
132    /// Returns error if document doesn't start with XML declaration.
133    /// If there is no XML declaration, the parser won't be able to decode encodings other than UTF-8, unless `encoding` below is set.
134    /// Default: `true`
135    pub require_decl: bool,
136    /// If this is set, the parser will start reading with this encoding.
137    /// But it will switch to XML declaration's encoding value if it has a different value.
138    /// See [`encoding_rs::Encoding::for_label`] for valid values.
139    /// Default: `None`
140    pub encoding: Option<String>,
141}
142impl ReadOptions {
143    /// New ReadOptions that is relaxed by not requiring XML declaration.
144    pub fn relaxed() -> Self {
145        ReadOptions {
146            empty_text_node: true,
147            trim_text: true,
148            ignore_whitespace_only: true,
149            require_decl: false,
150            encoding: None,
151        }
152    }
153}
154impl Default for ReadOptions {
155    fn default() -> Self {
156        ReadOptions {
157            empty_text_node: true,
158            trim_text: true,
159            ignore_whitespace_only: false,
160            require_decl: true,
161            encoding: None,
162        }
163    }
164}
165
166//TODO: don't unwrap element_stack.last() or pop(). Invalid XML file can crash the software.
167pub(crate) struct DocumentParser {
168    doc: Document,
169    read_opts: ReadOptions,
170    encoding: Option<&'static Encoding>,
171    element_stack: Vec<Element>,
172}
173
174impl DocumentParser {
175    pub(crate) fn parse_reader<R: Read>(reader: R, opts: ReadOptions) -> Result<Document> {
176        let doc = Document::new();
177        let element_stack = vec![doc.container()];
178        let mut parser = DocumentParser {
179            doc,
180            read_opts: opts,
181            encoding: None,
182            element_stack,
183        };
184        parser.parse_start(reader)?;
185        Ok(parser.doc)
186    }
187
188    fn handle_decl(&mut self, ev: &BytesDecl) -> Result<()> {
189        self.doc.version = String::from_utf8(ev.version()?.to_vec())?;
190        self.encoding = match ev.encoding() {
191            Some(res) => {
192                let encoding = Encoding::for_label(&res?).ok_or(DecodeError::MissingEncoding)?;
193                if encoding == UTF_8 {
194                    None
195                } else {
196                    Some(encoding)
197                }
198            }
199            None => None,
200        };
201        self.doc.standalone = match ev.standalone() {
202            Some(res) => {
203                let standalone_value = res?;
204                Some(StandaloneValue::try_from(standalone_value.as_ref())?)
205            }
206            None => None,
207        };
208        Ok(())
209    }
210
211    fn create_element(&mut self, parent: Element, ev: &BytesStart) -> Result<Element> {
212        let full_name = ev.name().into_string()?;
213        let mut namespace_decls = HashMap::new();
214        let mut attributes = HashMap::new();
215        for attr in ev.attributes() {
216            let mut attr = attr?;
217            attr.value = Cow::Owned(normalize_space(&attr.value));
218            let key = attr.key.into_string()?;
219            let value = bytes_to_unescaped_string(&attr.value)?;
220            if key == "xmlns" {
221                namespace_decls.insert(String::new(), value);
222                continue;
223            } else if let Some(prefix) = key.strip_prefix("xmlns:") {
224                namespace_decls.insert(prefix.to_owned(), value);
225                continue;
226            }
227            attributes.insert(key, value);
228        }
229        let elem = Element::with_data(&mut self.doc, full_name, attributes, namespace_decls);
230        parent
231            .push_child(&mut self.doc, Node::Element(elem))
232            .unwrap();
233        Ok(elem)
234    }
235
236    // Returns true if document parsing is finished.
237    fn handle_event(&mut self, event: Event) -> Result<bool> {
238        match event {
239            Event::Start(ref ev) => {
240                let parent = *self.element_stack.last().ok_or_else(|| {
241                    EditXMLError::MalformedXML(MalformedReason::GenericMalformedTree)
242                })?;
243                let element = self.create_element(parent, ev)?;
244                self.element_stack.push(element);
245                Ok(false)
246            }
247            Event::End(_) => {
248                let elem = self.element_stack.pop().ok_or_else(|| {
249                    EditXMLError::MalformedXML(MalformedReason::GenericMalformedTree)
250                })?; // quick-xml checks if tag names match for us
251                if self.read_opts.empty_text_node {
252                    // distinguish <tag></tag> and <tag />
253                    if !elem.has_children(&self.doc) {
254                        elem.push_child(&mut self.doc, Node::Text(String::new()))
255                            .unwrap();
256                    }
257                }
258                Ok(false)
259            }
260            Event::Empty(ref ev) => {
261                let parent = *self.element_stack.last().ok_or_else(|| {
262                    EditXMLError::MalformedXML(MalformedReason::GenericMalformedTree)
263                })?;
264                self.create_element(parent, ev)?;
265                Ok(false)
266            }
267            // Comment, CData, and PI content should not be escaped,
268            // but quick-xml assumes only CDATA is not escaped.
269            Event::Text(ev) => {
270                if self.read_opts.ignore_whitespace_only && only_has_whitespace(&ev) {
271                    return Ok(false);
272                }
273                // when trim_text, ignore_whitespace_only, empty_text_node are all false
274                if ev.is_empty() {
275                    return Ok(false);
276                }
277                // NOTE: Was Unescaped
278                let content = ev.unescape_to_string()?;
279                let node = Node::Text(content);
280                let parent = *self.element_stack.last().ok_or_else(|| {
281                    EditXMLError::MalformedXML(MalformedReason::GenericMalformedTree)
282                })?;
283                parent.push_child(&mut self.doc, node).unwrap();
284                Ok(false)
285            }
286            Event::DocType(ev) => {
287                // Event::DocType comes with one leading whitespace. Strip the whitespace.
288                let raw = ev.unescape_to_string()?.into_bytes();
289                let content = if !raw.is_empty() && raw[0] == b' ' {
290                    String::from_utf8(raw[1..].to_vec())?
291                } else {
292                    String::from_utf8(raw.to_vec())?
293                };
294                let node = Node::DocType(content);
295                let parent = *self.element_stack.last().ok_or_else(|| {
296                    EditXMLError::MalformedXML(MalformedReason::GenericMalformedTree)
297                })?;
298                parent.push_child(&mut self.doc, node).unwrap();
299                Ok(false)
300            }
301            Event::Comment(ev) => {
302                let content = String::from_utf8(ev.escape_ascii().collect())?;
303                let node = Node::Comment(content);
304                let parent = *self.element_stack.last().ok_or_else(|| {
305                    EditXMLError::MalformedXML(MalformedReason::GenericMalformedTree)
306                })?;
307                parent.push_child(&mut self.doc, node).unwrap();
308                Ok(false)
309            }
310            Event::CData(ev) => {
311                let content = String::from_utf8(ev.to_vec())?;
312                let node = Node::CData(content);
313                let parent = *self.element_stack.last().ok_or_else(|| {
314                    EditXMLError::MalformedXML(MalformedReason::GenericMalformedTree)
315                })?;
316                parent.push_child(&mut self.doc, node).unwrap();
317                Ok(false)
318            }
319            Event::PI(ev) => {
320                let content = ev.into_string()?;
321                let node = Node::PI(content);
322                let parent = *self.element_stack.last().ok_or_else(|| {
323                    EditXMLError::MalformedXML(MalformedReason::GenericMalformedTree)
324                })?;
325                parent.push_child(&mut self.doc, node).unwrap();
326                Ok(false)
327            }
328            Event::Decl(_) => Err(EditXMLError::MalformedXML(MalformedReason::UnexpectedItem(
329                "XML Declaration",
330            ))),
331            Event::Eof => Ok(true),
332        }
333    }
334
335    // Sniff encoding and consume BOM
336    fn sniff_encoding<R: Read>(
337        &mut self,
338        decodereader: &mut DecodeReader<R>,
339    ) -> Result<Option<&'static Encoding>> {
340        let bytes = decodereader.fill_buf()?;
341        let encoding = match bytes {
342            [0x3c, 0x3f, ..] => None, // UTF-8 '<?'
343            [0xfe, 0xff, ..] => {
344                // UTF-16 BE BOM
345                decodereader.consume(2);
346                Some(UTF_16BE)
347            }
348            [0xff, 0xfe, ..] => {
349                // UTF-16 LE BOM
350                decodereader.consume(2);
351                Some(UTF_16LE)
352            }
353            [0xef, 0xbb, 0xbf, ..] => {
354                // UTF-8 BOM
355                decodereader.consume(3);
356                None
357            }
358            [0x00, 0x3c, 0x00, 0x3f, ..] => Some(UTF_16BE),
359            [0x3c, 0x00, 0x3f, 0x00, ..] => Some(UTF_16LE),
360            _ => None, // Try decoding it with UTF-8
361        };
362        Ok(encoding)
363    }
364
365    // Look at the document decl and figure out the document encoding
366    fn parse_start<R: Read>(&mut self, reader: R) -> Result<()> {
367        debug!(?self.read_opts, "Parsing Start");
368        let mut decodereader = DecodeReader::new(reader, None);
369        let mut init_encoding = self.sniff_encoding(&mut decodereader)?;
370        if let Some(enc) = &self.read_opts.encoding {
371            init_encoding =
372                Some(Encoding::for_label(enc.as_bytes()).ok_or(DecodeError::MissingEncoding)?)
373        }
374        debug!(?init_encoding, "Initial Encoding");
375        decodereader.set_encoding(init_encoding);
376        let mut xmlreader = Reader::from_reader(decodereader);
377        xmlreader.config_mut().trim_text(self.read_opts.trim_text);
378
379        let mut buf = Vec::with_capacity(200);
380
381        // Skip first event if it only has whitespace
382        let event = match xmlreader.read_event_into(&mut buf)? {
383            Event::Text(ev) => {
384                if ev.len() == 0 {
385                    trace!("Skipping empty text event");
386                    xmlreader.read_event_into(&mut buf)?
387                } else if self.read_opts.ignore_whitespace_only && only_has_whitespace(&ev) {
388                    trace!("Skipping whitespace only text event");
389                    xmlreader.read_event_into(&mut buf)?
390                } else {
391                    trace!("First Event is Text");
392                    Event::Text(ev)
393                }
394            }
395            ev => ev,
396        };
397        debug!(?event, "First Event");
398        if let Event::Decl(ev) = event {
399            self.handle_decl(&ev)?;
400            // Encoding::for_label("UTF-16") defaults to UTF-16 LE, even though it could be UTF-16 BE
401            if self.encoding != init_encoding
402                && !(self.encoding == Some(UTF_16LE) && init_encoding == Some(UTF_16BE))
403            {
404                let mut decode_reader = xmlreader.into_inner();
405                decode_reader.set_encoding(self.encoding);
406                xmlreader = Reader::from_reader(decode_reader);
407                xmlreader.config_mut().trim_text(self.read_opts.trim_text);
408            }
409        } else if self.read_opts.require_decl {
410            debug!(?self.read_opts, ?event, "XML Declaration is required");
411            return Err(MalformedReason::MissingDeclaration.into());
412        } else if self.handle_event(event)? {
413            return Ok(());
414        }
415        // Handle rest of the events
416        self.parse_content(xmlreader)
417    }
418
419    fn parse_content<B: BufRead>(&mut self, mut reader: Reader<B>) -> Result<()> {
420        let mut buf = Vec::with_capacity(200); // reduce time increasing capacity at start.
421
422        loop {
423            let ev = reader.read_event_into(&mut buf)?;
424
425            if self.handle_event(ev)? {
426                if self.element_stack.len() == 1 {
427                    // Should only have container remaining in element_stack
428                    return Ok(());
429                } else {
430                    return Err(MalformedReason::MissingClosingTag.into());
431                }
432            }
433        }
434    }
435}
436
437/// Returns true if byte is an XML whitespace character
438#[allow(clippy::match_like_matches_macro)]
439fn is_whitespace(byte: u8) -> bool {
440    match byte {
441        b'\r' | b'\n' | b'\t' | b' ' => true,
442        _ => false,
443    }
444}
445
446/// Returns true if bytes.len() == 0 or bytes only has a whitespace-like character.
447fn only_has_whitespace(bytes: &[u8]) -> bool {
448    bytes.iter().all(|b| is_whitespace(*b))
449}
450
451/// #xD(\r), #xA(\n), #x9(\t) is normalized into #x20.
452/// Leading and trailing spaces(#x20) are discarded
453/// and sequence of spaces are replaced by a single space.
454pub fn normalize_space(bytes: &[u8]) -> Vec<u8> {
455    let mut normalized = Vec::with_capacity(bytes.len());
456    let mut char_found = false;
457    let mut last_space = false;
458    for &byte in bytes {
459        if is_whitespace(byte) {
460            if char_found && !last_space {
461                normalized.push(b' ');
462                last_space = true;
463            }
464        } else {
465            char_found = true;
466            last_space = false;
467            normalized.push(byte);
468        }
469    }
470    // There can't be multiple whitespaces
471    if normalized.last() == Some(&b' ') {
472        normalized.pop();
473    }
474    normalized
475}