quick_xml/parser/
pi.rs

1//! Contains a parser for an XML processing instruction.
2
3use crate::errors::SyntaxError;
4use crate::parser::Parser;
5use crate::utils::is_whitespace;
6
7/// A parser that search a `?>` sequence in the slice.
8///
9/// To use a parser create an instance of parser and [`feed`] data into it.
10/// After successful search the parser will return [`Some`] with position where
11/// processing instruction is ended (the position after `?>`). If search was
12/// unsuccessful, a [`None`] will be returned. You typically would expect positive
13/// result of search, so that you should feed new data until you get it.
14///
15/// NOTE: after successful match the parser does not returned to the initial
16/// state and should not be used anymore. Create a new parser if you want to perform
17/// new search.
18///
19/// # Example
20///
21/// ```
22/// # use pretty_assertions::assert_eq;
23/// use quick_xml::parser::{Parser, PiParser};
24///
25/// let mut parser = PiParser::default();
26///
27/// // Parse `<?instruction with = 'some > and ?' inside?>and the text follow...`
28/// // splitted into three chunks
29/// assert_eq!(parser.feed(b"<?instruction"), None);
30/// // ...get new chunk of data
31/// assert_eq!(parser.feed(b" with = 'some > and ?"), None);
32/// // ...get another chunk of data
33/// assert_eq!(parser.feed(b"' inside?>and the text follow..."), Some(9));
34/// //                       ^        ^
35/// //                       0        9
36/// ```
37///
38/// [`feed`]: Self::feed()
39#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
40pub struct PiParser(
41    /// A flag that indicates was the `bytes` in the previous attempt to find the
42    /// end ended with `?`.
43    pub bool,
44);
45
46impl Parser for PiParser {
47    /// Determines the end position of a processing instruction in the provided slice.
48    /// Processing instruction ends on the first occurrence of `?>` which cannot be
49    /// escaped.
50    ///
51    /// Returns position after the `?>` or `None` if such sequence was not found.
52    ///
53    /// [Section 2.6]: Parameter entity references MUST NOT be recognized within
54    /// processing instructions, so parser do not search for them.
55    ///
56    /// # Parameters
57    /// - `bytes`: a slice to find the end of a processing instruction.
58    ///   Should contain text in ASCII-compatible encoding
59    ///
60    /// [Section 2.6]: https://www.w3.org/TR/xml11/#sec-pi
61    #[inline]
62    fn feed(&mut self, bytes: &[u8]) -> Option<usize> {
63        for i in memchr::memchr_iter(b'>', bytes) {
64            match i {
65                0 if self.0 => return Some(0),
66                // If the previous byte is `?`, then we found `?>`
67                i if i > 0 && bytes[i - 1] == b'?' => return Some(i),
68                _ => {}
69            }
70        }
71        self.0 = bytes.last().copied() == Some(b'?');
72        None
73    }
74
75    #[inline]
76    fn eof_error(self, content: &[u8]) -> SyntaxError {
77        // Check if content starts with "?xml" followed by whitespace, '?' or end.
78        // This determines whether to report an unclosed XML declaration or PI.
79        // FIXME: Add support for UTF-8/ASCII incompatible encodings (UTF-16)
80        let is_xml_decl = content.starts_with(b"?xml")
81            && content
82                .get(4)
83                .map_or(true, |&b| is_whitespace(b) || b == b'?');
84        if is_xml_decl {
85            SyntaxError::UnclosedXmlDecl
86        } else {
87            SyntaxError::UnclosedPI
88        }
89    }
90}
91
92#[test]
93fn pi() {
94    use pretty_assertions::assert_eq;
95
96    /// Returns `Ok(pos)` with the position in the buffer where processing
97    /// instruction is ended.
98    ///
99    /// Returns `Err(internal_state)` if parsing is not done yet.
100    fn parse_pi(bytes: &[u8], had_question_mark: bool) -> Result<usize, bool> {
101        let mut parser = PiParser(had_question_mark);
102        match parser.feed(bytes) {
103            Some(i) => Ok(i),
104            None => Err(parser.0),
105        }
106    }
107
108    // Comments shows which character was seen the last before calling `feed`.
109    // `x` means any character, pipe denotes start of the buffer that passed to `feed`
110
111    assert_eq!(parse_pi(b"", false), Err(false)); // x|
112    assert_eq!(parse_pi(b"", true), Err(false)); // ?|
113
114    assert_eq!(parse_pi(b"?", false), Err(true)); // x|?
115    assert_eq!(parse_pi(b"?", true), Err(true)); // ?|?
116
117    assert_eq!(parse_pi(b">", false), Err(false)); // x|>
118    assert_eq!(parse_pi(b">", true), Ok(0)); // ?|>
119
120    assert_eq!(parse_pi(b"?>", false), Ok(1)); // x|?>
121    assert_eq!(parse_pi(b"?>", true), Ok(1)); // ?|?>
122
123    assert_eq!(parse_pi(b">?>", false), Ok(2)); // x|>?>
124    assert_eq!(parse_pi(b">?>", true), Ok(0)); // ?|>?>
125}