quick_xml/parser/
comment.rs

1//! Contains a parser for an XML comment.
2
3use crate::errors::SyntaxError;
4use crate::parser::Parser;
5
6/// A parser that search a `-->` sequence in the slice.
7///
8/// To use a parser create an instance of parser and [`feed`] data into it.
9/// After successful search the parser will return [`Some`] with position where
10/// comment is ended (the position after `-->`). If search was unsuccessful,
11/// a [`None`] will be returned. You typically would expect positive result of
12/// search, so that you should feed new data until yo'll get it.
13///
14/// NOTE: after successful match the parser does not returned to the initial
15/// state and should not be used anymore. Create a new parser if you want to perform
16/// new search.
17///
18/// # Example
19///
20/// ```
21/// # use pretty_assertions::assert_eq;
22/// use quick_xml::parser::{CommentParser, Parser};
23///
24/// let mut parser = CommentParser::default();
25///
26/// // Parse `<!-- comment with some -> and --- inside-->and the text follow...`
27/// // splitted into three chunks
28/// assert_eq!(parser.feed(b"<!-- comment"), None);
29/// // ...get new chunk of data
30/// assert_eq!(parser.feed(b" with some -> and -"), None);
31/// // ...get another chunk of data
32/// assert_eq!(parser.feed(b"-- inside-->and the text follow..."), Some(12));
33/// //                       ^          ^
34/// //                       0          11
35/// ```
36///
37/// [`feed`]: Self::feed()
38#[derive(Clone, Copy, Debug, Eq, PartialEq)]
39pub enum CommentParser {
40    /// The parser does not yet seen any dashes at the end of previous slice.
41    Seen0,
42    /// The parser already seen one dash on the end of previous slice.
43    Seen1,
44    /// The parser already seen two dashes on the end of previous slice.
45    Seen2,
46}
47
48impl Default for CommentParser {
49    #[inline]
50    fn default() -> Self {
51        Self::Seen0
52    }
53}
54
55impl Parser for CommentParser {
56    /// Determines the end position of an XML comment in the provided slice.
57    /// Comments is a pieces of text enclosed in `<!--` and `-->` braces.
58    /// Comment ends on the first occurrence of `-->` which cannot be escaped.
59    ///
60    /// Returns position after the `-->` or `None` if such sequence was not found.
61    ///
62    /// # Parameters
63    /// - `bytes`: a slice to find the end of a comment.
64    ///   Should contain text in ASCII-compatible encoding
65    #[inline]
66    fn feed(&mut self, bytes: &[u8]) -> Option<usize> {
67        let result = match self {
68            Self::Seen0 => seen0(bytes),
69            Self::Seen1 => seen1(bytes),
70            Self::Seen2 => seen2(bytes),
71        };
72        if let Some(r) = result {
73            return Some(r);
74        }
75        if bytes.ends_with(b"--") {
76            *self = Self::Seen2;
77        } else {
78            self.next_state(bytes.last().copied());
79        }
80        None
81    }
82
83    #[inline]
84    fn eof_error(self, _content: &[u8]) -> SyntaxError {
85        SyntaxError::UnclosedComment
86    }
87}
88
89impl CommentParser {
90    #[inline]
91    fn next_state(&mut self, last: Option<u8>) {
92        match (*self, last) {
93            (Self::Seen0, Some(b'-')) => *self = Self::Seen1,
94
95            (Self::Seen1, Some(b'-')) => *self = Self::Seen2,
96            (Self::Seen1, Some(_)) => *self = Self::Seen0,
97
98            (Self::Seen2, Some(b'-')) => {}
99            (Self::Seen2, Some(_)) => *self = Self::Seen0,
100
101            _ => {}
102        }
103    }
104}
105
106#[inline]
107fn seen0(bytes: &[u8]) -> Option<usize> {
108    for i in memchr::memchr_iter(b'>', bytes) {
109        if bytes[..i].ends_with(b"--") {
110            // +1 for `>` which should be included in event
111            return Some(i + 1);
112        }
113    }
114    None
115}
116
117#[inline]
118fn seen1(bytes: &[u8]) -> Option<usize> {
119    // -|->
120    if bytes.starts_with(b"->") {
121        return Some(2);
122    }
123    // Even if the first character is `-` it cannot be part of close sequence,
124    // because we checked that condition above. That means that we can forgot that
125    // we seen one `-` at the end of the previous chunk.
126    // -|x...
127    seen0(bytes)
128}
129
130#[inline]
131fn seen2(bytes: &[u8]) -> Option<usize> {
132    match bytes.get(0) {
133        // --|
134        None => None,
135        // --|>
136        Some(b'>') => Some(1),
137        // The end sequence here can be matched only if bytes starts with `->`
138        // which is handled in seen1().
139        // --|x...
140        Some(_) => seen1(bytes),
141    }
142}
143
144#[test]
145fn parse() {
146    use pretty_assertions::assert_eq;
147    use CommentParser::*;
148
149    /// Returns `Ok(pos)` with the position in the buffer where element is ended.
150    ///
151    /// Returns `Err(internal_state)` if parsing was not done yet.
152    fn parse_comment(bytes: &[u8], mut parser: CommentParser) -> Result<usize, CommentParser> {
153        match parser.feed(bytes) {
154            Some(i) => Ok(i),
155            None => Err(parser),
156        }
157    }
158
159    assert_eq!(parse_comment(b"", Seen0), Err(Seen0)); // xx|
160    assert_eq!(parse_comment(b"", Seen1), Err(Seen1)); // x-|
161    assert_eq!(parse_comment(b"", Seen2), Err(Seen2)); // --|
162
163    assert_eq!(parse_comment(b"-", Seen0), Err(Seen1)); // xx|-
164    assert_eq!(parse_comment(b"-", Seen1), Err(Seen2)); // x-|-
165    assert_eq!(parse_comment(b"-", Seen2), Err(Seen2)); // --|-
166
167    assert_eq!(parse_comment(b">", Seen0), Err(Seen0)); // xx|>
168    assert_eq!(parse_comment(b">", Seen1), Err(Seen0)); // x-|>
169    assert_eq!(parse_comment(b">", Seen2), Ok(1)); // --|>
170
171    assert_eq!(parse_comment(b"--", Seen0), Err(Seen2)); // xx|--
172    assert_eq!(parse_comment(b"--", Seen1), Err(Seen2)); // x-|--
173    assert_eq!(parse_comment(b"--", Seen2), Err(Seen2)); // --|--
174
175    assert_eq!(parse_comment(b"->", Seen0), Err(Seen0)); // xx|->
176    assert_eq!(parse_comment(b"->", Seen1), Ok(2)); // x-|->
177    assert_eq!(parse_comment(b"->", Seen2), Ok(2)); // --|->
178
179    assert_eq!(parse_comment(b"-->", Seen0), Ok(3)); // xx|-->
180    assert_eq!(parse_comment(b"-->", Seen1), Ok(3)); // x-|-->
181    assert_eq!(parse_comment(b"-->", Seen2), Ok(3)); // --|-->
182
183    assert_eq!(parse_comment(b">-->", Seen0), Ok(4)); // xx|>-->
184    assert_eq!(parse_comment(b">-->", Seen1), Ok(4)); // x-|>-->
185    assert_eq!(parse_comment(b">-->", Seen2), Ok(1)); // --|>-->
186
187    assert_eq!(parse_comment(b"->-->", Seen0), Ok(5)); // xx|->-->
188    assert_eq!(parse_comment(b"->-->", Seen1), Ok(2)); // x-|->-->
189    assert_eq!(parse_comment(b"->-->", Seen2), Ok(2)); // --|->-->
190}