autosar_data/
lexer.rs

1use super::AutosarDataError;
2use std::path::PathBuf;
3use thiserror::Error;
4
5#[derive(Debug, Error, Eq, PartialEq, Clone, Copy)]
6#[non_exhaustive]
7/// `ArxmlLexerError` contains all errors that can occur while reading data
8pub enum ArxmlLexerError {
9    /// Incomplete data, closing '>' was not found
10    #[error("Incomplete data, closing '>' was not found")]
11    IncompleteData,
12
13    /// Invalid element: '<>'
14    #[error("Invalid element: '<>'")]
15    InvalidElement,
16
17    /// A processing instruction was started with '<?', but it did not end with '?>'
18    #[error("A processing instruction was started with '<?', but it did not end with '?>'")]
19    InvalidProcessingInstruction,
20
21    /// Invalid arxml header: The xml header of an arxml file must specify version="1.0" encoding="utf-8"
22    #[error("Invalid arxml header: The xml header of an arxml file must specify version=\"1.0\" encoding=\"utf-8\"")]
23    InvalidXmlHeader,
24
25    /// Invalid comment: Comments must start with '<!--' and end with '-->'
26    #[error("Invalid comment")]
27    InvalidComment,
28}
29
30#[derive(Debug)]
31pub(crate) enum ArxmlEvent<'a> {
32    ArxmlHeader(Option<bool>),
33    BeginElement(&'a [u8], &'a [u8]),
34    EndElement(&'a [u8]),
35    Characters(&'a [u8]),
36    Comment(&'a [u8]),
37    EndOfFile,
38}
39
40pub(crate) struct ArxmlLexer<'a> {
41    buffer: &'a [u8],
42    bufpos: usize,
43    line: usize,
44    deferred_end: Option<(usize, usize)>,
45    sourcefile: PathBuf,
46}
47
48impl<'a> ArxmlLexer<'a> {
49    pub(crate) fn new(buffer: &'a [u8], name: PathBuf) -> Self {
50        // skip the byte-order mark, if it is present
51        let bufpos = if buffer.len() > 3 && buffer[0] == 239 && buffer[1] == 187 && buffer[2] == 191 {
52            3
53        } else {
54            0
55        };
56        Self {
57            buffer,
58            bufpos,
59            line: 1,
60            deferred_end: None,
61            sourcefile: name,
62        }
63    }
64
65    fn read_characters(&mut self) -> (ArxmlEvent<'a>, bool) {
66        debug_assert!(self.bufpos < self.buffer.len());
67
68        // the start of the next element '<' is the end of this block of characters
69        let mut endpos = self.bufpos;
70        let mut all_whitespace = true;
71        while endpos < self.buffer.len() && self.buffer[endpos] != b'<' {
72            // count the lines directly in this loop; it's faster than calling count_lines and this loop is quite hot in the profile...
73            if !self.buffer[endpos].is_ascii_whitespace() {
74                all_whitespace = false;
75            } else if self.buffer[endpos] == b'\n' {
76                self.line += 1;
77            }
78            endpos += 1;
79        }
80        debug_assert!(endpos > self.bufpos);
81
82        let text = &self.buffer[self.bufpos..endpos];
83        self.bufpos = endpos;
84        (ArxmlEvent::Characters(text), all_whitespace)
85    }
86
87    fn read_element_start(&mut self, endpos: usize) -> ArxmlEvent<'a> {
88        debug_assert!(self.bufpos < self.buffer.len());
89        debug_assert!(endpos > self.bufpos + 1);
90        debug_assert!(self.buffer[self.bufpos] == b'<');
91
92        let (text, is_end) = if self.buffer[endpos - 1] == b'/' {
93            (&self.buffer[self.bufpos + 1..endpos - 1], true)
94        } else {
95            (&self.buffer[self.bufpos + 1..endpos], false)
96        };
97
98        let (elemname, attributes) = if let Some(splitpos) = text.iter().position(u8::is_ascii_whitespace) {
99            (&text[..splitpos], &text[splitpos + 1..])
100        } else {
101            (text, &text[0..0])
102        };
103
104        // this is a <element/>, so a EndElement event needs to be generated next
105        if is_end {
106            // calculate the position of the element name inside the text
107            self.deferred_end = Some((self.bufpos + 1, self.bufpos + 1 + elemname.len()));
108        }
109
110        self.line += count_lines(text);
111        self.bufpos = endpos + 1;
112        ArxmlEvent::BeginElement(elemname, attributes)
113    }
114
115    fn read_element_end(&mut self, endpos: usize) -> ArxmlEvent<'a> {
116        debug_assert!(self.bufpos < self.buffer.len());
117        debug_assert!(endpos > self.bufpos + 1);
118        debug_assert!(self.buffer[self.bufpos] == b'<');
119
120        let text = &self.buffer[self.bufpos + 2..endpos];
121        self.bufpos = endpos + 1;
122
123        ArxmlEvent::EndElement(text)
124    }
125
126    fn read_xml_header(&mut self, endpos: usize) -> Option<Result<ArxmlEvent<'a>, AutosarDataError>> {
127        debug_assert!(self.bufpos < self.buffer.len());
128        debug_assert!(endpos > self.bufpos + 1);
129        debug_assert!(self.buffer[self.bufpos] == b'<');
130
131        if self.buffer[endpos - 1] != b'?' {
132            return Some(Err(self.error(ArxmlLexerError::InvalidProcessingInstruction)));
133        }
134
135        let text = &self.buffer[self.bufpos + 2..endpos - 1];
136        self.bufpos = endpos + 1;
137
138        let mut splitter = text.split(u8::is_ascii_whitespace);
139        let elemname = splitter.next().unwrap();
140
141        let result = if elemname == b"xml" {
142            let mut ver = &text[0..0];
143            let mut encoding = &text[0..0];
144            let mut standalone: Option<bool> = None;
145            for attr_text in splitter {
146                let (attr_name, attr_val) = if let Some(pos) = attr_text.iter().position(|c| *c == b'=') {
147                    (&attr_text[0..pos], &attr_text[pos + 2..attr_text.len() - 1])
148                } else {
149                    (attr_text, &attr_text[0..0])
150                };
151                if attr_name == b"version" {
152                    ver = attr_val;
153                } else if attr_name == b"encoding" {
154                    encoding = attr_val;
155                } else if attr_name == b"standalone" {
156                    standalone = Some(attr_val == b"yes");
157                }
158            }
159
160            if ver != b"1.0"
161                || (encoding != b"utf-8" && encoding != b"UTF-8" && encoding != b"utf8" && encoding != b"UTF8")
162            {
163                Some(Err(self.error(ArxmlLexerError::InvalidXmlHeader)))
164            } else {
165                Some(Ok(ArxmlEvent::ArxmlHeader(standalone)))
166            }
167        } else {
168            None
169        };
170
171        self.line += count_lines(text);
172        result
173    }
174
175    fn read_comment(&mut self, endpos: usize) -> Result<ArxmlEvent<'a>, AutosarDataError> {
176        debug_assert!(self.bufpos < self.buffer.len());
177        debug_assert!(endpos > self.bufpos + 1);
178
179        let startpos = self.bufpos;
180        let text = &self.buffer[startpos..endpos];
181        self.bufpos = endpos + 1;
182
183        if text.len() < 6 || !text.starts_with(b"<!--") || !text.ends_with(b"--") {
184            return Err(AutosarDataError::LexerError {
185                filename: self.sourcefile.clone(),
186                line: self.line,
187                source: ArxmlLexerError::InvalidComment,
188            });
189        }
190        self.line += count_lines(text);
191        let comment = &self.buffer[startpos + 4..endpos - 2];
192        Ok(ArxmlEvent::Comment(comment))
193    }
194}
195
196impl ArxmlLexer<'_> {
197    pub(crate) fn next<'a>(&'a mut self) -> Result<(usize, ArxmlEvent<'a>), AutosarDataError> {
198        // if an <element/> was found, then a BeginElement event is returned first, and the EndElement is deferred and must be returned next
199        if let Some((startpos, endpos)) = self.deferred_end {
200            self.deferred_end = None;
201            Ok((self.line, ArxmlEvent::EndElement(&self.buffer[startpos..endpos])))
202        } else {
203            loop {
204                if self.bufpos == self.buffer.len() {
205                    break Ok((self.line, ArxmlEvent::EndOfFile));
206                } else if self.buffer[self.bufpos] == b'<' {
207                    // start of an <element> or </element> or <!--comment-->
208                    // find a '>' character
209                    let findpos = self.buffer[self.bufpos + 1..]
210                        .iter()
211                        .position(|c| *c == b'>')
212                        .ok_or_else(|| self.error(ArxmlLexerError::IncompleteData))?;
213                    let endpos = self.bufpos + findpos + 1;
214
215                    if endpos == self.bufpos + 1 {
216                        // string is "<>"
217                        return Err(self.error(ArxmlLexerError::InvalidElement));
218                    }
219
220                    // got a non-empty sequence of characters that starts with '<' and ends with '>'
221                    match self.buffer[self.bufpos + 1] {
222                        b'/' => {
223                            // second char is '/' -> EndElement
224                            return Ok((self.line, self.read_element_end(endpos)));
225                        }
226                        b'?' => {
227                            // second char is '?' -> xml header or processing instruction
228                            // processing instructions are ignored, read_xml_header returns None
229                            if let Some(result) = self.read_xml_header(endpos) {
230                                let value = result?;
231                                return Ok((self.line, value));
232                            }
233                        }
234                        b'!' => {
235                            // second char is '!' -> parse a comment
236                            // we found a '>' character, but comments are allowed to contain unquoted '<' and '>'
237                            // this means we need to make sure the end is actually '-->', not just '>'
238                            let mut comment_endpos = endpos;
239                            while comment_endpos < self.buffer.len()
240                                && !self.buffer[comment_endpos - 2..].starts_with(b"-->")
241                            {
242                                comment_endpos += 1;
243                            }
244                            if comment_endpos < self.buffer.len() {
245                                return self.read_comment(comment_endpos).map(|res| (self.line, res));
246                            } else {
247                                // hit the end of the input -> unclosed comment
248                                return Err(self.error(ArxmlLexerError::InvalidComment));
249                            }
250                        }
251                        _ => {
252                            // any other second char -> BeginElement
253                            return Ok((self.line, self.read_element_start(endpos)));
254                        }
255                    }
256                } else {
257                    // start of character sequence
258                    if let (ArxmlEvent::Characters(text), false) = self.read_characters() {
259                        // found a character sequence which is not all whitespace
260                        return Ok((self.line, ArxmlEvent::Characters(text)));
261                    }
262                }
263                // loop if:
264                // - a processing instruction was ignored
265                // - empty character data found (whitespace only)
266            }
267        }
268    }
269
270    fn error(&self, err: ArxmlLexerError) -> AutosarDataError {
271        AutosarDataError::LexerError {
272            filename: self.sourcefile.clone(),
273            line: self.line,
274            source: err,
275        }
276    }
277}
278
279fn count_lines(text: &[u8]) -> usize {
280    text.iter().filter(|c| **c == b'\n').count()
281}
282
283#[cfg(test)]
284mod test {
285    use super::*;
286
287    #[test]
288    fn test_basic_functionality() {
289        let data =
290            b"<?xml version=\"1.0\" encoding=\"utf-8\"?><element attr=\"gggg\" attr3>contained characters</element>";
291        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
292        assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::ArxmlHeader(None)))));
293        assert!(
294            matches!(lexer.next(), Ok((_, ArxmlEvent::BeginElement(elem, attrs))) if elem == b"element" && attrs.len() == 17)
295        );
296        assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::Characters(text))) if text == b"contained characters"));
297        assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::EndElement(elem))) if elem == b"element"));
298        assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::EndOfFile))));
299    }
300
301    #[test]
302    fn skip_byte_order_mark() {
303        let data =
304            b"\xEF\xBB\xBF<?xml version=\"1.0\" encoding=\"utf-8\"?><element attr=\"gggg\" attr3>contained characters</element>";
305        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
306        assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::ArxmlHeader(None)))));
307    }
308
309    #[test]
310    fn test_incomplete_data() {
311        let data = b"<element";
312        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
313        assert!(
314            matches!(lexer.next(), Err(AutosarDataError::LexerError {source, ..}) if source == ArxmlLexerError::IncompleteData)
315        );
316    }
317
318    #[test]
319    fn test_invalid_element() {
320        let data = b"<element><>";
321        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
322        assert!(lexer.next().is_ok());
323        assert!(
324            matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidElement)
325        );
326    }
327
328    #[test]
329    fn test_invalid_processing_instruction() {
330        let data = b"<element><?what>";
331        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
332        assert!(lexer.next().is_ok());
333        assert!(
334            matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidProcessingInstruction)
335        );
336    }
337
338    #[test]
339    fn test_comment() {
340        let data = b"<!-- foo--><element>";
341        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
342        assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::Comment(_)))));
343        assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::BeginElement(_elem, _attrs)))));
344    }
345
346    #[test]
347    fn test_invalid_comment() {
348        let data = b"<element><!-- foo>";
349        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
350        assert!(lexer.next().is_ok());
351        assert!(
352            matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidComment)
353        );
354    }
355
356    #[test]
357    fn test_invalid_xml_header() {
358        let data = br#"<?xml version="1.0" encoding="cp1252"?>"#;
359        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
360        assert!(
361            matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidXmlHeader)
362        );
363
364        let data = br#"<?xml ?>"#;
365        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
366        assert!(
367            matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidXmlHeader)
368        );
369    }
370
371    #[test]
372    fn traits() {
373        // ArxmlLexerError: Debug, Error, Eq, PartialEq, Clone
374        let err = ArxmlLexerError::IncompleteData;
375        let err2 = err;
376        assert_eq!(err, err2);
377        assert_eq!(format!("{err:#?}"), format!("{err2:#?}"));
378        assert_eq!(format!("{err}"), format!("{err2}"));
379
380        // ArxmlEvent: Debug
381        let event = ArxmlEvent::ArxmlHeader(None);
382        let _ = format!("{event:#?}");
383    }
384
385    /// github issue #15 - comments can contain '<' and '>'
386    #[test]
387    fn test_w3c_comment_example() {
388        let data = b"<!-- declarations for <head> & <body> -->";
389        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
390        assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::Comment(_)))));
391    }
392}