Skip to main content

autosar_data/
lexer.rs

1use super::AutosarDataError;
2use std::path::PathBuf;
3use thiserror::Error;
4
5#[derive(Debug, Error, Eq, PartialEq, Clone, Copy)]
6#[non_exhaustive]
7/// `ArxmlLexerError` contains all errors that can occur while reading data
8pub enum ArxmlLexerError {
9    /// Incomplete data, closing '>' was not found
10    #[error("Incomplete data, closing '>' was not found")]
11    IncompleteData,
12
13    /// Invalid element: '<>'
14    #[error("Invalid element: '<>'")]
15    InvalidElement,
16
17    /// A processing instruction was started with '<?', but it did not end with '?>'
18    #[error("A processing instruction was started with '<?', but it did not end with '?>'")]
19    InvalidProcessingInstruction,
20
21    /// Invalid arxml header: The xml header of an arxml file must specify version="1.0" encoding="utf-8"
22    #[error("Invalid arxml header: The xml header of an arxml file must specify version=\"1.0\" encoding=\"utf-8\"")]
23    InvalidXmlHeader,
24
25    /// Invalid comment: Comments must start with '<!--' and end with '-->'
26    #[error("Invalid comment")]
27    InvalidComment,
28}
29
30#[derive(Debug)]
31pub(crate) enum ArxmlEvent<'a> {
32    ArxmlHeader(Option<bool>),
33    BeginElement(&'a [u8], &'a [u8]),
34    EndElement(&'a [u8]),
35    Characters(&'a [u8]),
36    Comment(&'a [u8]),
37    EndOfFile,
38}
39
40pub(crate) struct ArxmlLexer<'a> {
41    buffer: &'a [u8],
42    bufpos: usize,
43    line: usize,
44    deferred_end: Option<(usize, usize)>,
45    sourcefile: PathBuf,
46}
47
48impl<'a> ArxmlLexer<'a> {
49    pub(crate) fn new(buffer: &'a [u8], name: PathBuf) -> Self {
50        // skip the byte-order mark, if it is present
51        let bufpos = if buffer.len() > 3 && buffer[0] == 239 && buffer[1] == 187 && buffer[2] == 191 {
52            3
53        } else {
54            0
55        };
56        Self {
57            buffer,
58            bufpos,
59            line: 1,
60            deferred_end: None,
61            sourcefile: name,
62        }
63    }
64
65    fn read_characters(&mut self) -> (ArxmlEvent<'a>, bool) {
66        debug_assert!(self.bufpos < self.buffer.len());
67
68        // the start of the next element '<' is the end of this block of characters
69        let mut endpos = self.bufpos;
70        let mut all_whitespace = true;
71        while endpos < self.buffer.len() && self.buffer[endpos] != b'<' {
72            // count the lines directly in this loop; it's faster than calling count_lines and this loop is quite hot in the profile...
73            if !self.buffer[endpos].is_ascii_whitespace() {
74                all_whitespace = false;
75            } else if self.buffer[endpos] == b'\n' {
76                self.line += 1;
77            }
78            endpos += 1;
79        }
80        debug_assert!(endpos > self.bufpos);
81
82        let text = &self.buffer[self.bufpos..endpos];
83        self.bufpos = endpos;
84        (ArxmlEvent::Characters(text), all_whitespace)
85    }
86
87    fn read_element_start(&mut self, endpos: usize) -> ArxmlEvent<'a> {
88        debug_assert!(self.bufpos < self.buffer.len());
89        debug_assert!(endpos > self.bufpos + 1);
90        debug_assert!(self.buffer[self.bufpos] == b'<');
91
92        let (text, is_end) = if self.buffer[endpos - 1] == b'/' {
93            (&self.buffer[self.bufpos + 1..endpos - 1], true)
94        } else {
95            (&self.buffer[self.bufpos + 1..endpos], false)
96        };
97
98        let (elemname, attributes) = if let Some(splitpos) = text.iter().position(u8::is_ascii_whitespace) {
99            (&text[..splitpos], &text[splitpos + 1..])
100        } else {
101            (text, &text[0..0])
102        };
103
104        // this is a <element/>, so a EndElement event needs to be generated next
105        if is_end {
106            // calculate the position of the element name inside the text
107            self.deferred_end = Some((self.bufpos + 1, self.bufpos + 1 + elemname.len()));
108        }
109
110        self.line += count_lines(text);
111        self.bufpos = endpos + 1;
112        ArxmlEvent::BeginElement(elemname, attributes)
113    }
114
115    fn read_element_end(&mut self, endpos: usize) -> ArxmlEvent<'a> {
116        debug_assert!(self.bufpos < self.buffer.len());
117        debug_assert!(endpos > self.bufpos + 1);
118        debug_assert!(self.buffer[self.bufpos] == b'<');
119
120        let text = &self.buffer[self.bufpos + 2..endpos];
121        self.bufpos = endpos + 1;
122
123        ArxmlEvent::EndElement(text)
124    }
125
126    fn read_xml_header(&mut self, endpos: usize) -> Option<Result<ArxmlEvent<'a>, AutosarDataError>> {
127        debug_assert!(self.bufpos < self.buffer.len());
128        debug_assert!(endpos > self.bufpos + 1);
129        debug_assert!(self.buffer[self.bufpos] == b'<');
130
131        if self.buffer[endpos - 1] != b'?' {
132            return Some(Err(self.error(ArxmlLexerError::InvalidProcessingInstruction)));
133        }
134
135        let text = &self.buffer[self.bufpos + 2..endpos - 1];
136        self.bufpos = endpos + 1;
137
138        let text_trimmed = text.trim_ascii();
139        let (elemname, mut rest) = if let Some(ws_pos) = text_trimmed.iter().position(|c| c.is_ascii_whitespace()) {
140            (&text_trimmed[..ws_pos], &text_trimmed[ws_pos..])
141        } else {
142            (text_trimmed, &text_trimmed[text_trimmed.len()..])
143        };
144
145        let result = if elemname == b"xml" {
146            let mut ver = &text[0..0];
147            let mut encoding = &text[0..0];
148            let mut standalone: Option<bool> = None;
149
150            let valid = loop {
151                rest = rest.trim_ascii_start();
152                if rest.is_empty() {
153                    break true;
154                }
155
156                let Some(eq_pos) = rest.iter().position(|c| *c == b'=') else {
157                    break false;
158                };
159
160                let attr_name = rest[..eq_pos].trim_ascii_end();
161                if attr_name.is_empty() || attr_name.iter().any(|c| c.is_ascii_whitespace()) {
162                    break false;
163                }
164
165                rest = rest[eq_pos + 1..].trim_ascii_start();
166                if rest.is_empty() || (rest[0] != b'"' && rest[0] != b'\'') {
167                    break false;
168                }
169
170                let quote = rest[0];
171                rest = &rest[1..];
172                let Some(end_quote_pos) = rest.iter().position(|c| *c == quote) else {
173                    break false;
174                };
175
176                let attr_val = &rest[..end_quote_pos];
177                rest = &rest[end_quote_pos + 1..];
178
179                if attr_name == b"version" {
180                    ver = attr_val;
181                } else if attr_name == b"encoding" {
182                    encoding = attr_val;
183                } else if attr_name == b"standalone" {
184                    standalone = Some(attr_val == b"yes");
185                }
186            };
187
188            if !valid
189                || ver != b"1.0"
190                || (encoding != b"utf-8" && encoding != b"UTF-8" && encoding != b"utf8" && encoding != b"UTF8")
191            {
192                Some(Err(self.error(ArxmlLexerError::InvalidXmlHeader)))
193            } else {
194                Some(Ok(ArxmlEvent::ArxmlHeader(standalone)))
195            }
196        } else {
197            None
198        };
199
200        self.line += count_lines(text);
201        result
202    }
203
204    fn read_comment(&mut self, endpos: usize) -> Result<ArxmlEvent<'a>, AutosarDataError> {
205        debug_assert!(self.bufpos < self.buffer.len());
206        debug_assert!(endpos > self.bufpos + 1);
207
208        let startpos = self.bufpos;
209        let text = &self.buffer[startpos..endpos];
210        self.bufpos = endpos + 1;
211
212        if text.len() < 6 || !text.starts_with(b"<!--") || !text.ends_with(b"--") {
213            return Err(AutosarDataError::LexerError {
214                filename: self.sourcefile.clone(),
215                line: self.line,
216                source: ArxmlLexerError::InvalidComment,
217            });
218        }
219        self.line += count_lines(text);
220        let comment = &self.buffer[startpos + 4..endpos - 2];
221        Ok(ArxmlEvent::Comment(comment))
222    }
223}
224
225impl ArxmlLexer<'_> {
226    pub(crate) fn next<'a>(&'a mut self) -> Result<(usize, ArxmlEvent<'a>), AutosarDataError> {
227        // if an <element/> was found, then a BeginElement event is returned first, and the EndElement is deferred and must be returned next
228        if let Some((startpos, endpos)) = self.deferred_end {
229            self.deferred_end = None;
230            Ok((self.line, ArxmlEvent::EndElement(&self.buffer[startpos..endpos])))
231        } else {
232            loop {
233                if self.bufpos == self.buffer.len() {
234                    break Ok((self.line, ArxmlEvent::EndOfFile));
235                } else if self.buffer[self.bufpos] == b'<' {
236                    // start of an <element> or </element> or <!--comment-->
237                    // find a '>' character
238                    let findpos = self.buffer[self.bufpos + 1..]
239                        .iter()
240                        .position(|c| *c == b'>')
241                        .ok_or_else(|| self.error(ArxmlLexerError::IncompleteData))?;
242                    let endpos = self.bufpos + findpos + 1;
243
244                    if endpos == self.bufpos + 1 {
245                        // string is "<>"
246                        return Err(self.error(ArxmlLexerError::InvalidElement));
247                    }
248
249                    // got a non-empty sequence of characters that starts with '<' and ends with '>'
250                    match self.buffer[self.bufpos + 1] {
251                        b'/' => {
252                            // second char is '/' -> EndElement
253                            return Ok((self.line, self.read_element_end(endpos)));
254                        }
255                        b'?' => {
256                            // second char is '?' -> xml header or processing instruction
257                            // processing instructions are ignored, read_xml_header returns None
258                            if let Some(result) = self.read_xml_header(endpos) {
259                                let value = result?;
260                                return Ok((self.line, value));
261                            }
262                        }
263                        b'!' => {
264                            // second char is '!' -> parse a comment
265                            // we found a '>' character, but comments are allowed to contain unquoted '<' and '>'
266                            // this means we need to make sure the end is actually '-->', not just '>'
267                            let mut comment_endpos = endpos;
268                            while comment_endpos < self.buffer.len()
269                                && !self.buffer[comment_endpos - 2..].starts_with(b"-->")
270                            {
271                                comment_endpos += 1;
272                            }
273                            if comment_endpos < self.buffer.len() {
274                                return self.read_comment(comment_endpos).map(|res| (self.line, res));
275                            } else {
276                                // hit the end of the input -> unclosed comment
277                                return Err(self.error(ArxmlLexerError::InvalidComment));
278                            }
279                        }
280                        _ => {
281                            // any other second char -> BeginElement
282                            return Ok((self.line, self.read_element_start(endpos)));
283                        }
284                    }
285                } else {
286                    // start of character sequence
287                    if let (ArxmlEvent::Characters(text), false) = self.read_characters() {
288                        // found a character sequence which is not all whitespace
289                        return Ok((self.line, ArxmlEvent::Characters(text)));
290                    }
291                }
292                // loop if:
293                // - a processing instruction was ignored
294                // - empty character data found (whitespace only)
295            }
296        }
297    }
298
299    fn error(&self, err: ArxmlLexerError) -> AutosarDataError {
300        AutosarDataError::LexerError {
301            filename: self.sourcefile.clone(),
302            line: self.line,
303            source: err,
304        }
305    }
306}
307
308fn count_lines(text: &[u8]) -> usize {
309    text.iter().filter(|c| **c == b'\n').count()
310}
311
312#[cfg(test)]
313mod test {
314    use super::*;
315
316    #[test]
317    fn test_basic_functionality() {
318        let data =
319            b"<?xml version=\"1.0\" encoding=\"utf-8\"?><element attr=\"gggg\" attr3>contained characters</element>";
320        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
321        assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::ArxmlHeader(None)))));
322        assert!(
323            matches!(lexer.next(), Ok((_, ArxmlEvent::BeginElement(elem, attrs))) if elem == b"element" && attrs.len() == 17)
324        );
325        assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::Characters(text))) if text == b"contained characters"));
326        assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::EndElement(elem))) if elem == b"element"));
327        assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::EndOfFile))));
328    }
329
330    #[test]
331    fn skip_byte_order_mark() {
332        let data =
333            b"\xEF\xBB\xBF<?xml version=\"1.0\" encoding=\"utf-8\"?><element attr=\"gggg\" attr3>contained characters</element>";
334        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
335        assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::ArxmlHeader(None)))));
336    }
337
338    #[test]
339    fn test_incomplete_data() {
340        let data = b"<element";
341        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
342        assert!(
343            matches!(lexer.next(), Err(AutosarDataError::LexerError {source, ..}) if source == ArxmlLexerError::IncompleteData)
344        );
345    }
346
347    #[test]
348    fn test_invalid_element() {
349        let data = b"<element><>";
350        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
351        assert!(lexer.next().is_ok());
352        assert!(
353            matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidElement)
354        );
355    }
356
357    #[test]
358    fn test_invalid_processing_instruction() {
359        let data = b"<element><?what>";
360        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
361        assert!(lexer.next().is_ok());
362        assert!(
363            matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidProcessingInstruction)
364        );
365    }
366
367    #[test]
368    fn test_comment() {
369        let data = b"<!-- foo--><element>";
370        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
371        assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::Comment(_)))));
372        assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::BeginElement(_elem, _attrs)))));
373    }
374
375    #[test]
376    fn test_invalid_comment() {
377        let data = b"<element><!-- foo>";
378        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
379        assert!(lexer.next().is_ok());
380        assert!(
381            matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidComment)
382        );
383    }
384
385    #[test]
386    fn test_invalid_xml_header() {
387        let data = br#"<?xml version="1.0" encoding="cp1252"?>"#;
388        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
389        assert!(
390            matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidXmlHeader)
391        );
392
393        let data = br#"<?xml ?>"#;
394        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
395        assert!(
396            matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidXmlHeader)
397        );
398    }
399
400    #[test]
401    fn traits() {
402        // ArxmlLexerError: Debug, Error, Eq, PartialEq, Clone
403        let err = ArxmlLexerError::IncompleteData;
404        let err2 = err;
405        assert_eq!(err, err2);
406        assert_eq!(format!("{err:#?}"), format!("{err2:#?}"));
407        assert_eq!(format!("{err}"), format!("{err2}"));
408
409        // ArxmlEvent: Debug
410        let event = ArxmlEvent::ArxmlHeader(None);
411        let _ = format!("{event:#?}");
412    }
413
414    /// github issue #15 - comments can contain '<' and '>'
415    #[test]
416    fn test_w3c_comment_example() {
417        let data = b"<!-- declarations for <head> & <body> -->";
418        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
419        assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::Comment(_)))));
420    }
421
422    /// github issue #32 - extra spaces in the XML header should be tolerated
423    #[test]
424    fn test_xml_header_with_extra_spaces() {
425        let data = b"<?xml   version =   \"1.0\"   encoding =   \"utf-8\"   standalone = \"yes\"  ?>";
426        let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
427        assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::ArxmlHeader(Some(true))))));
428    }
429}