1use super::AutosarDataError;
2use std::path::PathBuf;
3use thiserror::Error;
4
5#[derive(Debug, Error, Eq, PartialEq, Clone, Copy)]
6#[non_exhaustive]
7pub enum ArxmlLexerError {
9 #[error("Incomplete data, closing '>' was not found")]
11 IncompleteData,
12
13 #[error("Invalid element: '<>'")]
15 InvalidElement,
16
17 #[error("A processing instruction was started with '<?', but it did not end with '?>'")]
19 InvalidProcessingInstruction,
20
21 #[error("Invalid arxml header: The xml header of an arxml file must specify version=\"1.0\" encoding=\"utf-8\"")]
23 InvalidXmlHeader,
24
25 #[error("Invalid comment")]
27 InvalidComment,
28}
29
30#[derive(Debug)]
31pub(crate) enum ArxmlEvent<'a> {
32 ArxmlHeader(Option<bool>),
33 BeginElement(&'a [u8], &'a [u8]),
34 EndElement(&'a [u8]),
35 Characters(&'a [u8]),
36 Comment(&'a [u8]),
37 EndOfFile,
38}
39
40pub(crate) struct ArxmlLexer<'a> {
41 buffer: &'a [u8],
42 bufpos: usize,
43 line: usize,
44 deferred_end: Option<(usize, usize)>,
45 sourcefile: PathBuf,
46}
47
48impl<'a> ArxmlLexer<'a> {
49 pub(crate) fn new(buffer: &'a [u8], name: PathBuf) -> Self {
50 let bufpos = if buffer.len() > 3 && buffer[0] == 239 && buffer[1] == 187 && buffer[2] == 191 {
52 3
53 } else {
54 0
55 };
56 Self {
57 buffer,
58 bufpos,
59 line: 1,
60 deferred_end: None,
61 sourcefile: name,
62 }
63 }
64
65 fn read_characters(&mut self) -> (ArxmlEvent<'a>, bool) {
66 debug_assert!(self.bufpos < self.buffer.len());
67
68 let mut endpos = self.bufpos;
70 let mut all_whitespace = true;
71 while endpos < self.buffer.len() && self.buffer[endpos] != b'<' {
72 if !self.buffer[endpos].is_ascii_whitespace() {
74 all_whitespace = false;
75 } else if self.buffer[endpos] == b'\n' {
76 self.line += 1;
77 }
78 endpos += 1;
79 }
80 debug_assert!(endpos > self.bufpos);
81
82 let text = &self.buffer[self.bufpos..endpos];
83 self.bufpos = endpos;
84 (ArxmlEvent::Characters(text), all_whitespace)
85 }
86
87 fn read_element_start(&mut self, endpos: usize) -> ArxmlEvent<'a> {
88 debug_assert!(self.bufpos < self.buffer.len());
89 debug_assert!(endpos > self.bufpos + 1);
90 debug_assert!(self.buffer[self.bufpos] == b'<');
91
92 let (text, is_end) = if self.buffer[endpos - 1] == b'/' {
93 (&self.buffer[self.bufpos + 1..endpos - 1], true)
94 } else {
95 (&self.buffer[self.bufpos + 1..endpos], false)
96 };
97
98 let (elemname, attributes) = if let Some(splitpos) = text.iter().position(u8::is_ascii_whitespace) {
99 (&text[..splitpos], &text[splitpos + 1..])
100 } else {
101 (text, &text[0..0])
102 };
103
104 if is_end {
106 self.deferred_end = Some((self.bufpos + 1, self.bufpos + 1 + elemname.len()));
108 }
109
110 self.line += count_lines(text);
111 self.bufpos = endpos + 1;
112 ArxmlEvent::BeginElement(elemname, attributes)
113 }
114
115 fn read_element_end(&mut self, endpos: usize) -> ArxmlEvent<'a> {
116 debug_assert!(self.bufpos < self.buffer.len());
117 debug_assert!(endpos > self.bufpos + 1);
118 debug_assert!(self.buffer[self.bufpos] == b'<');
119
120 let text = &self.buffer[self.bufpos + 2..endpos];
121 self.bufpos = endpos + 1;
122
123 ArxmlEvent::EndElement(text)
124 }
125
126 fn read_xml_header(&mut self, endpos: usize) -> Option<Result<ArxmlEvent<'a>, AutosarDataError>> {
127 debug_assert!(self.bufpos < self.buffer.len());
128 debug_assert!(endpos > self.bufpos + 1);
129 debug_assert!(self.buffer[self.bufpos] == b'<');
130
131 if self.buffer[endpos - 1] != b'?' {
132 return Some(Err(self.error(ArxmlLexerError::InvalidProcessingInstruction)));
133 }
134
135 let text = &self.buffer[self.bufpos + 2..endpos - 1];
136 self.bufpos = endpos + 1;
137
138 let mut splitter = text.split(u8::is_ascii_whitespace);
139 let elemname = splitter.next().unwrap();
140
141 let result = if elemname == b"xml" {
142 let mut ver = &text[0..0];
143 let mut encoding = &text[0..0];
144 let mut standalone: Option<bool> = None;
145 for attr_text in splitter {
146 let (attr_name, attr_val) = if let Some(pos) = attr_text.iter().position(|c| *c == b'=') {
147 (&attr_text[0..pos], &attr_text[pos + 2..attr_text.len() - 1])
148 } else {
149 (attr_text, &attr_text[0..0])
150 };
151 if attr_name == b"version" {
152 ver = attr_val;
153 } else if attr_name == b"encoding" {
154 encoding = attr_val;
155 } else if attr_name == b"standalone" {
156 standalone = Some(attr_val == b"yes");
157 }
158 }
159
160 if ver != b"1.0"
161 || (encoding != b"utf-8" && encoding != b"UTF-8" && encoding != b"utf8" && encoding != b"UTF8")
162 {
163 Some(Err(self.error(ArxmlLexerError::InvalidXmlHeader)))
164 } else {
165 Some(Ok(ArxmlEvent::ArxmlHeader(standalone)))
166 }
167 } else {
168 None
169 };
170
171 self.line += count_lines(text);
172 result
173 }
174
175 fn read_comment(&mut self, endpos: usize) -> Result<ArxmlEvent<'a>, AutosarDataError> {
176 debug_assert!(self.bufpos < self.buffer.len());
177 debug_assert!(endpos > self.bufpos + 1);
178
179 let startpos = self.bufpos;
180 let text = &self.buffer[startpos..endpos];
181 self.bufpos = endpos + 1;
182
183 if text.len() < 6 || !text.starts_with(b"<!--") || !text.ends_with(b"--") {
184 return Err(AutosarDataError::LexerError {
185 filename: self.sourcefile.clone(),
186 line: self.line,
187 source: ArxmlLexerError::InvalidComment,
188 });
189 }
190 self.line += count_lines(text);
191 let comment = &self.buffer[startpos + 4..endpos - 2];
192 Ok(ArxmlEvent::Comment(comment))
193 }
194}
195
196impl ArxmlLexer<'_> {
197 pub(crate) fn next<'a>(&'a mut self) -> Result<(usize, ArxmlEvent<'a>), AutosarDataError> {
198 if let Some((startpos, endpos)) = self.deferred_end {
200 self.deferred_end = None;
201 Ok((self.line, ArxmlEvent::EndElement(&self.buffer[startpos..endpos])))
202 } else {
203 loop {
204 if self.bufpos == self.buffer.len() {
205 break Ok((self.line, ArxmlEvent::EndOfFile));
206 } else if self.buffer[self.bufpos] == b'<' {
207 let findpos = self.buffer[self.bufpos + 1..]
210 .iter()
211 .position(|c| *c == b'>')
212 .ok_or_else(|| self.error(ArxmlLexerError::IncompleteData))?;
213 let endpos = self.bufpos + findpos + 1;
214
215 if endpos == self.bufpos + 1 {
216 return Err(self.error(ArxmlLexerError::InvalidElement));
218 }
219
220 match self.buffer[self.bufpos + 1] {
222 b'/' => {
223 return Ok((self.line, self.read_element_end(endpos)));
225 }
226 b'?' => {
227 if let Some(result) = self.read_xml_header(endpos) {
230 let value = result?;
231 return Ok((self.line, value));
232 }
233 }
234 b'!' => {
235 let mut comment_endpos = endpos;
239 while comment_endpos < self.buffer.len()
240 && !self.buffer[comment_endpos - 2..].starts_with(b"-->")
241 {
242 comment_endpos += 1;
243 }
244 if comment_endpos < self.buffer.len() {
245 return self.read_comment(comment_endpos).map(|res| (self.line, res));
246 } else {
247 return Err(self.error(ArxmlLexerError::InvalidComment));
249 }
250 }
251 _ => {
252 return Ok((self.line, self.read_element_start(endpos)));
254 }
255 }
256 } else {
257 if let (ArxmlEvent::Characters(text), false) = self.read_characters() {
259 return Ok((self.line, ArxmlEvent::Characters(text)));
261 }
262 }
263 }
267 }
268 }
269
270 fn error(&self, err: ArxmlLexerError) -> AutosarDataError {
271 AutosarDataError::LexerError {
272 filename: self.sourcefile.clone(),
273 line: self.line,
274 source: err,
275 }
276 }
277}
278
279fn count_lines(text: &[u8]) -> usize {
280 text.iter().filter(|c| **c == b'\n').count()
281}
282
283#[cfg(test)]
284mod test {
285 use super::*;
286
287 #[test]
288 fn test_basic_functionality() {
289 let data =
290 b"<?xml version=\"1.0\" encoding=\"utf-8\"?><element attr=\"gggg\" attr3>contained characters</element>";
291 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
292 assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::ArxmlHeader(None)))));
293 assert!(
294 matches!(lexer.next(), Ok((_, ArxmlEvent::BeginElement(elem, attrs))) if elem == b"element" && attrs.len() == 17)
295 );
296 assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::Characters(text))) if text == b"contained characters"));
297 assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::EndElement(elem))) if elem == b"element"));
298 assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::EndOfFile))));
299 }
300
301 #[test]
302 fn skip_byte_order_mark() {
303 let data =
304 b"\xEF\xBB\xBF<?xml version=\"1.0\" encoding=\"utf-8\"?><element attr=\"gggg\" attr3>contained characters</element>";
305 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
306 assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::ArxmlHeader(None)))));
307 }
308
309 #[test]
310 fn test_incomplete_data() {
311 let data = b"<element";
312 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
313 assert!(
314 matches!(lexer.next(), Err(AutosarDataError::LexerError {source, ..}) if source == ArxmlLexerError::IncompleteData)
315 );
316 }
317
318 #[test]
319 fn test_invalid_element() {
320 let data = b"<element><>";
321 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
322 assert!(lexer.next().is_ok());
323 assert!(
324 matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidElement)
325 );
326 }
327
328 #[test]
329 fn test_invalid_processing_instruction() {
330 let data = b"<element><?what>";
331 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
332 assert!(lexer.next().is_ok());
333 assert!(
334 matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidProcessingInstruction)
335 );
336 }
337
338 #[test]
339 fn test_comment() {
340 let data = b"<!-- foo--><element>";
341 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
342 assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::Comment(_)))));
343 assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::BeginElement(_elem, _attrs)))));
344 }
345
346 #[test]
347 fn test_invalid_comment() {
348 let data = b"<element><!-- foo>";
349 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
350 assert!(lexer.next().is_ok());
351 assert!(
352 matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidComment)
353 );
354 }
355
356 #[test]
357 fn test_invalid_xml_header() {
358 let data = br#"<?xml version="1.0" encoding="cp1252"?>"#;
359 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
360 assert!(
361 matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidXmlHeader)
362 );
363
364 let data = br#"<?xml ?>"#;
365 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
366 assert!(
367 matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidXmlHeader)
368 );
369 }
370
371 #[test]
372 fn traits() {
373 let err = ArxmlLexerError::IncompleteData;
375 let err2 = err;
376 assert_eq!(err, err2);
377 assert_eq!(format!("{err:#?}"), format!("{err2:#?}"));
378 assert_eq!(format!("{err}"), format!("{err2}"));
379
380 let event = ArxmlEvent::ArxmlHeader(None);
382 let _ = format!("{event:#?}");
383 }
384
385 #[test]
387 fn test_w3c_comment_example() {
388 let data = b"<!-- declarations for <head> & <body> -->";
389 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
390 assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::Comment(_)))));
391 }
392}