1use super::AutosarDataError;
2use std::path::PathBuf;
3use thiserror::Error;
4
5#[derive(Debug, Error, Eq, PartialEq, Clone, Copy)]
6#[non_exhaustive]
7pub enum ArxmlLexerError {
9 #[error("Incomplete data, closing '>' was not found")]
11 IncompleteData,
12
13 #[error("Invalid element: '<>'")]
15 InvalidElement,
16
17 #[error("A processing instruction was started with '<?', but it did not end with '?>'")]
19 InvalidProcessingInstruction,
20
21 #[error("Invalid arxml header: The xml header of an arxml file must specify version=\"1.0\" encoding=\"utf-8\"")]
23 InvalidXmlHeader,
24
25 #[error("Invalid comment")]
27 InvalidComment,
28}
29
30#[derive(Debug)]
31pub(crate) enum ArxmlEvent<'a> {
32 ArxmlHeader(Option<bool>),
33 BeginElement(&'a [u8], &'a [u8]),
34 EndElement(&'a [u8]),
35 Characters(&'a [u8]),
36 Comment(&'a [u8]),
37 EndOfFile,
38}
39
40pub(crate) struct ArxmlLexer<'a> {
41 buffer: &'a [u8],
42 bufpos: usize,
43 line: usize,
44 deferred_end: Option<(usize, usize)>,
45 sourcefile: PathBuf,
46}
47
48impl<'a> ArxmlLexer<'a> {
49 pub(crate) fn new(buffer: &'a [u8], name: PathBuf) -> Self {
50 let bufpos = if buffer.len() > 3 && buffer[0] == 239 && buffer[1] == 187 && buffer[2] == 191 {
52 3
53 } else {
54 0
55 };
56 Self {
57 buffer,
58 bufpos,
59 line: 1,
60 deferred_end: None,
61 sourcefile: name,
62 }
63 }
64
65 fn read_characters(&mut self) -> (ArxmlEvent<'a>, bool) {
66 debug_assert!(self.bufpos < self.buffer.len());
67
68 let mut endpos = self.bufpos;
70 let mut all_whitespace = true;
71 while endpos < self.buffer.len() && self.buffer[endpos] != b'<' {
72 if !self.buffer[endpos].is_ascii_whitespace() {
74 all_whitespace = false;
75 } else if self.buffer[endpos] == b'\n' {
76 self.line += 1;
77 }
78 endpos += 1;
79 }
80 debug_assert!(endpos > self.bufpos);
81
82 let text = &self.buffer[self.bufpos..endpos];
83 self.bufpos = endpos;
84 (ArxmlEvent::Characters(text), all_whitespace)
85 }
86
87 fn read_element_start(&mut self, endpos: usize) -> ArxmlEvent<'a> {
88 debug_assert!(self.bufpos < self.buffer.len());
89 debug_assert!(endpos > self.bufpos + 1);
90 debug_assert!(self.buffer[self.bufpos] == b'<');
91
92 let (text, is_end) = if self.buffer[endpos - 1] == b'/' {
93 (&self.buffer[self.bufpos + 1..endpos - 1], true)
94 } else {
95 (&self.buffer[self.bufpos + 1..endpos], false)
96 };
97
98 let (elemname, attributes) = if let Some(splitpos) = text.iter().position(u8::is_ascii_whitespace) {
99 (&text[..splitpos], &text[splitpos + 1..])
100 } else {
101 (text, &text[0..0])
102 };
103
104 if is_end {
106 self.deferred_end = Some((self.bufpos + 1, self.bufpos + 1 + elemname.len()));
108 }
109
110 self.line += count_lines(text);
111 self.bufpos = endpos + 1;
112 ArxmlEvent::BeginElement(elemname, attributes)
113 }
114
115 fn read_element_end(&mut self, endpos: usize) -> ArxmlEvent<'a> {
116 debug_assert!(self.bufpos < self.buffer.len());
117 debug_assert!(endpos > self.bufpos + 1);
118 debug_assert!(self.buffer[self.bufpos] == b'<');
119
120 let text = &self.buffer[self.bufpos + 2..endpos];
121 self.bufpos = endpos + 1;
122
123 ArxmlEvent::EndElement(text)
124 }
125
126 fn read_xml_header(&mut self, endpos: usize) -> Option<Result<ArxmlEvent<'a>, AutosarDataError>> {
127 debug_assert!(self.bufpos < self.buffer.len());
128 debug_assert!(endpos > self.bufpos + 1);
129 debug_assert!(self.buffer[self.bufpos] == b'<');
130
131 if self.buffer[endpos - 1] != b'?' {
132 return Some(Err(self.error(ArxmlLexerError::InvalidProcessingInstruction)));
133 }
134
135 let text = &self.buffer[self.bufpos + 2..endpos - 1];
136 self.bufpos = endpos + 1;
137
138 let text_trimmed = text.trim_ascii();
139 let (elemname, mut rest) = if let Some(ws_pos) = text_trimmed.iter().position(|c| c.is_ascii_whitespace()) {
140 (&text_trimmed[..ws_pos], &text_trimmed[ws_pos..])
141 } else {
142 (text_trimmed, &text_trimmed[text_trimmed.len()..])
143 };
144
145 let result = if elemname == b"xml" {
146 let mut ver = &text[0..0];
147 let mut encoding = &text[0..0];
148 let mut standalone: Option<bool> = None;
149
150 let valid = loop {
151 rest = rest.trim_ascii_start();
152 if rest.is_empty() {
153 break true;
154 }
155
156 let Some(eq_pos) = rest.iter().position(|c| *c == b'=') else {
157 break false;
158 };
159
160 let attr_name = rest[..eq_pos].trim_ascii_end();
161 if attr_name.is_empty() || attr_name.iter().any(|c| c.is_ascii_whitespace()) {
162 break false;
163 }
164
165 rest = rest[eq_pos + 1..].trim_ascii_start();
166 if rest.is_empty() || (rest[0] != b'"' && rest[0] != b'\'') {
167 break false;
168 }
169
170 let quote = rest[0];
171 rest = &rest[1..];
172 let Some(end_quote_pos) = rest.iter().position(|c| *c == quote) else {
173 break false;
174 };
175
176 let attr_val = &rest[..end_quote_pos];
177 rest = &rest[end_quote_pos + 1..];
178
179 if attr_name == b"version" {
180 ver = attr_val;
181 } else if attr_name == b"encoding" {
182 encoding = attr_val;
183 } else if attr_name == b"standalone" {
184 standalone = Some(attr_val == b"yes");
185 }
186 };
187
188 if !valid
189 || ver != b"1.0"
190 || (encoding != b"utf-8" && encoding != b"UTF-8" && encoding != b"utf8" && encoding != b"UTF8")
191 {
192 Some(Err(self.error(ArxmlLexerError::InvalidXmlHeader)))
193 } else {
194 Some(Ok(ArxmlEvent::ArxmlHeader(standalone)))
195 }
196 } else {
197 None
198 };
199
200 self.line += count_lines(text);
201 result
202 }
203
204 fn read_comment(&mut self, endpos: usize) -> Result<ArxmlEvent<'a>, AutosarDataError> {
205 debug_assert!(self.bufpos < self.buffer.len());
206 debug_assert!(endpos > self.bufpos + 1);
207
208 let startpos = self.bufpos;
209 let text = &self.buffer[startpos..endpos];
210 self.bufpos = endpos + 1;
211
212 if text.len() < 6 || !text.starts_with(b"<!--") || !text.ends_with(b"--") {
213 return Err(AutosarDataError::LexerError {
214 filename: self.sourcefile.clone(),
215 line: self.line,
216 source: ArxmlLexerError::InvalidComment,
217 });
218 }
219 self.line += count_lines(text);
220 let comment = &self.buffer[startpos + 4..endpos - 2];
221 Ok(ArxmlEvent::Comment(comment))
222 }
223}
224
225impl ArxmlLexer<'_> {
226 pub(crate) fn next<'a>(&'a mut self) -> Result<(usize, ArxmlEvent<'a>), AutosarDataError> {
227 if let Some((startpos, endpos)) = self.deferred_end {
229 self.deferred_end = None;
230 Ok((self.line, ArxmlEvent::EndElement(&self.buffer[startpos..endpos])))
231 } else {
232 loop {
233 if self.bufpos == self.buffer.len() {
234 break Ok((self.line, ArxmlEvent::EndOfFile));
235 } else if self.buffer[self.bufpos] == b'<' {
236 let findpos = self.buffer[self.bufpos + 1..]
239 .iter()
240 .position(|c| *c == b'>')
241 .ok_or_else(|| self.error(ArxmlLexerError::IncompleteData))?;
242 let endpos = self.bufpos + findpos + 1;
243
244 if endpos == self.bufpos + 1 {
245 return Err(self.error(ArxmlLexerError::InvalidElement));
247 }
248
249 match self.buffer[self.bufpos + 1] {
251 b'/' => {
252 return Ok((self.line, self.read_element_end(endpos)));
254 }
255 b'?' => {
256 if let Some(result) = self.read_xml_header(endpos) {
259 let value = result?;
260 return Ok((self.line, value));
261 }
262 }
263 b'!' => {
264 let mut comment_endpos = endpos;
268 while comment_endpos < self.buffer.len()
269 && !self.buffer[comment_endpos - 2..].starts_with(b"-->")
270 {
271 comment_endpos += 1;
272 }
273 if comment_endpos < self.buffer.len() {
274 return self.read_comment(comment_endpos).map(|res| (self.line, res));
275 } else {
276 return Err(self.error(ArxmlLexerError::InvalidComment));
278 }
279 }
280 _ => {
281 return Ok((self.line, self.read_element_start(endpos)));
283 }
284 }
285 } else {
286 if let (ArxmlEvent::Characters(text), false) = self.read_characters() {
288 return Ok((self.line, ArxmlEvent::Characters(text)));
290 }
291 }
292 }
296 }
297 }
298
299 fn error(&self, err: ArxmlLexerError) -> AutosarDataError {
300 AutosarDataError::LexerError {
301 filename: self.sourcefile.clone(),
302 line: self.line,
303 source: err,
304 }
305 }
306}
307
308fn count_lines(text: &[u8]) -> usize {
309 text.iter().filter(|c| **c == b'\n').count()
310}
311
312#[cfg(test)]
313mod test {
314 use super::*;
315
316 #[test]
317 fn test_basic_functionality() {
318 let data =
319 b"<?xml version=\"1.0\" encoding=\"utf-8\"?><element attr=\"gggg\" attr3>contained characters</element>";
320 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
321 assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::ArxmlHeader(None)))));
322 assert!(
323 matches!(lexer.next(), Ok((_, ArxmlEvent::BeginElement(elem, attrs))) if elem == b"element" && attrs.len() == 17)
324 );
325 assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::Characters(text))) if text == b"contained characters"));
326 assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::EndElement(elem))) if elem == b"element"));
327 assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::EndOfFile))));
328 }
329
330 #[test]
331 fn skip_byte_order_mark() {
332 let data =
333 b"\xEF\xBB\xBF<?xml version=\"1.0\" encoding=\"utf-8\"?><element attr=\"gggg\" attr3>contained characters</element>";
334 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
335 assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::ArxmlHeader(None)))));
336 }
337
338 #[test]
339 fn test_incomplete_data() {
340 let data = b"<element";
341 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
342 assert!(
343 matches!(lexer.next(), Err(AutosarDataError::LexerError {source, ..}) if source == ArxmlLexerError::IncompleteData)
344 );
345 }
346
347 #[test]
348 fn test_invalid_element() {
349 let data = b"<element><>";
350 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
351 assert!(lexer.next().is_ok());
352 assert!(
353 matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidElement)
354 );
355 }
356
357 #[test]
358 fn test_invalid_processing_instruction() {
359 let data = b"<element><?what>";
360 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
361 assert!(lexer.next().is_ok());
362 assert!(
363 matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidProcessingInstruction)
364 );
365 }
366
367 #[test]
368 fn test_comment() {
369 let data = b"<!-- foo--><element>";
370 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
371 assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::Comment(_)))));
372 assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::BeginElement(_elem, _attrs)))));
373 }
374
375 #[test]
376 fn test_invalid_comment() {
377 let data = b"<element><!-- foo>";
378 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
379 assert!(lexer.next().is_ok());
380 assert!(
381 matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidComment)
382 );
383 }
384
385 #[test]
386 fn test_invalid_xml_header() {
387 let data = br#"<?xml version="1.0" encoding="cp1252"?>"#;
388 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
389 assert!(
390 matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidXmlHeader)
391 );
392
393 let data = br#"<?xml ?>"#;
394 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
395 assert!(
396 matches!(lexer.next(), Err(AutosarDataError::LexerError{source, ..}) if source == ArxmlLexerError::InvalidXmlHeader)
397 );
398 }
399
400 #[test]
401 fn traits() {
402 let err = ArxmlLexerError::IncompleteData;
404 let err2 = err;
405 assert_eq!(err, err2);
406 assert_eq!(format!("{err:#?}"), format!("{err2:#?}"));
407 assert_eq!(format!("{err}"), format!("{err2}"));
408
409 let event = ArxmlEvent::ArxmlHeader(None);
411 let _ = format!("{event:#?}");
412 }
413
414 #[test]
416 fn test_w3c_comment_example() {
417 let data = b"<!-- declarations for <head> & <body> -->";
418 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
419 assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::Comment(_)))));
420 }
421
422 #[test]
424 fn test_xml_header_with_extra_spaces() {
425 let data = b"<?xml version = \"1.0\" encoding = \"utf-8\" standalone = \"yes\" ?>";
426 let mut lexer = ArxmlLexer::new(data, PathBuf::from("(buffer)"));
427 assert!(matches!(lexer.next(), Ok((_, ArxmlEvent::ArxmlHeader(Some(true))))));
428 }
429}