1use crate::error::{Error, Position, Result};
7use crate::escape::unescape;
8use memchr::memchr;
9use std::borrow::Cow;
10
11#[derive(Debug, Clone, PartialEq)]
13pub enum XmlEvent<'a> {
14 XmlDecl {
16 version: Cow<'a, str>,
18 encoding: Option<Cow<'a, str>>,
20 standalone: Option<bool>,
22 },
23 StartElement {
25 name: Cow<'a, str>,
27 attributes: Vec<Attribute<'a>>,
29 },
30 EndElement {
32 name: Cow<'a, str>,
34 },
35 EmptyElement {
37 name: Cow<'a, str>,
39 attributes: Vec<Attribute<'a>>,
41 },
42 Text(Cow<'a, str>),
44 CData(Cow<'a, str>),
46 Comment(Cow<'a, str>),
48 ProcessingInstruction {
50 target: Cow<'a, str>,
52 data: Option<Cow<'a, str>>,
54 },
55 Eof,
57}
58
59#[derive(Debug, Clone, PartialEq)]
61pub struct Attribute<'a> {
62 pub name: Cow<'a, str>,
64 pub value: Cow<'a, str>,
66}
67
68pub struct XmlReader<'a> {
70 input: &'a [u8],
71 pos: usize,
72 line: usize,
73 col: usize,
74 element_stack: Vec<String>,
76}
77
78impl<'a> XmlReader<'a> {
79 #[inline]
81 #[allow(clippy::should_implement_trait)]
82 pub fn from_str(s: &'a str) -> Self {
83 Self::from_bytes(s.as_bytes())
84 }
85
86 #[inline]
88 pub fn from_bytes(input: &'a [u8]) -> Self {
89 Self {
90 input,
91 pos: 0,
92 line: 1,
93 col: 1,
94 element_stack: Vec::new(),
95 }
96 }
97
98 #[inline]
100 pub fn position(&self) -> Position {
101 Position {
102 line: self.line,
103 column: self.col,
104 offset: self.pos,
105 }
106 }
107
108 #[inline]
110 pub fn depth(&self) -> usize {
111 self.element_stack.len()
112 }
113
114 pub fn next_event(&mut self) -> Result<XmlEvent<'a>> {
116 self.skip_whitespace();
117
118 if self.pos >= self.input.len() {
119 if !self.element_stack.is_empty() {
120 let tag = self.element_stack.pop().unwrap();
121 return Err(Error::unclosed_tag(tag).with_position(self.position()));
122 }
123 return Ok(XmlEvent::Eof);
124 }
125
126 if self.input[self.pos] == b'<' {
127 self.read_tag()
128 } else {
129 self.read_text()
130 }
131 }
132
133 fn skip_whitespace(&mut self) {
135 while self.pos < self.input.len() {
136 match self.input[self.pos] {
137 b' ' | b'\t' | b'\r' => {
138 self.pos += 1;
139 self.col += 1;
140 }
141 b'\n' => {
142 self.pos += 1;
143 self.line += 1;
144 self.col = 1;
145 }
146 _ => break,
147 }
148 }
149 }
150
151 fn read_text(&mut self) -> Result<XmlEvent<'a>> {
153 let start = self.pos;
154
155 while self.pos < self.input.len() && self.input[self.pos] != b'<' {
157 if self.input[self.pos] == b'\n' {
158 self.line += 1;
159 self.col = 1;
160 } else {
161 self.col += 1;
162 }
163 self.pos += 1;
164 }
165
166 let text = std::str::from_utf8(&self.input[start..self.pos])
167 .map_err(|_| Error::new(crate::error::ErrorKind::InvalidUtf8))?;
168
169 let trimmed = text.trim();
171 if trimmed.is_empty() {
172 return self.next_event();
174 }
175
176 match unescape(trimmed) {
178 Ok(unescaped) => Ok(XmlEvent::Text(unescaped)),
179 Err(e) => Err(Error::invalid_escape(e.entity)),
180 }
181 }
182
183 fn read_tag(&mut self) -> Result<XmlEvent<'a>> {
185 debug_assert_eq!(self.input[self.pos], b'<');
186 self.pos += 1;
187 self.col += 1;
188
189 if self.pos >= self.input.len() {
190 return Err(Error::unexpected_eof().with_position(self.position()));
191 }
192
193 match self.input[self.pos] {
194 b'/' => self.read_end_element(),
195 b'?' => self.read_processing_instruction(),
196 b'!' => self.read_special(),
197 _ => self.read_start_element(),
198 }
199 }
200
201 fn read_start_element(&mut self) -> Result<XmlEvent<'a>> {
203 let name = self.read_name()?;
204 let attributes = self.read_attributes()?;
205
206 self.skip_whitespace();
207
208 if self.pos >= self.input.len() {
209 return Err(Error::unexpected_eof().with_position(self.position()));
210 }
211
212 if self.input[self.pos] == b'/' {
213 self.pos += 1;
215 self.col += 1;
216 self.expect_char(b'>')?;
217 Ok(XmlEvent::EmptyElement {
218 name: Cow::Borrowed(name),
219 attributes,
220 })
221 } else if self.input[self.pos] == b'>' {
222 self.pos += 1;
224 self.col += 1;
225 self.element_stack.push(name.to_string());
226 Ok(XmlEvent::StartElement {
227 name: Cow::Borrowed(name),
228 attributes,
229 })
230 } else {
231 Err(Error::syntax("expected '>' or '/>'").with_position(self.position()))
232 }
233 }
234
235 fn read_end_element(&mut self) -> Result<XmlEvent<'a>> {
237 debug_assert_eq!(self.input[self.pos], b'/');
238 self.pos += 1;
239 self.col += 1;
240
241 let name = self.read_name()?;
242 self.skip_whitespace();
243 self.expect_char(b'>')?;
244
245 match self.element_stack.pop() {
247 Some(expected) if expected == name => Ok(XmlEvent::EndElement {
248 name: Cow::Borrowed(name),
249 }),
250 Some(expected) => Err(Error::mismatched_tag(expected, name.to_string()).with_position(self.position())),
251 None => Err(Error::syntax(format!("unexpected closing tag: {}", name))
252 .with_position(self.position())),
253 }
254 }
255
256 fn read_processing_instruction(&mut self) -> Result<XmlEvent<'a>> {
258 debug_assert_eq!(self.input[self.pos], b'?');
259 self.pos += 1;
260 self.col += 1;
261
262 let target = self.read_name()?;
263
264 if target.eq_ignore_ascii_case("xml") {
266 return self.read_xml_decl();
267 }
268
269 self.skip_whitespace();
270
271 let data_start = self.pos;
273 while self.pos + 1 < self.input.len() {
274 if self.input[self.pos] == b'?' && self.input[self.pos + 1] == b'>' {
275 let data = std::str::from_utf8(&self.input[data_start..self.pos])
276 .map_err(|_| Error::new(crate::error::ErrorKind::InvalidUtf8))?;
277 self.pos += 2;
278 self.col += 2;
279 return Ok(XmlEvent::ProcessingInstruction {
280 target: Cow::Borrowed(target),
281 data: if data.trim().is_empty() {
282 None
283 } else {
284 Some(Cow::Borrowed(data.trim()))
285 },
286 });
287 }
288 if self.input[self.pos] == b'\n' {
289 self.line += 1;
290 self.col = 1;
291 } else {
292 self.col += 1;
293 }
294 self.pos += 1;
295 }
296
297 Err(Error::syntax("unterminated processing instruction").with_position(self.position()))
298 }
299
300 fn read_xml_decl(&mut self) -> Result<XmlEvent<'a>> {
302 let attributes = self.read_attributes()?;
303 self.skip_whitespace();
304
305 if self.pos + 1 >= self.input.len()
306 || self.input[self.pos] != b'?'
307 || self.input[self.pos + 1] != b'>'
308 {
309 return Err(Error::syntax("expected '?>'").with_position(self.position()));
310 }
311 self.pos += 2;
312 self.col += 2;
313
314 let mut version = None;
315 let mut encoding = None;
316 let mut standalone = None;
317
318 for attr in attributes {
319 match attr.name.as_ref() {
320 "version" => version = Some(attr.value),
321 "encoding" => encoding = Some(attr.value),
322 "standalone" => {
323 standalone = Some(attr.value.as_ref() == "yes");
324 }
325 _ => {}
326 }
327 }
328
329 Ok(XmlEvent::XmlDecl {
330 version: version.unwrap_or(Cow::Borrowed("1.0")),
331 encoding,
332 standalone,
333 })
334 }
335
336 fn read_special(&mut self) -> Result<XmlEvent<'a>> {
338 debug_assert_eq!(self.input[self.pos], b'!');
339 self.pos += 1;
340 self.col += 1;
341
342 if self.pos >= self.input.len() {
343 return Err(Error::unexpected_eof().with_position(self.position()));
344 }
345
346 if self.pos + 1 < self.input.len()
348 && self.input[self.pos] == b'-'
349 && self.input[self.pos + 1] == b'-'
350 {
351 return self.read_comment();
352 }
353
354 if self.pos + 6 < self.input.len() && &self.input[self.pos..self.pos + 7] == b"[CDATA[" {
356 return self.read_cdata();
357 }
358
359 if self.pos + 6 < self.input.len() && self.input[self.pos..].starts_with(b"DOCTYPE") {
361 return self.skip_doctype();
362 }
363
364 Err(Error::syntax("unknown construct after '<!'").with_position(self.position()))
365 }
366
367 fn read_comment(&mut self) -> Result<XmlEvent<'a>> {
369 self.pos += 2; self.col += 2;
371 let start = self.pos;
372
373 while self.pos + 2 < self.input.len() {
374 if self.input[self.pos] == b'-'
375 && self.input[self.pos + 1] == b'-'
376 && self.input[self.pos + 2] == b'>'
377 {
378 let comment = std::str::from_utf8(&self.input[start..self.pos])
379 .map_err(|_| Error::new(crate::error::ErrorKind::InvalidUtf8))?;
380 self.pos += 3;
381 self.col += 3;
382 return Ok(XmlEvent::Comment(Cow::Borrowed(comment.trim())));
383 }
384 if self.input[self.pos] == b'\n' {
385 self.line += 1;
386 self.col = 1;
387 } else {
388 self.col += 1;
389 }
390 self.pos += 1;
391 }
392
393 Err(Error::syntax("unterminated comment").with_position(self.position()))
394 }
395
396 fn read_cdata(&mut self) -> Result<XmlEvent<'a>> {
398 self.pos += 7; self.col += 7;
400 let start = self.pos;
401
402 while self.pos + 2 < self.input.len() {
403 if self.input[self.pos] == b']'
404 && self.input[self.pos + 1] == b']'
405 && self.input[self.pos + 2] == b'>'
406 {
407 let data = std::str::from_utf8(&self.input[start..self.pos])
408 .map_err(|_| Error::new(crate::error::ErrorKind::InvalidUtf8))?;
409 self.pos += 3;
410 self.col += 3;
411 return Ok(XmlEvent::CData(Cow::Borrowed(data)));
412 }
413 if self.input[self.pos] == b'\n' {
414 self.line += 1;
415 self.col = 1;
416 } else {
417 self.col += 1;
418 }
419 self.pos += 1;
420 }
421
422 Err(Error::syntax("unterminated CDATA section").with_position(self.position()))
423 }
424
425 fn skip_doctype(&mut self) -> Result<XmlEvent<'a>> {
427 let mut depth = 1;
428
429 while self.pos < self.input.len() && depth > 0 {
430 match self.input[self.pos] {
431 b'<' => depth += 1,
432 b'>' => depth -= 1,
433 b'\n' => {
434 self.line += 1;
435 self.col = 1;
436 self.pos += 1;
437 continue;
438 }
439 _ => {}
440 }
441 self.col += 1;
442 self.pos += 1;
443 }
444
445 self.next_event()
447 }
448
449 fn read_name(&mut self) -> Result<&'a str> {
451 let start = self.pos;
452
453 if self.pos >= self.input.len() {
455 return Err(Error::unexpected_eof().with_position(self.position()));
456 }
457
458 let first = self.input[self.pos];
459 if !is_name_start_char(first) {
460 return Err(Error::invalid_name(format!("invalid name start character: {:?}", first as char))
461 .with_position(self.position()));
462 }
463 self.pos += 1;
464 self.col += 1;
465
466 while self.pos < self.input.len() && is_name_char(self.input[self.pos]) {
468 self.pos += 1;
469 self.col += 1;
470 }
471
472 std::str::from_utf8(&self.input[start..self.pos])
473 .map_err(|_| Error::new(crate::error::ErrorKind::InvalidUtf8))
474 }
475
476 fn read_attributes(&mut self) -> Result<Vec<Attribute<'a>>> {
478 let mut attributes = Vec::new();
479
480 loop {
481 self.skip_whitespace();
482
483 if self.pos >= self.input.len() {
484 break;
485 }
486
487 let c = self.input[self.pos];
489 if c == b'>' || c == b'/' || c == b'?' {
490 break;
491 }
492
493 let name = self.read_name()?;
495 self.skip_whitespace();
496
497 self.expect_char(b'=')?;
499 self.skip_whitespace();
500
501 let value = self.read_attribute_value()?;
503
504 attributes.push(Attribute {
505 name: Cow::Borrowed(name),
506 value,
507 });
508 }
509
510 Ok(attributes)
511 }
512
513 fn read_attribute_value(&mut self) -> Result<Cow<'a, str>> {
515 if self.pos >= self.input.len() {
516 return Err(Error::unexpected_eof().with_position(self.position()));
517 }
518
519 let quote = self.input[self.pos];
520 if quote != b'"' && quote != b'\'' {
521 return Err(Error::syntax("expected quote").with_position(self.position()));
522 }
523 self.pos += 1;
524 self.col += 1;
525
526 let start = self.pos;
527
528 match memchr(quote, &self.input[self.pos..]) {
530 Some(offset) => {
531 let value = std::str::from_utf8(&self.input[start..self.pos + offset])
532 .map_err(|_| Error::new(crate::error::ErrorKind::InvalidUtf8))?;
533 self.pos += offset + 1;
534 self.col += offset + 1;
535
536 match unescape(value) {
538 Ok(unescaped) => Ok(unescaped),
539 Err(e) => Err(Error::invalid_escape(e.entity)),
540 }
541 }
542 None => Err(Error::syntax("unterminated attribute value").with_position(self.position())),
543 }
544 }
545
546 fn expect_char(&mut self, expected: u8) -> Result<()> {
548 if self.pos >= self.input.len() {
549 return Err(Error::unexpected_eof().with_position(self.position()));
550 }
551
552 if self.input[self.pos] != expected {
553 return Err(Error::syntax(format!(
554 "expected '{}', found '{}'",
555 expected as char,
556 self.input[self.pos] as char
557 ))
558 .with_position(self.position()));
559 }
560
561 self.pos += 1;
562 self.col += 1;
563 Ok(())
564 }
565}
566
567#[inline]
569fn is_name_start_char(b: u8) -> bool {
570 matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'_' | b':')
571 || b >= 0x80 }
573
574#[inline]
576fn is_name_char(b: u8) -> bool {
577 is_name_start_char(b) || matches!(b, b'0'..=b'9' | b'-' | b'.')
578}
579
580#[cfg(test)]
581mod tests {
582 use super::*;
583
584 #[test]
585 fn test_simple_element() {
586 let mut reader = XmlReader::from_str("<root></root>");
587
588 match reader.next_event().unwrap() {
589 XmlEvent::StartElement { name, attributes } => {
590 assert_eq!(name, "root");
591 assert!(attributes.is_empty());
592 }
593 _ => panic!("expected StartElement"),
594 }
595
596 match reader.next_event().unwrap() {
597 XmlEvent::EndElement { name } => {
598 assert_eq!(name, "root");
599 }
600 _ => panic!("expected EndElement"),
601 }
602
603 assert!(matches!(reader.next_event().unwrap(), XmlEvent::Eof));
604 }
605
606 #[test]
607 fn test_empty_element() {
608 let mut reader = XmlReader::from_str("<root/>");
609
610 match reader.next_event().unwrap() {
611 XmlEvent::EmptyElement { name, attributes } => {
612 assert_eq!(name, "root");
613 assert!(attributes.is_empty());
614 }
615 _ => panic!("expected EmptyElement"),
616 }
617
618 assert!(matches!(reader.next_event().unwrap(), XmlEvent::Eof));
619 }
620
621 #[test]
622 fn test_attributes() {
623 let mut reader = XmlReader::from_str(r#"<root id="1" name="test"/>"#);
624
625 match reader.next_event().unwrap() {
626 XmlEvent::EmptyElement { name, attributes } => {
627 assert_eq!(name, "root");
628 assert_eq!(attributes.len(), 2);
629 assert_eq!(attributes[0].name, "id");
630 assert_eq!(attributes[0].value, "1");
631 assert_eq!(attributes[1].name, "name");
632 assert_eq!(attributes[1].value, "test");
633 }
634 _ => panic!("expected EmptyElement"),
635 }
636 }
637
638 #[test]
639 fn test_text_content() {
640 let mut reader = XmlReader::from_str("<root>Hello, World!</root>");
641
642 reader.next_event().unwrap(); match reader.next_event().unwrap() {
645 XmlEvent::Text(text) => {
646 assert_eq!(text, "Hello, World!");
647 }
648 _ => panic!("expected Text"),
649 }
650 }
651
652 #[test]
653 fn test_escaped_text() {
654 let mut reader = XmlReader::from_str("<root><Hello></root>");
655
656 reader.next_event().unwrap(); match reader.next_event().unwrap() {
659 XmlEvent::Text(text) => {
660 assert_eq!(text, "<Hello>");
661 }
662 _ => panic!("expected Text"),
663 }
664 }
665
666 #[test]
667 fn test_xml_declaration() {
668 let mut reader = XmlReader::from_str(r#"<?xml version="1.0" encoding="UTF-8"?><root/>"#);
669
670 match reader.next_event().unwrap() {
671 XmlEvent::XmlDecl { version, encoding, standalone } => {
672 assert_eq!(version, "1.0");
673 assert_eq!(encoding.as_deref(), Some("UTF-8"));
674 assert_eq!(standalone, None);
675 }
676 _ => panic!("expected XmlDecl"),
677 }
678 }
679
680 #[test]
681 fn test_comment() {
682 let mut reader = XmlReader::from_str("<!-- This is a comment --><root/>");
683
684 match reader.next_event().unwrap() {
685 XmlEvent::Comment(comment) => {
686 assert_eq!(comment, "This is a comment");
687 }
688 _ => panic!("expected Comment"),
689 }
690 }
691
692 #[test]
693 fn test_cdata() {
694 let mut reader = XmlReader::from_str("<root><![CDATA[<special>content</special>]]></root>");
695
696 reader.next_event().unwrap(); match reader.next_event().unwrap() {
699 XmlEvent::CData(data) => {
700 assert_eq!(data, "<special>content</special>");
701 }
702 _ => panic!("expected CData"),
703 }
704 }
705
706 #[test]
707 fn test_nested_elements() {
708 let xml = r#"<root><child1><grandchild/></child1><child2/></root>"#;
709 let mut reader = XmlReader::from_str(xml);
710
711 let events: Vec<_> = std::iter::from_fn(|| {
712 match reader.next_event() {
713 Ok(XmlEvent::Eof) => None,
714 Ok(event) => Some(event),
715 Err(_) => None,
716 }
717 }).collect();
718
719 assert_eq!(events.len(), 6);
720 }
721
722 #[test]
723 fn test_mismatched_tags() {
724 let mut reader = XmlReader::from_str("<root></wrong>");
725 reader.next_event().unwrap(); assert!(reader.next_event().is_err());
727 }
728
729 #[test]
730 fn test_unclosed_tag() {
731 let mut reader = XmlReader::from_str("<root>");
732 reader.next_event().unwrap(); assert!(reader.next_event().is_err());
734 }
735
736 #[test]
737 fn test_processing_instruction() {
738 let mut reader = XmlReader::from_str("<?target data?><root/>");
739
740 match reader.next_event().unwrap() {
741 XmlEvent::ProcessingInstruction { target, data } => {
742 assert_eq!(target, "target");
743 assert_eq!(data.as_deref(), Some("data"));
744 }
745 _ => panic!("expected ProcessingInstruction"),
746 }
747 }
748
749 #[test]
750 fn test_attribute_with_single_quotes() {
751 let mut reader = XmlReader::from_str("<root attr='value'/>");
752
753 match reader.next_event().unwrap() {
754 XmlEvent::EmptyElement { attributes, .. } => {
755 assert_eq!(attributes[0].value, "value");
756 }
757 _ => panic!("expected EmptyElement"),
758 }
759 }
760
761 #[test]
762 fn test_position_tracking() {
763 let xml = "<root>\n <child/>\n</root>";
764 let mut reader = XmlReader::from_str(xml);
765
766 reader.next_event().unwrap(); reader.next_event().unwrap(); let pos = reader.position();
770 assert!(pos.line >= 2);
771 }
772
773 #[test]
774 fn test_depth_tracking() {
775 let mut reader = XmlReader::from_str("<a><b><c></c></b></a>");
776
777 assert_eq!(reader.depth(), 0);
778 reader.next_event().unwrap(); assert_eq!(reader.depth(), 1);
780 reader.next_event().unwrap(); assert_eq!(reader.depth(), 2);
782 reader.next_event().unwrap(); assert_eq!(reader.depth(), 3);
784 reader.next_event().unwrap(); assert_eq!(reader.depth(), 2);
786 }
787}