1use crate::error::{Error, Position, Result};
7use crate::escape::unescape;
8use memchr::{memchr, memchr2};
9use std::borrow::Cow;
10
11static IS_WHITESPACE: [bool; 256] = {
13 let mut lut = [false; 256];
14 lut[b' ' as usize] = true;
15 lut[b'\t' as usize] = true;
16 lut[b'\n' as usize] = true;
17 lut[b'\r' as usize] = true;
18 lut
19};
20
21static IS_NAME_START: [bool; 256] = {
23 let mut lut = [false; 256];
24 let mut i = b'A';
25 while i <= b'Z' {
26 lut[i as usize] = true;
27 i += 1;
28 }
29 let mut i = b'a';
30 while i <= b'z' {
31 lut[i as usize] = true;
32 i += 1;
33 }
34 lut[b'_' as usize] = true;
35 lut[b':' as usize] = true;
36 let mut i: usize = 0x80;
38 while i < 256 {
39 lut[i] = true;
40 i += 1;
41 }
42 lut
43};
44
45static IS_NAME_CHAR: [bool; 256] = {
47 let mut lut = IS_NAME_START;
48 let mut i = b'0';
49 while i <= b'9' {
50 lut[i as usize] = true;
51 i += 1;
52 }
53 lut[b'-' as usize] = true;
54 lut[b'.' as usize] = true;
55 lut
56};
57
58#[derive(Debug, Clone, PartialEq)]
60pub enum XmlEvent<'a> {
61 XmlDecl {
63 version: Cow<'a, str>,
65 encoding: Option<Cow<'a, str>>,
67 standalone: Option<bool>,
69 },
70 StartElement {
72 name: Cow<'a, str>,
74 attributes: Vec<Attribute<'a>>,
76 },
77 EndElement {
79 name: Cow<'a, str>,
81 },
82 EmptyElement {
84 name: Cow<'a, str>,
86 attributes: Vec<Attribute<'a>>,
88 },
89 Text(Cow<'a, str>),
91 CData(Cow<'a, str>),
93 Comment(Cow<'a, str>),
95 ProcessingInstruction {
97 target: Cow<'a, str>,
99 data: Option<Cow<'a, str>>,
101 },
102 Eof,
104}
105
106#[derive(Debug, Clone, PartialEq)]
108pub struct Attribute<'a> {
109 pub name: Cow<'a, str>,
111 pub value: Cow<'a, str>,
113}
114
115pub struct XmlReader<'a> {
117 input: &'a [u8],
118 pos: usize,
119 line: usize,
120 col: usize,
121 element_stack: Vec<String>,
123}
124
125impl<'a> XmlReader<'a> {
126 #[inline]
128 #[allow(clippy::should_implement_trait)]
129 pub fn from_str(s: &'a str) -> Self {
130 Self::from_bytes(s.as_bytes())
131 }
132
133 #[inline]
135 pub fn from_bytes(input: &'a [u8]) -> Self {
136 Self {
137 input,
138 pos: 0,
139 line: 1,
140 col: 1,
141 element_stack: Vec::with_capacity(8), }
143 }
144
145 #[inline]
147 pub fn position(&self) -> Position {
148 Position {
149 line: self.line,
150 column: self.col,
151 offset: self.pos,
152 }
153 }
154
155 #[inline]
157 pub fn depth(&self) -> usize {
158 self.element_stack.len()
159 }
160
161 #[inline]
163 pub fn next_event(&mut self) -> Result<XmlEvent<'a>> {
164 self.skip_whitespace_fast();
165
166 if self.pos >= self.input.len() {
167 if let Some(tag) = self.element_stack.pop() {
168 return Err(Error::unclosed_tag(tag).with_position(self.position()));
169 }
170 return Ok(XmlEvent::Eof);
171 }
172
173 if self.input[self.pos] == b'<' {
174 self.read_tag()
175 } else {
176 self.read_text()
177 }
178 }
179
180 #[inline(always)]
182 fn skip_whitespace_fast(&mut self) {
183 while self.pos < self.input.len() {
184 let b = self.input[self.pos];
185 if !IS_WHITESPACE[b as usize] {
186 break;
187 }
188 if b == b'\n' {
189 self.line += 1;
190 self.col = 1;
191 } else {
192 self.col += 1;
193 }
194 self.pos += 1;
195 }
196 }
197
198 #[inline]
200 fn read_text(&mut self) -> Result<XmlEvent<'a>> {
201 let start = self.pos;
202
203 match memchr(b'<', &self.input[self.pos..]) {
205 Some(offset) => {
206 self.update_position_for_range(self.pos, self.pos + offset);
208 self.pos += offset;
209 }
210 None => {
211 self.update_position_for_range(self.pos, self.input.len());
212 self.pos = self.input.len();
213 }
214 }
215
216 let text = std::str::from_utf8(&self.input[start..self.pos])
217 .map_err(|_| Error::new(crate::error::ErrorKind::InvalidUtf8))?;
218
219 let trimmed = text.trim();
221 if trimmed.is_empty() {
222 return self.next_event();
223 }
224
225 match unescape(trimmed) {
227 Ok(unescaped) => Ok(XmlEvent::Text(unescaped)),
228 Err(e) => Err(Error::invalid_escape(e.entity)),
229 }
230 }
231
232 #[inline(always)]
234 fn update_position_for_range(&mut self, start: usize, end: usize) {
235 let slice = &self.input[start..end];
237 for &b in slice {
238 if b == b'\n' {
239 self.line += 1;
240 self.col = 1;
241 } else {
242 self.col += 1;
243 }
244 }
245 }
246
247 #[inline]
249 fn read_tag(&mut self) -> Result<XmlEvent<'a>> {
250 debug_assert_eq!(self.input[self.pos], b'<');
251 self.pos += 1;
252 self.col += 1;
253
254 if self.pos >= self.input.len() {
255 return Err(Error::unexpected_eof().with_position(self.position()));
256 }
257
258 match self.input[self.pos] {
259 b'/' => self.read_end_element(),
260 b'?' => self.read_processing_instruction(),
261 b'!' => self.read_special(),
262 _ => self.read_start_element(),
263 }
264 }
265
266 #[inline]
268 fn read_start_element(&mut self) -> Result<XmlEvent<'a>> {
269 let name = self.read_name()?;
270 let attributes = self.read_attributes()?;
271
272 self.skip_whitespace_fast();
273
274 if self.pos >= self.input.len() {
275 return Err(Error::unexpected_eof().with_position(self.position()));
276 }
277
278 if self.input[self.pos] == b'/' {
279 self.pos += 1;
281 self.col += 1;
282 self.expect_char(b'>')?;
283 Ok(XmlEvent::EmptyElement {
284 name: Cow::Borrowed(name),
285 attributes,
286 })
287 } else if self.input[self.pos] == b'>' {
288 self.pos += 1;
290 self.col += 1;
291 self.element_stack.push(name.to_string());
292 Ok(XmlEvent::StartElement {
293 name: Cow::Borrowed(name),
294 attributes,
295 })
296 } else {
297 Err(Error::syntax("expected '>' or '/>'").with_position(self.position()))
298 }
299 }
300
301 #[inline]
303 fn read_end_element(&mut self) -> Result<XmlEvent<'a>> {
304 debug_assert_eq!(self.input[self.pos], b'/');
305 self.pos += 1;
306 self.col += 1;
307
308 let name = self.read_name()?;
309 self.skip_whitespace_fast();
310 self.expect_char(b'>')?;
311
312 match self.element_stack.pop() {
314 Some(expected) if expected == name => Ok(XmlEvent::EndElement {
315 name: Cow::Borrowed(name),
316 }),
317 Some(expected) => Err(Error::mismatched_tag(expected, name.to_string()).with_position(self.position())),
318 None => Err(Error::syntax(format!("unexpected closing tag: {}", name))
319 .with_position(self.position())),
320 }
321 }
322
323 fn read_processing_instruction(&mut self) -> Result<XmlEvent<'a>> {
325 debug_assert_eq!(self.input[self.pos], b'?');
326 self.pos += 1;
327 self.col += 1;
328
329 let target = self.read_name()?;
330
331 if target.eq_ignore_ascii_case("xml") {
333 return self.read_xml_decl();
334 }
335
336 self.skip_whitespace_fast();
337
338 let data_start = self.pos;
340
341 while self.pos + 1 < self.input.len() {
342 if let Some(offset) = memchr(b'?', &self.input[self.pos..]) {
343 let check_pos = self.pos + offset;
344 if check_pos + 1 < self.input.len() && self.input[check_pos + 1] == b'>' {
345 self.update_position_for_range(self.pos, check_pos);
346 self.pos = check_pos;
347
348 let data = std::str::from_utf8(&self.input[data_start..self.pos])
349 .map_err(|_| Error::new(crate::error::ErrorKind::InvalidUtf8))?;
350 self.pos += 2;
351 self.col += 2;
352 return Ok(XmlEvent::ProcessingInstruction {
353 target: Cow::Borrowed(target),
354 data: if data.trim().is_empty() {
355 None
356 } else {
357 Some(Cow::Borrowed(data.trim()))
358 },
359 });
360 }
361 self.update_position_for_range(self.pos, check_pos + 1);
363 self.pos = check_pos + 1;
364 } else {
365 break;
366 }
367 }
368
369 Err(Error::syntax("unterminated processing instruction").with_position(self.position()))
370 }
371
372 fn read_xml_decl(&mut self) -> Result<XmlEvent<'a>> {
374 let attributes = self.read_attributes()?;
375 self.skip_whitespace_fast();
376
377 if self.pos + 1 >= self.input.len()
378 || self.input[self.pos] != b'?'
379 || self.input[self.pos + 1] != b'>'
380 {
381 return Err(Error::syntax("expected '?>'").with_position(self.position()));
382 }
383 self.pos += 2;
384 self.col += 2;
385
386 let mut version = None;
387 let mut encoding = None;
388 let mut standalone = None;
389
390 for attr in attributes {
391 match attr.name.as_ref() {
392 "version" => version = Some(attr.value),
393 "encoding" => encoding = Some(attr.value),
394 "standalone" => {
395 standalone = Some(attr.value.as_ref() == "yes");
396 }
397 _ => {}
398 }
399 }
400
401 Ok(XmlEvent::XmlDecl {
402 version: version.unwrap_or(Cow::Borrowed("1.0")),
403 encoding,
404 standalone,
405 })
406 }
407
408 fn read_special(&mut self) -> Result<XmlEvent<'a>> {
410 debug_assert_eq!(self.input[self.pos], b'!');
411 self.pos += 1;
412 self.col += 1;
413
414 if self.pos >= self.input.len() {
415 return Err(Error::unexpected_eof().with_position(self.position()));
416 }
417
418 if self.pos + 1 < self.input.len()
420 && self.input[self.pos] == b'-'
421 && self.input[self.pos + 1] == b'-'
422 {
423 return self.read_comment();
424 }
425
426 if self.pos + 6 < self.input.len() && &self.input[self.pos..self.pos + 7] == b"[CDATA[" {
428 return self.read_cdata();
429 }
430
431 if self.pos + 6 < self.input.len() && self.input[self.pos..].starts_with(b"DOCTYPE") {
433 return self.skip_doctype();
434 }
435
436 Err(Error::syntax("unknown construct after '<!'").with_position(self.position()))
437 }
438
439 fn read_comment(&mut self) -> Result<XmlEvent<'a>> {
441 self.pos += 2; self.col += 2;
443 let start = self.pos;
444
445 while self.pos + 2 < self.input.len() {
447 if let Some(offset) = memchr(b'-', &self.input[self.pos..]) {
448 let check_pos = self.pos + offset;
449 if check_pos + 2 < self.input.len()
450 && self.input[check_pos + 1] == b'-'
451 && self.input[check_pos + 2] == b'>'
452 {
453 self.update_position_for_range(self.pos, check_pos);
454 let comment = std::str::from_utf8(&self.input[start..check_pos])
455 .map_err(|_| Error::new(crate::error::ErrorKind::InvalidUtf8))?;
456 self.pos = check_pos + 3;
457 self.col += 3;
458 return Ok(XmlEvent::Comment(Cow::Borrowed(comment.trim())));
459 }
460 self.update_position_for_range(self.pos, check_pos + 1);
461 self.pos = check_pos + 1;
462 } else {
463 break;
464 }
465 }
466
467 Err(Error::syntax("unterminated comment").with_position(self.position()))
468 }
469
470 fn read_cdata(&mut self) -> Result<XmlEvent<'a>> {
472 self.pos += 7; self.col += 7;
474 let start = self.pos;
475
476 while self.pos + 2 < self.input.len() {
478 if let Some(offset) = memchr(b']', &self.input[self.pos..]) {
479 let check_pos = self.pos + offset;
480 if check_pos + 2 < self.input.len()
481 && self.input[check_pos + 1] == b']'
482 && self.input[check_pos + 2] == b'>'
483 {
484 self.update_position_for_range(self.pos, check_pos);
485 let data = std::str::from_utf8(&self.input[start..check_pos])
486 .map_err(|_| Error::new(crate::error::ErrorKind::InvalidUtf8))?;
487 self.pos = check_pos + 3;
488 self.col += 3;
489 return Ok(XmlEvent::CData(Cow::Borrowed(data)));
490 }
491 self.update_position_for_range(self.pos, check_pos + 1);
492 self.pos = check_pos + 1;
493 } else {
494 break;
495 }
496 }
497
498 Err(Error::syntax("unterminated CDATA section").with_position(self.position()))
499 }
500
501 fn skip_doctype(&mut self) -> Result<XmlEvent<'a>> {
503 let mut depth = 1;
504
505 while self.pos < self.input.len() && depth > 0 {
506 if let Some(offset) = memchr2(b'<', b'>', &self.input[self.pos..]) {
508 self.update_position_for_range(self.pos, self.pos + offset);
509 self.pos += offset;
510
511 match self.input[self.pos] {
512 b'<' => depth += 1,
513 b'>' => depth -= 1,
514 _ => {}
515 }
516 self.col += 1;
517 self.pos += 1;
518 } else {
519 self.update_position_for_range(self.pos, self.input.len());
520 self.pos = self.input.len();
521 break;
522 }
523 }
524
525 self.next_event()
527 }
528
529 #[inline]
531 fn read_name(&mut self) -> Result<&'a str> {
532 let start = self.pos;
533
534 if self.pos >= self.input.len() {
536 return Err(Error::unexpected_eof().with_position(self.position()));
537 }
538
539 let first = self.input[self.pos];
540 if !IS_NAME_START[first as usize] {
541 return Err(Error::invalid_name(format!("invalid name start character: {:?}", first as char))
542 .with_position(self.position()));
543 }
544 self.pos += 1;
545 self.col += 1;
546
547 while self.pos < self.input.len() && IS_NAME_CHAR[self.input[self.pos] as usize] {
549 self.pos += 1;
550 self.col += 1;
551 }
552
553 std::str::from_utf8(&self.input[start..self.pos])
554 .map_err(|_| Error::new(crate::error::ErrorKind::InvalidUtf8))
555 }
556
557 #[inline]
559 fn read_attributes(&mut self) -> Result<Vec<Attribute<'a>>> {
560 let mut attributes = Vec::with_capacity(4); loop {
563 self.skip_whitespace_fast();
564
565 if self.pos >= self.input.len() {
566 break;
567 }
568
569 let c = self.input[self.pos];
571 if c == b'>' || c == b'/' || c == b'?' {
572 break;
573 }
574
575 let name = self.read_name()?;
577 self.skip_whitespace_fast();
578
579 self.expect_char(b'=')?;
581 self.skip_whitespace_fast();
582
583 let value = self.read_attribute_value()?;
585
586 attributes.push(Attribute {
587 name: Cow::Borrowed(name),
588 value,
589 });
590 }
591
592 Ok(attributes)
593 }
594
595 #[inline]
597 fn read_attribute_value(&mut self) -> Result<Cow<'a, str>> {
598 if self.pos >= self.input.len() {
599 return Err(Error::unexpected_eof().with_position(self.position()));
600 }
601
602 let quote = self.input[self.pos];
603 if quote != b'"' && quote != b'\'' {
604 return Err(Error::syntax("expected quote").with_position(self.position()));
605 }
606 self.pos += 1;
607 self.col += 1;
608
609 let start = self.pos;
610
611 match memchr(quote, &self.input[self.pos..]) {
613 Some(offset) => {
614 let value = std::str::from_utf8(&self.input[start..self.pos + offset])
615 .map_err(|_| Error::new(crate::error::ErrorKind::InvalidUtf8))?;
616 self.pos += offset + 1;
617 self.col += offset + 1;
618
619 match unescape(value) {
621 Ok(unescaped) => Ok(unescaped),
622 Err(e) => Err(Error::invalid_escape(e.entity)),
623 }
624 }
625 None => Err(Error::syntax("unterminated attribute value").with_position(self.position())),
626 }
627 }
628
629 #[inline(always)]
631 fn expect_char(&mut self, expected: u8) -> Result<()> {
632 if self.pos >= self.input.len() {
633 return Err(Error::unexpected_eof().with_position(self.position()));
634 }
635
636 if self.input[self.pos] != expected {
637 return Err(Error::syntax(format!(
638 "expected '{}', found '{}'",
639 expected as char,
640 self.input[self.pos] as char
641 ))
642 .with_position(self.position()));
643 }
644
645 self.pos += 1;
646 self.col += 1;
647 Ok(())
648 }
649}
650
651#[cfg(test)]
652mod tests {
653 use super::*;
654
655 #[test]
656 fn test_simple_element() {
657 let mut reader = XmlReader::from_str("<root></root>");
658
659 match reader.next_event().unwrap() {
660 XmlEvent::StartElement { name, attributes } => {
661 assert_eq!(name, "root");
662 assert!(attributes.is_empty());
663 }
664 _ => panic!("expected StartElement"),
665 }
666
667 match reader.next_event().unwrap() {
668 XmlEvent::EndElement { name } => {
669 assert_eq!(name, "root");
670 }
671 _ => panic!("expected EndElement"),
672 }
673
674 assert!(matches!(reader.next_event().unwrap(), XmlEvent::Eof));
675 }
676
677 #[test]
678 fn test_empty_element() {
679 let mut reader = XmlReader::from_str("<root/>");
680
681 match reader.next_event().unwrap() {
682 XmlEvent::EmptyElement { name, attributes } => {
683 assert_eq!(name, "root");
684 assert!(attributes.is_empty());
685 }
686 _ => panic!("expected EmptyElement"),
687 }
688
689 assert!(matches!(reader.next_event().unwrap(), XmlEvent::Eof));
690 }
691
692 #[test]
693 fn test_attributes() {
694 let mut reader = XmlReader::from_str(r#"<root id="1" name="test"/>"#);
695
696 match reader.next_event().unwrap() {
697 XmlEvent::EmptyElement { name, attributes } => {
698 assert_eq!(name, "root");
699 assert_eq!(attributes.len(), 2);
700 assert_eq!(attributes[0].name, "id");
701 assert_eq!(attributes[0].value, "1");
702 assert_eq!(attributes[1].name, "name");
703 assert_eq!(attributes[1].value, "test");
704 }
705 _ => panic!("expected EmptyElement"),
706 }
707 }
708
709 #[test]
710 fn test_text_content() {
711 let mut reader = XmlReader::from_str("<root>Hello, World!</root>");
712
713 reader.next_event().unwrap(); match reader.next_event().unwrap() {
716 XmlEvent::Text(text) => {
717 assert_eq!(text, "Hello, World!");
718 }
719 _ => panic!("expected Text"),
720 }
721 }
722
723 #[test]
724 fn test_escaped_text() {
725 let mut reader = XmlReader::from_str("<root><Hello></root>");
726
727 reader.next_event().unwrap(); match reader.next_event().unwrap() {
730 XmlEvent::Text(text) => {
731 assert_eq!(text, "<Hello>");
732 }
733 _ => panic!("expected Text"),
734 }
735 }
736
737 #[test]
738 fn test_xml_declaration() {
739 let mut reader = XmlReader::from_str(r#"<?xml version="1.0" encoding="UTF-8"?><root/>"#);
740
741 match reader.next_event().unwrap() {
742 XmlEvent::XmlDecl { version, encoding, standalone } => {
743 assert_eq!(version, "1.0");
744 assert_eq!(encoding.as_deref(), Some("UTF-8"));
745 assert_eq!(standalone, None);
746 }
747 _ => panic!("expected XmlDecl"),
748 }
749 }
750
751 #[test]
752 fn test_comment() {
753 let mut reader = XmlReader::from_str("<!-- This is a comment --><root/>");
754
755 match reader.next_event().unwrap() {
756 XmlEvent::Comment(comment) => {
757 assert_eq!(comment, "This is a comment");
758 }
759 _ => panic!("expected Comment"),
760 }
761 }
762
763 #[test]
764 fn test_cdata() {
765 let mut reader = XmlReader::from_str("<root><![CDATA[<special>content</special>]]></root>");
766
767 reader.next_event().unwrap(); match reader.next_event().unwrap() {
770 XmlEvent::CData(data) => {
771 assert_eq!(data, "<special>content</special>");
772 }
773 _ => panic!("expected CData"),
774 }
775 }
776
777 #[test]
778 fn test_nested_elements() {
779 let xml = r#"<root><child1><grandchild/></child1><child2/></root>"#;
780 let mut reader = XmlReader::from_str(xml);
781
782 let events: Vec<_> = std::iter::from_fn(|| {
783 match reader.next_event() {
784 Ok(XmlEvent::Eof) => None,
785 Ok(event) => Some(event),
786 Err(_) => None,
787 }
788 }).collect();
789
790 assert_eq!(events.len(), 6);
791 }
792
793 #[test]
794 fn test_mismatched_tags() {
795 let mut reader = XmlReader::from_str("<root></wrong>");
796 reader.next_event().unwrap(); assert!(reader.next_event().is_err());
798 }
799
800 #[test]
801 fn test_unclosed_tag() {
802 let mut reader = XmlReader::from_str("<root>");
803 reader.next_event().unwrap(); assert!(reader.next_event().is_err());
805 }
806
807 #[test]
808 fn test_processing_instruction() {
809 let mut reader = XmlReader::from_str("<?target data?><root/>");
810
811 match reader.next_event().unwrap() {
812 XmlEvent::ProcessingInstruction { target, data } => {
813 assert_eq!(target, "target");
814 assert_eq!(data.as_deref(), Some("data"));
815 }
816 _ => panic!("expected ProcessingInstruction"),
817 }
818 }
819
820 #[test]
821 fn test_attribute_with_single_quotes() {
822 let mut reader = XmlReader::from_str("<root attr='value'/>");
823
824 match reader.next_event().unwrap() {
825 XmlEvent::EmptyElement { attributes, .. } => {
826 assert_eq!(attributes[0].value, "value");
827 }
828 _ => panic!("expected EmptyElement"),
829 }
830 }
831
832 #[test]
833 fn test_position_tracking() {
834 let xml = "<root>\n <child/>\n</root>";
835 let mut reader = XmlReader::from_str(xml);
836
837 reader.next_event().unwrap(); reader.next_event().unwrap(); let pos = reader.position();
841 assert!(pos.line >= 2);
842 }
843
844 #[test]
845 fn test_depth_tracking() {
846 let mut reader = XmlReader::from_str("<a><b><c></c></b></a>");
847
848 assert_eq!(reader.depth(), 0);
849 reader.next_event().unwrap(); assert_eq!(reader.depth(), 1);
851 reader.next_event().unwrap(); assert_eq!(reader.depth(), 2);
853 reader.next_event().unwrap(); assert_eq!(reader.depth(), 3);
855 reader.next_event().unwrap(); assert_eq!(reader.depth(), 2);
857 }
858}