speedy_xml/
reader.rs

1//! An XML parser that tries to replicate RapidXML's *very incorrect* parsing behaviour.
2//!
3//! Prefixed names like `mod:findName` are implemented as an extension.
4//!
5//! Parsing is done using default flags.
6//! Note that the behaviour of non-default flags can usually be reconstructed after parsing
7//! with default flags.
8//! That is unless the implementation of the flag is "buggy" in RapidXML itself, for example
9//! `parse_pi_nodes` and `parse_declaration_node` change behaviour on some invalid input.
10
11use std::{
12    borrow::Cow,
13    fmt::{Debug, Display},
14    ops::Range,
15};
16
17use crate::{
18    escape::unescape,
19    lut::{is_invalid_attribute_name, is_invalid_name, is_whitespace},
20};
21
22#[derive(Debug, Clone, Copy)]
23/// An event emitted by start tags like `<hello name="value">` or empty tags like
24/// `<hello name="value"/>`.
25pub struct StartEvent<'a> {
26    text: &'a str,
27    prefix_end: usize,
28    name_end: usize,
29}
30
31impl<'a> StartEvent<'a> {
32    /// Returns the prefix component of this event's prefixed name, if present.
33    pub fn prefix(&self) -> Option<&'a str> {
34        (self.prefix_end > 0).then(|| &self.text[1..self.prefix_end])
35    }
36
37    /// Returns the name component of this event's prefixed name.
38    pub fn name(&self) -> &'a str {
39        &self.text[self.prefix_end + 1..self.name_end]
40    }
41
42    /// Returns `true` if this event is an empty tag.
43    ///
44    /// # Notes
45    ///
46    /// This method is not suitable for checking whether the content of this
47    /// element is empty, it only checks whether the tag itself is of the
48    /// self-closing variety.
49    pub fn is_empty(&self) -> bool {
50        self.text.as_bytes()[self.text.len() - 2] == b'/'
51    }
52
53    /// Returns the span of this tag in `reader`.
54    ///
55    /// # Panics
56    ///
57    /// May panic if `reader` is not the [`Reader`] that this event originated from.
58    pub fn position_in(&self, reader: &Reader) -> Range<usize> {
59        reader.range_for_ptrs(self.text.as_bytes().as_ptr_range())
60    }
61
62    /// Returns the span of this tag's name component in `reader`.
63    ///
64    /// # Panics
65    ///
66    /// May panic if `reader` is not the [`Reader`] that this event originated from.
67    pub fn name_position_in(&self, reader: &Reader) -> Range<usize> {
68        reader.range_for_ptrs(self.name().as_bytes().as_ptr_range())
69    }
70
71    /// Returns the span of this tag's prefix component in `reader`.
72    ///
73    /// # Panics
74    ///
75    /// May panic if `reader` is not the [`Reader`] that this event originated from.
76    pub fn prefix_position_in(&self, reader: &Reader) -> Option<Range<usize>> {
77        (self.prefix_end > 0)
78            .then(|| reader.range_for_ptrs(self.text.as_bytes()[1..self.prefix_end].as_ptr_range()))
79    }
80
81    /// Returns the span of this tag's prefixed name in `reader`.
82    ///
83    /// # Panics
84    ///
85    /// May panic if `reader` is not the [`Reader`] that this event originated from.
86    pub fn prefixed_name_position_in(&self, reader: &Reader) -> Range<usize> {
87        reader.range_for_ptrs(self.text.as_bytes()[1..self.name_end].as_ptr_range())
88    }
89
90    /// Returns an iterator over the attribute events of this start tag.
91    pub fn attributes(&self) -> Attributes<'a> {
92        Attributes(ParsingBuffer::new(&self.text[self.name_end..]))
93    }
94}
95
96#[derive(Debug, Clone, Copy)]
97/// An event returned by the [`Attributes`] iterator that represents a single attribute on a start tag.
98pub struct AttributeEvent<'a> {
99    pub(crate) text: &'a str,
100    name_end: usize,
101    value_start: usize,
102}
103
104#[repr(u8)]
105/// The quote character an attribute's value can be wrapped in.
106pub enum AttributeQuote {
107    /// A single quote (`'`) character.
108    Single = b'\'',
109    /// A double quote (`"`) character.
110    Double = b'\"',
111}
112
113impl AttributeQuote {
114    /// Returns the quote character converted to a [`char`].
115    pub fn to_char(self) -> char {
116        self as u8 as char
117    }
118}
119
120impl<'a> AttributeEvent<'a> {
121    /// Returns this attribute's name.
122    pub fn name(&self) -> &'a str {
123        &self.text[..self.name_end]
124    }
125
126    /// Returns this attribute's unescaped value.
127    pub fn value(&self) -> Cow<'a, str> {
128        unescape(self.raw_value())
129    }
130
131    /// Returns this attribute's escaped value.
132    pub fn raw_value(&self) -> &'a str {
133        &self.text[self.value_start..self.text.len() - 1]
134    }
135
136    /// Returns the quote character that this attribute's value was wrapped in.
137    pub fn quote(&self) -> AttributeQuote {
138        match self.text.bytes().last().unwrap() {
139            b'\'' => AttributeQuote::Single,
140            b'\"' => AttributeQuote::Double,
141            _ => unreachable!(),
142        }
143    }
144
145    /// Returns the span of this atribute in `reader`.
146    ///
147    /// # Panics
148    ///
149    /// May panic if `reader` is not the [`Reader`] that this event originated from.
150    pub fn position_in(&self, reader: &Reader) -> Range<usize> {
151        reader.range_for_ptrs(self.text.as_bytes().as_ptr_range())
152    }
153
154    /// Returns the span of this atribute's name in `reader`.
155    ///
156    /// # Panics
157    ///
158    /// May panic if `reader` is not the [`Reader`] that this event originated from.
159    pub fn name_position_in(&self, reader: &Reader) -> Range<usize> {
160        reader.range_for_ptrs(self.name().as_bytes().as_ptr_range())
161    }
162
163    /// Returns the span of this atribute's value in `reader`.
164    ///
165    /// # Panics
166    ///
167    /// May panic if `reader` is not the [`Reader`] that this event originated from.
168    pub fn value_position_in(&self, reader: &Reader) -> Range<usize> {
169        reader.range_for_ptrs(self.raw_value().as_bytes().as_ptr_range())
170    }
171}
172
173#[derive(Debug, Clone, Copy)]
174/// An event emitted by end tags like `</hello>`.
175pub struct EndEvent<'a> {
176    text: &'a str,
177    prefix_end: usize,
178    name_end: usize,
179}
180
181impl<'a> EndEvent<'a> {
182    /// Returns the prefix component of this event's prefixed name, if present.
183    pub fn prefix(&self) -> Option<&'a str> {
184        (self.prefix_end != 1).then(|| &self.text[2..self.prefix_end])
185    }
186
187    /// Returns the name component of this event's prefixed name, if present.
188    pub fn name(&self) -> &'a str {
189        debug_assert_ne!(self.prefix_end, 0);
190        &self.text[self.prefix_end + 1..self.name_end]
191    }
192
193    /// Returns the span of this event in `reader`.
194    ///
195    /// # Panics
196    ///
197    /// May panic if `reader` is not the [`Reader`] that this event originated from.
198    pub fn position_in(&self, reader: &Reader) -> Range<usize> {
199        reader.range_for_ptrs(self.text.as_bytes().as_ptr_range())
200    }
201}
202
203macro_rules! simple_text_event {
204    (@mkunescape raw_content) => {
205        /// Returns this event's unescaped content.
206        pub fn content(&self) -> Cow<'a, str> {
207            unescape(self.raw_content())
208        }
209    };
210    (@mkunescape content) => {};
211
212    ($name: ident$(, $prefix: literal, $suffix: literal)?, $content_type: ident, $emitted_by_what: literal, $what_content: literal) => {
213        #[derive(Debug, Clone, Copy)]
214        #[doc = concat!("An event emitted by ", $emitted_by_what, ".")]
215        pub struct $name<'a> {
216            pub(crate) text: &'a str,
217        }
218
219        impl<'a> $name<'a> {
220            simple_text_event!(@mkunescape $content_type);
221
222            #[doc = concat!("Returns this event's ", $what_content, " content.")]
223            pub fn $content_type(&self) -> &'a str {
224                &self.text$([$prefix.len()..self.text.len() - $suffix.len()])?
225            }
226
227            /// Returns the span of this event in `reader`.
228            ///
229            /// # Panics
230            ///
231            /// May panic if `reader` is not the [`Reader`] that this event originated from.
232            pub fn position_in(&self, parser: &Reader) -> Range<usize> {
233                parser.range_for_ptrs(self.text.as_bytes().as_ptr_range())
234            }
235        }
236    };
237}
238
239simple_text_event!(TextEvent, raw_content, "text content", "escaped");
240simple_text_event!(
241    CDataEvent,
242    "<![CDATA[",
243    "]]>",
244    content,
245    "cdata",
246    "unescaped"
247);
248simple_text_event!(
249    CommentEvent,
250    "<!--",
251    "-->",
252    content,
253    "comments",
254    "unescaped"
255);
256simple_text_event!(
257    DoctypeEvent,
258    "<!DOCTYPE ",
259    ">",
260    content,
261    "doctype declarations",
262    "unescaped"
263);
264
265#[derive(Debug, Clone, Copy)]
266/// An event emitted by the [`Reader`].
267pub enum Event<'a> {
268    /// An event emitted by start tags like `<hello name="value">`.
269    Start(StartEvent<'a>),
270    /// An event emitted by end tags like `</hello>`.
271    End(EndEvent<'a>),
272    /// An event emitted by empty tags like `<hello name="value"/>`.
273    Empty(StartEvent<'a>),
274    /// An event emitted by text content.
275    Text(TextEvent<'a>),
276    /// An event emitted by cdata like `<![CDATA[hello]]>`.
277    CData(CDataEvent<'a>),
278    /// An event emitted by comments like `<!-- hello -->`.
279    Comment(CommentEvent<'a>),
280    /// An event emitted by doctype declarations like `<!DOCTYPE hello>`.
281    Doctype(DoctypeEvent<'a>),
282}
283
284#[derive(Debug, Clone, Copy, PartialEq, Eq)]
285/// An error that may be emitted by [`Reader`] while parsing XML.
286pub enum ErrorKind {
287    /// Top-level text encountered while [`Options`] did not allow it.
288    TopLevelText,
289    /// An unclosed processing instruction tag encountered.
290    UnclosedPITag,
291
292    /// End of tag was encountered when an element name was expected.
293    ExpectedElementName,
294    /// An invalid character was encountered when an element name was expected.
295    InvalidElementName,
296    /// An unclosed tag encountered.
297    UnclosedElementTag,
298    /// An unclosed empty tag encountered.
299    UnclosedEmptyElementTag,
300    /// An unclosed end tag encountered.
301    UnclosedEndTag,
302    /// An unclosed element encountered while [`Options`] did not allow it.
303    UnclosedElement,
304
305    /// Missing `=` character after attribute name.
306    ExpectedAttributeEq,
307    /// Missing attribute value after `=` character.
308    ExpectedAttributeValue,
309    /// Attribute value contained a null byte.
310    InvalidAttributeValue,
311    /// An unclosed attribute value was encountered.
312    UnclosedAttributeValue,
313
314    /// An unclosed comment was encountered.
315    UnclosedComment,
316    /// An unclosed cdata was encountered.
317    UnclosedCData,
318    /// An unclosed non-doctype "<!name >" tag encounteredd.
319    UnclosedUnknownSpecial,
320    /// An unclosed doctype tag encountered.
321    DoctypeEof,
322}
323
324impl ErrorKind {
325    /// Returns an error message for this error kind.
326    pub fn message(&self) -> &'static str {
327        match self {
328            Self::TopLevelText => "top-level text is forbidden",
329            Self::UnclosedPITag => "unclosed processing instruction",
330
331            Self::ExpectedElementName => "expected element name",
332            Self::InvalidElementName => "invalid element name",
333            Self::UnclosedElementTag => "expected a `>` or `/`",
334            Self::UnclosedEmptyElementTag => "expected a `>`",
335            Self::UnclosedEndTag => "expected a `>`",
336            Self::UnclosedElement => "unclosed element",
337
338            Self::ExpectedAttributeEq => "expected `=` after attribute name",
339            Self::ExpectedAttributeValue => {
340                "expected an attribute value enclosed in either `'` or `\"`"
341            }
342            Self::UnclosedAttributeValue => "unclosed attribute value",
343            Self::InvalidAttributeValue => "attribute value contains null byte",
344
345            Self::UnclosedComment => "unclosed comment",
346            Self::UnclosedCData => "unclosed cdata",
347            Self::UnclosedUnknownSpecial => "unclosed unknown <! tag",
348            Self::DoctypeEof => "unexpected end of file in <!DOCTYPE",
349        }
350    }
351}
352
353impl Display for ErrorKind {
354    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
355        f.write_str(self.message())
356    }
357}
358
359#[derive(Clone)]
360/// A spanned XML parse error consisting of a span and an [`ErrorKind`].
361pub struct Error {
362    kind: ErrorKind,
363    span: Range<usize>,
364}
365
366impl Error {
367    fn new(kind: ErrorKind, span: Range<usize>) -> Self {
368        Self { kind, span }
369    }
370
371    /// Returns this error's [`ErrorKind`].
372    pub fn kind(&self) -> ErrorKind {
373        self.kind
374    }
375
376    /// Returns this error's span.
377    pub fn span(&self) -> Range<usize> {
378        self.span.clone()
379    }
380}
381
382impl std::error::Error for Error {}
383
384impl Debug for Error {
385    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
386        <Self as Display>::fmt(self, f)
387    }
388}
389
390impl Display for Error {
391    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
392        write!(f, "parse error at {:?}: {}", self.span, self.kind)
393    }
394}
395
396struct ParsingBuffer<'a> {
397    text: &'a str,
398    current: usize,
399}
400
401impl<'a> ParsingBuffer<'a> {
402    pub fn new(text: &'a str) -> Self {
403        Self { text, current: 0 }
404    }
405
406    #[inline]
407    fn empty_range_here(&self) -> Range<usize> {
408        self.current..self.current
409    }
410
411    #[inline]
412    fn char_range_here(&self) -> Range<usize> {
413        self.current..(self.current + 1).min(self.text.len())
414    }
415
416    #[inline]
417    fn as_bytes(&self) -> &'a [u8] {
418        self.text.as_bytes()
419    }
420
421    #[inline(always)]
422    fn byte(&self, idx: usize) -> Option<u8> {
423        self.as_bytes().get(idx).copied()
424    }
425
426    #[inline]
427    fn position_or_end(&self, start: usize, fun: impl Fn(u8) -> bool) -> usize {
428        let mut current = start;
429        // Iterators seem to generate pretty bad code here, use a loop instead.
430        loop {
431            match self.as_bytes().get(current) {
432                Some(&b) if fun(b) => return current,
433                Some(_) => current += 1,
434                None => return self.text.len(),
435            }
436        }
437    }
438
439    #[inline]
440    fn memchr(&self, start: usize, needle: u8) -> Option<usize> {
441        memchr::memchr(needle, &self.text.as_bytes()[start..]).map(|i| i + start)
442    }
443
444    #[inline]
445    fn memchr2(&self, start: usize, needle1: u8, needle2: u8) -> Option<usize> {
446        memchr::memchr2(needle1, needle2, &self.text.as_bytes()[start..]).map(|i| i + start)
447    }
448
449    #[inline]
450    fn memmem(&self, needle: &[u8]) -> Option<usize> {
451        memchr::memmem::find(&self.text.as_bytes()[self.current..], needle)
452            .map(|value| value + self.current)
453    }
454
455    #[inline]
456    fn skip_whitespace(&mut self) {
457        self.current = self.position_or_end(self.current, |b| !is_whitespace(b));
458    }
459}
460
461/// An iterator over the attributes of a [`StartEvent`], obtained via [`StartEvent::attributes`].
462pub struct Attributes<'a>(ParsingBuffer<'a>);
463
464impl<'a> Iterator for Attributes<'a> {
465    type Item = AttributeEvent<'a>;
466
467    fn next(&mut self) -> Option<Self::Item> {
468        self.0.skip_whitespace();
469
470        let name_start = self.0.current;
471        let name_end = self
472            .0
473            .position_or_end(self.0.current, is_invalid_attribute_name);
474        if name_end == self.0.current {
475            return None;
476        }
477        self.0.current = name_end;
478
479        self.0.skip_whitespace();
480        self.0.current += 1;
481        self.0.skip_whitespace();
482
483        let quote = self.0.byte(self.0.current).unwrap();
484
485        self.0.current += 1;
486
487        let value_start = self.0.current;
488        let value_end = self.0.memchr(self.0.current, quote).unwrap();
489
490        self.0.current = value_end + 1;
491
492        Some(AttributeEvent {
493            text: &self.0.text[name_start..self.0.current],
494            name_end: name_end - name_start,
495            value_start: value_start - name_start,
496        })
497    }
498}
499
500#[non_exhaustive]
501#[derive(Default, Debug, Clone)]
502/// XML reader options.
503pub struct Options {
504    allow_top_level_text: bool,
505    allow_unmatched_closing_tags: bool,
506    allow_unclosed_tags: bool,
507}
508
509impl Options {
510    /// Changes whether top-level text should be allowed during parsing.
511    pub fn allow_top_level_text(mut self, value: bool) -> Self {
512        self.allow_top_level_text = value;
513        self
514    }
515
516    /// Changes whether unmatched closing tags should be allowed during parsing.
517    pub fn allow_unmatched_closing_tags(mut self, value: bool) -> Self {
518        self.allow_unmatched_closing_tags = value;
519        self
520    }
521
522    /// Changes whether unclosed tags should be allowed during parsing.
523    pub fn allow_unclosed_tags(mut self, value: bool) -> Self {
524        self.allow_unclosed_tags = value;
525        self
526    }
527}
528
529/// An XML reader.
530pub struct Reader<'a> {
531    buffer: ParsingBuffer<'a>,
532    depth: u32,
533    options: Options,
534}
535
536impl<'a> Reader<'a> {
537    /// Creates a new XML reader that will parse the contents of `text`.
538    pub fn new(text: &'a str) -> Self {
539        Self {
540            buffer: ParsingBuffer::new(text),
541            depth: 0,
542            options: Default::default(),
543        }
544    }
545
546    /// Creates a new XML reader that will parse the contents of `text` with the provided [`Options`].
547    pub fn with_options(text: &'a str, options: Options) -> Self {
548        Self {
549            buffer: ParsingBuffer::new(text),
550            depth: 0,
551            options,
552        }
553    }
554
555    /// Returns the string that this reader was originally created with.
556    pub fn buffer(&self) -> &'a str {
557        self.buffer.text
558    }
559
560    /// Returns the element depth the parser is currently at.
561    pub fn depth(&self) -> u32 {
562        self.depth
563    }
564
565    fn range_for_ptrs(&self, range: Range<*const u8>) -> Range<usize> {
566        let self_range = self.buffer.as_bytes().as_ptr_range();
567        assert!(
568            self_range.start <= range.start
569                && self_range.end >= range.end
570                && range.start <= range.end,
571            "Parser::range_for_ptrs called with invalid pointer range"
572        );
573
574        range.start.addr() - self_range.start.addr()..range.end.addr() - self_range.start.addr()
575    }
576
577    fn set_error_state(&mut self) {
578        self.buffer.current = self.buffer.text.len();
579        self.depth = 0;
580    }
581
582    #[inline]
583    fn bytes(&self) -> &'a [u8] {
584        self.buffer.as_bytes()
585    }
586
587    #[inline]
588    fn byte(&self, idx: usize) -> Option<u8> {
589        self.buffer.byte(idx)
590    }
591
592    fn skip_element_attributes(&mut self) -> Result<(), Error> {
593        loop {
594            self.buffer.skip_whitespace();
595
596            let name_start = self.buffer.current;
597            let name_end = self
598                .buffer
599                .position_or_end(self.buffer.current, is_invalid_attribute_name);
600            if name_end == self.buffer.current {
601                return Ok(());
602            }
603            self.buffer.current = name_end;
604
605            self.buffer.skip_whitespace();
606
607            if self.byte(self.buffer.current) != Some(b'=') {
608                self.set_error_state();
609                return Err(Error::new(
610                    ErrorKind::ExpectedAttributeEq,
611                    name_start..name_end,
612                ));
613            };
614
615            self.buffer.current += 1;
616
617            let eq_end = self.buffer.current;
618
619            self.buffer.skip_whitespace();
620
621            let Some(quote) = self
622                .byte(self.buffer.current)
623                .filter(|b| [b'\'', b'\"'].contains(b))
624            else {
625                self.set_error_state();
626                return Err(Error::new(
627                    ErrorKind::ExpectedAttributeValue,
628                    name_start..eq_end,
629                ));
630            };
631
632            self.buffer.current += 1;
633
634            let value_start = self.buffer.current;
635            let Some(value_end) = self.buffer.memchr2(self.buffer.current, quote, b'\0') else {
636                self.set_error_state();
637                return Err(Error::new(
638                    ErrorKind::UnclosedAttributeValue,
639                    self.buffer.current..(self.buffer.current + 1).min(self.buffer.text.len()),
640                ));
641            };
642
643            if self.bytes()[value_end] == b'\0' {
644                self.set_error_state();
645                return Err(Error::new(
646                    ErrorKind::InvalidAttributeValue,
647                    value_start..value_end + 1,
648                ));
649            }
650
651            self.buffer.current = value_end + 1;
652        }
653    }
654
655    fn skip_doctype(&mut self) -> Result<(), Error> {
656        loop {
657            match self.buffer.memchr2(self.buffer.current, b'>', b'[') {
658                Some(idx) if self.bytes()[idx] == b'[' => {
659                    self.buffer.current = idx + 1;
660                    let mut depth = 1;
661                    while depth > 0 {
662                        match self.buffer.memchr2(self.buffer.current, b'[', b']') {
663                            Some(idx) => {
664                                if self.bytes()[idx] == b'[' {
665                                    depth += 1;
666                                } else {
667                                    depth -= 1;
668                                }
669                                self.buffer.current = idx + 1;
670                            }
671                            None => {
672                                self.set_error_state();
673                                return Err(Error::new(
674                                    ErrorKind::DoctypeEof,
675                                    self.buffer.empty_range_here(),
676                                ));
677                            }
678                        }
679                    }
680                }
681                Some(idx) => {
682                    self.buffer.current = idx + 1;
683                    return Ok(());
684                }
685                None => {
686                    self.set_error_state();
687                    return Err(Error::new(
688                        ErrorKind::DoctypeEof,
689                        self.buffer.empty_range_here(),
690                    ));
691                }
692            }
693        }
694    }
695
696    fn take_prefixed_name(
697        &mut self,
698        start: usize,
699        prefix_end_default: usize,
700    ) -> Result<(usize, usize), Error> {
701        let first_end = self
702            .buffer
703            .position_or_end(self.buffer.current, is_invalid_name);
704        if first_end == self.buffer.current {
705            self.set_error_state();
706            return Err(Error::new(
707                ErrorKind::ExpectedElementName,
708                start..self.buffer.current,
709            ));
710        }
711
712        self.buffer.current = first_end;
713
714        let prefix_end;
715        let name_end;
716        if self.buffer.byte(self.buffer.current) == Some(b':') {
717            let second_end = self
718                .buffer
719                .position_or_end(self.buffer.current + 1, is_invalid_name);
720            if second_end == self.buffer.current {
721                self.set_error_state();
722                return Err(Error::new(
723                    ErrorKind::ExpectedElementName,
724                    start..self.buffer.current,
725                ));
726            }
727            self.buffer.current = second_end;
728            prefix_end = first_end;
729            name_end = second_end;
730        } else {
731            prefix_end = start + prefix_end_default;
732            name_end = first_end
733        }
734
735        Ok((prefix_end, name_end))
736    }
737
738    fn parse_node(&mut self) -> Result<Option<Event<'a>>, Error> {
739        let start = self.buffer.current;
740        self.buffer.current += 1;
741
742        match self.byte(self.buffer.current).ok_or_else(|| {
743            Error::new(
744                ErrorKind::InvalidElementName,
745                self.buffer.empty_range_here(),
746            )
747        })? {
748            b'?' => {
749                // xml declaration or processing instruction
750                // since both flags are disabled by default, they are treated the same.
751                //
752                // parse_declaration_node is disabled
753                // NOTE: This contains a glaring bug, it will fail on PIs like this:
754                //       <?hello something="?>"?>
755                //       But RapidXML doesn't care about this.
756                let Some(end) = self.buffer.memmem(b"?>") else {
757                    let name_range = self.buffer.current + 1
758                        ..self
759                            .buffer
760                            .position_or_end(self.buffer.current + 1, is_invalid_name);
761                    self.set_error_state();
762                    return Err(Error::new(ErrorKind::UnclosedPITag, name_range));
763                };
764                self.buffer.current = end + 2;
765
766                Ok(None)
767            }
768
769            b'!' => match self.byte(self.buffer.current + 1) {
770                Some(b'-') if self.byte(self.buffer.current + 2) == Some(b'-') => {
771                    self.buffer.current += 2;
772                    let Some(end) = self.buffer.memmem(b"-->") else {
773                        let span = start..self.buffer.current;
774                        self.set_error_state();
775                        return Err(Error::new(ErrorKind::UnclosedComment, span));
776                    };
777
778                    self.buffer.current = end + 3;
779                    Ok(Some(Event::Comment(CommentEvent {
780                        text: &self.buffer.text[start..self.buffer.current],
781                    })))
782                }
783                Some(b'[') if self.bytes()[self.buffer.current + 2..].starts_with(b"CDATA[") => {
784                    self.buffer.current += 8;
785                    let Some(end) = self.buffer.memmem(b"]]>") else {
786                        let span = start..self.buffer.current;
787                        self.set_error_state();
788                        return Err(Error::new(ErrorKind::UnclosedCData, span));
789                    };
790
791                    self.buffer.current = end + 3;
792                    Ok(Some(Event::CData(CDataEvent {
793                        text: &self.buffer.text[start..self.buffer.current],
794                    })))
795                }
796                Some(b'D')
797                    if self.bytes()[self.buffer.current + 2..].starts_with(b"OCTYPE")
798                        && self
799                            .byte(self.buffer.current + 8)
800                            .is_some_and(is_whitespace) =>
801                {
802                    self.buffer.current += 9;
803                    self.skip_doctype()?;
804                    Ok(Some(Event::Doctype(DoctypeEvent {
805                        text: &self.buffer.text[start..self.buffer.current],
806                    })))
807                }
808                _ => {
809                    let Some(end) = self.buffer.memchr(self.buffer.current + 1, b'>') else {
810                        let span = start..self.buffer.position_or_end(start + 2, is_invalid_name);
811                        self.set_error_state();
812                        return Err(Error::new(ErrorKind::UnclosedUnknownSpecial, span));
813                    };
814                    self.buffer.current = end + 1;
815                    Ok(None)
816                }
817            },
818
819            // TODO: make depth=0 a separate error condition instead
820            b'/' if self.depth > 0 || self.options.allow_unmatched_closing_tags => {
821                self.buffer.current += 1;
822                let (prefix_end, name_end) = self.take_prefixed_name(start, 1)?;
823
824                self.buffer.skip_whitespace();
825
826                if self.byte(self.buffer.current) != Some(b'>') {
827                    let span = self.buffer.char_range_here();
828                    self.set_error_state();
829                    return Err(Error::new(ErrorKind::UnclosedEndTag, span));
830                }
831
832                self.depth = self.depth.saturating_sub(1);
833                self.buffer.current += 1;
834                Ok(Some(Event::End(EndEvent {
835                    text: &self.buffer.text[start..self.buffer.current],
836                    prefix_end: prefix_end - start,
837                    name_end: name_end - start,
838                })))
839            }
840
841            _ => {
842                let (prefix_end, name_end) = self.take_prefixed_name(start, 0)?;
843
844                self.skip_element_attributes()?;
845                self.buffer.skip_whitespace();
846
847                match self.byte(self.buffer.current) {
848                    Some(b'>') => {
849                        self.buffer.current += 1;
850                        self.depth += 1;
851                        Ok(Some(Event::Start(StartEvent {
852                            text: &self.buffer.text[start..self.buffer.current],
853                            prefix_end: prefix_end - start,
854                            name_end: name_end - start,
855                        })))
856                    }
857                    Some(b'/') => {
858                        if self.byte(self.buffer.current + 1) != Some(b'>') {
859                            let span = self.buffer.char_range_here();
860                            self.set_error_state();
861                            return Err(Error::new(ErrorKind::UnclosedEmptyElementTag, span));
862                        }
863
864                        self.buffer.current += 2;
865                        Ok(Some(Event::Empty(StartEvent {
866                            text: &self.buffer.text[start..self.buffer.current],
867                            prefix_end: prefix_end - start,
868                            name_end: name_end - start,
869                        })))
870                    }
871                    _ => {
872                        let span = self.buffer.char_range_here();
873                        self.set_error_state();
874                        Err(Error::new(ErrorKind::UnclosedElementTag, span))
875                    }
876                }
877            }
878        }
879    }
880
881    /// Skips all events until the next end event on the same depth as the last seen start tag.
882    ///
883    /// # Examples
884    ///
885    /// ```
886    /// # use speedy_xml::reader::*;
887    /// let mut reader = Reader::new("<event>content</event><another/>");
888    /// assert!(matches!(reader.next(), Some(Ok(Event::Start(..)))));
889    /// assert!(matches!(reader.skip_to_end(), Ok(Some(..))));
890    /// assert!(matches!(reader.next(), Some(Ok(Event::Empty(..)))));
891    /// assert!(matches!(reader.next(), None));
892    /// ```
893    ///
894    /// # Errors
895    ///
896    /// Returns an error if a parse error occurred.
897    pub fn skip_to_end(&mut self) -> Result<Option<EndEvent<'a>>, Error> {
898        let end_depth = self.depth;
899
900        loop {
901            match self.next().transpose()? {
902                Some(Event::End(end)) if self.depth + 1 == end_depth => return Ok(Some(end)),
903                Some(_) => (),
904                None => return Ok(None),
905            }
906        }
907    }
908}
909
910impl<'a> Iterator for Reader<'a> {
911    type Item = Result<Event<'a>, Error>;
912
913    fn next(&mut self) -> Option<Result<Event<'a>, Error>> {
914        loop {
915            return match self.byte(self.buffer.current) {
916                Some(b'<') => match self.parse_node() {
917                    Ok(Some(event)) => Some(Ok(event)),
918                    Ok(None) => continue,
919                    Err(err) => Some(Err(err)),
920                },
921                Some(_) => {
922                    let node_start = self
923                        .buffer
924                        .memchr(self.buffer.current, b'<')
925                        .unwrap_or(self.buffer.text.len());
926                    let text_range = self.buffer.current..node_start;
927                    self.buffer.current = text_range.end;
928
929                    if self.depth == 0 && !self.options.allow_top_level_text {
930                        // SAFETY: node_start was just acquired from memchr or is equal to the length.
931                        //         self.buffer.current can also never be less than the string's length.
932                        if !unsafe { self.buffer.as_bytes().get_unchecked(text_range.clone()) }
933                            .iter()
934                            .copied()
935                            .all(is_whitespace)
936                        {
937                            self.set_error_state();
938                            return Some(Err(Error::new(ErrorKind::TopLevelText, text_range)));
939                        } else {
940                            self.buffer.current = text_range.end;
941                            continue;
942                        }
943                    }
944
945                    Some(Ok(Event::Text(TextEvent {
946                        // SAFETY: See above
947                        text: unsafe { self.buffer.text.get_unchecked(text_range) },
948                    })))
949                }
950                None if self.depth > 0 => {
951                    if self.options.allow_unclosed_tags {
952                        return None;
953                    }
954
955                    self.depth = 0;
956                    return Some(Err(Error::new(
957                        ErrorKind::UnclosedElement,
958                        self.buffer.empty_range_here(),
959                    )));
960                }
961                None => None,
962            };
963        }
964    }
965}
966
967#[cfg(test)]
968mod test {
969    use super::Reader;
970
971    macro_rules! unwrap {
972        ($event: expr, Some($($what: tt)*)) => {
973            unwrap!($event.expect("unexpected end of event stream"), $($what)*)
974        };
975        ($event: expr, Ok($what: ident)) => {
976            unwrap!($event.expect("parse error"), $what)
977        };
978        ($event: expr, $what: ident) => {{
979            let e = $event;
980            if let super::Event::$what(r) = e {
981                r
982            } else {
983                panic!(
984                    concat!("mismatched event, expected ", stringify!($what), " got {:?}"),
985                    e
986                )
987            }
988        }};
989    }
990
991    #[test]
992    fn element() {
993        let code =
994            "   <hello attr =  \"value\" 0ther4ttr=\t'val&apos;ue'>con&#x20;ten&#32;t</hello>   ";
995        let mut reader = Reader::new(code);
996
997        {
998            let start = unwrap!(reader.next(), Some(Ok(Start)));
999            assert_eq!(start.name(), "hello");
1000
1001            let mut attributes = start.attributes();
1002            {
1003                let attr = attributes.next().unwrap();
1004                assert_eq!(attr.name(), "attr");
1005                assert_eq!(attr.value(), "value");
1006                assert_eq!(attr.raw_value(), "value");
1007            }
1008            {
1009                let attr = attributes.next().unwrap();
1010                assert_eq!(attr.name(), "0ther4ttr");
1011                assert_eq!(attr.value(), "val'ue");
1012                assert_eq!(attr.raw_value(), "val&apos;ue");
1013            }
1014            assert!(attributes.next().is_none());
1015        }
1016
1017        {
1018            let text = unwrap!(reader.next(), Some(Ok(Text)));
1019            assert_eq!(text.content(), "con ten t");
1020            assert_eq!(text.raw_content(), "con&#x20;ten&#32;t");
1021        }
1022
1023        {
1024            let end = unwrap!(reader.next(), Some(Ok(End)));
1025            assert_eq!(end.name(), "hello");
1026        }
1027    }
1028
1029    #[test]
1030    fn comments() {
1031        let comment_text = " this is a &comment -- text ";
1032        let code = format!("   <!--{comment_text}-->   ");
1033        let mut reader = Reader::new(&code);
1034
1035        let comment = unwrap!(reader.next(), Some(Ok(Comment)));
1036        assert_eq!(comment.content(), comment_text);
1037    }
1038
1039    #[test]
1040    fn element_tree() {
1041        let code = r#"
1042            <tree>
1043                <ns:stuff1>one</stuff2>
1044                one is &lt; two
1045            </not:tree>
1046        "#;
1047        let mut reader = Reader::new(code);
1048
1049        {
1050            let start = unwrap!(reader.next(), Some(Ok(Start)));
1051            assert_eq!(start.prefix(), None);
1052            assert_eq!(start.name(), "tree");
1053            assert!(start.attributes().next().is_none());
1054        }
1055
1056        {
1057            let text = unwrap!(reader.next(), Some(Ok(Text)));
1058            assert_eq!(text.raw_content(), "\n                ");
1059        }
1060
1061        {
1062            let start = unwrap!(reader.next(), Some(Ok(Start)));
1063            assert_eq!(start.prefix(), Some("ns"));
1064            assert_eq!(start.name(), "stuff1");
1065            assert!(start.attributes().next().is_none());
1066        }
1067
1068        {
1069            let text = unwrap!(reader.next(), Some(Ok(Text)));
1070            assert_eq!(text.content(), "one");
1071        }
1072
1073        {
1074            let end = unwrap!(reader.next(), Some(Ok(End)));
1075            assert_eq!(end.name(), "stuff2");
1076        }
1077
1078        {
1079            let text = unwrap!(reader.next(), Some(Ok(Text)));
1080            assert_eq!(
1081                text.content(),
1082                "\n                one is < two\n            "
1083            );
1084            assert_eq!(
1085                text.raw_content(),
1086                "\n                one is &lt; two\n            "
1087            );
1088        }
1089
1090        {
1091            let end = unwrap!(reader.next(), Some(Ok(End)));
1092            assert_eq!(end.prefix(), Some("not"));
1093            assert_eq!(end.name(), "tree");
1094        }
1095    }
1096
1097    #[test]
1098    fn cdata() {
1099        let content = "this is some cdata < > > & & !!";
1100        let code = format!("<![CDATA[{content}]]>");
1101        let mut reader = Reader::new(&code);
1102
1103        {
1104            let end = unwrap!(reader.next(), Some(Ok(CData)));
1105            assert_eq!(end.content(), content);
1106        }
1107    }
1108
1109    #[test]
1110    fn doctype() {
1111        let content = "\tthis is a doctype [with] [many [brackets[[[]]][][]]]\n";
1112        let code = format!("<!DOCTYPE {content}>");
1113        let mut reader = Reader::new(&code);
1114
1115        {
1116            let end = unwrap!(reader.next(), Some(Ok(Doctype)));
1117            assert_eq!(end.content(), content);
1118        }
1119    }
1120}