serde_xml/
reader.rs

1//! Low-level XML reader/tokenizer.
2//!
3//! This module provides a fast, zero-copy XML tokenizer that produces events
4//! for elements, attributes, text content, and other XML constructs.
5
6use crate::error::{Error, Position, Result};
7use crate::escape::unescape;
8use memchr::memchr;
9use std::borrow::Cow;
10
11/// An XML event produced by the reader.
12#[derive(Debug, Clone, PartialEq)]
13pub enum XmlEvent<'a> {
14    /// XML declaration: <?xml version="1.0"?>
15    XmlDecl {
16        /// XML version (e.g., "1.0").
17        version: Cow<'a, str>,
18        /// Character encoding (e.g., "UTF-8").
19        encoding: Option<Cow<'a, str>>,
20        /// Standalone declaration.
21        standalone: Option<bool>,
22    },
23    /// Start of an element: <name attr="value">
24    StartElement {
25        /// Element name.
26        name: Cow<'a, str>,
27        /// Element attributes.
28        attributes: Vec<Attribute<'a>>,
29    },
30    /// End of an element: </name>
31    EndElement {
32        /// Element name.
33        name: Cow<'a, str>,
34    },
35    /// Empty element: <name attr="value"/>
36    EmptyElement {
37        /// Element name.
38        name: Cow<'a, str>,
39        /// Element attributes.
40        attributes: Vec<Attribute<'a>>,
41    },
42    /// Text content between elements.
43    Text(Cow<'a, str>),
44    /// CDATA section: <![CDATA[...]]>
45    CData(Cow<'a, str>),
46    /// Comment: <!-- ... -->
47    Comment(Cow<'a, str>),
48    /// Processing instruction: <?target data?>
49    ProcessingInstruction {
50        /// Processing instruction target.
51        target: Cow<'a, str>,
52        /// Processing instruction data.
53        data: Option<Cow<'a, str>>,
54    },
55    /// End of document.
56    Eof,
57}
58
59/// An XML attribute.
60#[derive(Debug, Clone, PartialEq)]
61pub struct Attribute<'a> {
62    /// The attribute name.
63    pub name: Cow<'a, str>,
64    /// The attribute value.
65    pub value: Cow<'a, str>,
66}
67
68/// A fast, zero-copy XML reader.
69pub struct XmlReader<'a> {
70    input: &'a [u8],
71    pos: usize,
72    line: usize,
73    col: usize,
74    /// Stack of open element names for validation.
75    element_stack: Vec<String>,
76}
77
78impl<'a> XmlReader<'a> {
79    /// Creates a new XML reader from a string.
80    #[inline]
81    #[allow(clippy::should_implement_trait)]
82    pub fn from_str(s: &'a str) -> Self {
83        Self::from_bytes(s.as_bytes())
84    }
85
86    /// Creates a new XML reader from bytes.
87    #[inline]
88    pub fn from_bytes(input: &'a [u8]) -> Self {
89        Self {
90            input,
91            pos: 0,
92            line: 1,
93            col: 1,
94            element_stack: Vec::new(),
95        }
96    }
97
98    /// Returns the current position in the input.
99    #[inline]
100    pub fn position(&self) -> Position {
101        Position {
102            line: self.line,
103            column: self.col,
104            offset: self.pos,
105        }
106    }
107
108    /// Returns whether there are any open elements.
109    #[inline]
110    pub fn depth(&self) -> usize {
111        self.element_stack.len()
112    }
113
114    /// Reads the next XML event.
115    pub fn next_event(&mut self) -> Result<XmlEvent<'a>> {
116        self.skip_whitespace();
117
118        if self.pos >= self.input.len() {
119            if !self.element_stack.is_empty() {
120                let tag = self.element_stack.pop().unwrap();
121                return Err(Error::unclosed_tag(tag).with_position(self.position()));
122            }
123            return Ok(XmlEvent::Eof);
124        }
125
126        if self.input[self.pos] == b'<' {
127            self.read_tag()
128        } else {
129            self.read_text()
130        }
131    }
132
133    /// Skips whitespace characters.
134    fn skip_whitespace(&mut self) {
135        while self.pos < self.input.len() {
136            match self.input[self.pos] {
137                b' ' | b'\t' | b'\r' => {
138                    self.pos += 1;
139                    self.col += 1;
140                }
141                b'\n' => {
142                    self.pos += 1;
143                    self.line += 1;
144                    self.col = 1;
145                }
146                _ => break,
147            }
148        }
149    }
150
151    /// Reads text content.
152    fn read_text(&mut self) -> Result<XmlEvent<'a>> {
153        let start = self.pos;
154
155        // Find the end of text (start of next tag or end of input)
156        while self.pos < self.input.len() && self.input[self.pos] != b'<' {
157            if self.input[self.pos] == b'\n' {
158                self.line += 1;
159                self.col = 1;
160            } else {
161                self.col += 1;
162            }
163            self.pos += 1;
164        }
165
166        let text = std::str::from_utf8(&self.input[start..self.pos])
167            .map_err(|_| Error::new(crate::error::ErrorKind::InvalidUtf8))?;
168
169        // Trim whitespace from text
170        let trimmed = text.trim();
171        if trimmed.is_empty() {
172            // Skip whitespace-only text and read the next event
173            return self.next_event();
174        }
175
176        // Unescape XML entities
177        match unescape(trimmed) {
178            Ok(unescaped) => Ok(XmlEvent::Text(unescaped)),
179            Err(e) => Err(Error::invalid_escape(e.entity)),
180        }
181    }
182
183    /// Reads a tag (element, comment, CDATA, PI, or declaration).
184    fn read_tag(&mut self) -> Result<XmlEvent<'a>> {
185        debug_assert_eq!(self.input[self.pos], b'<');
186        self.pos += 1;
187        self.col += 1;
188
189        if self.pos >= self.input.len() {
190            return Err(Error::unexpected_eof().with_position(self.position()));
191        }
192
193        match self.input[self.pos] {
194            b'/' => self.read_end_element(),
195            b'?' => self.read_processing_instruction(),
196            b'!' => self.read_special(),
197            _ => self.read_start_element(),
198        }
199    }
200
201    /// Reads a start element or empty element.
202    fn read_start_element(&mut self) -> Result<XmlEvent<'a>> {
203        let name = self.read_name()?;
204        let attributes = self.read_attributes()?;
205
206        self.skip_whitespace();
207
208        if self.pos >= self.input.len() {
209            return Err(Error::unexpected_eof().with_position(self.position()));
210        }
211
212        if self.input[self.pos] == b'/' {
213            // Empty element: <name/>
214            self.pos += 1;
215            self.col += 1;
216            self.expect_char(b'>')?;
217            Ok(XmlEvent::EmptyElement {
218                name: Cow::Borrowed(name),
219                attributes,
220            })
221        } else if self.input[self.pos] == b'>' {
222            // Start element: <name>
223            self.pos += 1;
224            self.col += 1;
225            self.element_stack.push(name.to_string());
226            Ok(XmlEvent::StartElement {
227                name: Cow::Borrowed(name),
228                attributes,
229            })
230        } else {
231            Err(Error::syntax("expected '>' or '/>'").with_position(self.position()))
232        }
233    }
234
235    /// Reads an end element.
236    fn read_end_element(&mut self) -> Result<XmlEvent<'a>> {
237        debug_assert_eq!(self.input[self.pos], b'/');
238        self.pos += 1;
239        self.col += 1;
240
241        let name = self.read_name()?;
242        self.skip_whitespace();
243        self.expect_char(b'>')?;
244
245        // Validate matching tags
246        match self.element_stack.pop() {
247            Some(expected) if expected == name => Ok(XmlEvent::EndElement {
248                name: Cow::Borrowed(name),
249            }),
250            Some(expected) => Err(Error::mismatched_tag(expected, name.to_string()).with_position(self.position())),
251            None => Err(Error::syntax(format!("unexpected closing tag: {}", name))
252                .with_position(self.position())),
253        }
254    }
255
256    /// Reads a processing instruction.
257    fn read_processing_instruction(&mut self) -> Result<XmlEvent<'a>> {
258        debug_assert_eq!(self.input[self.pos], b'?');
259        self.pos += 1;
260        self.col += 1;
261
262        let target = self.read_name()?;
263
264        // Check for XML declaration
265        if target.eq_ignore_ascii_case("xml") {
266            return self.read_xml_decl();
267        }
268
269        self.skip_whitespace();
270
271        // Read data until ?>
272        let data_start = self.pos;
273        while self.pos + 1 < self.input.len() {
274            if self.input[self.pos] == b'?' && self.input[self.pos + 1] == b'>' {
275                let data = std::str::from_utf8(&self.input[data_start..self.pos])
276                    .map_err(|_| Error::new(crate::error::ErrorKind::InvalidUtf8))?;
277                self.pos += 2;
278                self.col += 2;
279                return Ok(XmlEvent::ProcessingInstruction {
280                    target: Cow::Borrowed(target),
281                    data: if data.trim().is_empty() {
282                        None
283                    } else {
284                        Some(Cow::Borrowed(data.trim()))
285                    },
286                });
287            }
288            if self.input[self.pos] == b'\n' {
289                self.line += 1;
290                self.col = 1;
291            } else {
292                self.col += 1;
293            }
294            self.pos += 1;
295        }
296
297        Err(Error::syntax("unterminated processing instruction").with_position(self.position()))
298    }
299
300    /// Reads an XML declaration.
301    fn read_xml_decl(&mut self) -> Result<XmlEvent<'a>> {
302        let attributes = self.read_attributes()?;
303        self.skip_whitespace();
304
305        if self.pos + 1 >= self.input.len()
306            || self.input[self.pos] != b'?'
307            || self.input[self.pos + 1] != b'>'
308        {
309            return Err(Error::syntax("expected '?>'").with_position(self.position()));
310        }
311        self.pos += 2;
312        self.col += 2;
313
314        let mut version = None;
315        let mut encoding = None;
316        let mut standalone = None;
317
318        for attr in attributes {
319            match attr.name.as_ref() {
320                "version" => version = Some(attr.value),
321                "encoding" => encoding = Some(attr.value),
322                "standalone" => {
323                    standalone = Some(attr.value.as_ref() == "yes");
324                }
325                _ => {}
326            }
327        }
328
329        Ok(XmlEvent::XmlDecl {
330            version: version.unwrap_or(Cow::Borrowed("1.0")),
331            encoding,
332            standalone,
333        })
334    }
335
336    /// Reads special constructs (comments, CDATA, DOCTYPE).
337    fn read_special(&mut self) -> Result<XmlEvent<'a>> {
338        debug_assert_eq!(self.input[self.pos], b'!');
339        self.pos += 1;
340        self.col += 1;
341
342        if self.pos >= self.input.len() {
343            return Err(Error::unexpected_eof().with_position(self.position()));
344        }
345
346        // Check for comment: <!--
347        if self.pos + 1 < self.input.len()
348            && self.input[self.pos] == b'-'
349            && self.input[self.pos + 1] == b'-'
350        {
351            return self.read_comment();
352        }
353
354        // Check for CDATA: <![CDATA[
355        if self.pos + 6 < self.input.len() && &self.input[self.pos..self.pos + 7] == b"[CDATA[" {
356            return self.read_cdata();
357        }
358
359        // Check for DOCTYPE
360        if self.pos + 6 < self.input.len() && self.input[self.pos..].starts_with(b"DOCTYPE") {
361            return self.skip_doctype();
362        }
363
364        Err(Error::syntax("unknown construct after '<!'").with_position(self.position()))
365    }
366
367    /// Reads a comment.
368    fn read_comment(&mut self) -> Result<XmlEvent<'a>> {
369        self.pos += 2; // Skip --
370        self.col += 2;
371        let start = self.pos;
372
373        while self.pos + 2 < self.input.len() {
374            if self.input[self.pos] == b'-'
375                && self.input[self.pos + 1] == b'-'
376                && self.input[self.pos + 2] == b'>'
377            {
378                let comment = std::str::from_utf8(&self.input[start..self.pos])
379                    .map_err(|_| Error::new(crate::error::ErrorKind::InvalidUtf8))?;
380                self.pos += 3;
381                self.col += 3;
382                return Ok(XmlEvent::Comment(Cow::Borrowed(comment.trim())));
383            }
384            if self.input[self.pos] == b'\n' {
385                self.line += 1;
386                self.col = 1;
387            } else {
388                self.col += 1;
389            }
390            self.pos += 1;
391        }
392
393        Err(Error::syntax("unterminated comment").with_position(self.position()))
394    }
395
396    /// Reads a CDATA section.
397    fn read_cdata(&mut self) -> Result<XmlEvent<'a>> {
398        self.pos += 7; // Skip [CDATA[
399        self.col += 7;
400        let start = self.pos;
401
402        while self.pos + 2 < self.input.len() {
403            if self.input[self.pos] == b']'
404                && self.input[self.pos + 1] == b']'
405                && self.input[self.pos + 2] == b'>'
406            {
407                let data = std::str::from_utf8(&self.input[start..self.pos])
408                    .map_err(|_| Error::new(crate::error::ErrorKind::InvalidUtf8))?;
409                self.pos += 3;
410                self.col += 3;
411                return Ok(XmlEvent::CData(Cow::Borrowed(data)));
412            }
413            if self.input[self.pos] == b'\n' {
414                self.line += 1;
415                self.col = 1;
416            } else {
417                self.col += 1;
418            }
419            self.pos += 1;
420        }
421
422        Err(Error::syntax("unterminated CDATA section").with_position(self.position()))
423    }
424
425    /// Skips a DOCTYPE declaration.
426    fn skip_doctype(&mut self) -> Result<XmlEvent<'a>> {
427        let mut depth = 1;
428
429        while self.pos < self.input.len() && depth > 0 {
430            match self.input[self.pos] {
431                b'<' => depth += 1,
432                b'>' => depth -= 1,
433                b'\n' => {
434                    self.line += 1;
435                    self.col = 1;
436                    self.pos += 1;
437                    continue;
438                }
439                _ => {}
440            }
441            self.col += 1;
442            self.pos += 1;
443        }
444
445        // Skip to next event
446        self.next_event()
447    }
448
449    /// Reads an XML name.
450    fn read_name(&mut self) -> Result<&'a str> {
451        let start = self.pos;
452
453        // First character must be a name start char
454        if self.pos >= self.input.len() {
455            return Err(Error::unexpected_eof().with_position(self.position()));
456        }
457
458        let first = self.input[self.pos];
459        if !is_name_start_char(first) {
460            return Err(Error::invalid_name(format!("invalid name start character: {:?}", first as char))
461                .with_position(self.position()));
462        }
463        self.pos += 1;
464        self.col += 1;
465
466        // Subsequent characters
467        while self.pos < self.input.len() && is_name_char(self.input[self.pos]) {
468            self.pos += 1;
469            self.col += 1;
470        }
471
472        std::str::from_utf8(&self.input[start..self.pos])
473            .map_err(|_| Error::new(crate::error::ErrorKind::InvalidUtf8))
474    }
475
476    /// Reads element attributes.
477    fn read_attributes(&mut self) -> Result<Vec<Attribute<'a>>> {
478        let mut attributes = Vec::new();
479
480        loop {
481            self.skip_whitespace();
482
483            if self.pos >= self.input.len() {
484                break;
485            }
486
487            // Check for end of attributes
488            let c = self.input[self.pos];
489            if c == b'>' || c == b'/' || c == b'?' {
490                break;
491            }
492
493            // Read attribute name
494            let name = self.read_name()?;
495            self.skip_whitespace();
496
497            // Expect '='
498            self.expect_char(b'=')?;
499            self.skip_whitespace();
500
501            // Read attribute value
502            let value = self.read_attribute_value()?;
503
504            attributes.push(Attribute {
505                name: Cow::Borrowed(name),
506                value,
507            });
508        }
509
510        Ok(attributes)
511    }
512
513    /// Reads an attribute value.
514    fn read_attribute_value(&mut self) -> Result<Cow<'a, str>> {
515        if self.pos >= self.input.len() {
516            return Err(Error::unexpected_eof().with_position(self.position()));
517        }
518
519        let quote = self.input[self.pos];
520        if quote != b'"' && quote != b'\'' {
521            return Err(Error::syntax("expected quote").with_position(self.position()));
522        }
523        self.pos += 1;
524        self.col += 1;
525
526        let start = self.pos;
527
528        // Find closing quote
529        match memchr(quote, &self.input[self.pos..]) {
530            Some(offset) => {
531                let value = std::str::from_utf8(&self.input[start..self.pos + offset])
532                    .map_err(|_| Error::new(crate::error::ErrorKind::InvalidUtf8))?;
533                self.pos += offset + 1;
534                self.col += offset + 1;
535
536                // Unescape the value
537                match unescape(value) {
538                    Ok(unescaped) => Ok(unescaped),
539                    Err(e) => Err(Error::invalid_escape(e.entity)),
540                }
541            }
542            None => Err(Error::syntax("unterminated attribute value").with_position(self.position())),
543        }
544    }
545
546    /// Expects a specific character.
547    fn expect_char(&mut self, expected: u8) -> Result<()> {
548        if self.pos >= self.input.len() {
549            return Err(Error::unexpected_eof().with_position(self.position()));
550        }
551
552        if self.input[self.pos] != expected {
553            return Err(Error::syntax(format!(
554                "expected '{}', found '{}'",
555                expected as char,
556                self.input[self.pos] as char
557            ))
558            .with_position(self.position()));
559        }
560
561        self.pos += 1;
562        self.col += 1;
563        Ok(())
564    }
565}
566
567/// Checks if a byte is a valid XML name start character.
568#[inline]
569fn is_name_start_char(b: u8) -> bool {
570    matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'_' | b':')
571        || b >= 0x80 // Allow UTF-8 continuation bytes (simplified check)
572}
573
574/// Checks if a byte is a valid XML name character.
575#[inline]
576fn is_name_char(b: u8) -> bool {
577    is_name_start_char(b) || matches!(b, b'0'..=b'9' | b'-' | b'.')
578}
579
580#[cfg(test)]
581mod tests {
582    use super::*;
583
584    #[test]
585    fn test_simple_element() {
586        let mut reader = XmlReader::from_str("<root></root>");
587
588        match reader.next_event().unwrap() {
589            XmlEvent::StartElement { name, attributes } => {
590                assert_eq!(name, "root");
591                assert!(attributes.is_empty());
592            }
593            _ => panic!("expected StartElement"),
594        }
595
596        match reader.next_event().unwrap() {
597            XmlEvent::EndElement { name } => {
598                assert_eq!(name, "root");
599            }
600            _ => panic!("expected EndElement"),
601        }
602
603        assert!(matches!(reader.next_event().unwrap(), XmlEvent::Eof));
604    }
605
606    #[test]
607    fn test_empty_element() {
608        let mut reader = XmlReader::from_str("<root/>");
609
610        match reader.next_event().unwrap() {
611            XmlEvent::EmptyElement { name, attributes } => {
612                assert_eq!(name, "root");
613                assert!(attributes.is_empty());
614            }
615            _ => panic!("expected EmptyElement"),
616        }
617
618        assert!(matches!(reader.next_event().unwrap(), XmlEvent::Eof));
619    }
620
621    #[test]
622    fn test_attributes() {
623        let mut reader = XmlReader::from_str(r#"<root id="1" name="test"/>"#);
624
625        match reader.next_event().unwrap() {
626            XmlEvent::EmptyElement { name, attributes } => {
627                assert_eq!(name, "root");
628                assert_eq!(attributes.len(), 2);
629                assert_eq!(attributes[0].name, "id");
630                assert_eq!(attributes[0].value, "1");
631                assert_eq!(attributes[1].name, "name");
632                assert_eq!(attributes[1].value, "test");
633            }
634            _ => panic!("expected EmptyElement"),
635        }
636    }
637
638    #[test]
639    fn test_text_content() {
640        let mut reader = XmlReader::from_str("<root>Hello, World!</root>");
641
642        reader.next_event().unwrap(); // StartElement
643
644        match reader.next_event().unwrap() {
645            XmlEvent::Text(text) => {
646                assert_eq!(text, "Hello, World!");
647            }
648            _ => panic!("expected Text"),
649        }
650    }
651
652    #[test]
653    fn test_escaped_text() {
654        let mut reader = XmlReader::from_str("<root>&lt;Hello&gt;</root>");
655
656        reader.next_event().unwrap(); // StartElement
657
658        match reader.next_event().unwrap() {
659            XmlEvent::Text(text) => {
660                assert_eq!(text, "<Hello>");
661            }
662            _ => panic!("expected Text"),
663        }
664    }
665
666    #[test]
667    fn test_xml_declaration() {
668        let mut reader = XmlReader::from_str(r#"<?xml version="1.0" encoding="UTF-8"?><root/>"#);
669
670        match reader.next_event().unwrap() {
671            XmlEvent::XmlDecl { version, encoding, standalone } => {
672                assert_eq!(version, "1.0");
673                assert_eq!(encoding.as_deref(), Some("UTF-8"));
674                assert_eq!(standalone, None);
675            }
676            _ => panic!("expected XmlDecl"),
677        }
678    }
679
680    #[test]
681    fn test_comment() {
682        let mut reader = XmlReader::from_str("<!-- This is a comment --><root/>");
683
684        match reader.next_event().unwrap() {
685            XmlEvent::Comment(comment) => {
686                assert_eq!(comment, "This is a comment");
687            }
688            _ => panic!("expected Comment"),
689        }
690    }
691
692    #[test]
693    fn test_cdata() {
694        let mut reader = XmlReader::from_str("<root><![CDATA[<special>content</special>]]></root>");
695
696        reader.next_event().unwrap(); // StartElement
697
698        match reader.next_event().unwrap() {
699            XmlEvent::CData(data) => {
700                assert_eq!(data, "<special>content</special>");
701            }
702            _ => panic!("expected CData"),
703        }
704    }
705
706    #[test]
707    fn test_nested_elements() {
708        let xml = r#"<root><child1><grandchild/></child1><child2/></root>"#;
709        let mut reader = XmlReader::from_str(xml);
710
711        let events: Vec<_> = std::iter::from_fn(|| {
712            match reader.next_event() {
713                Ok(XmlEvent::Eof) => None,
714                Ok(event) => Some(event),
715                Err(_) => None,
716            }
717        }).collect();
718
719        assert_eq!(events.len(), 6);
720    }
721
722    #[test]
723    fn test_mismatched_tags() {
724        let mut reader = XmlReader::from_str("<root></wrong>");
725        reader.next_event().unwrap(); // StartElement
726        assert!(reader.next_event().is_err());
727    }
728
729    #[test]
730    fn test_unclosed_tag() {
731        let mut reader = XmlReader::from_str("<root>");
732        reader.next_event().unwrap(); // StartElement
733        assert!(reader.next_event().is_err());
734    }
735
736    #[test]
737    fn test_processing_instruction() {
738        let mut reader = XmlReader::from_str("<?target data?><root/>");
739
740        match reader.next_event().unwrap() {
741            XmlEvent::ProcessingInstruction { target, data } => {
742                assert_eq!(target, "target");
743                assert_eq!(data.as_deref(), Some("data"));
744            }
745            _ => panic!("expected ProcessingInstruction"),
746        }
747    }
748
749    #[test]
750    fn test_attribute_with_single_quotes() {
751        let mut reader = XmlReader::from_str("<root attr='value'/>");
752
753        match reader.next_event().unwrap() {
754            XmlEvent::EmptyElement { attributes, .. } => {
755                assert_eq!(attributes[0].value, "value");
756            }
757            _ => panic!("expected EmptyElement"),
758        }
759    }
760
761    #[test]
762    fn test_position_tracking() {
763        let xml = "<root>\n  <child/>\n</root>";
764        let mut reader = XmlReader::from_str(xml);
765
766        reader.next_event().unwrap(); // <root>
767        reader.next_event().unwrap(); // <child/>
768
769        let pos = reader.position();
770        assert!(pos.line >= 2);
771    }
772
773    #[test]
774    fn test_depth_tracking() {
775        let mut reader = XmlReader::from_str("<a><b><c></c></b></a>");
776
777        assert_eq!(reader.depth(), 0);
778        reader.next_event().unwrap(); // <a>
779        assert_eq!(reader.depth(), 1);
780        reader.next_event().unwrap(); // <b>
781        assert_eq!(reader.depth(), 2);
782        reader.next_event().unwrap(); // <c>
783        assert_eq!(reader.depth(), 3);
784        reader.next_event().unwrap(); // </c>
785        assert_eq!(reader.depth(), 2);
786    }
787}