facet_xml/
dom_parser.rs

1//! Streaming DomParser implementation for XML using quick-xml.
2
3extern crate alloc;
4
5use alloc::borrow::Cow;
6use alloc::string::String;
7use alloc::vec::Vec;
8use core::fmt;
9use std::io::Cursor;
10
11use facet_dom::{DomEvent, DomParser};
12use quick_xml::NsReader;
13use quick_xml::escape::resolve_xml_entity;
14use quick_xml::events::Event;
15use quick_xml::name::ResolveResult;
16
17/// XML parsing error.
18#[derive(Debug, Clone)]
19pub enum XmlError {
20    /// Error from quick-xml.
21    Parse(String),
22    /// Unexpected end of input.
23    UnexpectedEof,
24    /// Unbalanced tags.
25    UnbalancedTags,
26    /// Invalid UTF-8.
27    InvalidUtf8(core::str::Utf8Error),
28}
29
30impl fmt::Display for XmlError {
31    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
32        match self {
33            XmlError::Parse(msg) => write!(f, "XML parse error: {}", msg),
34            XmlError::UnexpectedEof => write!(f, "Unexpected end of XML"),
35            XmlError::UnbalancedTags => write!(f, "Unbalanced XML tags"),
36            XmlError::InvalidUtf8(e) => write!(f, "Invalid UTF-8 in XML: {}", e),
37        }
38    }
39}
40
41impl std::error::Error for XmlError {}
42
43/// Streaming XML parser implementing `DomParser`.
44pub struct XmlParser<'de> {
45    reader: NsReader<Cursor<&'de [u8]>>,
46    /// Original input for raw capture
47    input: &'de [u8],
48    /// Buffer for quick-xml events
49    buf: Vec<u8>,
50    /// Buffer for peeked event
51    peeked: Option<DomEvent<'de>>,
52    /// Stack tracking element depth for skip_node
53    depth: usize,
54    /// Pending attributes from the current element
55    pending_attrs: Vec<(Option<String>, String, String)>,
56    /// Index into pending_attrs
57    attr_idx: usize,
58    /// State machine for event generation
59    state: ParserState,
60    /// True if current element is empty (self-closing)
61    is_empty_element: bool,
62    /// Position where current node started (for raw capture)
63    node_start_pos: u64,
64}
65
66#[derive(Debug, Clone, Copy, PartialEq)]
67enum ParserState {
68    /// Ready to read next XML event
69    Ready,
70    /// Just emitted NodeStart, need to emit attributes
71    EmittingAttrs,
72    /// Done emitting attrs, need to emit ChildrenStart
73    NeedChildrenStart,
74    /// Inside element children
75    InChildren,
76    /// Need to emit ChildrenEnd before NodeEnd
77    NeedChildrenEnd,
78    /// Need to emit NodeEnd
79    NeedNodeEnd,
80    /// End of document
81    Done,
82}
83
84impl<'de> XmlParser<'de> {
85    /// Create a new streaming XML parser.
86    pub fn new(input: &'de [u8]) -> Self {
87        trace!(input_len = input.len(), "creating XML parser");
88
89        let mut reader = NsReader::from_reader(Cursor::new(input));
90        reader.config_mut().trim_text(true);
91
92        Self {
93            reader,
94            input,
95            buf: Vec::new(),
96            peeked: None,
97            depth: 0,
98            pending_attrs: Vec::new(),
99            attr_idx: 0,
100            state: ParserState::Ready,
101            is_empty_element: false,
102            node_start_pos: 0,
103        }
104    }
105
106    /// Capture the current node as raw XML and skip past it.
107    /// Must be called right after a NodeStart event has been consumed.
108    fn do_capture_raw_node(&mut self) -> Result<Cow<'de, str>, XmlError> {
109        // Save start position before it gets overwritten by child elements
110        let start = self.node_start_pos as usize;
111        let start_depth = self.depth;
112
113        // Skip through the node - consume events until depth drops below starting
114        loop {
115            // Handle peeked event first
116            let event = if let Some(e) = self.peeked.take() {
117                Some(e)
118            } else {
119                self.read_next()?
120            };
121
122            match event {
123                Some(DomEvent::NodeEnd) if self.depth < start_depth => break,
124                None => break,
125                _ => {}
126            }
127        }
128
129        let end = self.reader.buffer_position() as usize;
130        let raw = &self.input[start..end];
131        let s = core::str::from_utf8(raw).map_err(XmlError::InvalidUtf8)?;
132        Ok(Cow::Borrowed(s))
133    }
134
135    /// Read the next raw event from quick-xml and convert to DomEvent.
136    fn read_next(&mut self) -> Result<Option<DomEvent<'de>>, XmlError> {
137        loop {
138            match self.state {
139                ParserState::Done => return Ok(None),
140
141                ParserState::EmittingAttrs => {
142                    if self.attr_idx < self.pending_attrs.len() {
143                        let (ns, name, value) = &self.pending_attrs[self.attr_idx];
144                        let event = DomEvent::Attribute {
145                            name: Cow::Owned(name.clone()),
146                            value: Cow::Owned(value.clone()),
147                            namespace: ns.clone().map(Cow::Owned),
148                        };
149                        self.attr_idx += 1;
150                        return Ok(Some(event));
151                    }
152                    // Done with attrs
153                    self.pending_attrs.clear();
154                    self.attr_idx = 0;
155                    self.state = ParserState::NeedChildrenStart;
156                }
157
158                ParserState::NeedChildrenStart => {
159                    if self.is_empty_element {
160                        self.state = ParserState::NeedChildrenEnd;
161                        self.is_empty_element = false;
162                    } else {
163                        self.state = ParserState::InChildren;
164                    }
165                    return Ok(Some(DomEvent::ChildrenStart));
166                }
167
168                ParserState::NeedChildrenEnd => {
169                    self.state = ParserState::NeedNodeEnd;
170                    return Ok(Some(DomEvent::ChildrenEnd));
171                }
172
173                ParserState::NeedNodeEnd => {
174                    self.depth -= 1;
175                    self.state = if self.depth == 0 {
176                        ParserState::Done
177                    } else {
178                        ParserState::InChildren
179                    };
180                    return Ok(Some(DomEvent::NodeEnd));
181                }
182
183                ParserState::Ready | ParserState::InChildren => {
184                    // Record position before reading (for raw capture)
185                    let pos_before = self.reader.buffer_position();
186
187                    self.buf.clear();
188                    let (resolve, event) = self
189                        .reader
190                        .read_resolved_event_into(&mut self.buf)
191                        .map_err(|e| XmlError::Parse(e.to_string()))?;
192
193                    // Resolve element namespace upfront
194                    let elem_ns = resolve_namespace(resolve)?;
195
196                    match event {
197                        Event::Start(ref e) | Event::Empty(ref e) => {
198                            let is_empty = matches!(event, Event::Empty(_));
199                            // Record start position for potential raw capture
200                            self.node_start_pos = pos_before;
201
202                            // Get element local name
203                            let local_name = e.local_name();
204                            let local = core::str::from_utf8(local_name.as_ref())
205                                .map_err(XmlError::InvalidUtf8)?;
206                            let local_owned = local.to_string();
207
208                            // Collect attributes
209                            self.pending_attrs.clear();
210                            self.attr_idx = 0;
211
212                            for attr in e.attributes() {
213                                let attr = attr.map_err(|e| XmlError::Parse(e.to_string()))?;
214
215                                // Skip xmlns declarations
216                                let key = attr.key;
217                                if key.as_ref() == b"xmlns" {
218                                    continue;
219                                }
220                                if let Some(prefix) = key.prefix()
221                                    && prefix.as_ref() == b"xmlns"
222                                {
223                                    continue;
224                                }
225
226                                let (attr_resolve, _) =
227                                    self.reader.resolver().resolve_attribute(key);
228                                let attr_ns = resolve_namespace(attr_resolve)?;
229                                let attr_local_name = key.local_name();
230                                let attr_local = core::str::from_utf8(attr_local_name.as_ref())
231                                    .map_err(XmlError::InvalidUtf8)?;
232                                let value = attr
233                                    .unescape_value()
234                                    .map_err(|e| XmlError::Parse(e.to_string()))?;
235
236                                self.pending_attrs.push((
237                                    attr_ns,
238                                    attr_local.to_string(),
239                                    value.into_owned(),
240                                ));
241                            }
242
243                            self.depth += 1;
244                            self.is_empty_element = is_empty;
245
246                            if self.pending_attrs.is_empty() {
247                                self.state = ParserState::NeedChildrenStart;
248                            } else {
249                                self.state = ParserState::EmittingAttrs;
250                            }
251
252                            return Ok(Some(DomEvent::NodeStart {
253                                tag: Cow::Owned(local_owned),
254                                namespace: elem_ns.map(Cow::Owned),
255                            }));
256                        }
257                        Event::End(_) => {
258                            self.state = ParserState::NeedChildrenEnd;
259                        }
260                        Event::Text(e) => {
261                            let text = e.decode().map_err(|e| XmlError::Parse(e.to_string()))?;
262                            let trimmed = text.trim();
263                            if !trimmed.is_empty() {
264                                return Ok(Some(DomEvent::Text(Cow::Owned(trimmed.to_string()))));
265                            }
266                        }
267                        Event::CData(e) => {
268                            let text =
269                                core::str::from_utf8(e.as_ref()).map_err(XmlError::InvalidUtf8)?;
270                            if !text.is_empty() {
271                                return Ok(Some(DomEvent::Text(Cow::Owned(text.to_string()))));
272                            }
273                        }
274                        Event::Comment(e) => {
275                            let text =
276                                core::str::from_utf8(e.as_ref()).map_err(XmlError::InvalidUtf8)?;
277                            return Ok(Some(DomEvent::Comment(Cow::Owned(text.to_string()))));
278                        }
279                        Event::PI(e) => {
280                            let content =
281                                core::str::from_utf8(e.as_ref()).map_err(XmlError::InvalidUtf8)?;
282                            let (target, data) = content
283                                .split_once(char::is_whitespace)
284                                .unwrap_or((content, ""));
285                            return Ok(Some(DomEvent::ProcessingInstruction {
286                                target: Cow::Owned(target.to_string()),
287                                data: Cow::Owned(data.trim().to_string()),
288                            }));
289                        }
290                        Event::Decl(_) => {
291                            // XML declaration - skip
292                        }
293                        Event::DocType(e) => {
294                            // Parse DOCTYPE declaration and emit as DomEvent
295                            let text =
296                                core::str::from_utf8(e.as_ref()).map_err(XmlError::InvalidUtf8)?;
297                            return Ok(Some(DomEvent::Doctype(Cow::Owned(text.to_string()))));
298                        }
299                        Event::Eof => {
300                            self.state = ParserState::Done;
301                            return Ok(None);
302                        }
303                        Event::GeneralRef(e) => {
304                            let raw = e.decode().map_err(|e| XmlError::Parse(e.to_string()))?;
305                            let resolved = resolve_entity(&raw)?;
306                            return Ok(Some(DomEvent::Text(Cow::Owned(resolved))));
307                        }
308                    }
309                }
310            }
311        }
312    }
313}
314
315impl<'de> DomParser<'de> for XmlParser<'de> {
316    type Error = XmlError;
317
318    fn next_event(&mut self) -> Result<Option<DomEvent<'de>>, Self::Error> {
319        if let Some(event) = self.peeked.take() {
320            return Ok(Some(event));
321        }
322        self.read_next()
323    }
324
325    fn peek_event(&mut self) -> Result<Option<&DomEvent<'de>>, Self::Error> {
326        if self.peeked.is_none() {
327            self.peeked = self.read_next()?;
328        }
329        Ok(self.peeked.as_ref())
330    }
331
332    fn skip_node(&mut self) -> Result<(), Self::Error> {
333        let start_depth = self.depth;
334
335        loop {
336            let event = self.next_event()?;
337            match event {
338                Some(DomEvent::NodeEnd) => {
339                    if self.depth < start_depth {
340                        break;
341                    }
342                }
343                None => break,
344                _ => {}
345            }
346        }
347
348        Ok(())
349    }
350
351    fn current_span(&self) -> Option<facet_reflect::Span> {
352        None
353    }
354
355    fn format_namespace(&self) -> Option<&'static str> {
356        Some("xml")
357    }
358
359    fn capture_raw_node(&mut self) -> Result<Option<Cow<'de, str>>, Self::Error> {
360        Ok(Some(self.do_capture_raw_node()?))
361    }
362}
363
364/// Resolve a namespace from quick-xml's ResolveResult.
365fn resolve_namespace(resolve: ResolveResult<'_>) -> Result<Option<String>, XmlError> {
366    match resolve {
367        ResolveResult::Bound(ns) => Ok(Some(String::from_utf8_lossy(ns.as_ref()).into_owned())),
368        ResolveResult::Unbound => Ok(None),
369        ResolveResult::Unknown(_) => Ok(None),
370    }
371}
372
373/// Resolve a general entity reference.
374fn resolve_entity(raw: &str) -> Result<String, XmlError> {
375    if let Some(resolved) = resolve_xml_entity(raw) {
376        return Ok(resolved.into());
377    }
378
379    if let Some(rest) = raw.strip_prefix('#') {
380        let code = if let Some(hex) = rest.strip_prefix('x').or_else(|| rest.strip_prefix('X')) {
381            u32::from_str_radix(hex, 16)
382                .map_err(|_| XmlError::Parse(format!("Invalid hex entity: #{}", rest)))?
383        } else {
384            rest.parse::<u32>()
385                .map_err(|_| XmlError::Parse(format!("Invalid decimal entity: #{}", rest)))?
386        };
387
388        let ch = char::from_u32(code)
389            .ok_or_else(|| XmlError::Parse(format!("Invalid Unicode: {}", code)))?;
390        return Ok(ch.to_string());
391    }
392
393    Ok(format!("&{};", raw))
394}