facet_xml/
parser.rs

1extern crate alloc;
2
3use alloc::borrow::Cow;
4use alloc::string::String;
5use alloc::vec::Vec;
6use core::fmt;
7
8use facet_format::{
9    ContainerKind, FieldEvidence, FieldKey, FieldLocationHint, FormatParser, ParseEvent,
10    ProbeStream, ScalarValue,
11};
12use quick_xml::NsReader;
13use quick_xml::escape::resolve_xml_entity;
14use quick_xml::events::Event;
15use quick_xml::name::ResolveResult;
16use std::io::Cursor;
17
18/// A qualified XML name with optional namespace URI.
19///
20/// In XML, elements and attributes can be in a namespace. The namespace is
21/// identified by a URI, not the prefix used in the document. For example,
22/// `android:label` and `a:label` are the same if both prefixes resolve to
23/// the same namespace URI.
24#[derive(Debug, Clone, PartialEq, Eq)]
25#[allow(dead_code)] // Will be used in Phase 2
26struct QName {
27    /// The namespace URI, or `None` for "no namespace".
28    ///
29    /// - Elements without a prefix and no default `xmlns` are in no namespace.
30    /// - Attributes without a prefix are always in no namespace (even with default xmlns).
31    /// - Elements/attributes with a prefix have their namespace resolved via xmlns declarations.
32    namespace: Option<String>,
33    /// The local name (without prefix).
34    local_name: String,
35}
36
37#[allow(dead_code)] // Will be used in Phase 2
38impl QName {
39    /// Create a qualified name with no namespace.
40    fn local(name: impl Into<String>) -> Self {
41        Self {
42            namespace: None,
43            local_name: name.into(),
44        }
45    }
46
47    /// Create a qualified name with a namespace.
48    fn with_ns(namespace: impl Into<String>, local_name: impl Into<String>) -> Self {
49        Self {
50            namespace: Some(namespace.into()),
51            local_name: local_name.into(),
52        }
53    }
54
55    /// Check if this name matches a local name with an optional expected namespace.
56    ///
57    /// If `expected_ns` is `None`, matches any name with the given local name.
58    /// If `expected_ns` is `Some(ns)`, only matches if both local name and namespace match.
59    fn matches(&self, local_name: &str, expected_ns: Option<&str>) -> bool {
60        if self.local_name != local_name {
61            return false;
62        }
63        match expected_ns {
64            None => true, // No namespace constraint - match any namespace (or none)
65            Some(ns) => self.namespace.as_deref() == Some(ns),
66        }
67    }
68}
69
70impl fmt::Display for QName {
71    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
72        match &self.namespace {
73            Some(ns) => write!(f, "{{{}}}{}", ns, self.local_name),
74            None => write!(f, "{}", self.local_name),
75        }
76    }
77}
78
79pub struct XmlParser<'de> {
80    events: Vec<ParseEvent<'de>>,
81    idx: usize,
82    pending_error: Option<XmlError>,
83}
84
85impl<'de> XmlParser<'de> {
86    pub fn new(input: &'de [u8]) -> Self {
87        match build_events(input) {
88            Ok(events) => Self {
89                events,
90                idx: 0,
91                pending_error: None,
92            },
93            Err(err) => Self {
94                events: Vec::new(),
95                idx: 0,
96                pending_error: Some(err),
97            },
98        }
99    }
100}
101
102#[derive(Debug, Clone)]
103pub enum XmlError {
104    ParseError(alloc::string::String),
105    UnexpectedEof,
106    UnbalancedTags,
107    InvalidUtf8(core::str::Utf8Error),
108    MultipleRoots,
109}
110
111impl fmt::Display for XmlError {
112    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
113        match self {
114            XmlError::ParseError(msg) => write!(f, "XML parse error: {}", msg),
115            XmlError::UnexpectedEof => write!(f, "Unexpected end of XML"),
116            XmlError::UnbalancedTags => write!(f, "Unbalanced XML tags"),
117            XmlError::InvalidUtf8(e) => write!(f, "Invalid UTF-8 in XML: {}", e),
118            XmlError::MultipleRoots => write!(f, "XML document has multiple root elements"),
119        }
120    }
121}
122
123impl<'de> FormatParser<'de> for XmlParser<'de> {
124    type Error = XmlError;
125    type Probe<'a>
126        = XmlProbe<'de>
127    where
128        Self: 'a;
129
130    fn next_event(&mut self) -> Result<Option<ParseEvent<'de>>, Self::Error> {
131        if let Some(err) = &self.pending_error {
132            return Err(err.clone());
133        }
134        if self.idx >= self.events.len() {
135            return Ok(None);
136        }
137        let event = self.events[self.idx].clone();
138        self.idx += 1;
139        Ok(Some(event))
140    }
141
142    fn peek_event(&mut self) -> Result<Option<ParseEvent<'de>>, Self::Error> {
143        if let Some(err) = &self.pending_error {
144            return Err(err.clone());
145        }
146        Ok(self.events.get(self.idx).cloned())
147    }
148
149    fn skip_value(&mut self) -> Result<(), Self::Error> {
150        // Track how many "pending field values" we have at each struct depth.
151        // When we see FieldKey, we expect a value to follow.
152        // When that value is consumed (Scalar or StructEnd/SequenceEnd), we're done with that field.
153        let mut struct_depth = 0usize;
154        let mut pending_field_value = false;
155
156        loop {
157            let event = self.next_event()?.ok_or(XmlError::UnexpectedEof)?;
158            match event {
159                ParseEvent::StructStart(_) | ParseEvent::SequenceStart(_) => {
160                    // If we were waiting for a field value, this struct/seq IS that value
161                    pending_field_value = false;
162                    struct_depth += 1;
163                }
164                ParseEvent::StructEnd | ParseEvent::SequenceEnd => {
165                    if struct_depth == 0 {
166                        // We were skipping a struct/seq value and now it's closed
167                        break;
168                    } else {
169                        struct_depth -= 1;
170                        // If we just closed the top-level value, we're done
171                        if struct_depth == 0 && !pending_field_value {
172                            break;
173                        }
174                    }
175                }
176                ParseEvent::Scalar(_) | ParseEvent::VariantTag(_) => {
177                    if struct_depth == 0 && !pending_field_value {
178                        // This scalar IS the value we were asked to skip
179                        break;
180                    }
181                    // If we were waiting for a field value, this scalar is it
182                    pending_field_value = false;
183                }
184                ParseEvent::FieldKey(_) | ParseEvent::OrderedField => {
185                    // A field key means a value will follow
186                    pending_field_value = true;
187                }
188            }
189        }
190        Ok(())
191    }
192
193    fn begin_probe(&mut self) -> Result<Self::Probe<'_>, Self::Error> {
194        // Look ahead in the remaining events to build field evidence
195        let evidence = self.build_probe();
196        Ok(XmlProbe { evidence, idx: 0 })
197    }
198}
199
200impl<'de> XmlParser<'de> {
201    /// Build field evidence by looking ahead at remaining events.
202    fn build_probe(&self) -> Vec<FieldEvidence<'de>> {
203        let mut evidence = Vec::new();
204
205        // Check if we're about to read a struct
206        if self.idx >= self.events.len() {
207            return evidence;
208        }
209
210        if !matches!(
211            self.events.get(self.idx),
212            Some(ParseEvent::StructStart(ContainerKind::Element))
213        ) {
214            return evidence;
215        }
216
217        // Scan the struct's fields
218        let mut i = self.idx + 1;
219        let mut depth = 0usize;
220
221        while i < self.events.len() {
222            match &self.events[i] {
223                ParseEvent::StructStart(_) | ParseEvent::SequenceStart(_) => {
224                    depth += 1;
225                    i += 1;
226                }
227                ParseEvent::StructEnd | ParseEvent::SequenceEnd => {
228                    if depth == 0 {
229                        // End of the struct we're probing
230                        break;
231                    }
232                    depth -= 1;
233                    i += 1;
234                }
235                ParseEvent::FieldKey(key) if depth == 0 => {
236                    // This is a top-level field in the struct we're probing
237                    // Look at the next event to see if it's a scalar
238                    let scalar_value = if let Some(next_event) = self.events.get(i + 1) {
239                        match next_event {
240                            ParseEvent::Scalar(sv) => Some(sv.clone()),
241                            _ => None,
242                        }
243                    } else {
244                        None
245                    };
246
247                    if let Some(sv) = scalar_value {
248                        evidence.push(FieldEvidence::with_scalar_value(
249                            key.name.clone(),
250                            key.location,
251                            None,
252                            sv,
253                            key.namespace.clone(),
254                        ));
255                    } else {
256                        evidence.push(FieldEvidence::new(
257                            key.name.clone(),
258                            key.location,
259                            None,
260                            key.namespace.clone(),
261                        ));
262                    }
263                    i += 1;
264                }
265                _ => {
266                    i += 1;
267                }
268            }
269        }
270
271        evidence
272    }
273}
274
275pub struct XmlProbe<'de> {
276    evidence: Vec<FieldEvidence<'de>>,
277    idx: usize,
278}
279
280impl<'de> ProbeStream<'de> for XmlProbe<'de> {
281    type Error = XmlError;
282
283    fn next(&mut self) -> Result<Option<FieldEvidence<'de>>, Self::Error> {
284        if self.idx >= self.evidence.len() {
285            Ok(None)
286        } else {
287            let ev = self.evidence[self.idx].clone();
288            self.idx += 1;
289            Ok(Some(ev))
290        }
291    }
292}
293
294/// Resolve a namespace from quick-xml's ResolveResult.
295fn resolve_namespace(resolve: ResolveResult<'_>) -> Result<Option<String>, XmlError> {
296    match resolve {
297        ResolveResult::Bound(ns) => Ok(Some(String::from_utf8_lossy(ns.as_ref()).into_owned())),
298        ResolveResult::Unbound => Ok(None),
299        ResolveResult::Unknown(_) => {
300            // Unknown prefix - treat as no namespace
301            Ok(None)
302        }
303    }
304}
305
306/// Resolve a general entity reference to its character value.
307/// Handles both named entities (lt, gt, amp, etc.) and numeric entities (&#10;, &#x09;, etc.)
308fn resolve_entity(raw: &str) -> Result<String, XmlError> {
309    // Try named entity first (e.g., "lt" -> "<")
310    if let Some(resolved) = resolve_xml_entity(raw) {
311        return Ok(resolved.into());
312    }
313
314    // Try numeric entity (e.g., "#10" -> "\n", "#x09" -> "\t")
315    if let Some(rest) = raw.strip_prefix('#') {
316        let code = if let Some(hex) = rest.strip_prefix('x').or_else(|| rest.strip_prefix('X')) {
317            // Hexadecimal numeric entity
318            u32::from_str_radix(hex, 16).map_err(|_| {
319                XmlError::ParseError(format!("Invalid hex numeric entity: #{}", rest))
320            })?
321        } else {
322            // Decimal numeric entity
323            rest.parse::<u32>().map_err(|_| {
324                XmlError::ParseError(format!("Invalid decimal numeric entity: #{}", rest))
325            })?
326        };
327
328        let ch = char::from_u32(code)
329            .ok_or_else(|| XmlError::ParseError(format!("Invalid Unicode code point: {}", code)))?;
330        return Ok(ch.to_string());
331    }
332
333    // Unknown entity - return as-is with & and ;
334    Ok(format!("&{};", raw))
335}
336
337#[derive(Debug, Clone)]
338struct Element {
339    name: QName,
340    attributes: Vec<(QName, String)>,
341    children: Vec<Element>,
342    text: String,
343}
344
345impl Element {
346    fn new(name: QName, attributes: Vec<(QName, String)>) -> Self {
347        Self {
348            name,
349            attributes,
350            children: Vec::new(),
351            text: String::new(),
352        }
353    }
354
355    fn push_text(&mut self, text: &str) {
356        self.push_text_impl(text, true);
357    }
358
359    fn push_text_raw(&mut self, text: &str) {
360        self.push_text_impl(text, false);
361    }
362
363    fn push_text_impl(&mut self, text: &str, should_trim: bool) {
364        let content = if should_trim { text.trim() } else { text };
365        if content.is_empty() {
366            return;
367        }
368        self.text.push_str(content);
369    }
370}
371
372fn build_events<'de>(input: &'de [u8]) -> Result<Vec<ParseEvent<'de>>, XmlError> {
373    let mut reader = NsReader::from_reader(Cursor::new(input));
374    reader.config_mut().trim_text(true);
375
376    let mut buf = Vec::new();
377    let mut stack: Vec<Element> = Vec::new();
378    let mut root: Option<Element> = None;
379
380    loop {
381        buf.clear();
382        let (resolve, event) = reader
383            .read_resolved_event_into(&mut buf)
384            .map_err(|e| XmlError::ParseError(e.to_string()))?;
385
386        match event {
387            Event::Start(ref e) | Event::Empty(ref e) => {
388                // Resolve element namespace
389                let ns = resolve_namespace(resolve)?;
390                let local = core::str::from_utf8(e.local_name().as_ref())
391                    .map_err(XmlError::InvalidUtf8)?
392                    .to_string();
393                let name = match ns {
394                    Some(uri) => QName::with_ns(uri, local),
395                    None => QName::local(local),
396                };
397
398                // Resolve attribute namespaces
399                let mut attributes = Vec::new();
400                for attr in e.attributes() {
401                    let attr = attr.map_err(|e| XmlError::ParseError(e.to_string()))?;
402
403                    // Skip xmlns declarations (xmlns and xmlns:*)
404                    let key = attr.key;
405                    if key.as_ref() == b"xmlns" {
406                        continue; // Skip default namespace declaration
407                    }
408                    if let Some(prefix) = key.prefix()
409                        && prefix.as_ref() == b"xmlns"
410                    {
411                        continue; // Skip prefixed namespace declarations
412                    }
413
414                    let (attr_resolve, _) = reader.resolve_attribute(key);
415                    let attr_ns = resolve_namespace(attr_resolve)?;
416                    let attr_local = core::str::from_utf8(key.local_name().as_ref())
417                        .map_err(XmlError::InvalidUtf8)?
418                        .to_string();
419                    let attr_qname = match attr_ns {
420                        Some(uri) => QName::with_ns(uri, attr_local),
421                        None => QName::local(attr_local),
422                    };
423                    let value = attr
424                        .unescape_value()
425                        .map_err(|e| XmlError::ParseError(e.to_string()))?
426                        .into_owned();
427                    attributes.push((attr_qname, value));
428                }
429
430                let elem = Element::new(name, attributes);
431
432                if matches!(event, Event::Start(_)) {
433                    stack.push(elem);
434                } else {
435                    // Empty element
436                    attach_element(stack.as_mut_slice(), elem, &mut root)?;
437                }
438            }
439            Event::End(_) => {
440                let elem = stack.pop().ok_or(XmlError::UnbalancedTags)?;
441                attach_element(stack.as_mut_slice(), elem, &mut root)?;
442            }
443            Event::Text(e) => {
444                if let Some(current) = stack.last_mut() {
445                    let text = e
446                        .decode()
447                        .map_err(|err| XmlError::ParseError(err.to_string()))?;
448                    current.push_text(text.as_ref());
449                }
450            }
451            Event::CData(e) => {
452                if let Some(current) = stack.last_mut() {
453                    let text = core::str::from_utf8(e.as_ref()).map_err(XmlError::InvalidUtf8)?;
454                    current.push_text(text);
455                }
456            }
457            Event::GeneralRef(e) => {
458                // General entity references (e.g., &lt;, &gt;, &amp;, &#10;, etc.)
459                // These are now reported separately in quick-xml 0.38+
460                if let Some(current) = stack.last_mut() {
461                    let raw = e
462                        .decode()
463                        .map_err(|err| XmlError::ParseError(err.to_string()))?;
464                    let resolved = resolve_entity(&raw)?;
465                    // Don't trim entity references - they may be intentional whitespace/control chars
466                    current.push_text_raw(&resolved);
467                }
468            }
469            Event::Decl(_) | Event::Comment(_) | Event::PI(_) | Event::DocType(_) => {}
470            Event::Eof => break,
471        }
472    }
473
474    if !stack.is_empty() {
475        return Err(XmlError::UnbalancedTags);
476    }
477
478    let root = root.ok_or(XmlError::UnexpectedEof)?;
479    let mut events = Vec::new();
480    emit_element_events(&root, &mut events);
481    Ok(events)
482}
483
484fn attach_element(
485    stack: &mut [Element],
486    elem: Element,
487    root: &mut Option<Element>,
488) -> Result<(), XmlError> {
489    if let Some(parent) = stack.last_mut() {
490        parent.children.push(elem);
491    } else if root.is_none() {
492        *root = Some(elem);
493    } else {
494        return Err(XmlError::MultipleRoots);
495    }
496    Ok(())
497}
498
499/// Emit ParseEvents directly from an Element, without intermediate XmlValue.
500fn emit_element_events<'de>(elem: &Element, events: &mut Vec<ParseEvent<'de>>) {
501    let text = elem.text.trim();
502    let has_attrs = !elem.attributes.is_empty();
503    let has_children = !elem.children.is_empty();
504
505    // Case 1: No attributes, no children - emit scalar from text
506    if !has_attrs && !has_children {
507        if text.is_empty() {
508            // Empty element is an empty object (for unit structs)
509            events.push(ParseEvent::StructStart(ContainerKind::Element));
510            events.push(ParseEvent::StructEnd);
511        } else {
512            emit_scalar_from_text(text, events);
513        }
514        return;
515    }
516
517    // Case 2: No attributes, multiple children with same name - emit as array
518    if !has_attrs && has_children && text.is_empty() && elem.children.len() > 1 {
519        let first = &elem.children[0].name;
520        if elem.children.iter().all(|child| &child.name == first) {
521            events.push(ParseEvent::SequenceStart(ContainerKind::Element));
522            for child in &elem.children {
523                emit_element_events(child, events);
524            }
525            events.push(ParseEvent::SequenceEnd);
526            return;
527        }
528    }
529
530    // Case 3: Has attributes or mixed children - emit as struct
531    events.push(ParseEvent::StructStart(ContainerKind::Element));
532
533    // Emit attributes as fields
534    for (qname, value) in &elem.attributes {
535        let mut key = FieldKey::new(
536            Cow::Owned(qname.local_name.clone()),
537            FieldLocationHint::Attribute,
538        );
539        if let Some(ns) = &qname.namespace {
540            key = key.with_namespace(Cow::Owned(ns.clone()));
541        }
542        events.push(ParseEvent::FieldKey(key));
543        // Attributes are always strings
544        events.push(ParseEvent::Scalar(ScalarValue::Str(Cow::Owned(
545            value.clone(),
546        ))));
547    }
548
549    // Emit children in order (preserving document order for xml::elements support)
550    // The deserializer is responsible for grouping same-named children into arrays
551    // or collecting them into xml::elements fields.
552    for child in &elem.children {
553        let mut key = FieldKey::new(
554            Cow::Owned(child.name.local_name.clone()),
555            FieldLocationHint::Child,
556        );
557        if let Some(ns) = &child.name.namespace {
558            key = key.with_namespace(Cow::Owned(ns.clone()));
559        }
560        events.push(ParseEvent::FieldKey(key));
561        emit_element_events(child, events);
562    }
563
564    // Emit text content if present (mixed content)
565    if !text.is_empty() {
566        let key = FieldKey::new(Cow::Borrowed("_text"), FieldLocationHint::Text);
567        events.push(ParseEvent::FieldKey(key));
568        events.push(ParseEvent::Scalar(ScalarValue::Str(Cow::Owned(
569            text.to_string(),
570        ))));
571    }
572
573    events.push(ParseEvent::StructEnd);
574}
575
576/// Parse text and emit appropriate scalar event.
577fn emit_scalar_from_text<'de>(text: &str, events: &mut Vec<ParseEvent<'de>>) {
578    if text.eq_ignore_ascii_case("null") {
579        events.push(ParseEvent::Scalar(ScalarValue::Null));
580        return;
581    }
582    if let Ok(b) = text.parse::<bool>() {
583        events.push(ParseEvent::Scalar(ScalarValue::Bool(b)));
584        return;
585    }
586    if let Ok(i) = text.parse::<i64>() {
587        events.push(ParseEvent::Scalar(ScalarValue::I64(i)));
588        return;
589    }
590    if let Ok(u) = text.parse::<u64>() {
591        events.push(ParseEvent::Scalar(ScalarValue::U64(u)));
592        return;
593    }
594    // Try i128/u128 before f64 to avoid precision loss for large integers.
595    // Emit as string so the deserializer can use parse_from_str.
596    if text.parse::<i128>().is_ok() || text.parse::<u128>().is_ok() {
597        events.push(ParseEvent::Scalar(ScalarValue::Str(Cow::Owned(
598            text.to_string(),
599        ))));
600        return;
601    }
602    if let Ok(f) = text.parse::<f64>() {
603        events.push(ParseEvent::Scalar(ScalarValue::F64(f)));
604        return;
605    }
606    events.push(ParseEvent::Scalar(ScalarValue::Str(Cow::Owned(
607        text.to_string(),
608    ))));
609}