facet_format_xml/
parser.rs

1extern crate alloc;
2
3use alloc::borrow::Cow;
4use alloc::collections::BTreeMap;
5use alloc::string::String;
6use alloc::vec::Vec;
7use core::fmt;
8
9use facet_format::{
10    ContainerKind, FieldEvidence, FieldKey, FieldLocationHint, FormatParser, ParseEvent,
11    ProbeStream, ScalarValue,
12};
13use quick_xml::NsReader;
14use quick_xml::escape::resolve_xml_entity;
15use quick_xml::events::Event;
16use quick_xml::name::ResolveResult;
17use std::io::Cursor;
18
19/// A qualified XML name with optional namespace URI.
20///
21/// In XML, elements and attributes can be in a namespace. The namespace is
22/// identified by a URI, not the prefix used in the document. For example,
23/// `android:label` and `a:label` are the same if both prefixes resolve to
24/// the same namespace URI.
25#[derive(Debug, Clone, PartialEq, Eq)]
26#[allow(dead_code)] // Will be used in Phase 2
27struct QName {
28    /// The namespace URI, or `None` for "no namespace".
29    ///
30    /// - Elements without a prefix and no default `xmlns` are in no namespace.
31    /// - Attributes without a prefix are always in no namespace (even with default xmlns).
32    /// - Elements/attributes with a prefix have their namespace resolved via xmlns declarations.
33    namespace: Option<String>,
34    /// The local name (without prefix).
35    local_name: String,
36}
37
38#[allow(dead_code)] // Will be used in Phase 2
39impl QName {
40    /// Create a qualified name with no namespace.
41    fn local(name: impl Into<String>) -> Self {
42        Self {
43            namespace: None,
44            local_name: name.into(),
45        }
46    }
47
48    /// Create a qualified name with a namespace.
49    fn with_ns(namespace: impl Into<String>, local_name: impl Into<String>) -> Self {
50        Self {
51            namespace: Some(namespace.into()),
52            local_name: local_name.into(),
53        }
54    }
55
56    /// Check if this name matches a local name with an optional expected namespace.
57    ///
58    /// If `expected_ns` is `None`, matches any name with the given local name.
59    /// If `expected_ns` is `Some(ns)`, only matches if both local name and namespace match.
60    fn matches(&self, local_name: &str, expected_ns: Option<&str>) -> bool {
61        if self.local_name != local_name {
62            return false;
63        }
64        match expected_ns {
65            None => true, // No namespace constraint - match any namespace (or none)
66            Some(ns) => self.namespace.as_deref() == Some(ns),
67        }
68    }
69}
70
71impl fmt::Display for QName {
72    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
73        match &self.namespace {
74            Some(ns) => write!(f, "{{{}}}{}", ns, self.local_name),
75            None => write!(f, "{}", self.local_name),
76        }
77    }
78}
79
80pub struct XmlParser<'de> {
81    events: Vec<ParseEvent<'de>>,
82    idx: usize,
83    pending_error: Option<XmlError>,
84}
85
86impl<'de> XmlParser<'de> {
87    pub fn new(input: &'de [u8]) -> Self {
88        match build_events(input) {
89            Ok(events) => Self {
90                events,
91                idx: 0,
92                pending_error: None,
93            },
94            Err(err) => Self {
95                events: Vec::new(),
96                idx: 0,
97                pending_error: Some(err),
98            },
99        }
100    }
101}
102
103#[derive(Debug, Clone)]
104pub enum XmlError {
105    ParseError(alloc::string::String),
106    UnexpectedEof,
107    UnbalancedTags,
108    InvalidUtf8,
109    MultipleRoots,
110}
111
112impl fmt::Display for XmlError {
113    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
114        match self {
115            XmlError::ParseError(msg) => write!(f, "XML parse error: {}", msg),
116            XmlError::UnexpectedEof => write!(f, "Unexpected end of XML"),
117            XmlError::UnbalancedTags => write!(f, "Unbalanced XML tags"),
118            XmlError::InvalidUtf8 => write!(f, "Invalid UTF-8 in XML"),
119            XmlError::MultipleRoots => write!(f, "XML document has multiple root elements"),
120        }
121    }
122}
123
124impl<'de> FormatParser<'de> for XmlParser<'de> {
125    type Error = XmlError;
126    type Probe<'a>
127        = XmlProbe<'de>
128    where
129        Self: 'a;
130
131    fn next_event(&mut self) -> Result<ParseEvent<'de>, Self::Error> {
132        if let Some(err) = &self.pending_error {
133            return Err(err.clone());
134        }
135        if self.idx >= self.events.len() {
136            return Err(XmlError::UnexpectedEof);
137        }
138        let event = self.events[self.idx].clone();
139        self.idx += 1;
140        Ok(event)
141    }
142
143    fn peek_event(&mut self) -> Result<ParseEvent<'de>, Self::Error> {
144        if let Some(err) = &self.pending_error {
145            return Err(err.clone());
146        }
147        self.events
148            .get(self.idx)
149            .cloned()
150            .ok_or(XmlError::UnexpectedEof)
151    }
152
153    fn skip_value(&mut self) -> Result<(), Self::Error> {
154        let mut depth = 0usize;
155        loop {
156            let event = self.next_event()?;
157            match event {
158                ParseEvent::StructStart(_) | ParseEvent::SequenceStart(_) => {
159                    depth += 1;
160                }
161                ParseEvent::StructEnd | ParseEvent::SequenceEnd => {
162                    if depth == 0 {
163                        break;
164                    } else {
165                        depth -= 1;
166                    }
167                }
168                ParseEvent::Scalar(_) | ParseEvent::VariantTag(_) => {
169                    if depth == 0 {
170                        break;
171                    }
172                }
173                ParseEvent::FieldKey(_) => {
174                    // Value will follow; treat as entering one more depth level.
175                    depth += 1;
176                }
177            }
178        }
179        Ok(())
180    }
181
182    fn begin_probe(&mut self) -> Result<Self::Probe<'_>, Self::Error> {
183        // Look ahead in the remaining events to build field evidence
184        let evidence = self.build_probe();
185        Ok(XmlProbe { evidence, idx: 0 })
186    }
187}
188
189impl<'de> XmlParser<'de> {
190    /// Build field evidence by looking ahead at remaining events.
191    fn build_probe(&self) -> Vec<FieldEvidence<'de>> {
192        let mut evidence = Vec::new();
193
194        // Check if we're about to read a struct
195        if self.idx >= self.events.len() {
196            return evidence;
197        }
198
199        if !matches!(
200            self.events.get(self.idx),
201            Some(ParseEvent::StructStart(ContainerKind::Element))
202        ) {
203            return evidence;
204        }
205
206        // Scan the struct's fields
207        let mut i = self.idx + 1;
208        let mut depth = 0usize;
209
210        while i < self.events.len() {
211            match &self.events[i] {
212                ParseEvent::StructStart(_) | ParseEvent::SequenceStart(_) => {
213                    depth += 1;
214                    i += 1;
215                }
216                ParseEvent::StructEnd | ParseEvent::SequenceEnd => {
217                    if depth == 0 {
218                        // End of the struct we're probing
219                        break;
220                    }
221                    depth -= 1;
222                    i += 1;
223                }
224                ParseEvent::FieldKey(key) if depth == 0 => {
225                    // This is a top-level field in the struct we're probing
226                    // Look at the next event to see if it's a scalar
227                    let scalar_value = if let Some(next_event) = self.events.get(i + 1) {
228                        match next_event {
229                            ParseEvent::Scalar(sv) => Some(sv.clone()),
230                            _ => None,
231                        }
232                    } else {
233                        None
234                    };
235
236                    if let Some(sv) = scalar_value {
237                        evidence.push(FieldEvidence::with_scalar_value(
238                            key.name.clone(),
239                            key.location,
240                            None,
241                            sv,
242                            key.namespace.clone(),
243                        ));
244                    } else {
245                        evidence.push(FieldEvidence::new(
246                            key.name.clone(),
247                            key.location,
248                            None,
249                            key.namespace.clone(),
250                        ));
251                    }
252                    i += 1;
253                }
254                _ => {
255                    i += 1;
256                }
257            }
258        }
259
260        evidence
261    }
262}
263
264pub struct XmlProbe<'de> {
265    evidence: Vec<FieldEvidence<'de>>,
266    idx: usize,
267}
268
269impl<'de> ProbeStream<'de> for XmlProbe<'de> {
270    type Error = XmlError;
271
272    fn next(&mut self) -> Result<Option<FieldEvidence<'de>>, Self::Error> {
273        if self.idx >= self.evidence.len() {
274            Ok(None)
275        } else {
276            let ev = self.evidence[self.idx].clone();
277            self.idx += 1;
278            Ok(Some(ev))
279        }
280    }
281}
282
283/// Resolve a namespace from quick-xml's ResolveResult.
284fn resolve_namespace(resolve: ResolveResult<'_>) -> Result<Option<String>, XmlError> {
285    match resolve {
286        ResolveResult::Bound(ns) => Ok(Some(String::from_utf8_lossy(ns.as_ref()).into_owned())),
287        ResolveResult::Unbound => Ok(None),
288        ResolveResult::Unknown(_) => {
289            // Unknown prefix - treat as no namespace
290            Ok(None)
291        }
292    }
293}
294
295/// Resolve a general entity reference to its character value.
296/// Handles both named entities (lt, gt, amp, etc.) and numeric entities (&#10;, &#x09;, etc.)
297fn resolve_entity(raw: &str) -> Result<String, XmlError> {
298    // Try named entity first (e.g., "lt" -> "<")
299    if let Some(resolved) = resolve_xml_entity(raw) {
300        return Ok(resolved.into());
301    }
302
303    // Try numeric entity (e.g., "#10" -> "\n", "#x09" -> "\t")
304    if let Some(rest) = raw.strip_prefix('#') {
305        let code = if let Some(hex) = rest.strip_prefix('x').or_else(|| rest.strip_prefix('X')) {
306            // Hexadecimal numeric entity
307            u32::from_str_radix(hex, 16).map_err(|_| {
308                XmlError::ParseError(format!("Invalid hex numeric entity: #{}", rest))
309            })?
310        } else {
311            // Decimal numeric entity
312            rest.parse::<u32>().map_err(|_| {
313                XmlError::ParseError(format!("Invalid decimal numeric entity: #{}", rest))
314            })?
315        };
316
317        let ch = char::from_u32(code)
318            .ok_or_else(|| XmlError::ParseError(format!("Invalid Unicode code point: {}", code)))?;
319        return Ok(ch.to_string());
320    }
321
322    // Unknown entity - return as-is with & and ;
323    Ok(format!("&{};", raw))
324}
325
326#[derive(Debug, Clone)]
327struct Element {
328    name: QName,
329    attributes: Vec<(QName, String)>,
330    children: Vec<Element>,
331    text: String,
332}
333
334impl Element {
335    fn new(name: QName, attributes: Vec<(QName, String)>) -> Self {
336        Self {
337            name,
338            attributes,
339            children: Vec::new(),
340            text: String::new(),
341        }
342    }
343
344    fn push_text(&mut self, text: &str) {
345        self.push_text_impl(text, true);
346    }
347
348    fn push_text_raw(&mut self, text: &str) {
349        self.push_text_impl(text, false);
350    }
351
352    fn push_text_impl(&mut self, text: &str, should_trim: bool) {
353        let content = if should_trim { text.trim() } else { text };
354        if content.is_empty() {
355            return;
356        }
357        self.text.push_str(content);
358    }
359}
360
361fn build_events<'de>(input: &'de [u8]) -> Result<Vec<ParseEvent<'de>>, XmlError> {
362    let mut reader = NsReader::from_reader(Cursor::new(input));
363    reader.config_mut().trim_text(true);
364
365    let mut buf = Vec::new();
366    let mut stack: Vec<Element> = Vec::new();
367    let mut root: Option<Element> = None;
368
369    loop {
370        buf.clear();
371        let (resolve, event) = reader
372            .read_resolved_event_into(&mut buf)
373            .map_err(|e| XmlError::ParseError(e.to_string()))?;
374
375        match event {
376            Event::Start(ref e) | Event::Empty(ref e) => {
377                // Resolve element namespace
378                let ns = resolve_namespace(resolve)?;
379                let local = core::str::from_utf8(e.local_name().as_ref())
380                    .map_err(|_| XmlError::InvalidUtf8)?
381                    .to_string();
382                let name = match ns {
383                    Some(uri) => QName::with_ns(uri, local),
384                    None => QName::local(local),
385                };
386
387                // Resolve attribute namespaces
388                let mut attributes = Vec::new();
389                for attr in e.attributes() {
390                    let attr = attr.map_err(|e| XmlError::ParseError(e.to_string()))?;
391
392                    // Skip xmlns declarations (xmlns and xmlns:*)
393                    let key = attr.key;
394                    if key.as_ref() == b"xmlns" {
395                        continue; // Skip default namespace declaration
396                    }
397                    if let Some(prefix) = key.prefix()
398                        && prefix.as_ref() == b"xmlns"
399                    {
400                        continue; // Skip prefixed namespace declarations
401                    }
402
403                    let (attr_resolve, _) = reader.resolve_attribute(key);
404                    let attr_ns = resolve_namespace(attr_resolve)?;
405                    let attr_local = core::str::from_utf8(key.local_name().as_ref())
406                        .map_err(|_| XmlError::InvalidUtf8)?
407                        .to_string();
408                    let attr_qname = match attr_ns {
409                        Some(uri) => QName::with_ns(uri, attr_local),
410                        None => QName::local(attr_local),
411                    };
412                    let value = attr
413                        .unescape_value()
414                        .map_err(|e| XmlError::ParseError(e.to_string()))?
415                        .into_owned();
416                    attributes.push((attr_qname, value));
417                }
418
419                let elem = Element::new(name, attributes);
420
421                if matches!(event, Event::Start(_)) {
422                    stack.push(elem);
423                } else {
424                    // Empty element
425                    attach_element(stack.as_mut_slice(), elem, &mut root)?;
426                }
427            }
428            Event::End(_) => {
429                let elem = stack.pop().ok_or(XmlError::UnbalancedTags)?;
430                attach_element(stack.as_mut_slice(), elem, &mut root)?;
431            }
432            Event::Text(e) => {
433                if let Some(current) = stack.last_mut() {
434                    let text = e
435                        .decode()
436                        .map_err(|err| XmlError::ParseError(err.to_string()))?;
437                    current.push_text(text.as_ref());
438                }
439            }
440            Event::CData(e) => {
441                if let Some(current) = stack.last_mut() {
442                    let text =
443                        core::str::from_utf8(e.as_ref()).map_err(|_| XmlError::InvalidUtf8)?;
444                    current.push_text(text);
445                }
446            }
447            Event::GeneralRef(e) => {
448                // General entity references (e.g., &lt;, &gt;, &amp;, &#10;, etc.)
449                // These are now reported separately in quick-xml 0.38+
450                if let Some(current) = stack.last_mut() {
451                    let raw = e
452                        .decode()
453                        .map_err(|err| XmlError::ParseError(err.to_string()))?;
454                    let resolved = resolve_entity(&raw)?;
455                    // Don't trim entity references - they may be intentional whitespace/control chars
456                    current.push_text_raw(&resolved);
457                }
458            }
459            Event::Decl(_) | Event::Comment(_) | Event::PI(_) | Event::DocType(_) => {}
460            Event::Eof => break,
461        }
462    }
463
464    if !stack.is_empty() {
465        return Err(XmlError::UnbalancedTags);
466    }
467
468    let root = root.ok_or(XmlError::UnexpectedEof)?;
469    let mut events = Vec::new();
470    emit_element_events(&root, &mut events);
471    Ok(events)
472}
473
474fn attach_element(
475    stack: &mut [Element],
476    elem: Element,
477    root: &mut Option<Element>,
478) -> Result<(), XmlError> {
479    if let Some(parent) = stack.last_mut() {
480        parent.children.push(elem);
481    } else if root.is_none() {
482        *root = Some(elem);
483    } else {
484        return Err(XmlError::MultipleRoots);
485    }
486    Ok(())
487}
488
489/// Emit ParseEvents directly from an Element, without intermediate XmlValue.
490fn emit_element_events<'de>(elem: &Element, events: &mut Vec<ParseEvent<'de>>) {
491    let text = elem.text.trim();
492    let has_attrs = !elem.attributes.is_empty();
493    let has_children = !elem.children.is_empty();
494
495    // Case 1: No attributes, no children - emit scalar from text
496    if !has_attrs && !has_children {
497        if text.is_empty() {
498            // Empty element is an empty object (for unit structs)
499            events.push(ParseEvent::StructStart(ContainerKind::Element));
500            events.push(ParseEvent::StructEnd);
501        } else {
502            emit_scalar_from_text(text, events);
503        }
504        return;
505    }
506
507    // Case 2: No attributes, multiple children with same name - emit as array
508    if !has_attrs && has_children && text.is_empty() && elem.children.len() > 1 {
509        let first = &elem.children[0].name;
510        if elem.children.iter().all(|child| &child.name == first) {
511            events.push(ParseEvent::SequenceStart(ContainerKind::Element));
512            for child in &elem.children {
513                emit_element_events(child, events);
514            }
515            events.push(ParseEvent::SequenceEnd);
516            return;
517        }
518    }
519
520    // Case 3: Has attributes or mixed children - emit as struct
521    events.push(ParseEvent::StructStart(ContainerKind::Element));
522
523    // Emit attributes as fields
524    for (qname, value) in &elem.attributes {
525        let mut key = FieldKey::new(
526            Cow::Owned(qname.local_name.clone()),
527            FieldLocationHint::Attribute,
528        );
529        if let Some(ns) = &qname.namespace {
530            key = key.with_namespace(Cow::Owned(ns.clone()));
531        }
532        events.push(ParseEvent::FieldKey(key));
533        // Attributes are always strings
534        events.push(ParseEvent::Scalar(ScalarValue::Str(Cow::Owned(
535            value.clone(),
536        ))));
537    }
538
539    // Group children by (local_name, namespace) to detect arrays
540    let mut grouped: BTreeMap<(&str, Option<&str>), Vec<&Element>> = BTreeMap::new();
541    for child in &elem.children {
542        let key = (
543            child.name.local_name.as_str(),
544            child.name.namespace.as_deref(),
545        );
546        grouped.entry(key).or_default().push(child);
547    }
548
549    // Emit children as fields
550    for ((local_name, namespace), children) in grouped {
551        let mut key = FieldKey::new(Cow::Owned(local_name.to_string()), FieldLocationHint::Child);
552        if let Some(ns) = namespace {
553            key = key.with_namespace(Cow::Owned(ns.to_string()));
554        }
555        events.push(ParseEvent::FieldKey(key));
556
557        if children.len() == 1 {
558            emit_element_events(children[0], events);
559        } else {
560            // Multiple children with same name -> array
561            events.push(ParseEvent::SequenceStart(ContainerKind::Element));
562            for child in children {
563                emit_element_events(child, events);
564            }
565            events.push(ParseEvent::SequenceEnd);
566        }
567    }
568
569    // Emit text content if present (mixed content)
570    if !text.is_empty() {
571        let key = FieldKey::new(Cow::Borrowed("_text"), FieldLocationHint::Text);
572        events.push(ParseEvent::FieldKey(key));
573        events.push(ParseEvent::Scalar(ScalarValue::Str(Cow::Owned(
574            text.to_string(),
575        ))));
576    }
577
578    events.push(ParseEvent::StructEnd);
579}
580
581/// Parse text and emit appropriate scalar event.
582fn emit_scalar_from_text<'de>(text: &str, events: &mut Vec<ParseEvent<'de>>) {
583    if text.eq_ignore_ascii_case("null") {
584        events.push(ParseEvent::Scalar(ScalarValue::Null));
585        return;
586    }
587    if let Ok(b) = text.parse::<bool>() {
588        events.push(ParseEvent::Scalar(ScalarValue::Bool(b)));
589        return;
590    }
591    if let Ok(i) = text.parse::<i64>() {
592        events.push(ParseEvent::Scalar(ScalarValue::I64(i)));
593        return;
594    }
595    if let Ok(u) = text.parse::<u64>() {
596        events.push(ParseEvent::Scalar(ScalarValue::U64(u)));
597        return;
598    }
599    // Try i128/u128 before f64 to avoid precision loss for large integers.
600    // Emit as string so the deserializer can use parse_from_str.
601    if text.parse::<i128>().is_ok() || text.parse::<u128>().is_ok() {
602        events.push(ParseEvent::Scalar(ScalarValue::Str(Cow::Owned(
603            text.to_string(),
604        ))));
605        return;
606    }
607    if let Ok(f) = text.parse::<f64>() {
608        events.push(ParseEvent::Scalar(ScalarValue::F64(f)));
609        return;
610    }
611    events.push(ParseEvent::Scalar(ScalarValue::Str(Cow::Owned(
612        text.to_string(),
613    ))));
614}