facet_format_xml/
parser.rs

1extern crate alloc;
2
3use alloc::borrow::Cow;
4use alloc::collections::BTreeMap;
5use alloc::string::String;
6use alloc::vec::Vec;
7use core::fmt;
8
9use facet_format::{
10    ContainerKind, FieldEvidence, FieldKey, FieldLocationHint, FormatParser, ParseEvent,
11    ProbeStream, ScalarValue,
12};
13use quick_xml::NsReader;
14use quick_xml::escape::resolve_xml_entity;
15use quick_xml::events::Event;
16use quick_xml::name::ResolveResult;
17use std::io::Cursor;
18
19/// A qualified XML name with optional namespace URI.
20///
21/// In XML, elements and attributes can be in a namespace. The namespace is
22/// identified by a URI, not the prefix used in the document. For example,
23/// `android:label` and `a:label` are the same if both prefixes resolve to
24/// the same namespace URI.
25#[derive(Debug, Clone, PartialEq, Eq)]
26#[allow(dead_code)] // Will be used in Phase 2
27struct QName {
28    /// The namespace URI, or `None` for "no namespace".
29    ///
30    /// - Elements without a prefix and no default `xmlns` are in no namespace.
31    /// - Attributes without a prefix are always in no namespace (even with default xmlns).
32    /// - Elements/attributes with a prefix have their namespace resolved via xmlns declarations.
33    namespace: Option<String>,
34    /// The local name (without prefix).
35    local_name: String,
36}
37
38#[allow(dead_code)] // Will be used in Phase 2
39impl QName {
40    /// Create a qualified name with no namespace.
41    fn local(name: impl Into<String>) -> Self {
42        Self {
43            namespace: None,
44            local_name: name.into(),
45        }
46    }
47
48    /// Create a qualified name with a namespace.
49    fn with_ns(namespace: impl Into<String>, local_name: impl Into<String>) -> Self {
50        Self {
51            namespace: Some(namespace.into()),
52            local_name: local_name.into(),
53        }
54    }
55
56    /// Check if this name matches a local name with an optional expected namespace.
57    ///
58    /// If `expected_ns` is `None`, matches any name with the given local name.
59    /// If `expected_ns` is `Some(ns)`, only matches if both local name and namespace match.
60    fn matches(&self, local_name: &str, expected_ns: Option<&str>) -> bool {
61        if self.local_name != local_name {
62            return false;
63        }
64        match expected_ns {
65            None => true, // No namespace constraint - match any namespace (or none)
66            Some(ns) => self.namespace.as_deref() == Some(ns),
67        }
68    }
69}
70
71impl fmt::Display for QName {
72    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
73        match &self.namespace {
74            Some(ns) => write!(f, "{{{}}}{}", ns, self.local_name),
75            None => write!(f, "{}", self.local_name),
76        }
77    }
78}
79
80pub struct XmlParser<'de> {
81    events: Vec<ParseEvent<'de>>,
82    idx: usize,
83    pending_error: Option<XmlError>,
84}
85
86impl<'de> XmlParser<'de> {
87    pub fn new(input: &'de [u8]) -> Self {
88        match build_events(input) {
89            Ok(events) => Self {
90                events,
91                idx: 0,
92                pending_error: None,
93            },
94            Err(err) => Self {
95                events: Vec::new(),
96                idx: 0,
97                pending_error: Some(err),
98            },
99        }
100    }
101}
102
103#[derive(Debug, Clone)]
104pub enum XmlError {
105    ParseError(alloc::string::String),
106    UnexpectedEof,
107    UnbalancedTags,
108    InvalidUtf8,
109    MultipleRoots,
110}
111
112impl fmt::Display for XmlError {
113    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
114        match self {
115            XmlError::ParseError(msg) => write!(f, "XML parse error: {}", msg),
116            XmlError::UnexpectedEof => write!(f, "Unexpected end of XML"),
117            XmlError::UnbalancedTags => write!(f, "Unbalanced XML tags"),
118            XmlError::InvalidUtf8 => write!(f, "Invalid UTF-8 in XML"),
119            XmlError::MultipleRoots => write!(f, "XML document has multiple root elements"),
120        }
121    }
122}
123
124impl<'de> FormatParser<'de> for XmlParser<'de> {
125    type Error = XmlError;
126    type Probe<'a>
127        = XmlProbe<'de>
128    where
129        Self: 'a;
130
131    fn next_event(&mut self) -> Result<Option<ParseEvent<'de>>, Self::Error> {
132        if let Some(err) = &self.pending_error {
133            return Err(err.clone());
134        }
135        if self.idx >= self.events.len() {
136            return Ok(None);
137        }
138        let event = self.events[self.idx].clone();
139        self.idx += 1;
140        Ok(Some(event))
141    }
142
143    fn peek_event(&mut self) -> Result<Option<ParseEvent<'de>>, Self::Error> {
144        if let Some(err) = &self.pending_error {
145            return Err(err.clone());
146        }
147        Ok(self.events.get(self.idx).cloned())
148    }
149
150    fn skip_value(&mut self) -> Result<(), Self::Error> {
151        let mut depth = 0usize;
152        loop {
153            let event = self.next_event()?.ok_or(XmlError::UnexpectedEof)?;
154            match event {
155                ParseEvent::StructStart(_) | ParseEvent::SequenceStart(_) => {
156                    depth += 1;
157                }
158                ParseEvent::StructEnd | ParseEvent::SequenceEnd => {
159                    if depth == 0 {
160                        break;
161                    } else {
162                        depth -= 1;
163                    }
164                }
165                ParseEvent::Scalar(_) | ParseEvent::VariantTag(_) => {
166                    if depth == 0 {
167                        break;
168                    }
169                }
170                ParseEvent::FieldKey(_) | ParseEvent::OrderedField => {
171                    // Value will follow; treat as entering one more depth level.
172                    depth += 1;
173                }
174            }
175        }
176        Ok(())
177    }
178
179    fn begin_probe(&mut self) -> Result<Self::Probe<'_>, Self::Error> {
180        // Look ahead in the remaining events to build field evidence
181        let evidence = self.build_probe();
182        Ok(XmlProbe { evidence, idx: 0 })
183    }
184}
185
186impl<'de> XmlParser<'de> {
187    /// Build field evidence by looking ahead at remaining events.
188    fn build_probe(&self) -> Vec<FieldEvidence<'de>> {
189        let mut evidence = Vec::new();
190
191        // Check if we're about to read a struct
192        if self.idx >= self.events.len() {
193            return evidence;
194        }
195
196        if !matches!(
197            self.events.get(self.idx),
198            Some(ParseEvent::StructStart(ContainerKind::Element))
199        ) {
200            return evidence;
201        }
202
203        // Scan the struct's fields
204        let mut i = self.idx + 1;
205        let mut depth = 0usize;
206
207        while i < self.events.len() {
208            match &self.events[i] {
209                ParseEvent::StructStart(_) | ParseEvent::SequenceStart(_) => {
210                    depth += 1;
211                    i += 1;
212                }
213                ParseEvent::StructEnd | ParseEvent::SequenceEnd => {
214                    if depth == 0 {
215                        // End of the struct we're probing
216                        break;
217                    }
218                    depth -= 1;
219                    i += 1;
220                }
221                ParseEvent::FieldKey(key) if depth == 0 => {
222                    // This is a top-level field in the struct we're probing
223                    // Look at the next event to see if it's a scalar
224                    let scalar_value = if let Some(next_event) = self.events.get(i + 1) {
225                        match next_event {
226                            ParseEvent::Scalar(sv) => Some(sv.clone()),
227                            _ => None,
228                        }
229                    } else {
230                        None
231                    };
232
233                    if let Some(sv) = scalar_value {
234                        evidence.push(FieldEvidence::with_scalar_value(
235                            key.name.clone(),
236                            key.location,
237                            None,
238                            sv,
239                            key.namespace.clone(),
240                        ));
241                    } else {
242                        evidence.push(FieldEvidence::new(
243                            key.name.clone(),
244                            key.location,
245                            None,
246                            key.namespace.clone(),
247                        ));
248                    }
249                    i += 1;
250                }
251                _ => {
252                    i += 1;
253                }
254            }
255        }
256
257        evidence
258    }
259}
260
261pub struct XmlProbe<'de> {
262    evidence: Vec<FieldEvidence<'de>>,
263    idx: usize,
264}
265
266impl<'de> ProbeStream<'de> for XmlProbe<'de> {
267    type Error = XmlError;
268
269    fn next(&mut self) -> Result<Option<FieldEvidence<'de>>, Self::Error> {
270        if self.idx >= self.evidence.len() {
271            Ok(None)
272        } else {
273            let ev = self.evidence[self.idx].clone();
274            self.idx += 1;
275            Ok(Some(ev))
276        }
277    }
278}
279
280/// Resolve a namespace from quick-xml's ResolveResult.
281fn resolve_namespace(resolve: ResolveResult<'_>) -> Result<Option<String>, XmlError> {
282    match resolve {
283        ResolveResult::Bound(ns) => Ok(Some(String::from_utf8_lossy(ns.as_ref()).into_owned())),
284        ResolveResult::Unbound => Ok(None),
285        ResolveResult::Unknown(_) => {
286            // Unknown prefix - treat as no namespace
287            Ok(None)
288        }
289    }
290}
291
292/// Resolve a general entity reference to its character value.
293/// Handles both named entities (lt, gt, amp, etc.) and numeric entities (&#10;, &#x09;, etc.)
294fn resolve_entity(raw: &str) -> Result<String, XmlError> {
295    // Try named entity first (e.g., "lt" -> "<")
296    if let Some(resolved) = resolve_xml_entity(raw) {
297        return Ok(resolved.into());
298    }
299
300    // Try numeric entity (e.g., "#10" -> "\n", "#x09" -> "\t")
301    if let Some(rest) = raw.strip_prefix('#') {
302        let code = if let Some(hex) = rest.strip_prefix('x').or_else(|| rest.strip_prefix('X')) {
303            // Hexadecimal numeric entity
304            u32::from_str_radix(hex, 16).map_err(|_| {
305                XmlError::ParseError(format!("Invalid hex numeric entity: #{}", rest))
306            })?
307        } else {
308            // Decimal numeric entity
309            rest.parse::<u32>().map_err(|_| {
310                XmlError::ParseError(format!("Invalid decimal numeric entity: #{}", rest))
311            })?
312        };
313
314        let ch = char::from_u32(code)
315            .ok_or_else(|| XmlError::ParseError(format!("Invalid Unicode code point: {}", code)))?;
316        return Ok(ch.to_string());
317    }
318
319    // Unknown entity - return as-is with & and ;
320    Ok(format!("&{};", raw))
321}
322
323#[derive(Debug, Clone)]
324struct Element {
325    name: QName,
326    attributes: Vec<(QName, String)>,
327    children: Vec<Element>,
328    text: String,
329}
330
331impl Element {
332    fn new(name: QName, attributes: Vec<(QName, String)>) -> Self {
333        Self {
334            name,
335            attributes,
336            children: Vec::new(),
337            text: String::new(),
338        }
339    }
340
341    fn push_text(&mut self, text: &str) {
342        self.push_text_impl(text, true);
343    }
344
345    fn push_text_raw(&mut self, text: &str) {
346        self.push_text_impl(text, false);
347    }
348
349    fn push_text_impl(&mut self, text: &str, should_trim: bool) {
350        let content = if should_trim { text.trim() } else { text };
351        if content.is_empty() {
352            return;
353        }
354        self.text.push_str(content);
355    }
356}
357
358fn build_events<'de>(input: &'de [u8]) -> Result<Vec<ParseEvent<'de>>, XmlError> {
359    let mut reader = NsReader::from_reader(Cursor::new(input));
360    reader.config_mut().trim_text(true);
361
362    let mut buf = Vec::new();
363    let mut stack: Vec<Element> = Vec::new();
364    let mut root: Option<Element> = None;
365
366    loop {
367        buf.clear();
368        let (resolve, event) = reader
369            .read_resolved_event_into(&mut buf)
370            .map_err(|e| XmlError::ParseError(e.to_string()))?;
371
372        match event {
373            Event::Start(ref e) | Event::Empty(ref e) => {
374                // Resolve element namespace
375                let ns = resolve_namespace(resolve)?;
376                let local = core::str::from_utf8(e.local_name().as_ref())
377                    .map_err(|_| XmlError::InvalidUtf8)?
378                    .to_string();
379                let name = match ns {
380                    Some(uri) => QName::with_ns(uri, local),
381                    None => QName::local(local),
382                };
383
384                // Resolve attribute namespaces
385                let mut attributes = Vec::new();
386                for attr in e.attributes() {
387                    let attr = attr.map_err(|e| XmlError::ParseError(e.to_string()))?;
388
389                    // Skip xmlns declarations (xmlns and xmlns:*)
390                    let key = attr.key;
391                    if key.as_ref() == b"xmlns" {
392                        continue; // Skip default namespace declaration
393                    }
394                    if let Some(prefix) = key.prefix()
395                        && prefix.as_ref() == b"xmlns"
396                    {
397                        continue; // Skip prefixed namespace declarations
398                    }
399
400                    let (attr_resolve, _) = reader.resolve_attribute(key);
401                    let attr_ns = resolve_namespace(attr_resolve)?;
402                    let attr_local = core::str::from_utf8(key.local_name().as_ref())
403                        .map_err(|_| XmlError::InvalidUtf8)?
404                        .to_string();
405                    let attr_qname = match attr_ns {
406                        Some(uri) => QName::with_ns(uri, attr_local),
407                        None => QName::local(attr_local),
408                    };
409                    let value = attr
410                        .unescape_value()
411                        .map_err(|e| XmlError::ParseError(e.to_string()))?
412                        .into_owned();
413                    attributes.push((attr_qname, value));
414                }
415
416                let elem = Element::new(name, attributes);
417
418                if matches!(event, Event::Start(_)) {
419                    stack.push(elem);
420                } else {
421                    // Empty element
422                    attach_element(stack.as_mut_slice(), elem, &mut root)?;
423                }
424            }
425            Event::End(_) => {
426                let elem = stack.pop().ok_or(XmlError::UnbalancedTags)?;
427                attach_element(stack.as_mut_slice(), elem, &mut root)?;
428            }
429            Event::Text(e) => {
430                if let Some(current) = stack.last_mut() {
431                    let text = e
432                        .decode()
433                        .map_err(|err| XmlError::ParseError(err.to_string()))?;
434                    current.push_text(text.as_ref());
435                }
436            }
437            Event::CData(e) => {
438                if let Some(current) = stack.last_mut() {
439                    let text =
440                        core::str::from_utf8(e.as_ref()).map_err(|_| XmlError::InvalidUtf8)?;
441                    current.push_text(text);
442                }
443            }
444            Event::GeneralRef(e) => {
445                // General entity references (e.g., &lt;, &gt;, &amp;, &#10;, etc.)
446                // These are now reported separately in quick-xml 0.38+
447                if let Some(current) = stack.last_mut() {
448                    let raw = e
449                        .decode()
450                        .map_err(|err| XmlError::ParseError(err.to_string()))?;
451                    let resolved = resolve_entity(&raw)?;
452                    // Don't trim entity references - they may be intentional whitespace/control chars
453                    current.push_text_raw(&resolved);
454                }
455            }
456            Event::Decl(_) | Event::Comment(_) | Event::PI(_) | Event::DocType(_) => {}
457            Event::Eof => break,
458        }
459    }
460
461    if !stack.is_empty() {
462        return Err(XmlError::UnbalancedTags);
463    }
464
465    let root = root.ok_or(XmlError::UnexpectedEof)?;
466    let mut events = Vec::new();
467    emit_element_events(&root, &mut events);
468    Ok(events)
469}
470
471fn attach_element(
472    stack: &mut [Element],
473    elem: Element,
474    root: &mut Option<Element>,
475) -> Result<(), XmlError> {
476    if let Some(parent) = stack.last_mut() {
477        parent.children.push(elem);
478    } else if root.is_none() {
479        *root = Some(elem);
480    } else {
481        return Err(XmlError::MultipleRoots);
482    }
483    Ok(())
484}
485
486/// Emit ParseEvents directly from an Element, without intermediate XmlValue.
487fn emit_element_events<'de>(elem: &Element, events: &mut Vec<ParseEvent<'de>>) {
488    let text = elem.text.trim();
489    let has_attrs = !elem.attributes.is_empty();
490    let has_children = !elem.children.is_empty();
491
492    // Case 1: No attributes, no children - emit scalar from text
493    if !has_attrs && !has_children {
494        if text.is_empty() {
495            // Empty element is an empty object (for unit structs)
496            events.push(ParseEvent::StructStart(ContainerKind::Element));
497            events.push(ParseEvent::StructEnd);
498        } else {
499            emit_scalar_from_text(text, events);
500        }
501        return;
502    }
503
504    // Case 2: No attributes, multiple children with same name - emit as array
505    if !has_attrs && has_children && text.is_empty() && elem.children.len() > 1 {
506        let first = &elem.children[0].name;
507        if elem.children.iter().all(|child| &child.name == first) {
508            events.push(ParseEvent::SequenceStart(ContainerKind::Element));
509            for child in &elem.children {
510                emit_element_events(child, events);
511            }
512            events.push(ParseEvent::SequenceEnd);
513            return;
514        }
515    }
516
517    // Case 3: Has attributes or mixed children - emit as struct
518    events.push(ParseEvent::StructStart(ContainerKind::Element));
519
520    // Emit attributes as fields
521    for (qname, value) in &elem.attributes {
522        let mut key = FieldKey::new(
523            Cow::Owned(qname.local_name.clone()),
524            FieldLocationHint::Attribute,
525        );
526        if let Some(ns) = &qname.namespace {
527            key = key.with_namespace(Cow::Owned(ns.clone()));
528        }
529        events.push(ParseEvent::FieldKey(key));
530        // Attributes are always strings
531        events.push(ParseEvent::Scalar(ScalarValue::Str(Cow::Owned(
532            value.clone(),
533        ))));
534    }
535
536    // Group children by (local_name, namespace) to detect arrays
537    let mut grouped: BTreeMap<(&str, Option<&str>), Vec<&Element>> = BTreeMap::new();
538    for child in &elem.children {
539        let key = (
540            child.name.local_name.as_str(),
541            child.name.namespace.as_deref(),
542        );
543        grouped.entry(key).or_default().push(child);
544    }
545
546    // Emit children as fields
547    for ((local_name, namespace), children) in grouped {
548        let mut key = FieldKey::new(Cow::Owned(local_name.to_string()), FieldLocationHint::Child);
549        if let Some(ns) = namespace {
550            key = key.with_namespace(Cow::Owned(ns.to_string()));
551        }
552        events.push(ParseEvent::FieldKey(key));
553
554        if children.len() == 1 {
555            emit_element_events(children[0], events);
556        } else {
557            // Multiple children with same name -> array
558            events.push(ParseEvent::SequenceStart(ContainerKind::Element));
559            for child in children {
560                emit_element_events(child, events);
561            }
562            events.push(ParseEvent::SequenceEnd);
563        }
564    }
565
566    // Emit text content if present (mixed content)
567    if !text.is_empty() {
568        let key = FieldKey::new(Cow::Borrowed("_text"), FieldLocationHint::Text);
569        events.push(ParseEvent::FieldKey(key));
570        events.push(ParseEvent::Scalar(ScalarValue::Str(Cow::Owned(
571            text.to_string(),
572        ))));
573    }
574
575    events.push(ParseEvent::StructEnd);
576}
577
578/// Parse text and emit appropriate scalar event.
579fn emit_scalar_from_text<'de>(text: &str, events: &mut Vec<ParseEvent<'de>>) {
580    if text.eq_ignore_ascii_case("null") {
581        events.push(ParseEvent::Scalar(ScalarValue::Null));
582        return;
583    }
584    if let Ok(b) = text.parse::<bool>() {
585        events.push(ParseEvent::Scalar(ScalarValue::Bool(b)));
586        return;
587    }
588    if let Ok(i) = text.parse::<i64>() {
589        events.push(ParseEvent::Scalar(ScalarValue::I64(i)));
590        return;
591    }
592    if let Ok(u) = text.parse::<u64>() {
593        events.push(ParseEvent::Scalar(ScalarValue::U64(u)));
594        return;
595    }
596    // Try i128/u128 before f64 to avoid precision loss for large integers.
597    // Emit as string so the deserializer can use parse_from_str.
598    if text.parse::<i128>().is_ok() || text.parse::<u128>().is_ok() {
599        events.push(ParseEvent::Scalar(ScalarValue::Str(Cow::Owned(
600            text.to_string(),
601        ))));
602        return;
603    }
604    if let Ok(f) = text.parse::<f64>() {
605        events.push(ParseEvent::Scalar(ScalarValue::F64(f)));
606        return;
607    }
608    events.push(ParseEvent::Scalar(ScalarValue::Str(Cow::Owned(
609        text.to_string(),
610    ))));
611}