Skip to main content

supersigil_parser/
xml_parser.rs

1//! XML subset parser for `supersigil-xml` fence content.
2//!
3//! Parses a strict XML subset supporting:
4//! - `PascalCase` elements with double-quoted string attributes
5//! - Self-closing elements (`<Foo />`)
6//! - Nested elements
7//! - Text content between elements
8//! - Entity references: `&amp;`, `&lt;`, `&gt;`, `&quot;`
9//!
10//! Rejects processing instructions, CDATA, DTD, comments, namespaces, and
11//! unsupported entity references.
12//!
13//! Implemented on top of `quick-xml` for correctness and robustness.
14
15use std::path::Path;
16
17use quick_xml::Reader;
18use quick_xml::events::Event;
19use supersigil_core::ParseError;
20
21use crate::util::line_col;
22
23// ---------------------------------------------------------------------------
24// Public types
25// ---------------------------------------------------------------------------
26
27/// A parsed XML node from a `supersigil-xml` fence.
28#[derive(Debug, Clone, PartialEq)]
29pub enum XmlNode {
30    /// An XML element with name, attributes, children, and source offset.
31    Element {
32        /// Element tag name (e.g. `Criterion`).
33        name: String,
34        /// Ordered list of `(key, value)` attribute pairs.
35        attributes: Vec<(String, String)>,
36        /// Child nodes (elements and/or text).
37        children: Vec<XmlNode>,
38        /// Byte offset of the opening `<` relative to the source file.
39        offset: usize,
40        /// Byte offset past the closing `>` relative to the source file.
41        end_offset: usize,
42    },
43    /// Raw text content (entity references already resolved).
44    Text {
45        /// The decoded text content.
46        content: String,
47        /// Byte offset of the start of the text relative to the source file.
48        offset: usize,
49        /// Byte offset of the end of the raw source text (past the last
50        /// byte of the original text/entity-ref run). This can differ
51        /// from `offset + content.len()` when entity references are present
52        /// because decoded text is shorter than the raw source.
53        end_offset: usize,
54    },
55}
56
57// ---------------------------------------------------------------------------
58// Synthetic root tag
59// ---------------------------------------------------------------------------
60
61/// Tag used to wrap multiple top-level elements so that quick-xml sees
62/// well-formed XML.  This name intentionally cannot be a valid `PascalCase`
63/// supersigil element.
64const SYNTHETIC_ROOT: &str = "__root";
65const SYNTHETIC_OPEN: &str = "<__root>";
66const SYNTHETIC_CLOSE: &str = "</__root>";
67
68// ---------------------------------------------------------------------------
69// Public API
70// ---------------------------------------------------------------------------
71
72/// Parse the content of a `supersigil-xml` fence into structured XML nodes.
73///
74/// `content` is the raw text between the fence delimiters.
75/// `fence_offset` is the byte offset of the fence content start within the
76/// source file, used to produce file-absolute offsets.
77/// `path` is the file path for error messages.
78///
79/// # Errors
80///
81/// Returns [`ParseError::XmlSyntaxError`] for any syntax or validation error.
82/// Line and column numbers in errors are **content-relative** (i.e. relative
83/// to the start of the fence content, not the file). The caller must adjust
84/// for the fence's position within the file if file-absolute positions are
85/// needed.
86pub fn parse_supersigil_xml(
87    content: &str,
88    fence_offset: usize,
89    path: &Path,
90) -> Result<Vec<XmlNode>, ParseError> {
91    // Reject unsupported constructs that quick-xml would silently handle.
92    reject_unsupported(content, path)?;
93
94    // Wrap in a synthetic root so quick-xml can handle multiple top-level
95    // elements (which is not valid XML on its own).
96    let wrapped = format!("{SYNTHETIC_OPEN}{content}{SYNTHETIC_CLOSE}");
97
98    let mut reader = Reader::from_str(&wrapped);
99    reader.config_mut().trim_text(false);
100
101    // The synthetic root open tag shifts all byte positions by its length.
102    // quick-xml uses u64 for buffer positions.
103    #[allow(clippy::cast_possible_truncation, reason = "SYNTHETIC_OPEN is 8 bytes")]
104    let root_tag_len: u64 = SYNTHETIC_OPEN.len() as u64;
105
106    // Skip the synthetic root's opening event.
107    loop {
108        match reader.read_event() {
109            Ok(Event::Start(ref e)) if e.name().as_ref() == SYNTHETIC_ROOT.as_bytes() => break,
110            Ok(Event::Eof) => break,
111            Err(e) => {
112                return Err(quick_xml_error_to_parse_error(&e, path));
113            }
114            _ => {}
115        }
116    }
117
118    // Parse children of the synthetic root (= top-level nodes).
119    let nodes = parse_children(
120        &mut reader,
121        SYNTHETIC_ROOT,
122        content,
123        fence_offset,
124        root_tag_len,
125        path,
126    )?;
127
128    Ok(nodes)
129}
130
131// ---------------------------------------------------------------------------
132// Pre-scan: reject unsupported constructs before quick-xml parsing
133// ---------------------------------------------------------------------------
134
135/// Scan the raw content for constructs that our XML subset forbids.
136///
137/// We do this before handing the input to quick-xml because quick-xml would
138/// either silently handle these (comments, CDATA) or produce cryptic errors.
139fn reject_unsupported(content: &str, path: &Path) -> Result<(), ParseError> {
140    let bytes = content.as_bytes();
141    let mut i = 0;
142    while i < bytes.len() {
143        if bytes[i] == b'<' {
144            if bytes[i..].starts_with(b"<?") {
145                return Err(make_error(
146                    content,
147                    i,
148                    path,
149                    "processing instructions (`<?...?>`) are not supported",
150                ));
151            }
152            if bytes[i..].starts_with(b"<![CDATA[") {
153                return Err(make_error(
154                    content,
155                    i,
156                    path,
157                    "CDATA sections (`<![CDATA[...]]>`) are not supported",
158                ));
159            }
160            if bytes[i..].starts_with(b"<!DOCTYPE") || bytes[i..].starts_with(b"<!doctype") {
161                return Err(make_error(
162                    content,
163                    i,
164                    path,
165                    "DTD declarations (`<!DOCTYPE ...>`) are not supported",
166                ));
167            }
168            // XML comments are allowed and silently skipped during parsing.
169        }
170        i += 1;
171    }
172    Ok(())
173}
174
175// ---------------------------------------------------------------------------
176// Recursive event-driven parser
177// ---------------------------------------------------------------------------
178
179/// Parse child nodes until the closing tag for `parent_name` is encountered.
180///
181/// Text and entity-reference events are accumulated into a single `XmlNode::Text`
182/// because quick-xml emits `Text` / `GeneralRef` / `Text` sequences for content
183/// like `a &lt; b`.
184#[allow(
185    clippy::too_many_lines,
186    reason = "event-loop structure is clearest as a single function"
187)]
188fn parse_children(
189    reader: &mut Reader<&[u8]>,
190    parent_name: &str,
191    content: &str,
192    fence_offset: usize,
193    root_tag_len: u64,
194    path: &Path,
195) -> Result<Vec<XmlNode>, ParseError> {
196    /// Flush accumulated text into the node list.
197    fn flush_text(
198        text_buf: &mut String,
199        text_start: &mut Option<usize>,
200        text_end: &mut Option<usize>,
201        nodes: &mut Vec<XmlNode>,
202        is_top_level: bool,
203    ) {
204        if text_buf.is_empty() {
205            return;
206        }
207        // At top level, skip whitespace-only text nodes.
208        if is_top_level && text_buf.trim().is_empty() {
209            text_buf.clear();
210            *text_start = None;
211            *text_end = None;
212            return;
213        }
214        let start = text_start.take().unwrap_or(0);
215        let end = text_end.take().unwrap_or(start);
216        nodes.push(XmlNode::Text {
217            content: std::mem::take(text_buf),
218            offset: start,
219            end_offset: end,
220        });
221    }
222
223    let mut nodes: Vec<XmlNode> = Vec::new();
224    // Accumulator for runs of Text + GeneralRef events.
225    let mut text_buf = String::new();
226    // Byte offset (file-absolute) of the first event in the current text run.
227    let mut text_start_offset: Option<usize> = None;
228    // Byte offset (file-absolute) past the last byte of the current text run.
229    let mut text_end_offset: Option<usize> = None;
230    let is_top_level = parent_name == SYNTHETIC_ROOT;
231
232    loop {
233        let event_pos = reader.buffer_position();
234
235        match reader.read_event() {
236            Ok(Event::Start(ref e)) => {
237                flush_text(
238                    &mut text_buf,
239                    &mut text_start_offset,
240                    &mut text_end_offset,
241                    &mut nodes,
242                    is_top_level,
243                );
244
245                let offset_in_content = content_offset(event_pos, root_tag_len);
246                let file_offset = fence_offset + offset_in_content;
247
248                let tag_name = decode_name(e.name().as_ref(), content, offset_in_content, path)?;
249                validate_element_name(&tag_name, content, offset_in_content, path)?;
250                let attributes = parse_attributes(e, content, offset_in_content, path)?;
251
252                let children =
253                    parse_children(reader, &tag_name, content, fence_offset, root_tag_len, path)?;
254
255                // After parse_children returns, reader is past the closing `>`.
256                let end_in_content = content_offset(reader.buffer_position(), root_tag_len);
257                let file_end_offset = fence_offset + end_in_content;
258
259                nodes.push(XmlNode::Element {
260                    name: tag_name,
261                    attributes,
262                    children,
263                    offset: file_offset,
264                    end_offset: file_end_offset,
265                });
266            }
267
268            Ok(Event::Empty(ref e)) => {
269                flush_text(
270                    &mut text_buf,
271                    &mut text_start_offset,
272                    &mut text_end_offset,
273                    &mut nodes,
274                    is_top_level,
275                );
276
277                let offset_in_content = content_offset(event_pos, root_tag_len);
278                let file_offset = fence_offset + offset_in_content;
279
280                let tag_name = decode_name(e.name().as_ref(), content, offset_in_content, path)?;
281                validate_element_name(&tag_name, content, offset_in_content, path)?;
282                let attributes = parse_attributes(e, content, offset_in_content, path)?;
283
284                // After reading Empty event, reader is past the closing `/>`.
285                let end_in_content = content_offset(reader.buffer_position(), root_tag_len);
286                let file_end_offset = fence_offset + end_in_content;
287
288                nodes.push(XmlNode::Element {
289                    name: tag_name,
290                    attributes,
291                    children: vec![],
292                    offset: file_offset,
293                    end_offset: file_end_offset,
294                });
295            }
296
297            Ok(Event::Text(ref e)) => {
298                // Accumulate raw text (no entity refs in here — those come as GeneralRef).
299                let raw = std::str::from_utf8(e.as_ref()).map_err(|_err| {
300                    let off = content_offset(event_pos, root_tag_len);
301                    make_error(content, off, path, "invalid UTF-8 in text content")
302                })?;
303                let off = content_offset(event_pos, root_tag_len);
304                if text_start_offset.is_none() {
305                    text_start_offset = Some(fence_offset + off);
306                }
307                // Text events have no entity expansion, so raw byte length
308                // equals the source byte length.
309                text_end_offset = Some(fence_offset + off + raw.len());
310                text_buf.push_str(raw);
311            }
312
313            Ok(Event::GeneralRef(ref e)) => {
314                // Entity reference: e.g. `&amp;` arrives as GeneralRef with content `amp`.
315                let entity_name = std::str::from_utf8(e.as_ref()).map_err(|_err| {
316                    let off = content_offset(event_pos, root_tag_len);
317                    make_error(content, off, path, "invalid UTF-8 in entity reference")
318                })?;
319                let off = content_offset(event_pos, root_tag_len);
320                if text_start_offset.is_none() {
321                    text_start_offset = Some(fence_offset + off);
322                }
323                // Raw source: `&` + entity_name + `;` — so raw length is
324                // entity_name.len() + 2.
325                text_end_offset = Some(fence_offset + off + entity_name.len() + 2);
326                let resolved = resolve_entity(entity_name, content, off, path)?;
327                text_buf.push_str(resolved);
328            }
329
330            Ok(Event::End(ref e)) => {
331                let name_bytes = e.name();
332                let end_name = std::str::from_utf8(name_bytes.as_ref()).unwrap_or("<invalid>");
333                if end_name == parent_name {
334                    flush_text(
335                        &mut text_buf,
336                        &mut text_start_offset,
337                        &mut text_end_offset,
338                        &mut nodes,
339                        is_top_level,
340                    );
341                    return Ok(nodes);
342                }
343                // Mismatch — provide our own message.
344                let offset_in_content = content_offset(event_pos, root_tag_len);
345                return Err(make_error(
346                    content,
347                    offset_in_content,
348                    path,
349                    &format!(
350                        "mismatched closing tag: expected `</{parent_name}>`, found `</{end_name}>`"
351                    ),
352                ));
353            }
354
355            Ok(Event::Eof) => {
356                flush_text(
357                    &mut text_buf,
358                    &mut text_start_offset,
359                    &mut text_end_offset,
360                    &mut nodes,
361                    is_top_level,
362                );
363                if is_top_level {
364                    return Ok(nodes);
365                }
366                return Err(make_error(
367                    content,
368                    content.len(),
369                    path,
370                    &format!("expected closing tag `</{parent_name}>`, found end of input"),
371                ));
372            }
373
374            // XML comments are silently skipped.
375            Ok(Event::Comment(_)) => {}
376
377            // Constructs rejected by pre-scan but caught here as a safety net.
378            Ok(Event::CData(_)) => {
379                let off = content_offset(event_pos, root_tag_len);
380                return Err(make_error(
381                    content,
382                    off,
383                    path,
384                    "CDATA sections (`<![CDATA[...]]>`) are not supported",
385                ));
386            }
387            Ok(Event::PI(_) | Event::Decl(_)) => {
388                let off = content_offset(event_pos, root_tag_len);
389                return Err(make_error(
390                    content,
391                    off,
392                    path,
393                    "processing instructions (`<?...?>`) are not supported",
394                ));
395            }
396            Ok(Event::DocType(_)) => {
397                let off = content_offset(event_pos, root_tag_len);
398                return Err(make_error(
399                    content,
400                    off,
401                    path,
402                    "DTD declarations (`<!DOCTYPE ...>`) are not supported",
403                ));
404            }
405
406            Err(e) => {
407                return Err(quick_xml_error_to_parse_error(&e, path));
408            }
409        }
410    }
411}
412
413// ---------------------------------------------------------------------------
414// Attribute parsing
415// ---------------------------------------------------------------------------
416
417/// Extract attributes from a quick-xml `BytesStart` event.
418fn parse_attributes(
419    event: &quick_xml::events::BytesStart<'_>,
420    content: &str,
421    offset_in_content: usize,
422    path: &Path,
423) -> Result<Vec<(String, String)>, ParseError> {
424    // First, validate that all attribute values are double-quoted by scanning
425    // the raw event bytes.
426    validate_attribute_quotes(event, content, offset_in_content, path)?;
427
428    let mut attrs = Vec::new();
429    for attr_result in event.attributes() {
430        let attr = attr_result.map_err(|e| {
431            let msg = format!("{e}");
432            make_error(content, offset_in_content, path, &msg)
433        })?;
434
435        let key = decode_name(attr.key.as_ref(), content, offset_in_content, path)?;
436
437        // Reject namespaced attributes.
438        if key.contains(':') {
439            return Err(make_error(
440                content,
441                offset_in_content,
442                path,
443                &format!("namespaced attribute `{key}` is not supported"),
444            ));
445        }
446
447        // Decode the value, resolving our supported entity subset.
448        let raw_value = std::str::from_utf8(attr.value.as_ref()).map_err(|_err| {
449            make_error(
450                content,
451                offset_in_content,
452                path,
453                "invalid UTF-8 in attribute value",
454            )
455        })?;
456        let value = resolve_entities_in_str(raw_value, content, offset_in_content, path)?;
457
458        attrs.push((key, value));
459    }
460    Ok(attrs)
461}
462
463/// Check that all attribute values use double quotes (not single quotes,
464/// not unquoted).
465///
466/// quick-xml accepts both single- and double-quoted attribute values, so we
467/// must inspect the raw tag bytes to enforce our subset requirement.
468fn validate_attribute_quotes(
469    event: &quick_xml::events::BytesStart<'_>,
470    content: &str,
471    offset_in_content: usize,
472    path: &Path,
473) -> Result<(), ParseError> {
474    let raw: &[u8] = event.as_ref(); // bytes after the tag name
475    let mut i = 0;
476    let mut in_double_quote = false;
477    while i < raw.len() {
478        if in_double_quote {
479            if raw[i] == b'"' {
480                in_double_quote = false;
481            }
482            i += 1;
483            continue;
484        }
485        if raw[i] == b'"' {
486            in_double_quote = true;
487            i += 1;
488            continue;
489        }
490        if raw[i] == b'=' {
491            // Skip whitespace after `=`.
492            i += 1;
493            while i < raw.len() && raw[i].is_ascii_whitespace() {
494                i += 1;
495            }
496            if i < raw.len() && raw[i] == b'\'' {
497                return Err(make_error(
498                    content,
499                    offset_in_content,
500                    path,
501                    "attribute values must be double-quoted",
502                ));
503            }
504            // Don't advance `i` here — let the main loop handle the
505            // opening `"` so that double-quote tracking activates.
506            continue;
507        }
508        i += 1;
509    }
510    Ok(())
511}
512
513// ---------------------------------------------------------------------------
514// Entity resolution
515// ---------------------------------------------------------------------------
516
517/// Resolve a single entity name to its replacement character.
518fn resolve_entity(
519    name: &str,
520    content: &str,
521    offset_in_content: usize,
522    path: &Path,
523) -> Result<&'static str, ParseError> {
524    match name {
525        "amp" => Ok("&"),
526        "lt" => Ok("<"),
527        "gt" => Ok(">"),
528        "quot" => Ok("\""),
529        _ => Err(make_error(
530            content,
531            offset_in_content,
532            path,
533            &format!("unsupported entity reference `&{name};`"),
534        )),
535    }
536}
537
538/// Resolve entity references in a string (used for attribute values where
539/// quick-xml delivers the raw encoded text).
540fn resolve_entities_in_str(
541    text: &str,
542    content: &str,
543    offset_in_content: usize,
544    path: &Path,
545) -> Result<String, ParseError> {
546    if !text.contains('&') {
547        return Ok(text.to_owned());
548    }
549
550    let mut result = String::with_capacity(text.len());
551    let mut rest = text;
552
553    while let Some(amp_pos) = rest.find('&') {
554        result.push_str(&rest[..amp_pos]);
555        rest = &rest[amp_pos + 1..];
556        if let Some(semi_pos) = rest.find(';') {
557            let entity_name = &rest[..semi_pos];
558            let resolved = resolve_entity(entity_name, content, offset_in_content, path)?;
559            result.push_str(resolved);
560            rest = &rest[semi_pos + 1..];
561        } else {
562            return Err(make_error(
563                content,
564                offset_in_content,
565                path,
566                "unterminated entity reference (missing `;`)",
567            ));
568        }
569    }
570    result.push_str(rest);
571    Ok(result)
572}
573
574// ---------------------------------------------------------------------------
575// Name decoding
576// ---------------------------------------------------------------------------
577
578/// Decode a raw byte slice to a `String`, producing a parse error for invalid
579/// UTF-8.
580fn decode_name(
581    raw: &[u8],
582    content: &str,
583    offset_in_content: usize,
584    path: &Path,
585) -> Result<String, ParseError> {
586    std::str::from_utf8(raw)
587        .map(str::to_owned)
588        .map_err(|_err| make_error(content, offset_in_content, path, "invalid UTF-8 in name"))
589}
590
591// ---------------------------------------------------------------------------
592// Validation helpers
593// ---------------------------------------------------------------------------
594
595/// Validate that an element name is `PascalCase` (starts with uppercase ASCII)
596/// and does not contain namespace prefixes.
597fn validate_element_name(
598    name: &str,
599    content: &str,
600    offset_in_content: usize,
601    path: &Path,
602) -> Result<(), ParseError> {
603    if name.contains(':') {
604        return Err(make_error(
605            content,
606            offset_in_content,
607            path,
608            &format!("namespaced element `{name}` is not supported"),
609        ));
610    }
611    Ok(())
612}
613
614// ---------------------------------------------------------------------------
615// Error helpers
616// ---------------------------------------------------------------------------
617
618/// Convert a quick-xml `u64` buffer position to a `usize` content offset by
619/// subtracting the synthetic root tag length.
620///
621/// XML fence content is always small (well under 4 GiB), so truncation on
622/// 32-bit targets is not a concern in practice.
623#[allow(
624    clippy::cast_possible_truncation,
625    reason = "XML fence content is always small"
626)]
627fn content_offset(event_pos: u64, root_tag_len: u64) -> usize {
628    (event_pos - root_tag_len) as usize
629}
630
631/// Build a `ParseError::XmlSyntaxError` from a byte position in the content.
632fn make_error(content: &str, offset_in_content: usize, path: &Path, message: &str) -> ParseError {
633    let (line, column) = line_col(content, offset_in_content);
634    ParseError::XmlSyntaxError {
635        path: path.to_path_buf(),
636        line,
637        column,
638        message: message.to_owned(),
639    }
640}
641
642/// Convert a `quick_xml::Error` into a `ParseError::XmlSyntaxError`.
643///
644/// The line 1, column 1 fallback is deliberate: quick-xml does not expose
645/// byte positions for structural errors, so we cannot compute a more precise
646/// location.
647fn quick_xml_error_to_parse_error(err: &quick_xml::Error, path: &Path) -> ParseError {
648    let message = match err {
649        quick_xml::Error::IllFormed(ill) => match ill {
650            quick_xml::errors::IllFormedError::MismatchedEndTag { expected, found } => {
651                if expected == SYNTHETIC_ROOT {
652                    // Closing tag at the top level with no matching open tag.
653                    format!("unexpected closing tag `</{found}>` at top level")
654                } else {
655                    format!("mismatched closing tag: expected `</{expected}>`, found `</{found}>`")
656                }
657            }
658            quick_xml::errors::IllFormedError::UnmatchedEndTag(name) => {
659                format!("unexpected closing tag `</{name}>` at top level")
660            }
661            quick_xml::errors::IllFormedError::MissingEndTag(name) => {
662                format!("expected closing tag `</{name}>`, found end of input")
663            }
664            quick_xml::errors::IllFormedError::UnclosedReference => {
665                "unterminated entity reference (missing `;`)".to_owned()
666            }
667            other => format!("{other}"),
668        },
669        other => format!("{other}"),
670    };
671
672    ParseError::XmlSyntaxError {
673        path: path.to_path_buf(),
674        line: 1,
675        column: 1,
676        message,
677    }
678}
679
680// ---------------------------------------------------------------------------
681// Tests
682// ---------------------------------------------------------------------------
683
684#[cfg(test)]
685#[allow(
686    clippy::match_wildcard_for_single_variants,
687    clippy::single_char_pattern,
688    reason = "test assertions are clearer with wildcards and string patterns"
689)]
690mod tests {
691    use super::*;
692
693    fn parse(content: &str) -> Result<Vec<XmlNode>, ParseError> {
694        parse_supersigil_xml(content, 0, Path::new("test.md"))
695    }
696
697    fn parse_with_offset(content: &str, offset: usize) -> Result<Vec<XmlNode>, ParseError> {
698        parse_supersigil_xml(content, offset, Path::new("test.md"))
699    }
700
701    // -- Valid fragments ---------------------------------------------------
702
703    #[test]
704    fn empty_input() {
705        let nodes = parse("").unwrap();
706        assert!(nodes.is_empty());
707    }
708
709    #[test]
710    fn whitespace_only_input() {
711        let nodes = parse("  \n  \n  ").unwrap();
712        assert!(nodes.is_empty());
713    }
714
715    #[test]
716    fn single_self_closing_element() {
717        let nodes = parse(r#"<Spec id="s1" />"#).unwrap();
718        assert_eq!(nodes.len(), 1);
719        match &nodes[0] {
720            XmlNode::Element {
721                name,
722                attributes,
723                children,
724                offset,
725                ..
726            } => {
727                assert_eq!(name, "Spec");
728                assert_eq!(attributes, &[("id".to_owned(), "s1".to_owned())]);
729                assert!(children.is_empty());
730                assert_eq!(*offset, 0);
731            }
732            _ => panic!("expected Element"),
733        }
734    }
735
736    #[test]
737    fn self_closing_no_space_before_slash() {
738        let nodes = parse(r#"<Spec id="s1"/>"#).unwrap();
739        assert_eq!(nodes.len(), 1);
740        if let XmlNode::Element { name, .. } = &nodes[0] {
741            assert_eq!(name, "Spec");
742        } else {
743            panic!("expected Element");
744        }
745    }
746
747    #[test]
748    fn element_with_text_content() {
749        let nodes = parse("<Title>Hello World</Title>").unwrap();
750        assert_eq!(nodes.len(), 1);
751        match &nodes[0] {
752            XmlNode::Element { name, children, .. } => {
753                assert_eq!(name, "Title");
754                assert_eq!(children.len(), 1);
755                assert!(
756                    matches!(&children[0], XmlNode::Text { content, .. } if content == "Hello World")
757                );
758            }
759            _ => panic!("expected Element"),
760        }
761    }
762
763    #[test]
764    fn element_with_no_attributes() {
765        let nodes = parse("<Container></Container>").unwrap();
766        assert_eq!(nodes.len(), 1);
767        match &nodes[0] {
768            XmlNode::Element {
769                name, attributes, ..
770            } => {
771                assert_eq!(name, "Container");
772                assert!(attributes.is_empty());
773            }
774            _ => panic!("expected Element"),
775        }
776    }
777
778    // -- Nested elements ---------------------------------------------------
779
780    #[test]
781    fn nested_elements() {
782        let input = r#"<Parent id="p1"><Child id="c1" /></Parent>"#;
783        let nodes = parse(input).unwrap();
784        assert_eq!(nodes.len(), 1);
785        match &nodes[0] {
786            XmlNode::Element { name, children, .. } => {
787                assert_eq!(name, "Parent");
788                assert_eq!(children.len(), 1);
789                match &children[0] {
790                    XmlNode::Element {
791                        name, attributes, ..
792                    } => {
793                        assert_eq!(name, "Child");
794                        assert_eq!(attributes, &[("id".to_owned(), "c1".to_owned())]);
795                    }
796                    _ => panic!("expected nested Element"),
797                }
798            }
799            _ => panic!("expected Element"),
800        }
801    }
802
803    #[test]
804    fn deeply_nested_elements() {
805        let input = "<A><B><C>deep</C></B></A>";
806        let nodes = parse(input).unwrap();
807        assert_eq!(nodes.len(), 1);
808        // A -> B -> C -> Text("deep")
809        let a = &nodes[0];
810        if let XmlNode::Element { children, .. } = a {
811            let b = &children[0];
812            if let XmlNode::Element { children, .. } = b {
813                let c = &children[0];
814                if let XmlNode::Element { children, .. } = c {
815                    assert!(
816                        matches!(&children[0], XmlNode::Text { content, .. } if content == "deep")
817                    );
818                } else {
819                    panic!("expected C element");
820                }
821            } else {
822                panic!("expected B element");
823            }
824        } else {
825            panic!("expected A element");
826        }
827    }
828
829    #[test]
830    fn mixed_children_text_and_elements() {
831        let input = "<Parent>before<Child />after</Parent>";
832        let nodes = parse(input).unwrap();
833        assert_eq!(nodes.len(), 1);
834        if let XmlNode::Element { children, .. } = &nodes[0] {
835            assert_eq!(children.len(), 3);
836            assert!(matches!(&children[0], XmlNode::Text { content, .. } if content == "before"));
837            assert!(matches!(&children[1], XmlNode::Element { name, .. } if name == "Child"));
838            assert!(matches!(&children[2], XmlNode::Text { content, .. } if content == "after"));
839        } else {
840            panic!("expected Element");
841        }
842    }
843
844    // -- Multiple top-level elements ---------------------------------------
845
846    #[test]
847    fn multiple_top_level_elements() {
848        let input = r#"<A id="1" />
849<B id="2" />"#;
850        let nodes = parse(input).unwrap();
851        assert_eq!(nodes.len(), 2);
852        if let XmlNode::Element { name, .. } = &nodes[0] {
853            assert_eq!(name, "A");
854        }
855        if let XmlNode::Element { name, .. } = &nodes[1] {
856            assert_eq!(name, "B");
857        }
858    }
859
860    // -- Attribute parsing -------------------------------------------------
861
862    #[test]
863    fn multiple_attributes() {
864        let input = r#"<Criterion id="c1" strategy="tag" />"#;
865        let nodes = parse(input).unwrap();
866        if let XmlNode::Element { attributes, .. } = &nodes[0] {
867            assert_eq!(
868                attributes,
869                &[
870                    ("id".to_owned(), "c1".to_owned()),
871                    ("strategy".to_owned(), "tag".to_owned()),
872                ]
873            );
874        } else {
875            panic!("expected Element");
876        }
877    }
878
879    #[test]
880    fn attribute_with_entity_in_value() {
881        let input = r#"<Spec desc="a &amp; b" />"#;
882        let nodes = parse(input).unwrap();
883        if let XmlNode::Element { attributes, .. } = &nodes[0] {
884            assert_eq!(attributes[0].1, "a & b");
885        } else {
886            panic!("expected Element");
887        }
888    }
889
890    #[test]
891    fn all_supported_entities_in_attribute() {
892        let input = r#"<Spec val="&amp;&lt;&gt;&quot;" />"#;
893        let nodes = parse(input).unwrap();
894        if let XmlNode::Element { attributes, .. } = &nodes[0] {
895            assert_eq!(attributes[0].1, "&<>\"");
896        } else {
897            panic!("expected Element");
898        }
899    }
900
901    // -- Entity references in text content ---------------------------------
902
903    #[test]
904    fn entity_references_in_text() {
905        let input = "<Note>a &lt; b &amp; c &gt; d &quot;e&quot;</Note>";
906        let nodes = parse(input).unwrap();
907        if let XmlNode::Element { children, .. } = &nodes[0] {
908            assert!(
909                matches!(&children[0], XmlNode::Text { content, .. } if content == r#"a < b & c > d "e""#)
910            );
911        } else {
912            panic!("expected Element");
913        }
914    }
915
916    // -- Position offsetting -----------------------------------------------
917
918    #[test]
919    fn offset_applied_to_elements() {
920        let fence_offset = 100;
921        let input = r#"<Spec id="s1" />"#;
922        let nodes = parse_with_offset(input, fence_offset).unwrap();
923        if let XmlNode::Element { offset, .. } = &nodes[0] {
924            assert_eq!(*offset, 100);
925        } else {
926            panic!("expected Element");
927        }
928    }
929
930    #[test]
931    fn offset_applied_to_nested_element() {
932        let fence_offset = 50;
933        // "<A>" takes 3 bytes, so <B> starts at position 3.
934        let input = "<A><B /></A>";
935        let nodes = parse_with_offset(input, fence_offset).unwrap();
936        if let XmlNode::Element {
937            offset, children, ..
938        } = &nodes[0]
939        {
940            assert_eq!(*offset, 50); // A at position 0 + 50
941            if let XmlNode::Element { offset, .. } = &children[0] {
942                assert_eq!(*offset, 53); // B at position 3 + 50
943            } else {
944                panic!("expected nested Element");
945            }
946        } else {
947            panic!("expected Element");
948        }
949    }
950
951    #[test]
952    fn offset_with_multiple_top_level_elements() {
953        let fence_offset = 200;
954        let input = "<A />\n<B />";
955        let nodes = parse_with_offset(input, fence_offset).unwrap();
956        assert_eq!(nodes.len(), 2);
957        if let XmlNode::Element { offset, .. } = &nodes[0] {
958            assert_eq!(*offset, 200); // A at byte 0
959        }
960        if let XmlNode::Element { offset, .. } = &nodes[1] {
961            assert_eq!(*offset, 206); // B at byte 6 (after "<A />\n")
962        }
963    }
964
965    // -- Error cases: unclosed tags ----------------------------------------
966
967    #[test]
968    fn unclosed_element() {
969        let err = parse("<Spec>content").unwrap_err();
970        let msg = err.to_string();
971        assert!(msg.contains("closing tag"), "got: {msg}");
972        assert!(msg.contains("Spec"), "got: {msg}");
973    }
974
975    #[test]
976    fn mismatched_closing_tag() {
977        let err = parse("<A>text</B>").unwrap_err();
978        let msg = err.to_string();
979        assert!(msg.contains("mismatched"), "got: {msg}");
980        assert!(msg.contains("A"), "got: {msg}");
981        assert!(msg.contains("B"), "got: {msg}");
982    }
983
984    // -- Error cases: invalid attributes -----------------------------------
985
986    #[test]
987    fn single_quoted_attribute_value() {
988        let err = parse("<Spec id='s1' />").unwrap_err();
989        let msg = err.to_string();
990        assert!(msg.contains("double-quoted"), "got: {msg}");
991    }
992
993    #[test]
994    fn single_quotes_inside_double_quoted_attribute_value() {
995        // Single quotes inside a double-quoted attribute value are valid XML.
996        let result = parse(r#"<Spec val="a='b'" />"#);
997        assert!(result.is_ok(), "got: {}", result.unwrap_err());
998    }
999
1000    #[test]
1001    fn missing_attribute_value() {
1002        let err = parse("<Spec id />").unwrap_err();
1003        let msg = err.to_string();
1004        assert!(msg.contains("="), "got: {msg}");
1005    }
1006
1007    // -- Error cases: unsupported XML features -----------------------------
1008
1009    #[test]
1010    fn processing_instruction_rejected() {
1011        let err = parse("<?xml version=\"1.0\"?>").unwrap_err();
1012        let msg = err.to_string();
1013        assert!(msg.contains("processing instruction"), "got: {msg}");
1014    }
1015
1016    #[test]
1017    fn cdata_rejected() {
1018        let err = parse("<![CDATA[foo]]>").unwrap_err();
1019        let msg = err.to_string();
1020        assert!(msg.contains("CDATA"), "got: {msg}");
1021    }
1022
1023    #[test]
1024    fn doctype_rejected() {
1025        let err = parse("<!DOCTYPE html>").unwrap_err();
1026        let msg = err.to_string();
1027        assert!(msg.contains("DTD") || msg.contains("DOCTYPE"), "got: {msg}");
1028    }
1029
1030    #[test]
1031    fn comment_ignored() {
1032        let nodes = parse("<!-- comment -->").unwrap();
1033        assert!(nodes.is_empty(), "comments should produce no nodes");
1034    }
1035
1036    #[test]
1037    fn comment_between_elements_ignored() {
1038        let nodes = parse(r#"<Task id="t-1" status="draft">text</Task><!-- skip --><Task id="t-2" status="draft">more</Task>"#).unwrap();
1039        assert_eq!(
1040            nodes.len(),
1041            2,
1042            "should parse both elements, ignoring comment"
1043        );
1044    }
1045
1046    #[test]
1047    fn comment_inside_element_ignored() {
1048        let nodes = parse(r#"<AcceptanceCriteria><!-- placeholder --><Criterion id="c-1">desc</Criterion></AcceptanceCriteria>"#).unwrap();
1049        assert_eq!(nodes.len(), 1);
1050        if let XmlNode::Element { children, .. } = &nodes[0] {
1051            assert_eq!(children.len(), 1, "comment should not appear as child");
1052        } else {
1053            panic!("expected element");
1054        }
1055    }
1056
1057    #[test]
1058    fn namespace_in_element_rejected() {
1059        // The parser treats `:` as not part of a name, so `foo:Bar` would
1060        // parse `foo` as the name and then fail on `:`. That error is
1061        // acceptable — the point is it doesn't silently succeed.
1062        let err = parse("<foo:Bar />").unwrap_err();
1063        assert!(
1064            err.to_string().contains("test.md"),
1065            "error should include path"
1066        );
1067    }
1068
1069    #[test]
1070    fn unsupported_entity_rejected() {
1071        let err = parse("<Spec>&apos;</Spec>").unwrap_err();
1072        let msg = err.to_string();
1073        assert!(msg.contains("unsupported entity"), "got: {msg}");
1074    }
1075
1076    #[test]
1077    fn unterminated_entity_rejected() {
1078        let err = parse("<Spec>&amp</Spec>").unwrap_err();
1079        let msg = err.to_string();
1080        assert!(msg.contains("unterminated entity"), "got: {msg}");
1081    }
1082
1083    // -- Lowercase elements are parsed successfully -------------------------
1084
1085    #[test]
1086    fn lowercase_element_name_parsed_successfully() {
1087        let nodes = parse("<spec />").unwrap();
1088        assert_eq!(nodes.len(), 1);
1089        match &nodes[0] {
1090            XmlNode::Element { name, .. } => assert_eq!(name, "spec"),
1091            _ => panic!("expected Element"),
1092        }
1093    }
1094
1095    #[test]
1096    fn lowercase_element_inside_pascal_case_element() {
1097        let nodes = parse(r#"<Criterion id="c1">Use <em>fast</em> path</Criterion>"#).unwrap();
1098        assert_eq!(nodes.len(), 1);
1099        match &nodes[0] {
1100            XmlNode::Element { name, children, .. } => {
1101                assert_eq!(name, "Criterion");
1102                // Children: Text("Use "), Element(em), Text(" path")
1103                assert_eq!(children.len(), 3);
1104                assert!(matches!(&children[0], XmlNode::Text { content, .. } if content == "Use "));
1105                match &children[1] {
1106                    XmlNode::Element { name, children, .. } => {
1107                        assert_eq!(name, "em");
1108                        assert_eq!(children.len(), 1);
1109                        assert!(
1110                            matches!(&children[0], XmlNode::Text { content, .. } if content == "fast")
1111                        );
1112                    }
1113                    _ => panic!("expected em Element"),
1114                }
1115                assert!(
1116                    matches!(&children[2], XmlNode::Text { content, .. } if content == " path")
1117                );
1118            }
1119            _ => panic!("expected Criterion Element"),
1120        }
1121    }
1122
1123    // -- Error position information ----------------------------------------
1124
1125    #[test]
1126    fn error_includes_line_and_column() {
1127        // Error on line 2, at the start of the line (column 1).
1128        // Use a namespaced element (still rejected) to trigger an error.
1129        let err = parse("<A>\n<ns:B /></A>").unwrap_err();
1130        if let ParseError::XmlSyntaxError { line, column, .. } = &err {
1131            assert_eq!(*line, 2);
1132            assert_eq!(*column, 1);
1133        } else {
1134            panic!("expected XmlSyntaxError");
1135        }
1136    }
1137
1138    #[test]
1139    fn error_includes_file_path() {
1140        let err = parse_supersigil_xml("<?xml?>", 0, Path::new("/foo/bar.md")).unwrap_err();
1141        let msg = err.to_string();
1142        assert!(msg.contains("/foo/bar.md"), "got: {msg}");
1143    }
1144
1145    // -- Closing tag at top level ------------------------------------------
1146
1147    #[test]
1148    fn closing_tag_at_top_level_rejected() {
1149        let err = parse("</Orphan>").unwrap_err();
1150        let msg = err.to_string();
1151        assert!(msg.contains("unexpected closing tag"), "got: {msg}");
1152    }
1153
1154    // -- Realistic example -------------------------------------------------
1155
1156    #[test]
1157    fn realistic_component_example() {
1158        let input = r#"<Criterion id="perf-latency" strategy="tag">
1159  P99 latency must be under 100ms for API requests.
1160</Criterion>
1161<VerifiedBy strategy="tag" tag="perf-latency" />"#;
1162        let nodes = parse(input).unwrap();
1163        assert_eq!(nodes.len(), 2);
1164
1165        // Criterion
1166        match &nodes[0] {
1167            XmlNode::Element {
1168                name,
1169                attributes,
1170                children,
1171                ..
1172            } => {
1173                assert_eq!(name, "Criterion");
1174                assert_eq!(attributes.len(), 2);
1175                assert_eq!(attributes[0], ("id".to_owned(), "perf-latency".to_owned()));
1176                assert_eq!(attributes[1], ("strategy".to_owned(), "tag".to_owned()));
1177                assert_eq!(children.len(), 1);
1178                if let XmlNode::Text { content, .. } = &children[0] {
1179                    assert!(content.contains("P99 latency"));
1180                } else {
1181                    panic!("expected Text child");
1182                }
1183            }
1184            _ => panic!("expected Element"),
1185        }
1186
1187        // VerifiedBy
1188        match &nodes[1] {
1189            XmlNode::Element {
1190                name,
1191                attributes,
1192                children,
1193                ..
1194            } => {
1195                assert_eq!(name, "VerifiedBy");
1196                assert_eq!(attributes.len(), 2);
1197                assert!(children.is_empty());
1198            }
1199            _ => panic!("expected Element"),
1200        }
1201    }
1202
1203    // -- UTF-8 text content ------------------------------------------------
1204
1205    #[test]
1206    fn utf8_text_content_preserved() {
1207        let input = "<Note>cafe\u{0301} \u{1F600}</Note>";
1208        let nodes = parse(input).unwrap();
1209        if let XmlNode::Element { children, .. } = &nodes[0] {
1210            if let XmlNode::Text { content: t, .. } = &children[0] {
1211                assert!(t.contains("cafe\u{0301}"));
1212                assert!(t.contains('\u{1F600}'));
1213            } else {
1214                panic!("expected Text");
1215            }
1216        } else {
1217            panic!("expected Element");
1218        }
1219    }
1220
1221    #[test]
1222    fn text_node_has_correct_offset() {
1223        // "<Title>" = 7 bytes, so text "Hello" starts at offset 7
1224        let input = "<Title>Hello</Title>";
1225        let nodes = parse(input).unwrap();
1226        if let XmlNode::Element { children, .. } = &nodes[0] {
1227            if let XmlNode::Text {
1228                content, offset, ..
1229            } = &children[0]
1230            {
1231                assert_eq!(content, "Hello");
1232                assert_eq!(*offset, 7, "text should start at byte 7 (after '<Title>')");
1233            } else {
1234                panic!("expected Text");
1235            }
1236        } else {
1237            panic!("expected Element");
1238        }
1239    }
1240
1241    #[test]
1242    fn text_node_offset_with_fence_offset() {
1243        let fence_offset = 100;
1244        let input = "<Title>Hello</Title>";
1245        let nodes = parse_with_offset(input, fence_offset).unwrap();
1246        if let XmlNode::Element { children, .. } = &nodes[0] {
1247            if let XmlNode::Text {
1248                content, offset, ..
1249            } = &children[0]
1250            {
1251                assert_eq!(content, "Hello");
1252                assert_eq!(*offset, 107, "text should be fence_offset + 7");
1253            } else {
1254                panic!("expected Text");
1255            }
1256        } else {
1257            panic!("expected Element");
1258        }
1259    }
1260
1261    #[test]
1262    fn text_node_end_offset_plain_text() {
1263        // "<Title>Hello</Title>" — text "Hello" starts at 7, ends at 12
1264        let input = "<Title>Hello</Title>";
1265        let nodes = parse(input).unwrap();
1266        if let XmlNode::Element { children, .. } = &nodes[0] {
1267            if let XmlNode::Text {
1268                content,
1269                offset,
1270                end_offset,
1271            } = &children[0]
1272            {
1273                assert_eq!(content, "Hello");
1274                assert_eq!(*offset, 7);
1275                assert_eq!(
1276                    *end_offset, 12,
1277                    "end_offset should be past the last byte of 'Hello'"
1278                );
1279                assert_eq!(
1280                    &input[*offset..*end_offset],
1281                    "Hello",
1282                    "offset..end_offset should span the raw text"
1283                );
1284            } else {
1285                panic!("expected Text");
1286            }
1287        } else {
1288            panic!("expected Element");
1289        }
1290    }
1291
1292    #[test]
1293    fn text_node_end_offset_with_entities() {
1294        // "a &lt; b" in raw source → decoded "a < b"
1295        // Raw: a(1) space(1) &lt;(4) space(1) b(1) = 8 bytes
1296        let input = "<T>a &lt; b</T>";
1297        let nodes = parse(input).unwrap();
1298        if let XmlNode::Element { children, .. } = &nodes[0] {
1299            if let XmlNode::Text {
1300                content,
1301                offset,
1302                end_offset,
1303            } = &children[0]
1304            {
1305                assert_eq!(content, "a < b", "content should be entity-decoded");
1306                assert_eq!(*offset, 3, "text starts after '<T>'");
1307                assert_eq!(
1308                    *end_offset, 11,
1309                    "end_offset should be past the last byte of 'a &lt; b' in raw source"
1310                );
1311                assert_eq!(
1312                    &input[*offset..*end_offset],
1313                    "a &lt; b",
1314                    "offset..end_offset should span the raw source text"
1315                );
1316                // Decoded length (5) < raw length (8)
1317                assert!(content.len() < (*end_offset - *offset));
1318            } else {
1319                panic!("expected Text");
1320            }
1321        } else {
1322            panic!("expected Element");
1323        }
1324    }
1325
1326    #[test]
1327    fn text_node_end_offset_with_fence_offset_and_entities() {
1328        let fence_offset = 50;
1329        let input = "<T>&amp;</T>";
1330        let nodes = parse_with_offset(input, fence_offset).unwrap();
1331        if let XmlNode::Element { children, .. } = &nodes[0] {
1332            if let XmlNode::Text {
1333                content,
1334                offset,
1335                end_offset,
1336            } = &children[0]
1337            {
1338                assert_eq!(content, "&", "decoded entity");
1339                assert_eq!(*offset, 53, "starts at fence_offset + 3 (after '<T>')");
1340                // &amp; = 5 bytes in raw source, starts at position 3 in XML content
1341                assert_eq!(
1342                    *end_offset, 58,
1343                    "end_offset = fence_offset + 3 + 5 (length of '&amp;')"
1344                );
1345            } else {
1346                panic!("expected Text");
1347            }
1348        } else {
1349            panic!("expected Element");
1350        }
1351    }
1352
1353    #[test]
1354    fn text_node_end_offset_multiple_entities() {
1355        // "&lt;&gt;" → decoded "<>" (2 chars), raw = 8 bytes
1356        let input = "<T>&lt;&gt;</T>";
1357        let nodes = parse(input).unwrap();
1358        if let XmlNode::Element { children, .. } = &nodes[0] {
1359            if let XmlNode::Text {
1360                content,
1361                offset,
1362                end_offset,
1363            } = &children[0]
1364            {
1365                assert_eq!(content, "<>");
1366                assert_eq!(*offset, 3);
1367                assert_eq!(*end_offset, 11, "past '&lt;&gt;' in raw source");
1368                assert_eq!(&input[*offset..*end_offset], "&lt;&gt;");
1369            } else {
1370                panic!("expected Text");
1371            }
1372        } else {
1373            panic!("expected Element");
1374        }
1375    }
1376
1377    // -- end_offset on Element ------------------------------------------------
1378
1379    #[test]
1380    fn self_closing_element_end_offset() {
1381        let input = r#"<Spec id="s1" />"#;
1382        let nodes = parse(input).unwrap();
1383        assert_eq!(nodes.len(), 1);
1384        match &nodes[0] {
1385            XmlNode::Element {
1386                name,
1387                offset,
1388                end_offset,
1389                ..
1390            } => {
1391                assert_eq!(name, "Spec");
1392                assert_eq!(*offset, 0);
1393                assert_eq!(*end_offset, input.len());
1394            }
1395            _ => panic!("expected Element"),
1396        }
1397    }
1398
1399    #[test]
1400    fn regular_element_end_offset() {
1401        let input = "<Title>Hello</Title>";
1402        let nodes = parse(input).unwrap();
1403        assert_eq!(nodes.len(), 1);
1404        match &nodes[0] {
1405            XmlNode::Element {
1406                name,
1407                offset,
1408                end_offset,
1409                ..
1410            } => {
1411                assert_eq!(name, "Title");
1412                assert_eq!(*offset, 0);
1413                assert_eq!(*end_offset, input.len());
1414            }
1415            _ => panic!("expected Element"),
1416        }
1417    }
1418
1419    #[test]
1420    fn nested_element_end_offsets() {
1421        let input = r#"<Parent><Child id="c1" /></Parent>"#;
1422        let nodes = parse(input).unwrap();
1423        match &nodes[0] {
1424            XmlNode::Element {
1425                end_offset,
1426                children,
1427                ..
1428            } => {
1429                assert_eq!(*end_offset, input.len());
1430                match &children[0] {
1431                    XmlNode::Element {
1432                        name,
1433                        offset,
1434                        end_offset,
1435                        ..
1436                    } => {
1437                        assert_eq!(name, "Child");
1438                        assert_eq!(*offset, 8);
1439                        // "<Child id="c1" />" ends at position 25
1440                        assert_eq!(*end_offset, 25);
1441                    }
1442                    _ => panic!("expected Element"),
1443                }
1444            }
1445            _ => panic!("expected Element"),
1446        }
1447    }
1448
1449    #[test]
1450    fn element_end_offset_with_fence_offset() {
1451        let input = r#"<Spec id="s1" />"#;
1452        let fence_offset = 100;
1453        let nodes = parse_with_offset(input, fence_offset).unwrap();
1454        match &nodes[0] {
1455            XmlNode::Element {
1456                offset, end_offset, ..
1457            } => {
1458                assert_eq!(*offset, 100);
1459                assert_eq!(*end_offset, 100 + input.len());
1460            }
1461            _ => panic!("expected Element"),
1462        }
1463    }
1464}