Skip to main content

supersigil_parser/
xml_parser.rs

1//! XML subset parser for `supersigil-xml` fence content.
2//!
3//! Parses a strict XML subset supporting:
4//! - `PascalCase` elements with double-quoted string attributes
5//! - Self-closing elements (`<Foo />`)
6//! - Nested elements
7//! - Text content between elements
8//! - Entity references: `&amp;`, `&lt;`, `&gt;`, `&quot;`
9//!
10//! Rejects processing instructions, CDATA, DTD, comments, namespaces, and
11//! unsupported entity references.
12//!
13//! Implemented on top of `quick-xml` for correctness and robustness.
14
15use std::path::Path;
16
17use quick_xml::Reader;
18use quick_xml::events::Event;
19use supersigil_core::ParseError;
20
21use crate::util::line_col;
22
23// ---------------------------------------------------------------------------
24// Public types
25// ---------------------------------------------------------------------------
26
27/// A parsed XML node from a `supersigil-xml` fence.
28#[derive(Debug, Clone, PartialEq)]
29pub enum XmlNode {
30    /// An XML element with name, attributes, children, and source offset.
31    Element {
32        /// Element tag name (e.g. `Criterion`).
33        name: String,
34        /// Ordered list of `(key, value)` attribute pairs.
35        attributes: Vec<(String, String)>,
36        /// Child nodes (elements and/or text).
37        children: Vec<XmlNode>,
38        /// Byte offset of the opening `<` relative to the source file.
39        offset: usize,
40        /// Byte offset past the closing `>` relative to the source file.
41        end_offset: usize,
42    },
43    /// Raw text content (entity references already resolved).
44    Text {
45        /// The decoded text content.
46        content: String,
47        /// Byte offset of the start of the text relative to the source file.
48        offset: usize,
49        /// Byte offset of the end of the raw source text (past the last
50        /// byte of the original text/entity-ref run). This can differ
51        /// from `offset + content.len()` when entity references are present
52        /// because decoded text is shorter than the raw source.
53        end_offset: usize,
54    },
55}
56
57// ---------------------------------------------------------------------------
58// Synthetic root tag
59// ---------------------------------------------------------------------------
60
61/// Tag used to wrap multiple top-level elements so that quick-xml sees
62/// well-formed XML.  This name intentionally cannot be a valid `PascalCase`
63/// supersigil element.
64const SYNTHETIC_ROOT: &str = "__root";
65const SYNTHETIC_OPEN: &str = "<__root>";
66const SYNTHETIC_CLOSE: &str = "</__root>";
67
68// ---------------------------------------------------------------------------
69// Public API
70// ---------------------------------------------------------------------------
71
72/// Parse the content of a `supersigil-xml` fence into structured XML nodes.
73///
74/// `content` is the raw text between the fence delimiters.
75/// `fence_offset` is the byte offset of the fence content start within the
76/// source file, used to produce file-absolute offsets.
77/// `path` is the file path for error messages.
78///
79/// # Errors
80///
81/// Returns [`ParseError::XmlSyntaxError`] for any syntax or validation error.
82/// Line and column numbers in errors are **content-relative** (i.e. relative
83/// to the start of the fence content, not the file). The caller must adjust
84/// for the fence's position within the file if file-absolute positions are
85/// needed.
86pub fn parse_supersigil_xml(
87    content: &str,
88    fence_offset: usize,
89    path: &Path,
90) -> Result<Vec<XmlNode>, ParseError> {
91    // Reject unsupported constructs that quick-xml would silently handle.
92    reject_unsupported(content, path)?;
93
94    // Wrap in a synthetic root so quick-xml can handle multiple top-level
95    // elements (which is not valid XML on its own).
96    let wrapped = format!("{SYNTHETIC_OPEN}{content}{SYNTHETIC_CLOSE}");
97
98    let mut reader = Reader::from_str(&wrapped);
99    reader.config_mut().trim_text(false);
100
101    // The synthetic root open tag shifts all byte positions by its length.
102    // quick-xml uses u64 for buffer positions.
103    #[allow(clippy::cast_possible_truncation, reason = "SYNTHETIC_OPEN is 8 bytes")]
104    let root_tag_len: u64 = SYNTHETIC_OPEN.len() as u64;
105
106    // Skip the synthetic root's opening event.
107    loop {
108        match reader.read_event() {
109            Ok(Event::Start(ref e)) if e.name().as_ref() == SYNTHETIC_ROOT.as_bytes() => break,
110            Ok(Event::Eof) => break,
111            Err(e) => {
112                return Err(quick_xml_error_to_parse_error(&e, path));
113            }
114            _ => {}
115        }
116    }
117
118    // Parse children of the synthetic root (= top-level nodes).
119    let nodes = parse_children(
120        &mut reader,
121        SYNTHETIC_ROOT,
122        content,
123        fence_offset,
124        root_tag_len,
125        path,
126    )?;
127
128    Ok(nodes)
129}
130
131// ---------------------------------------------------------------------------
132// Pre-scan: reject unsupported constructs before quick-xml parsing
133// ---------------------------------------------------------------------------
134
135/// Scan the raw content for constructs that our XML subset forbids.
136///
137/// We do this before handing the input to quick-xml because quick-xml would
138/// either silently handle these (comments, CDATA) or produce cryptic errors.
139fn reject_unsupported(content: &str, path: &Path) -> Result<(), ParseError> {
140    let bytes = content.as_bytes();
141    let mut i = 0;
142    while i < bytes.len() {
143        if bytes[i] == b'<' {
144            if bytes[i..].starts_with(b"<?") {
145                return Err(make_error(
146                    content,
147                    i,
148                    path,
149                    "processing instructions (`<?...?>`) are not supported",
150                ));
151            }
152            if bytes[i..].starts_with(b"<![CDATA[") {
153                return Err(make_error(
154                    content,
155                    i,
156                    path,
157                    "CDATA sections (`<![CDATA[...]]>`) are not supported",
158                ));
159            }
160            if bytes[i..].starts_with(b"<!DOCTYPE") || bytes[i..].starts_with(b"<!doctype") {
161                return Err(make_error(
162                    content,
163                    i,
164                    path,
165                    "DTD declarations (`<!DOCTYPE ...>`) are not supported",
166                ));
167            }
168            if bytes[i..].starts_with(b"<!--") {
169                return Err(make_error(
170                    content,
171                    i,
172                    path,
173                    "XML comments (`<!-- ... -->`) are not supported",
174                ));
175            }
176        }
177        i += 1;
178    }
179    Ok(())
180}
181
182// ---------------------------------------------------------------------------
183// Recursive event-driven parser
184// ---------------------------------------------------------------------------
185
186/// Parse child nodes until the closing tag for `parent_name` is encountered.
187///
188/// Text and entity-reference events are accumulated into a single `XmlNode::Text`
189/// because quick-xml emits `Text` / `GeneralRef` / `Text` sequences for content
190/// like `a &lt; b`.
191#[allow(
192    clippy::too_many_lines,
193    reason = "event-loop structure is clearest as a single function"
194)]
195fn parse_children(
196    reader: &mut Reader<&[u8]>,
197    parent_name: &str,
198    content: &str,
199    fence_offset: usize,
200    root_tag_len: u64,
201    path: &Path,
202) -> Result<Vec<XmlNode>, ParseError> {
203    /// Flush accumulated text into the node list.
204    fn flush_text(
205        text_buf: &mut String,
206        text_start: &mut Option<usize>,
207        text_end: &mut Option<usize>,
208        nodes: &mut Vec<XmlNode>,
209        is_top_level: bool,
210    ) {
211        if text_buf.is_empty() {
212            return;
213        }
214        // At top level, skip whitespace-only text nodes.
215        if is_top_level && text_buf.trim().is_empty() {
216            text_buf.clear();
217            *text_start = None;
218            *text_end = None;
219            return;
220        }
221        let start = text_start.take().unwrap_or(0);
222        let end = text_end.take().unwrap_or(start);
223        nodes.push(XmlNode::Text {
224            content: std::mem::take(text_buf),
225            offset: start,
226            end_offset: end,
227        });
228    }
229
230    let mut nodes: Vec<XmlNode> = Vec::new();
231    // Accumulator for runs of Text + GeneralRef events.
232    let mut text_buf = String::new();
233    // Byte offset (file-absolute) of the first event in the current text run.
234    let mut text_start_offset: Option<usize> = None;
235    // Byte offset (file-absolute) past the last byte of the current text run.
236    let mut text_end_offset: Option<usize> = None;
237    let is_top_level = parent_name == SYNTHETIC_ROOT;
238
239    loop {
240        let event_pos = reader.buffer_position();
241
242        match reader.read_event() {
243            Ok(Event::Start(ref e)) => {
244                flush_text(
245                    &mut text_buf,
246                    &mut text_start_offset,
247                    &mut text_end_offset,
248                    &mut nodes,
249                    is_top_level,
250                );
251
252                let offset_in_content = content_offset(event_pos, root_tag_len);
253                let file_offset = fence_offset + offset_in_content;
254
255                let tag_name = decode_name(e.name().as_ref(), content, offset_in_content, path)?;
256                validate_element_name(&tag_name, content, offset_in_content, path)?;
257                let attributes = parse_attributes(e, content, offset_in_content, path)?;
258
259                let children =
260                    parse_children(reader, &tag_name, content, fence_offset, root_tag_len, path)?;
261
262                // After parse_children returns, reader is past the closing `>`.
263                let end_in_content = content_offset(reader.buffer_position(), root_tag_len);
264                let file_end_offset = fence_offset + end_in_content;
265
266                nodes.push(XmlNode::Element {
267                    name: tag_name,
268                    attributes,
269                    children,
270                    offset: file_offset,
271                    end_offset: file_end_offset,
272                });
273            }
274
275            Ok(Event::Empty(ref e)) => {
276                flush_text(
277                    &mut text_buf,
278                    &mut text_start_offset,
279                    &mut text_end_offset,
280                    &mut nodes,
281                    is_top_level,
282                );
283
284                let offset_in_content = content_offset(event_pos, root_tag_len);
285                let file_offset = fence_offset + offset_in_content;
286
287                let tag_name = decode_name(e.name().as_ref(), content, offset_in_content, path)?;
288                validate_element_name(&tag_name, content, offset_in_content, path)?;
289                let attributes = parse_attributes(e, content, offset_in_content, path)?;
290
291                // After reading Empty event, reader is past the closing `/>`.
292                let end_in_content = content_offset(reader.buffer_position(), root_tag_len);
293                let file_end_offset = fence_offset + end_in_content;
294
295                nodes.push(XmlNode::Element {
296                    name: tag_name,
297                    attributes,
298                    children: vec![],
299                    offset: file_offset,
300                    end_offset: file_end_offset,
301                });
302            }
303
304            Ok(Event::Text(ref e)) => {
305                // Accumulate raw text (no entity refs in here — those come as GeneralRef).
306                let raw = std::str::from_utf8(e.as_ref()).map_err(|_err| {
307                    let off = content_offset(event_pos, root_tag_len);
308                    make_error(content, off, path, "invalid UTF-8 in text content")
309                })?;
310                let off = content_offset(event_pos, root_tag_len);
311                if text_start_offset.is_none() {
312                    text_start_offset = Some(fence_offset + off);
313                }
314                // Text events have no entity expansion, so raw byte length
315                // equals the source byte length.
316                text_end_offset = Some(fence_offset + off + raw.len());
317                text_buf.push_str(raw);
318            }
319
320            Ok(Event::GeneralRef(ref e)) => {
321                // Entity reference: e.g. `&amp;` arrives as GeneralRef with content `amp`.
322                let entity_name = std::str::from_utf8(e.as_ref()).map_err(|_err| {
323                    let off = content_offset(event_pos, root_tag_len);
324                    make_error(content, off, path, "invalid UTF-8 in entity reference")
325                })?;
326                let off = content_offset(event_pos, root_tag_len);
327                if text_start_offset.is_none() {
328                    text_start_offset = Some(fence_offset + off);
329                }
330                // Raw source: `&` + entity_name + `;` — so raw length is
331                // entity_name.len() + 2.
332                text_end_offset = Some(fence_offset + off + entity_name.len() + 2);
333                let resolved = resolve_entity(entity_name, content, off, path)?;
334                text_buf.push_str(resolved);
335            }
336
337            Ok(Event::End(ref e)) => {
338                let name_bytes = e.name();
339                let end_name = std::str::from_utf8(name_bytes.as_ref()).unwrap_or("<invalid>");
340                if end_name == parent_name {
341                    flush_text(
342                        &mut text_buf,
343                        &mut text_start_offset,
344                        &mut text_end_offset,
345                        &mut nodes,
346                        is_top_level,
347                    );
348                    return Ok(nodes);
349                }
350                // Mismatch — provide our own message.
351                let offset_in_content = content_offset(event_pos, root_tag_len);
352                return Err(make_error(
353                    content,
354                    offset_in_content,
355                    path,
356                    &format!(
357                        "mismatched closing tag: expected `</{parent_name}>`, found `</{end_name}>`"
358                    ),
359                ));
360            }
361
362            Ok(Event::Eof) => {
363                flush_text(
364                    &mut text_buf,
365                    &mut text_start_offset,
366                    &mut text_end_offset,
367                    &mut nodes,
368                    is_top_level,
369                );
370                if is_top_level {
371                    return Ok(nodes);
372                }
373                return Err(make_error(
374                    content,
375                    content.len(),
376                    path,
377                    &format!("expected closing tag `</{parent_name}>`, found end of input"),
378                ));
379            }
380
381            // Constructs rejected by pre-scan but caught here as a safety net.
382            Ok(Event::Comment(_)) => {
383                let off = content_offset(event_pos, root_tag_len);
384                return Err(make_error(
385                    content,
386                    off,
387                    path,
388                    "XML comments (`<!-- ... -->`) are not supported",
389                ));
390            }
391            Ok(Event::CData(_)) => {
392                let off = content_offset(event_pos, root_tag_len);
393                return Err(make_error(
394                    content,
395                    off,
396                    path,
397                    "CDATA sections (`<![CDATA[...]]>`) are not supported",
398                ));
399            }
400            Ok(Event::PI(_) | Event::Decl(_)) => {
401                let off = content_offset(event_pos, root_tag_len);
402                return Err(make_error(
403                    content,
404                    off,
405                    path,
406                    "processing instructions (`<?...?>`) are not supported",
407                ));
408            }
409            Ok(Event::DocType(_)) => {
410                let off = content_offset(event_pos, root_tag_len);
411                return Err(make_error(
412                    content,
413                    off,
414                    path,
415                    "DTD declarations (`<!DOCTYPE ...>`) are not supported",
416                ));
417            }
418
419            Err(e) => {
420                return Err(quick_xml_error_to_parse_error(&e, path));
421            }
422        }
423    }
424}
425
426// ---------------------------------------------------------------------------
427// Attribute parsing
428// ---------------------------------------------------------------------------
429
430/// Extract attributes from a quick-xml `BytesStart` event.
431fn parse_attributes(
432    event: &quick_xml::events::BytesStart<'_>,
433    content: &str,
434    offset_in_content: usize,
435    path: &Path,
436) -> Result<Vec<(String, String)>, ParseError> {
437    // First, validate that all attribute values are double-quoted by scanning
438    // the raw event bytes.
439    validate_attribute_quotes(event, content, offset_in_content, path)?;
440
441    let mut attrs = Vec::new();
442    for attr_result in event.attributes() {
443        let attr = attr_result.map_err(|e| {
444            let msg = format!("{e}");
445            make_error(content, offset_in_content, path, &msg)
446        })?;
447
448        let key = decode_name(attr.key.as_ref(), content, offset_in_content, path)?;
449
450        // Reject namespaced attributes.
451        if key.contains(':') {
452            return Err(make_error(
453                content,
454                offset_in_content,
455                path,
456                &format!("namespaced attribute `{key}` is not supported"),
457            ));
458        }
459
460        // Decode the value, resolving our supported entity subset.
461        let raw_value = std::str::from_utf8(attr.value.as_ref()).map_err(|_err| {
462            make_error(
463                content,
464                offset_in_content,
465                path,
466                "invalid UTF-8 in attribute value",
467            )
468        })?;
469        let value = resolve_entities_in_str(raw_value, content, offset_in_content, path)?;
470
471        attrs.push((key, value));
472    }
473    Ok(attrs)
474}
475
476/// Check that all attribute values use double quotes (not single quotes,
477/// not unquoted).
478///
479/// quick-xml accepts both single- and double-quoted attribute values, so we
480/// must inspect the raw tag bytes to enforce our subset requirement.
481fn validate_attribute_quotes(
482    event: &quick_xml::events::BytesStart<'_>,
483    content: &str,
484    offset_in_content: usize,
485    path: &Path,
486) -> Result<(), ParseError> {
487    let raw: &[u8] = event.as_ref(); // bytes after the tag name
488    let mut i = 0;
489    let mut in_double_quote = false;
490    while i < raw.len() {
491        if in_double_quote {
492            if raw[i] == b'"' {
493                in_double_quote = false;
494            }
495            i += 1;
496            continue;
497        }
498        if raw[i] == b'"' {
499            in_double_quote = true;
500            i += 1;
501            continue;
502        }
503        if raw[i] == b'=' {
504            // Skip whitespace after `=`.
505            i += 1;
506            while i < raw.len() && raw[i].is_ascii_whitespace() {
507                i += 1;
508            }
509            if i < raw.len() && raw[i] == b'\'' {
510                return Err(make_error(
511                    content,
512                    offset_in_content,
513                    path,
514                    "attribute values must be double-quoted",
515                ));
516            }
517            // Don't advance `i` here — let the main loop handle the
518            // opening `"` so that double-quote tracking activates.
519            continue;
520        }
521        i += 1;
522    }
523    Ok(())
524}
525
526// ---------------------------------------------------------------------------
527// Entity resolution
528// ---------------------------------------------------------------------------
529
530/// Resolve a single entity name to its replacement character.
531fn resolve_entity(
532    name: &str,
533    content: &str,
534    offset_in_content: usize,
535    path: &Path,
536) -> Result<&'static str, ParseError> {
537    match name {
538        "amp" => Ok("&"),
539        "lt" => Ok("<"),
540        "gt" => Ok(">"),
541        "quot" => Ok("\""),
542        _ => Err(make_error(
543            content,
544            offset_in_content,
545            path,
546            &format!("unsupported entity reference `&{name};`"),
547        )),
548    }
549}
550
551/// Resolve entity references in a string (used for attribute values where
552/// quick-xml delivers the raw encoded text).
553fn resolve_entities_in_str(
554    text: &str,
555    content: &str,
556    offset_in_content: usize,
557    path: &Path,
558) -> Result<String, ParseError> {
559    if !text.contains('&') {
560        return Ok(text.to_owned());
561    }
562
563    let mut result = String::with_capacity(text.len());
564    let mut rest = text;
565
566    while let Some(amp_pos) = rest.find('&') {
567        result.push_str(&rest[..amp_pos]);
568        rest = &rest[amp_pos + 1..];
569        if let Some(semi_pos) = rest.find(';') {
570            let entity_name = &rest[..semi_pos];
571            let resolved = resolve_entity(entity_name, content, offset_in_content, path)?;
572            result.push_str(resolved);
573            rest = &rest[semi_pos + 1..];
574        } else {
575            return Err(make_error(
576                content,
577                offset_in_content,
578                path,
579                "unterminated entity reference (missing `;`)",
580            ));
581        }
582    }
583    result.push_str(rest);
584    Ok(result)
585}
586
587// ---------------------------------------------------------------------------
588// Name decoding
589// ---------------------------------------------------------------------------
590
591/// Decode a raw byte slice to a `String`, producing a parse error for invalid
592/// UTF-8.
593fn decode_name(
594    raw: &[u8],
595    content: &str,
596    offset_in_content: usize,
597    path: &Path,
598) -> Result<String, ParseError> {
599    std::str::from_utf8(raw)
600        .map(str::to_owned)
601        .map_err(|_err| make_error(content, offset_in_content, path, "invalid UTF-8 in name"))
602}
603
604// ---------------------------------------------------------------------------
605// Validation helpers
606// ---------------------------------------------------------------------------
607
608/// Validate that an element name is `PascalCase` (starts with uppercase ASCII)
609/// and does not contain namespace prefixes.
610fn validate_element_name(
611    name: &str,
612    content: &str,
613    offset_in_content: usize,
614    path: &Path,
615) -> Result<(), ParseError> {
616    if name.contains(':') {
617        return Err(make_error(
618            content,
619            offset_in_content,
620            path,
621            &format!("namespaced element `{name}` is not supported"),
622        ));
623    }
624    Ok(())
625}
626
627// ---------------------------------------------------------------------------
628// Error helpers
629// ---------------------------------------------------------------------------
630
631/// Convert a quick-xml `u64` buffer position to a `usize` content offset by
632/// subtracting the synthetic root tag length.
633///
634/// XML fence content is always small (well under 4 GiB), so truncation on
635/// 32-bit targets is not a concern in practice.
636#[allow(
637    clippy::cast_possible_truncation,
638    reason = "XML fence content is always small"
639)]
640fn content_offset(event_pos: u64, root_tag_len: u64) -> usize {
641    (event_pos - root_tag_len) as usize
642}
643
644/// Build a `ParseError::XmlSyntaxError` from a byte position in the content.
645fn make_error(content: &str, offset_in_content: usize, path: &Path, message: &str) -> ParseError {
646    let (line, column) = line_col(content, offset_in_content);
647    ParseError::XmlSyntaxError {
648        path: path.to_path_buf(),
649        line,
650        column,
651        message: message.to_owned(),
652    }
653}
654
655/// Convert a `quick_xml::Error` into a `ParseError::XmlSyntaxError`.
656///
657/// The line 1, column 1 fallback is deliberate: quick-xml does not expose
658/// byte positions for structural errors, so we cannot compute a more precise
659/// location.
660fn quick_xml_error_to_parse_error(err: &quick_xml::Error, path: &Path) -> ParseError {
661    let message = match err {
662        quick_xml::Error::IllFormed(ill) => match ill {
663            quick_xml::errors::IllFormedError::MismatchedEndTag { expected, found } => {
664                if expected == SYNTHETIC_ROOT {
665                    // Closing tag at the top level with no matching open tag.
666                    format!("unexpected closing tag `</{found}>` at top level")
667                } else {
668                    format!("mismatched closing tag: expected `</{expected}>`, found `</{found}>`")
669                }
670            }
671            quick_xml::errors::IllFormedError::UnmatchedEndTag(name) => {
672                format!("unexpected closing tag `</{name}>` at top level")
673            }
674            quick_xml::errors::IllFormedError::MissingEndTag(name) => {
675                format!("expected closing tag `</{name}>`, found end of input")
676            }
677            quick_xml::errors::IllFormedError::UnclosedReference => {
678                "unterminated entity reference (missing `;`)".to_owned()
679            }
680            other => format!("{other}"),
681        },
682        other => format!("{other}"),
683    };
684
685    ParseError::XmlSyntaxError {
686        path: path.to_path_buf(),
687        line: 1,
688        column: 1,
689        message,
690    }
691}
692
693// ---------------------------------------------------------------------------
694// Tests
695// ---------------------------------------------------------------------------
696
697#[cfg(test)]
698#[allow(
699    clippy::match_wildcard_for_single_variants,
700    clippy::single_char_pattern,
701    reason = "test assertions are clearer with wildcards and string patterns"
702)]
703mod tests {
704    use super::*;
705
706    fn parse(content: &str) -> Result<Vec<XmlNode>, ParseError> {
707        parse_supersigil_xml(content, 0, Path::new("test.md"))
708    }
709
710    fn parse_with_offset(content: &str, offset: usize) -> Result<Vec<XmlNode>, ParseError> {
711        parse_supersigil_xml(content, offset, Path::new("test.md"))
712    }
713
714    // -- Valid fragments ---------------------------------------------------
715
716    #[test]
717    fn empty_input() {
718        let nodes = parse("").unwrap();
719        assert!(nodes.is_empty());
720    }
721
722    #[test]
723    fn whitespace_only_input() {
724        let nodes = parse("  \n  \n  ").unwrap();
725        assert!(nodes.is_empty());
726    }
727
728    #[test]
729    fn single_self_closing_element() {
730        let nodes = parse(r#"<Spec id="s1" />"#).unwrap();
731        assert_eq!(nodes.len(), 1);
732        match &nodes[0] {
733            XmlNode::Element {
734                name,
735                attributes,
736                children,
737                offset,
738                ..
739            } => {
740                assert_eq!(name, "Spec");
741                assert_eq!(attributes, &[("id".to_owned(), "s1".to_owned())]);
742                assert!(children.is_empty());
743                assert_eq!(*offset, 0);
744            }
745            _ => panic!("expected Element"),
746        }
747    }
748
749    #[test]
750    fn self_closing_no_space_before_slash() {
751        let nodes = parse(r#"<Spec id="s1"/>"#).unwrap();
752        assert_eq!(nodes.len(), 1);
753        if let XmlNode::Element { name, .. } = &nodes[0] {
754            assert_eq!(name, "Spec");
755        } else {
756            panic!("expected Element");
757        }
758    }
759
760    #[test]
761    fn element_with_text_content() {
762        let nodes = parse("<Title>Hello World</Title>").unwrap();
763        assert_eq!(nodes.len(), 1);
764        match &nodes[0] {
765            XmlNode::Element { name, children, .. } => {
766                assert_eq!(name, "Title");
767                assert_eq!(children.len(), 1);
768                assert!(
769                    matches!(&children[0], XmlNode::Text { content, .. } if content == "Hello World")
770                );
771            }
772            _ => panic!("expected Element"),
773        }
774    }
775
776    #[test]
777    fn element_with_no_attributes() {
778        let nodes = parse("<Container></Container>").unwrap();
779        assert_eq!(nodes.len(), 1);
780        match &nodes[0] {
781            XmlNode::Element {
782                name, attributes, ..
783            } => {
784                assert_eq!(name, "Container");
785                assert!(attributes.is_empty());
786            }
787            _ => panic!("expected Element"),
788        }
789    }
790
791    // -- Nested elements ---------------------------------------------------
792
793    #[test]
794    fn nested_elements() {
795        let input = r#"<Parent id="p1"><Child id="c1" /></Parent>"#;
796        let nodes = parse(input).unwrap();
797        assert_eq!(nodes.len(), 1);
798        match &nodes[0] {
799            XmlNode::Element { name, children, .. } => {
800                assert_eq!(name, "Parent");
801                assert_eq!(children.len(), 1);
802                match &children[0] {
803                    XmlNode::Element {
804                        name, attributes, ..
805                    } => {
806                        assert_eq!(name, "Child");
807                        assert_eq!(attributes, &[("id".to_owned(), "c1".to_owned())]);
808                    }
809                    _ => panic!("expected nested Element"),
810                }
811            }
812            _ => panic!("expected Element"),
813        }
814    }
815
816    #[test]
817    fn deeply_nested_elements() {
818        let input = "<A><B><C>deep</C></B></A>";
819        let nodes = parse(input).unwrap();
820        assert_eq!(nodes.len(), 1);
821        // A -> B -> C -> Text("deep")
822        let a = &nodes[0];
823        if let XmlNode::Element { children, .. } = a {
824            let b = &children[0];
825            if let XmlNode::Element { children, .. } = b {
826                let c = &children[0];
827                if let XmlNode::Element { children, .. } = c {
828                    assert!(
829                        matches!(&children[0], XmlNode::Text { content, .. } if content == "deep")
830                    );
831                } else {
832                    panic!("expected C element");
833                }
834            } else {
835                panic!("expected B element");
836            }
837        } else {
838            panic!("expected A element");
839        }
840    }
841
842    #[test]
843    fn mixed_children_text_and_elements() {
844        let input = "<Parent>before<Child />after</Parent>";
845        let nodes = parse(input).unwrap();
846        assert_eq!(nodes.len(), 1);
847        if let XmlNode::Element { children, .. } = &nodes[0] {
848            assert_eq!(children.len(), 3);
849            assert!(matches!(&children[0], XmlNode::Text { content, .. } if content == "before"));
850            assert!(matches!(&children[1], XmlNode::Element { name, .. } if name == "Child"));
851            assert!(matches!(&children[2], XmlNode::Text { content, .. } if content == "after"));
852        } else {
853            panic!("expected Element");
854        }
855    }
856
857    // -- Multiple top-level elements ---------------------------------------
858
859    #[test]
860    fn multiple_top_level_elements() {
861        let input = r#"<A id="1" />
862<B id="2" />"#;
863        let nodes = parse(input).unwrap();
864        assert_eq!(nodes.len(), 2);
865        if let XmlNode::Element { name, .. } = &nodes[0] {
866            assert_eq!(name, "A");
867        }
868        if let XmlNode::Element { name, .. } = &nodes[1] {
869            assert_eq!(name, "B");
870        }
871    }
872
873    // -- Attribute parsing -------------------------------------------------
874
875    #[test]
876    fn multiple_attributes() {
877        let input = r#"<Criterion id="c1" strategy="tag" />"#;
878        let nodes = parse(input).unwrap();
879        if let XmlNode::Element { attributes, .. } = &nodes[0] {
880            assert_eq!(
881                attributes,
882                &[
883                    ("id".to_owned(), "c1".to_owned()),
884                    ("strategy".to_owned(), "tag".to_owned()),
885                ]
886            );
887        } else {
888            panic!("expected Element");
889        }
890    }
891
892    #[test]
893    fn attribute_with_entity_in_value() {
894        let input = r#"<Spec desc="a &amp; b" />"#;
895        let nodes = parse(input).unwrap();
896        if let XmlNode::Element { attributes, .. } = &nodes[0] {
897            assert_eq!(attributes[0].1, "a & b");
898        } else {
899            panic!("expected Element");
900        }
901    }
902
903    #[test]
904    fn all_supported_entities_in_attribute() {
905        let input = r#"<Spec val="&amp;&lt;&gt;&quot;" />"#;
906        let nodes = parse(input).unwrap();
907        if let XmlNode::Element { attributes, .. } = &nodes[0] {
908            assert_eq!(attributes[0].1, "&<>\"");
909        } else {
910            panic!("expected Element");
911        }
912    }
913
914    // -- Entity references in text content ---------------------------------
915
916    #[test]
917    fn entity_references_in_text() {
918        let input = "<Note>a &lt; b &amp; c &gt; d &quot;e&quot;</Note>";
919        let nodes = parse(input).unwrap();
920        if let XmlNode::Element { children, .. } = &nodes[0] {
921            assert!(
922                matches!(&children[0], XmlNode::Text { content, .. } if content == r#"a < b & c > d "e""#)
923            );
924        } else {
925            panic!("expected Element");
926        }
927    }
928
929    // -- Position offsetting -----------------------------------------------
930
931    #[test]
932    fn offset_applied_to_elements() {
933        let fence_offset = 100;
934        let input = r#"<Spec id="s1" />"#;
935        let nodes = parse_with_offset(input, fence_offset).unwrap();
936        if let XmlNode::Element { offset, .. } = &nodes[0] {
937            assert_eq!(*offset, 100);
938        } else {
939            panic!("expected Element");
940        }
941    }
942
943    #[test]
944    fn offset_applied_to_nested_element() {
945        let fence_offset = 50;
946        // "<A>" takes 3 bytes, so <B> starts at position 3.
947        let input = "<A><B /></A>";
948        let nodes = parse_with_offset(input, fence_offset).unwrap();
949        if let XmlNode::Element {
950            offset, children, ..
951        } = &nodes[0]
952        {
953            assert_eq!(*offset, 50); // A at position 0 + 50
954            if let XmlNode::Element { offset, .. } = &children[0] {
955                assert_eq!(*offset, 53); // B at position 3 + 50
956            } else {
957                panic!("expected nested Element");
958            }
959        } else {
960            panic!("expected Element");
961        }
962    }
963
964    #[test]
965    fn offset_with_multiple_top_level_elements() {
966        let fence_offset = 200;
967        let input = "<A />\n<B />";
968        let nodes = parse_with_offset(input, fence_offset).unwrap();
969        assert_eq!(nodes.len(), 2);
970        if let XmlNode::Element { offset, .. } = &nodes[0] {
971            assert_eq!(*offset, 200); // A at byte 0
972        }
973        if let XmlNode::Element { offset, .. } = &nodes[1] {
974            assert_eq!(*offset, 206); // B at byte 6 (after "<A />\n")
975        }
976    }
977
978    // -- Error cases: unclosed tags ----------------------------------------
979
980    #[test]
981    fn unclosed_element() {
982        let err = parse("<Spec>content").unwrap_err();
983        let msg = err.to_string();
984        assert!(msg.contains("closing tag"), "got: {msg}");
985        assert!(msg.contains("Spec"), "got: {msg}");
986    }
987
988    #[test]
989    fn mismatched_closing_tag() {
990        let err = parse("<A>text</B>").unwrap_err();
991        let msg = err.to_string();
992        assert!(msg.contains("mismatched"), "got: {msg}");
993        assert!(msg.contains("A"), "got: {msg}");
994        assert!(msg.contains("B"), "got: {msg}");
995    }
996
997    // -- Error cases: invalid attributes -----------------------------------
998
999    #[test]
1000    fn single_quoted_attribute_value() {
1001        let err = parse("<Spec id='s1' />").unwrap_err();
1002        let msg = err.to_string();
1003        assert!(msg.contains("double-quoted"), "got: {msg}");
1004    }
1005
1006    #[test]
1007    fn single_quotes_inside_double_quoted_attribute_value() {
1008        // Single quotes inside a double-quoted attribute value are valid XML.
1009        let result = parse(r#"<Spec val="a='b'" />"#);
1010        assert!(result.is_ok(), "got: {}", result.unwrap_err());
1011    }
1012
1013    #[test]
1014    fn missing_attribute_value() {
1015        let err = parse("<Spec id />").unwrap_err();
1016        let msg = err.to_string();
1017        assert!(msg.contains("="), "got: {msg}");
1018    }
1019
1020    // -- Error cases: unsupported XML features -----------------------------
1021
1022    #[test]
1023    fn processing_instruction_rejected() {
1024        let err = parse("<?xml version=\"1.0\"?>").unwrap_err();
1025        let msg = err.to_string();
1026        assert!(msg.contains("processing instruction"), "got: {msg}");
1027    }
1028
1029    #[test]
1030    fn cdata_rejected() {
1031        let err = parse("<![CDATA[foo]]>").unwrap_err();
1032        let msg = err.to_string();
1033        assert!(msg.contains("CDATA"), "got: {msg}");
1034    }
1035
1036    #[test]
1037    fn doctype_rejected() {
1038        let err = parse("<!DOCTYPE html>").unwrap_err();
1039        let msg = err.to_string();
1040        assert!(msg.contains("DTD") || msg.contains("DOCTYPE"), "got: {msg}");
1041    }
1042
1043    #[test]
1044    fn comment_rejected() {
1045        let err = parse("<!-- comment -->").unwrap_err();
1046        let msg = err.to_string();
1047        assert!(msg.contains("comment"), "got: {msg}");
1048    }
1049
1050    #[test]
1051    fn namespace_in_element_rejected() {
1052        // The parser treats `:` as not part of a name, so `foo:Bar` would
1053        // parse `foo` as the name and then fail on `:`. That error is
1054        // acceptable — the point is it doesn't silently succeed.
1055        let err = parse("<foo:Bar />").unwrap_err();
1056        assert!(
1057            err.to_string().contains("test.md"),
1058            "error should include path"
1059        );
1060    }
1061
1062    #[test]
1063    fn unsupported_entity_rejected() {
1064        let err = parse("<Spec>&apos;</Spec>").unwrap_err();
1065        let msg = err.to_string();
1066        assert!(msg.contains("unsupported entity"), "got: {msg}");
1067    }
1068
1069    #[test]
1070    fn unterminated_entity_rejected() {
1071        let err = parse("<Spec>&amp</Spec>").unwrap_err();
1072        let msg = err.to_string();
1073        assert!(msg.contains("unterminated entity"), "got: {msg}");
1074    }
1075
1076    // -- Lowercase elements are parsed successfully -------------------------
1077
1078    #[test]
1079    fn lowercase_element_name_parsed_successfully() {
1080        let nodes = parse("<spec />").unwrap();
1081        assert_eq!(nodes.len(), 1);
1082        match &nodes[0] {
1083            XmlNode::Element { name, .. } => assert_eq!(name, "spec"),
1084            _ => panic!("expected Element"),
1085        }
1086    }
1087
1088    #[test]
1089    fn lowercase_element_inside_pascal_case_element() {
1090        let nodes = parse(r#"<Criterion id="c1">Use <em>fast</em> path</Criterion>"#).unwrap();
1091        assert_eq!(nodes.len(), 1);
1092        match &nodes[0] {
1093            XmlNode::Element { name, children, .. } => {
1094                assert_eq!(name, "Criterion");
1095                // Children: Text("Use "), Element(em), Text(" path")
1096                assert_eq!(children.len(), 3);
1097                assert!(matches!(&children[0], XmlNode::Text { content, .. } if content == "Use "));
1098                match &children[1] {
1099                    XmlNode::Element { name, children, .. } => {
1100                        assert_eq!(name, "em");
1101                        assert_eq!(children.len(), 1);
1102                        assert!(
1103                            matches!(&children[0], XmlNode::Text { content, .. } if content == "fast")
1104                        );
1105                    }
1106                    _ => panic!("expected em Element"),
1107                }
1108                assert!(
1109                    matches!(&children[2], XmlNode::Text { content, .. } if content == " path")
1110                );
1111            }
1112            _ => panic!("expected Criterion Element"),
1113        }
1114    }
1115
1116    // -- Error position information ----------------------------------------
1117
1118    #[test]
1119    fn error_includes_line_and_column() {
1120        // Error on line 2, at the start of the line (column 1).
1121        // Use a namespaced element (still rejected) to trigger an error.
1122        let err = parse("<A>\n<ns:B /></A>").unwrap_err();
1123        if let ParseError::XmlSyntaxError { line, column, .. } = &err {
1124            assert_eq!(*line, 2);
1125            assert_eq!(*column, 1);
1126        } else {
1127            panic!("expected XmlSyntaxError");
1128        }
1129    }
1130
1131    #[test]
1132    fn error_includes_file_path() {
1133        let err = parse_supersigil_xml("<?xml?>", 0, Path::new("/foo/bar.md")).unwrap_err();
1134        let msg = err.to_string();
1135        assert!(msg.contains("/foo/bar.md"), "got: {msg}");
1136    }
1137
1138    // -- Closing tag at top level ------------------------------------------
1139
1140    #[test]
1141    fn closing_tag_at_top_level_rejected() {
1142        let err = parse("</Orphan>").unwrap_err();
1143        let msg = err.to_string();
1144        assert!(msg.contains("unexpected closing tag"), "got: {msg}");
1145    }
1146
1147    // -- Realistic example -------------------------------------------------
1148
1149    #[test]
1150    fn realistic_component_example() {
1151        let input = r#"<Criterion id="perf-latency" strategy="tag">
1152  P99 latency must be under 100ms for API requests.
1153</Criterion>
1154<VerifiedBy strategy="tag" tag="perf-latency" />"#;
1155        let nodes = parse(input).unwrap();
1156        assert_eq!(nodes.len(), 2);
1157
1158        // Criterion
1159        match &nodes[0] {
1160            XmlNode::Element {
1161                name,
1162                attributes,
1163                children,
1164                ..
1165            } => {
1166                assert_eq!(name, "Criterion");
1167                assert_eq!(attributes.len(), 2);
1168                assert_eq!(attributes[0], ("id".to_owned(), "perf-latency".to_owned()));
1169                assert_eq!(attributes[1], ("strategy".to_owned(), "tag".to_owned()));
1170                assert_eq!(children.len(), 1);
1171                if let XmlNode::Text { content, .. } = &children[0] {
1172                    assert!(content.contains("P99 latency"));
1173                } else {
1174                    panic!("expected Text child");
1175                }
1176            }
1177            _ => panic!("expected Element"),
1178        }
1179
1180        // VerifiedBy
1181        match &nodes[1] {
1182            XmlNode::Element {
1183                name,
1184                attributes,
1185                children,
1186                ..
1187            } => {
1188                assert_eq!(name, "VerifiedBy");
1189                assert_eq!(attributes.len(), 2);
1190                assert!(children.is_empty());
1191            }
1192            _ => panic!("expected Element"),
1193        }
1194    }
1195
1196    // -- UTF-8 text content ------------------------------------------------
1197
1198    #[test]
1199    fn utf8_text_content_preserved() {
1200        let input = "<Note>cafe\u{0301} \u{1F600}</Note>";
1201        let nodes = parse(input).unwrap();
1202        if let XmlNode::Element { children, .. } = &nodes[0] {
1203            if let XmlNode::Text { content: t, .. } = &children[0] {
1204                assert!(t.contains("cafe\u{0301}"));
1205                assert!(t.contains('\u{1F600}'));
1206            } else {
1207                panic!("expected Text");
1208            }
1209        } else {
1210            panic!("expected Element");
1211        }
1212    }
1213
1214    #[test]
1215    fn text_node_has_correct_offset() {
1216        // "<Title>" = 7 bytes, so text "Hello" starts at offset 7
1217        let input = "<Title>Hello</Title>";
1218        let nodes = parse(input).unwrap();
1219        if let XmlNode::Element { children, .. } = &nodes[0] {
1220            if let XmlNode::Text {
1221                content, offset, ..
1222            } = &children[0]
1223            {
1224                assert_eq!(content, "Hello");
1225                assert_eq!(*offset, 7, "text should start at byte 7 (after '<Title>')");
1226            } else {
1227                panic!("expected Text");
1228            }
1229        } else {
1230            panic!("expected Element");
1231        }
1232    }
1233
1234    #[test]
1235    fn text_node_offset_with_fence_offset() {
1236        let fence_offset = 100;
1237        let input = "<Title>Hello</Title>";
1238        let nodes = parse_with_offset(input, fence_offset).unwrap();
1239        if let XmlNode::Element { children, .. } = &nodes[0] {
1240            if let XmlNode::Text {
1241                content, offset, ..
1242            } = &children[0]
1243            {
1244                assert_eq!(content, "Hello");
1245                assert_eq!(*offset, 107, "text should be fence_offset + 7");
1246            } else {
1247                panic!("expected Text");
1248            }
1249        } else {
1250            panic!("expected Element");
1251        }
1252    }
1253
1254    #[test]
1255    fn text_node_end_offset_plain_text() {
1256        // "<Title>Hello</Title>" — text "Hello" starts at 7, ends at 12
1257        let input = "<Title>Hello</Title>";
1258        let nodes = parse(input).unwrap();
1259        if let XmlNode::Element { children, .. } = &nodes[0] {
1260            if let XmlNode::Text {
1261                content,
1262                offset,
1263                end_offset,
1264            } = &children[0]
1265            {
1266                assert_eq!(content, "Hello");
1267                assert_eq!(*offset, 7);
1268                assert_eq!(
1269                    *end_offset, 12,
1270                    "end_offset should be past the last byte of 'Hello'"
1271                );
1272                assert_eq!(
1273                    &input[*offset..*end_offset],
1274                    "Hello",
1275                    "offset..end_offset should span the raw text"
1276                );
1277            } else {
1278                panic!("expected Text");
1279            }
1280        } else {
1281            panic!("expected Element");
1282        }
1283    }
1284
1285    #[test]
1286    fn text_node_end_offset_with_entities() {
1287        // "a &lt; b" in raw source → decoded "a < b"
1288        // Raw: a(1) space(1) &lt;(4) space(1) b(1) = 8 bytes
1289        let input = "<T>a &lt; b</T>";
1290        let nodes = parse(input).unwrap();
1291        if let XmlNode::Element { children, .. } = &nodes[0] {
1292            if let XmlNode::Text {
1293                content,
1294                offset,
1295                end_offset,
1296            } = &children[0]
1297            {
1298                assert_eq!(content, "a < b", "content should be entity-decoded");
1299                assert_eq!(*offset, 3, "text starts after '<T>'");
1300                assert_eq!(
1301                    *end_offset, 11,
1302                    "end_offset should be past the last byte of 'a &lt; b' in raw source"
1303                );
1304                assert_eq!(
1305                    &input[*offset..*end_offset],
1306                    "a &lt; b",
1307                    "offset..end_offset should span the raw source text"
1308                );
1309                // Decoded length (5) < raw length (8)
1310                assert!(content.len() < (*end_offset - *offset));
1311            } else {
1312                panic!("expected Text");
1313            }
1314        } else {
1315            panic!("expected Element");
1316        }
1317    }
1318
1319    #[test]
1320    fn text_node_end_offset_with_fence_offset_and_entities() {
1321        let fence_offset = 50;
1322        let input = "<T>&amp;</T>";
1323        let nodes = parse_with_offset(input, fence_offset).unwrap();
1324        if let XmlNode::Element { children, .. } = &nodes[0] {
1325            if let XmlNode::Text {
1326                content,
1327                offset,
1328                end_offset,
1329            } = &children[0]
1330            {
1331                assert_eq!(content, "&", "decoded entity");
1332                assert_eq!(*offset, 53, "starts at fence_offset + 3 (after '<T>')");
1333                // &amp; = 5 bytes in raw source, starts at position 3 in XML content
1334                assert_eq!(
1335                    *end_offset, 58,
1336                    "end_offset = fence_offset + 3 + 5 (length of '&amp;')"
1337                );
1338            } else {
1339                panic!("expected Text");
1340            }
1341        } else {
1342            panic!("expected Element");
1343        }
1344    }
1345
1346    #[test]
1347    fn text_node_end_offset_multiple_entities() {
1348        // "&lt;&gt;" → decoded "<>" (2 chars), raw = 8 bytes
1349        let input = "<T>&lt;&gt;</T>";
1350        let nodes = parse(input).unwrap();
1351        if let XmlNode::Element { children, .. } = &nodes[0] {
1352            if let XmlNode::Text {
1353                content,
1354                offset,
1355                end_offset,
1356            } = &children[0]
1357            {
1358                assert_eq!(content, "<>");
1359                assert_eq!(*offset, 3);
1360                assert_eq!(*end_offset, 11, "past '&lt;&gt;' in raw source");
1361                assert_eq!(&input[*offset..*end_offset], "&lt;&gt;");
1362            } else {
1363                panic!("expected Text");
1364            }
1365        } else {
1366            panic!("expected Element");
1367        }
1368    }
1369
1370    // -- end_offset on Element ------------------------------------------------
1371
1372    #[test]
1373    fn self_closing_element_end_offset() {
1374        let input = r#"<Spec id="s1" />"#;
1375        let nodes = parse(input).unwrap();
1376        assert_eq!(nodes.len(), 1);
1377        match &nodes[0] {
1378            XmlNode::Element {
1379                name,
1380                offset,
1381                end_offset,
1382                ..
1383            } => {
1384                assert_eq!(name, "Spec");
1385                assert_eq!(*offset, 0);
1386                assert_eq!(*end_offset, input.len());
1387            }
1388            _ => panic!("expected Element"),
1389        }
1390    }
1391
1392    #[test]
1393    fn regular_element_end_offset() {
1394        let input = "<Title>Hello</Title>";
1395        let nodes = parse(input).unwrap();
1396        assert_eq!(nodes.len(), 1);
1397        match &nodes[0] {
1398            XmlNode::Element {
1399                name,
1400                offset,
1401                end_offset,
1402                ..
1403            } => {
1404                assert_eq!(name, "Title");
1405                assert_eq!(*offset, 0);
1406                assert_eq!(*end_offset, input.len());
1407            }
1408            _ => panic!("expected Element"),
1409        }
1410    }
1411
1412    #[test]
1413    fn nested_element_end_offsets() {
1414        let input = r#"<Parent><Child id="c1" /></Parent>"#;
1415        let nodes = parse(input).unwrap();
1416        match &nodes[0] {
1417            XmlNode::Element {
1418                end_offset,
1419                children,
1420                ..
1421            } => {
1422                assert_eq!(*end_offset, input.len());
1423                match &children[0] {
1424                    XmlNode::Element {
1425                        name,
1426                        offset,
1427                        end_offset,
1428                        ..
1429                    } => {
1430                        assert_eq!(name, "Child");
1431                        assert_eq!(*offset, 8);
1432                        // "<Child id="c1" />" ends at position 25
1433                        assert_eq!(*end_offset, 25);
1434                    }
1435                    _ => panic!("expected Element"),
1436                }
1437            }
1438            _ => panic!("expected Element"),
1439        }
1440    }
1441
1442    #[test]
1443    fn element_end_offset_with_fence_offset() {
1444        let input = r#"<Spec id="s1" />"#;
1445        let fence_offset = 100;
1446        let nodes = parse_with_offset(input, fence_offset).unwrap();
1447        match &nodes[0] {
1448            XmlNode::Element {
1449                offset, end_offset, ..
1450            } => {
1451                assert_eq!(*offset, 100);
1452                assert_eq!(*end_offset, 100 + input.len());
1453            }
1454            _ => panic!("expected Element"),
1455        }
1456    }
1457}