1use std::path::Path;
16
17use quick_xml::Reader;
18use quick_xml::events::Event;
19use supersigil_core::ParseError;
20
21use crate::util::line_col;
22
23#[derive(Debug, Clone, PartialEq)]
29pub enum XmlNode {
30 Element {
32 name: String,
34 attributes: Vec<(String, String)>,
36 children: Vec<XmlNode>,
38 offset: usize,
40 end_offset: usize,
42 },
43 Text {
45 content: String,
47 offset: usize,
49 end_offset: usize,
54 },
55}
56
57const SYNTHETIC_ROOT: &str = "__root";
65const SYNTHETIC_OPEN: &str = "<__root>";
66const SYNTHETIC_CLOSE: &str = "</__root>";
67
68pub fn parse_supersigil_xml(
87 content: &str,
88 fence_offset: usize,
89 path: &Path,
90) -> Result<Vec<XmlNode>, ParseError> {
91 reject_unsupported(content, path)?;
93
94 let wrapped = format!("{SYNTHETIC_OPEN}{content}{SYNTHETIC_CLOSE}");
97
98 let mut reader = Reader::from_str(&wrapped);
99 reader.config_mut().trim_text(false);
100
101 #[allow(clippy::cast_possible_truncation, reason = "SYNTHETIC_OPEN is 8 bytes")]
104 let root_tag_len: u64 = SYNTHETIC_OPEN.len() as u64;
105
106 loop {
108 match reader.read_event() {
109 Ok(Event::Start(ref e)) if e.name().as_ref() == SYNTHETIC_ROOT.as_bytes() => break,
110 Ok(Event::Eof) => break,
111 Err(e) => {
112 return Err(quick_xml_error_to_parse_error(&e, path));
113 }
114 _ => {}
115 }
116 }
117
118 let nodes = parse_children(
120 &mut reader,
121 SYNTHETIC_ROOT,
122 content,
123 fence_offset,
124 root_tag_len,
125 path,
126 )?;
127
128 Ok(nodes)
129}
130
131fn reject_unsupported(content: &str, path: &Path) -> Result<(), ParseError> {
140 let bytes = content.as_bytes();
141 let mut i = 0;
142 while i < bytes.len() {
143 if bytes[i] == b'<' {
144 if bytes[i..].starts_with(b"<?") {
145 return Err(make_error(
146 content,
147 i,
148 path,
149 "processing instructions (`<?...?>`) are not supported",
150 ));
151 }
152 if bytes[i..].starts_with(b"<![CDATA[") {
153 return Err(make_error(
154 content,
155 i,
156 path,
157 "CDATA sections (`<![CDATA[...]]>`) are not supported",
158 ));
159 }
160 if bytes[i..].starts_with(b"<!DOCTYPE") || bytes[i..].starts_with(b"<!doctype") {
161 return Err(make_error(
162 content,
163 i,
164 path,
165 "DTD declarations (`<!DOCTYPE ...>`) are not supported",
166 ));
167 }
168 }
170 i += 1;
171 }
172 Ok(())
173}
174
175#[allow(
185 clippy::too_many_lines,
186 reason = "event-loop structure is clearest as a single function"
187)]
188fn parse_children(
189 reader: &mut Reader<&[u8]>,
190 parent_name: &str,
191 content: &str,
192 fence_offset: usize,
193 root_tag_len: u64,
194 path: &Path,
195) -> Result<Vec<XmlNode>, ParseError> {
196 fn flush_text(
198 text_buf: &mut String,
199 text_start: &mut Option<usize>,
200 text_end: &mut Option<usize>,
201 nodes: &mut Vec<XmlNode>,
202 is_top_level: bool,
203 ) {
204 if text_buf.is_empty() {
205 return;
206 }
207 if is_top_level && text_buf.trim().is_empty() {
209 text_buf.clear();
210 *text_start = None;
211 *text_end = None;
212 return;
213 }
214 let start = text_start.take().unwrap_or(0);
215 let end = text_end.take().unwrap_or(start);
216 nodes.push(XmlNode::Text {
217 content: std::mem::take(text_buf),
218 offset: start,
219 end_offset: end,
220 });
221 }
222
223 let mut nodes: Vec<XmlNode> = Vec::new();
224 let mut text_buf = String::new();
226 let mut text_start_offset: Option<usize> = None;
228 let mut text_end_offset: Option<usize> = None;
230 let is_top_level = parent_name == SYNTHETIC_ROOT;
231
232 loop {
233 let event_pos = reader.buffer_position();
234
235 match reader.read_event() {
236 Ok(Event::Start(ref e)) => {
237 flush_text(
238 &mut text_buf,
239 &mut text_start_offset,
240 &mut text_end_offset,
241 &mut nodes,
242 is_top_level,
243 );
244
245 let offset_in_content = content_offset(event_pos, root_tag_len);
246 let file_offset = fence_offset + offset_in_content;
247
248 let tag_name = decode_name(e.name().as_ref(), content, offset_in_content, path)?;
249 validate_element_name(&tag_name, content, offset_in_content, path)?;
250 let attributes = parse_attributes(e, content, offset_in_content, path)?;
251
252 let children =
253 parse_children(reader, &tag_name, content, fence_offset, root_tag_len, path)?;
254
255 let end_in_content = content_offset(reader.buffer_position(), root_tag_len);
257 let file_end_offset = fence_offset + end_in_content;
258
259 nodes.push(XmlNode::Element {
260 name: tag_name,
261 attributes,
262 children,
263 offset: file_offset,
264 end_offset: file_end_offset,
265 });
266 }
267
268 Ok(Event::Empty(ref e)) => {
269 flush_text(
270 &mut text_buf,
271 &mut text_start_offset,
272 &mut text_end_offset,
273 &mut nodes,
274 is_top_level,
275 );
276
277 let offset_in_content = content_offset(event_pos, root_tag_len);
278 let file_offset = fence_offset + offset_in_content;
279
280 let tag_name = decode_name(e.name().as_ref(), content, offset_in_content, path)?;
281 validate_element_name(&tag_name, content, offset_in_content, path)?;
282 let attributes = parse_attributes(e, content, offset_in_content, path)?;
283
284 let end_in_content = content_offset(reader.buffer_position(), root_tag_len);
286 let file_end_offset = fence_offset + end_in_content;
287
288 nodes.push(XmlNode::Element {
289 name: tag_name,
290 attributes,
291 children: vec![],
292 offset: file_offset,
293 end_offset: file_end_offset,
294 });
295 }
296
297 Ok(Event::Text(ref e)) => {
298 let raw = std::str::from_utf8(e.as_ref()).map_err(|_err| {
300 let off = content_offset(event_pos, root_tag_len);
301 make_error(content, off, path, "invalid UTF-8 in text content")
302 })?;
303 let off = content_offset(event_pos, root_tag_len);
304 if text_start_offset.is_none() {
305 text_start_offset = Some(fence_offset + off);
306 }
307 text_end_offset = Some(fence_offset + off + raw.len());
310 text_buf.push_str(raw);
311 }
312
313 Ok(Event::GeneralRef(ref e)) => {
314 let entity_name = std::str::from_utf8(e.as_ref()).map_err(|_err| {
316 let off = content_offset(event_pos, root_tag_len);
317 make_error(content, off, path, "invalid UTF-8 in entity reference")
318 })?;
319 let off = content_offset(event_pos, root_tag_len);
320 if text_start_offset.is_none() {
321 text_start_offset = Some(fence_offset + off);
322 }
323 text_end_offset = Some(fence_offset + off + entity_name.len() + 2);
326 let resolved = resolve_entity(entity_name, content, off, path)?;
327 text_buf.push_str(resolved);
328 }
329
330 Ok(Event::End(ref e)) => {
331 let name_bytes = e.name();
332 let end_name = std::str::from_utf8(name_bytes.as_ref()).unwrap_or("<invalid>");
333 if end_name == parent_name {
334 flush_text(
335 &mut text_buf,
336 &mut text_start_offset,
337 &mut text_end_offset,
338 &mut nodes,
339 is_top_level,
340 );
341 return Ok(nodes);
342 }
343 let offset_in_content = content_offset(event_pos, root_tag_len);
345 return Err(make_error(
346 content,
347 offset_in_content,
348 path,
349 &format!(
350 "mismatched closing tag: expected `</{parent_name}>`, found `</{end_name}>`"
351 ),
352 ));
353 }
354
355 Ok(Event::Eof) => {
356 flush_text(
357 &mut text_buf,
358 &mut text_start_offset,
359 &mut text_end_offset,
360 &mut nodes,
361 is_top_level,
362 );
363 if is_top_level {
364 return Ok(nodes);
365 }
366 return Err(make_error(
367 content,
368 content.len(),
369 path,
370 &format!("expected closing tag `</{parent_name}>`, found end of input"),
371 ));
372 }
373
374 Ok(Event::Comment(_)) => {}
376
377 Ok(Event::CData(_)) => {
379 let off = content_offset(event_pos, root_tag_len);
380 return Err(make_error(
381 content,
382 off,
383 path,
384 "CDATA sections (`<![CDATA[...]]>`) are not supported",
385 ));
386 }
387 Ok(Event::PI(_) | Event::Decl(_)) => {
388 let off = content_offset(event_pos, root_tag_len);
389 return Err(make_error(
390 content,
391 off,
392 path,
393 "processing instructions (`<?...?>`) are not supported",
394 ));
395 }
396 Ok(Event::DocType(_)) => {
397 let off = content_offset(event_pos, root_tag_len);
398 return Err(make_error(
399 content,
400 off,
401 path,
402 "DTD declarations (`<!DOCTYPE ...>`) are not supported",
403 ));
404 }
405
406 Err(e) => {
407 return Err(quick_xml_error_to_parse_error(&e, path));
408 }
409 }
410 }
411}
412
413fn parse_attributes(
419 event: &quick_xml::events::BytesStart<'_>,
420 content: &str,
421 offset_in_content: usize,
422 path: &Path,
423) -> Result<Vec<(String, String)>, ParseError> {
424 validate_attribute_quotes(event, content, offset_in_content, path)?;
427
428 let mut attrs = Vec::new();
429 for attr_result in event.attributes() {
430 let attr = attr_result.map_err(|e| {
431 let msg = format!("{e}");
432 make_error(content, offset_in_content, path, &msg)
433 })?;
434
435 let key = decode_name(attr.key.as_ref(), content, offset_in_content, path)?;
436
437 if key.contains(':') {
439 return Err(make_error(
440 content,
441 offset_in_content,
442 path,
443 &format!("namespaced attribute `{key}` is not supported"),
444 ));
445 }
446
447 let raw_value = std::str::from_utf8(attr.value.as_ref()).map_err(|_err| {
449 make_error(
450 content,
451 offset_in_content,
452 path,
453 "invalid UTF-8 in attribute value",
454 )
455 })?;
456 let value = resolve_entities_in_str(raw_value, content, offset_in_content, path)?;
457
458 attrs.push((key, value));
459 }
460 Ok(attrs)
461}
462
463fn validate_attribute_quotes(
469 event: &quick_xml::events::BytesStart<'_>,
470 content: &str,
471 offset_in_content: usize,
472 path: &Path,
473) -> Result<(), ParseError> {
474 let raw: &[u8] = event.as_ref(); let mut i = 0;
476 let mut in_double_quote = false;
477 while i < raw.len() {
478 if in_double_quote {
479 if raw[i] == b'"' {
480 in_double_quote = false;
481 }
482 i += 1;
483 continue;
484 }
485 if raw[i] == b'"' {
486 in_double_quote = true;
487 i += 1;
488 continue;
489 }
490 if raw[i] == b'=' {
491 i += 1;
493 while i < raw.len() && raw[i].is_ascii_whitespace() {
494 i += 1;
495 }
496 if i < raw.len() && raw[i] == b'\'' {
497 return Err(make_error(
498 content,
499 offset_in_content,
500 path,
501 "attribute values must be double-quoted",
502 ));
503 }
504 continue;
507 }
508 i += 1;
509 }
510 Ok(())
511}
512
513fn resolve_entity(
519 name: &str,
520 content: &str,
521 offset_in_content: usize,
522 path: &Path,
523) -> Result<&'static str, ParseError> {
524 match name {
525 "amp" => Ok("&"),
526 "lt" => Ok("<"),
527 "gt" => Ok(">"),
528 "quot" => Ok("\""),
529 _ => Err(make_error(
530 content,
531 offset_in_content,
532 path,
533 &format!("unsupported entity reference `&{name};`"),
534 )),
535 }
536}
537
538fn resolve_entities_in_str(
541 text: &str,
542 content: &str,
543 offset_in_content: usize,
544 path: &Path,
545) -> Result<String, ParseError> {
546 if !text.contains('&') {
547 return Ok(text.to_owned());
548 }
549
550 let mut result = String::with_capacity(text.len());
551 let mut rest = text;
552
553 while let Some(amp_pos) = rest.find('&') {
554 result.push_str(&rest[..amp_pos]);
555 rest = &rest[amp_pos + 1..];
556 if let Some(semi_pos) = rest.find(';') {
557 let entity_name = &rest[..semi_pos];
558 let resolved = resolve_entity(entity_name, content, offset_in_content, path)?;
559 result.push_str(resolved);
560 rest = &rest[semi_pos + 1..];
561 } else {
562 return Err(make_error(
563 content,
564 offset_in_content,
565 path,
566 "unterminated entity reference (missing `;`)",
567 ));
568 }
569 }
570 result.push_str(rest);
571 Ok(result)
572}
573
574fn decode_name(
581 raw: &[u8],
582 content: &str,
583 offset_in_content: usize,
584 path: &Path,
585) -> Result<String, ParseError> {
586 std::str::from_utf8(raw)
587 .map(str::to_owned)
588 .map_err(|_err| make_error(content, offset_in_content, path, "invalid UTF-8 in name"))
589}
590
591fn validate_element_name(
598 name: &str,
599 content: &str,
600 offset_in_content: usize,
601 path: &Path,
602) -> Result<(), ParseError> {
603 if name.contains(':') {
604 return Err(make_error(
605 content,
606 offset_in_content,
607 path,
608 &format!("namespaced element `{name}` is not supported"),
609 ));
610 }
611 Ok(())
612}
613
614#[allow(
624 clippy::cast_possible_truncation,
625 reason = "XML fence content is always small"
626)]
627fn content_offset(event_pos: u64, root_tag_len: u64) -> usize {
628 (event_pos - root_tag_len) as usize
629}
630
631fn make_error(content: &str, offset_in_content: usize, path: &Path, message: &str) -> ParseError {
633 let (line, column) = line_col(content, offset_in_content);
634 ParseError::XmlSyntaxError {
635 path: path.to_path_buf(),
636 line,
637 column,
638 message: message.to_owned(),
639 }
640}
641
642fn quick_xml_error_to_parse_error(err: &quick_xml::Error, path: &Path) -> ParseError {
648 let message = match err {
649 quick_xml::Error::IllFormed(ill) => match ill {
650 quick_xml::errors::IllFormedError::MismatchedEndTag { expected, found } => {
651 if expected == SYNTHETIC_ROOT {
652 format!("unexpected closing tag `</{found}>` at top level")
654 } else {
655 format!("mismatched closing tag: expected `</{expected}>`, found `</{found}>`")
656 }
657 }
658 quick_xml::errors::IllFormedError::UnmatchedEndTag(name) => {
659 format!("unexpected closing tag `</{name}>` at top level")
660 }
661 quick_xml::errors::IllFormedError::MissingEndTag(name) => {
662 format!("expected closing tag `</{name}>`, found end of input")
663 }
664 quick_xml::errors::IllFormedError::UnclosedReference => {
665 "unterminated entity reference (missing `;`)".to_owned()
666 }
667 other => format!("{other}"),
668 },
669 other => format!("{other}"),
670 };
671
672 ParseError::XmlSyntaxError {
673 path: path.to_path_buf(),
674 line: 1,
675 column: 1,
676 message,
677 }
678}
679
680#[cfg(test)]
685#[allow(
686 clippy::match_wildcard_for_single_variants,
687 clippy::single_char_pattern,
688 reason = "test assertions are clearer with wildcards and string patterns"
689)]
690mod tests {
691 use super::*;
692
693 fn parse(content: &str) -> Result<Vec<XmlNode>, ParseError> {
694 parse_supersigil_xml(content, 0, Path::new("test.md"))
695 }
696
697 fn parse_with_offset(content: &str, offset: usize) -> Result<Vec<XmlNode>, ParseError> {
698 parse_supersigil_xml(content, offset, Path::new("test.md"))
699 }
700
701 #[test]
704 fn empty_input() {
705 let nodes = parse("").unwrap();
706 assert!(nodes.is_empty());
707 }
708
709 #[test]
710 fn whitespace_only_input() {
711 let nodes = parse(" \n \n ").unwrap();
712 assert!(nodes.is_empty());
713 }
714
715 #[test]
716 fn single_self_closing_element() {
717 let nodes = parse(r#"<Spec id="s1" />"#).unwrap();
718 assert_eq!(nodes.len(), 1);
719 match &nodes[0] {
720 XmlNode::Element {
721 name,
722 attributes,
723 children,
724 offset,
725 ..
726 } => {
727 assert_eq!(name, "Spec");
728 assert_eq!(attributes, &[("id".to_owned(), "s1".to_owned())]);
729 assert!(children.is_empty());
730 assert_eq!(*offset, 0);
731 }
732 _ => panic!("expected Element"),
733 }
734 }
735
736 #[test]
737 fn self_closing_no_space_before_slash() {
738 let nodes = parse(r#"<Spec id="s1"/>"#).unwrap();
739 assert_eq!(nodes.len(), 1);
740 if let XmlNode::Element { name, .. } = &nodes[0] {
741 assert_eq!(name, "Spec");
742 } else {
743 panic!("expected Element");
744 }
745 }
746
747 #[test]
748 fn element_with_text_content() {
749 let nodes = parse("<Title>Hello World</Title>").unwrap();
750 assert_eq!(nodes.len(), 1);
751 match &nodes[0] {
752 XmlNode::Element { name, children, .. } => {
753 assert_eq!(name, "Title");
754 assert_eq!(children.len(), 1);
755 assert!(
756 matches!(&children[0], XmlNode::Text { content, .. } if content == "Hello World")
757 );
758 }
759 _ => panic!("expected Element"),
760 }
761 }
762
763 #[test]
764 fn element_with_no_attributes() {
765 let nodes = parse("<Container></Container>").unwrap();
766 assert_eq!(nodes.len(), 1);
767 match &nodes[0] {
768 XmlNode::Element {
769 name, attributes, ..
770 } => {
771 assert_eq!(name, "Container");
772 assert!(attributes.is_empty());
773 }
774 _ => panic!("expected Element"),
775 }
776 }
777
778 #[test]
781 fn nested_elements() {
782 let input = r#"<Parent id="p1"><Child id="c1" /></Parent>"#;
783 let nodes = parse(input).unwrap();
784 assert_eq!(nodes.len(), 1);
785 match &nodes[0] {
786 XmlNode::Element { name, children, .. } => {
787 assert_eq!(name, "Parent");
788 assert_eq!(children.len(), 1);
789 match &children[0] {
790 XmlNode::Element {
791 name, attributes, ..
792 } => {
793 assert_eq!(name, "Child");
794 assert_eq!(attributes, &[("id".to_owned(), "c1".to_owned())]);
795 }
796 _ => panic!("expected nested Element"),
797 }
798 }
799 _ => panic!("expected Element"),
800 }
801 }
802
803 #[test]
804 fn deeply_nested_elements() {
805 let input = "<A><B><C>deep</C></B></A>";
806 let nodes = parse(input).unwrap();
807 assert_eq!(nodes.len(), 1);
808 let a = &nodes[0];
810 if let XmlNode::Element { children, .. } = a {
811 let b = &children[0];
812 if let XmlNode::Element { children, .. } = b {
813 let c = &children[0];
814 if let XmlNode::Element { children, .. } = c {
815 assert!(
816 matches!(&children[0], XmlNode::Text { content, .. } if content == "deep")
817 );
818 } else {
819 panic!("expected C element");
820 }
821 } else {
822 panic!("expected B element");
823 }
824 } else {
825 panic!("expected A element");
826 }
827 }
828
829 #[test]
830 fn mixed_children_text_and_elements() {
831 let input = "<Parent>before<Child />after</Parent>";
832 let nodes = parse(input).unwrap();
833 assert_eq!(nodes.len(), 1);
834 if let XmlNode::Element { children, .. } = &nodes[0] {
835 assert_eq!(children.len(), 3);
836 assert!(matches!(&children[0], XmlNode::Text { content, .. } if content == "before"));
837 assert!(matches!(&children[1], XmlNode::Element { name, .. } if name == "Child"));
838 assert!(matches!(&children[2], XmlNode::Text { content, .. } if content == "after"));
839 } else {
840 panic!("expected Element");
841 }
842 }
843
844 #[test]
847 fn multiple_top_level_elements() {
848 let input = r#"<A id="1" />
849<B id="2" />"#;
850 let nodes = parse(input).unwrap();
851 assert_eq!(nodes.len(), 2);
852 if let XmlNode::Element { name, .. } = &nodes[0] {
853 assert_eq!(name, "A");
854 }
855 if let XmlNode::Element { name, .. } = &nodes[1] {
856 assert_eq!(name, "B");
857 }
858 }
859
860 #[test]
863 fn multiple_attributes() {
864 let input = r#"<Criterion id="c1" strategy="tag" />"#;
865 let nodes = parse(input).unwrap();
866 if let XmlNode::Element { attributes, .. } = &nodes[0] {
867 assert_eq!(
868 attributes,
869 &[
870 ("id".to_owned(), "c1".to_owned()),
871 ("strategy".to_owned(), "tag".to_owned()),
872 ]
873 );
874 } else {
875 panic!("expected Element");
876 }
877 }
878
879 #[test]
880 fn attribute_with_entity_in_value() {
881 let input = r#"<Spec desc="a & b" />"#;
882 let nodes = parse(input).unwrap();
883 if let XmlNode::Element { attributes, .. } = &nodes[0] {
884 assert_eq!(attributes[0].1, "a & b");
885 } else {
886 panic!("expected Element");
887 }
888 }
889
890 #[test]
891 fn all_supported_entities_in_attribute() {
892 let input = r#"<Spec val="&<>"" />"#;
893 let nodes = parse(input).unwrap();
894 if let XmlNode::Element { attributes, .. } = &nodes[0] {
895 assert_eq!(attributes[0].1, "&<>\"");
896 } else {
897 panic!("expected Element");
898 }
899 }
900
901 #[test]
904 fn entity_references_in_text() {
905 let input = "<Note>a < b & c > d "e"</Note>";
906 let nodes = parse(input).unwrap();
907 if let XmlNode::Element { children, .. } = &nodes[0] {
908 assert!(
909 matches!(&children[0], XmlNode::Text { content, .. } if content == r#"a < b & c > d "e""#)
910 );
911 } else {
912 panic!("expected Element");
913 }
914 }
915
916 #[test]
919 fn offset_applied_to_elements() {
920 let fence_offset = 100;
921 let input = r#"<Spec id="s1" />"#;
922 let nodes = parse_with_offset(input, fence_offset).unwrap();
923 if let XmlNode::Element { offset, .. } = &nodes[0] {
924 assert_eq!(*offset, 100);
925 } else {
926 panic!("expected Element");
927 }
928 }
929
930 #[test]
931 fn offset_applied_to_nested_element() {
932 let fence_offset = 50;
933 let input = "<A><B /></A>";
935 let nodes = parse_with_offset(input, fence_offset).unwrap();
936 if let XmlNode::Element {
937 offset, children, ..
938 } = &nodes[0]
939 {
940 assert_eq!(*offset, 50); if let XmlNode::Element { offset, .. } = &children[0] {
942 assert_eq!(*offset, 53); } else {
944 panic!("expected nested Element");
945 }
946 } else {
947 panic!("expected Element");
948 }
949 }
950
951 #[test]
952 fn offset_with_multiple_top_level_elements() {
953 let fence_offset = 200;
954 let input = "<A />\n<B />";
955 let nodes = parse_with_offset(input, fence_offset).unwrap();
956 assert_eq!(nodes.len(), 2);
957 if let XmlNode::Element { offset, .. } = &nodes[0] {
958 assert_eq!(*offset, 200); }
960 if let XmlNode::Element { offset, .. } = &nodes[1] {
961 assert_eq!(*offset, 206); }
963 }
964
965 #[test]
968 fn unclosed_element() {
969 let err = parse("<Spec>content").unwrap_err();
970 let msg = err.to_string();
971 assert!(msg.contains("closing tag"), "got: {msg}");
972 assert!(msg.contains("Spec"), "got: {msg}");
973 }
974
975 #[test]
976 fn mismatched_closing_tag() {
977 let err = parse("<A>text</B>").unwrap_err();
978 let msg = err.to_string();
979 assert!(msg.contains("mismatched"), "got: {msg}");
980 assert!(msg.contains("A"), "got: {msg}");
981 assert!(msg.contains("B"), "got: {msg}");
982 }
983
984 #[test]
987 fn single_quoted_attribute_value() {
988 let err = parse("<Spec id='s1' />").unwrap_err();
989 let msg = err.to_string();
990 assert!(msg.contains("double-quoted"), "got: {msg}");
991 }
992
993 #[test]
994 fn single_quotes_inside_double_quoted_attribute_value() {
995 let result = parse(r#"<Spec val="a='b'" />"#);
997 assert!(result.is_ok(), "got: {}", result.unwrap_err());
998 }
999
1000 #[test]
1001 fn missing_attribute_value() {
1002 let err = parse("<Spec id />").unwrap_err();
1003 let msg = err.to_string();
1004 assert!(msg.contains("="), "got: {msg}");
1005 }
1006
1007 #[test]
1010 fn processing_instruction_rejected() {
1011 let err = parse("<?xml version=\"1.0\"?>").unwrap_err();
1012 let msg = err.to_string();
1013 assert!(msg.contains("processing instruction"), "got: {msg}");
1014 }
1015
1016 #[test]
1017 fn cdata_rejected() {
1018 let err = parse("<![CDATA[foo]]>").unwrap_err();
1019 let msg = err.to_string();
1020 assert!(msg.contains("CDATA"), "got: {msg}");
1021 }
1022
1023 #[test]
1024 fn doctype_rejected() {
1025 let err = parse("<!DOCTYPE html>").unwrap_err();
1026 let msg = err.to_string();
1027 assert!(msg.contains("DTD") || msg.contains("DOCTYPE"), "got: {msg}");
1028 }
1029
1030 #[test]
1031 fn comment_ignored() {
1032 let nodes = parse("<!-- comment -->").unwrap();
1033 assert!(nodes.is_empty(), "comments should produce no nodes");
1034 }
1035
1036 #[test]
1037 fn comment_between_elements_ignored() {
1038 let nodes = parse(r#"<Task id="t-1" status="draft">text</Task><!-- skip --><Task id="t-2" status="draft">more</Task>"#).unwrap();
1039 assert_eq!(
1040 nodes.len(),
1041 2,
1042 "should parse both elements, ignoring comment"
1043 );
1044 }
1045
1046 #[test]
1047 fn comment_inside_element_ignored() {
1048 let nodes = parse(r#"<AcceptanceCriteria><!-- placeholder --><Criterion id="c-1">desc</Criterion></AcceptanceCriteria>"#).unwrap();
1049 assert_eq!(nodes.len(), 1);
1050 if let XmlNode::Element { children, .. } = &nodes[0] {
1051 assert_eq!(children.len(), 1, "comment should not appear as child");
1052 } else {
1053 panic!("expected element");
1054 }
1055 }
1056
1057 #[test]
1058 fn namespace_in_element_rejected() {
1059 let err = parse("<foo:Bar />").unwrap_err();
1063 assert!(
1064 err.to_string().contains("test.md"),
1065 "error should include path"
1066 );
1067 }
1068
1069 #[test]
1070 fn unsupported_entity_rejected() {
1071 let err = parse("<Spec>'</Spec>").unwrap_err();
1072 let msg = err.to_string();
1073 assert!(msg.contains("unsupported entity"), "got: {msg}");
1074 }
1075
1076 #[test]
1077 fn unterminated_entity_rejected() {
1078 let err = parse("<Spec>&</Spec>").unwrap_err();
1079 let msg = err.to_string();
1080 assert!(msg.contains("unterminated entity"), "got: {msg}");
1081 }
1082
1083 #[test]
1086 fn lowercase_element_name_parsed_successfully() {
1087 let nodes = parse("<spec />").unwrap();
1088 assert_eq!(nodes.len(), 1);
1089 match &nodes[0] {
1090 XmlNode::Element { name, .. } => assert_eq!(name, "spec"),
1091 _ => panic!("expected Element"),
1092 }
1093 }
1094
1095 #[test]
1096 fn lowercase_element_inside_pascal_case_element() {
1097 let nodes = parse(r#"<Criterion id="c1">Use <em>fast</em> path</Criterion>"#).unwrap();
1098 assert_eq!(nodes.len(), 1);
1099 match &nodes[0] {
1100 XmlNode::Element { name, children, .. } => {
1101 assert_eq!(name, "Criterion");
1102 assert_eq!(children.len(), 3);
1104 assert!(matches!(&children[0], XmlNode::Text { content, .. } if content == "Use "));
1105 match &children[1] {
1106 XmlNode::Element { name, children, .. } => {
1107 assert_eq!(name, "em");
1108 assert_eq!(children.len(), 1);
1109 assert!(
1110 matches!(&children[0], XmlNode::Text { content, .. } if content == "fast")
1111 );
1112 }
1113 _ => panic!("expected em Element"),
1114 }
1115 assert!(
1116 matches!(&children[2], XmlNode::Text { content, .. } if content == " path")
1117 );
1118 }
1119 _ => panic!("expected Criterion Element"),
1120 }
1121 }
1122
1123 #[test]
1126 fn error_includes_line_and_column() {
1127 let err = parse("<A>\n<ns:B /></A>").unwrap_err();
1130 if let ParseError::XmlSyntaxError { line, column, .. } = &err {
1131 assert_eq!(*line, 2);
1132 assert_eq!(*column, 1);
1133 } else {
1134 panic!("expected XmlSyntaxError");
1135 }
1136 }
1137
1138 #[test]
1139 fn error_includes_file_path() {
1140 let err = parse_supersigil_xml("<?xml?>", 0, Path::new("/foo/bar.md")).unwrap_err();
1141 let msg = err.to_string();
1142 assert!(msg.contains("/foo/bar.md"), "got: {msg}");
1143 }
1144
1145 #[test]
1148 fn closing_tag_at_top_level_rejected() {
1149 let err = parse("</Orphan>").unwrap_err();
1150 let msg = err.to_string();
1151 assert!(msg.contains("unexpected closing tag"), "got: {msg}");
1152 }
1153
1154 #[test]
1157 fn realistic_component_example() {
1158 let input = r#"<Criterion id="perf-latency" strategy="tag">
1159 P99 latency must be under 100ms for API requests.
1160</Criterion>
1161<VerifiedBy strategy="tag" tag="perf-latency" />"#;
1162 let nodes = parse(input).unwrap();
1163 assert_eq!(nodes.len(), 2);
1164
1165 match &nodes[0] {
1167 XmlNode::Element {
1168 name,
1169 attributes,
1170 children,
1171 ..
1172 } => {
1173 assert_eq!(name, "Criterion");
1174 assert_eq!(attributes.len(), 2);
1175 assert_eq!(attributes[0], ("id".to_owned(), "perf-latency".to_owned()));
1176 assert_eq!(attributes[1], ("strategy".to_owned(), "tag".to_owned()));
1177 assert_eq!(children.len(), 1);
1178 if let XmlNode::Text { content, .. } = &children[0] {
1179 assert!(content.contains("P99 latency"));
1180 } else {
1181 panic!("expected Text child");
1182 }
1183 }
1184 _ => panic!("expected Element"),
1185 }
1186
1187 match &nodes[1] {
1189 XmlNode::Element {
1190 name,
1191 attributes,
1192 children,
1193 ..
1194 } => {
1195 assert_eq!(name, "VerifiedBy");
1196 assert_eq!(attributes.len(), 2);
1197 assert!(children.is_empty());
1198 }
1199 _ => panic!("expected Element"),
1200 }
1201 }
1202
1203 #[test]
1206 fn utf8_text_content_preserved() {
1207 let input = "<Note>cafe\u{0301} \u{1F600}</Note>";
1208 let nodes = parse(input).unwrap();
1209 if let XmlNode::Element { children, .. } = &nodes[0] {
1210 if let XmlNode::Text { content: t, .. } = &children[0] {
1211 assert!(t.contains("cafe\u{0301}"));
1212 assert!(t.contains('\u{1F600}'));
1213 } else {
1214 panic!("expected Text");
1215 }
1216 } else {
1217 panic!("expected Element");
1218 }
1219 }
1220
1221 #[test]
1222 fn text_node_has_correct_offset() {
1223 let input = "<Title>Hello</Title>";
1225 let nodes = parse(input).unwrap();
1226 if let XmlNode::Element { children, .. } = &nodes[0] {
1227 if let XmlNode::Text {
1228 content, offset, ..
1229 } = &children[0]
1230 {
1231 assert_eq!(content, "Hello");
1232 assert_eq!(*offset, 7, "text should start at byte 7 (after '<Title>')");
1233 } else {
1234 panic!("expected Text");
1235 }
1236 } else {
1237 panic!("expected Element");
1238 }
1239 }
1240
1241 #[test]
1242 fn text_node_offset_with_fence_offset() {
1243 let fence_offset = 100;
1244 let input = "<Title>Hello</Title>";
1245 let nodes = parse_with_offset(input, fence_offset).unwrap();
1246 if let XmlNode::Element { children, .. } = &nodes[0] {
1247 if let XmlNode::Text {
1248 content, offset, ..
1249 } = &children[0]
1250 {
1251 assert_eq!(content, "Hello");
1252 assert_eq!(*offset, 107, "text should be fence_offset + 7");
1253 } else {
1254 panic!("expected Text");
1255 }
1256 } else {
1257 panic!("expected Element");
1258 }
1259 }
1260
1261 #[test]
1262 fn text_node_end_offset_plain_text() {
1263 let input = "<Title>Hello</Title>";
1265 let nodes = parse(input).unwrap();
1266 if let XmlNode::Element { children, .. } = &nodes[0] {
1267 if let XmlNode::Text {
1268 content,
1269 offset,
1270 end_offset,
1271 } = &children[0]
1272 {
1273 assert_eq!(content, "Hello");
1274 assert_eq!(*offset, 7);
1275 assert_eq!(
1276 *end_offset, 12,
1277 "end_offset should be past the last byte of 'Hello'"
1278 );
1279 assert_eq!(
1280 &input[*offset..*end_offset],
1281 "Hello",
1282 "offset..end_offset should span the raw text"
1283 );
1284 } else {
1285 panic!("expected Text");
1286 }
1287 } else {
1288 panic!("expected Element");
1289 }
1290 }
1291
1292 #[test]
1293 fn text_node_end_offset_with_entities() {
1294 let input = "<T>a < b</T>";
1297 let nodes = parse(input).unwrap();
1298 if let XmlNode::Element { children, .. } = &nodes[0] {
1299 if let XmlNode::Text {
1300 content,
1301 offset,
1302 end_offset,
1303 } = &children[0]
1304 {
1305 assert_eq!(content, "a < b", "content should be entity-decoded");
1306 assert_eq!(*offset, 3, "text starts after '<T>'");
1307 assert_eq!(
1308 *end_offset, 11,
1309 "end_offset should be past the last byte of 'a < b' in raw source"
1310 );
1311 assert_eq!(
1312 &input[*offset..*end_offset],
1313 "a < b",
1314 "offset..end_offset should span the raw source text"
1315 );
1316 assert!(content.len() < (*end_offset - *offset));
1318 } else {
1319 panic!("expected Text");
1320 }
1321 } else {
1322 panic!("expected Element");
1323 }
1324 }
1325
1326 #[test]
1327 fn text_node_end_offset_with_fence_offset_and_entities() {
1328 let fence_offset = 50;
1329 let input = "<T>&</T>";
1330 let nodes = parse_with_offset(input, fence_offset).unwrap();
1331 if let XmlNode::Element { children, .. } = &nodes[0] {
1332 if let XmlNode::Text {
1333 content,
1334 offset,
1335 end_offset,
1336 } = &children[0]
1337 {
1338 assert_eq!(content, "&", "decoded entity");
1339 assert_eq!(*offset, 53, "starts at fence_offset + 3 (after '<T>')");
1340 assert_eq!(
1342 *end_offset, 58,
1343 "end_offset = fence_offset + 3 + 5 (length of '&')"
1344 );
1345 } else {
1346 panic!("expected Text");
1347 }
1348 } else {
1349 panic!("expected Element");
1350 }
1351 }
1352
1353 #[test]
1354 fn text_node_end_offset_multiple_entities() {
1355 let input = "<T><></T>";
1357 let nodes = parse(input).unwrap();
1358 if let XmlNode::Element { children, .. } = &nodes[0] {
1359 if let XmlNode::Text {
1360 content,
1361 offset,
1362 end_offset,
1363 } = &children[0]
1364 {
1365 assert_eq!(content, "<>");
1366 assert_eq!(*offset, 3);
1367 assert_eq!(*end_offset, 11, "past '<>' in raw source");
1368 assert_eq!(&input[*offset..*end_offset], "<>");
1369 } else {
1370 panic!("expected Text");
1371 }
1372 } else {
1373 panic!("expected Element");
1374 }
1375 }
1376
1377 #[test]
1380 fn self_closing_element_end_offset() {
1381 let input = r#"<Spec id="s1" />"#;
1382 let nodes = parse(input).unwrap();
1383 assert_eq!(nodes.len(), 1);
1384 match &nodes[0] {
1385 XmlNode::Element {
1386 name,
1387 offset,
1388 end_offset,
1389 ..
1390 } => {
1391 assert_eq!(name, "Spec");
1392 assert_eq!(*offset, 0);
1393 assert_eq!(*end_offset, input.len());
1394 }
1395 _ => panic!("expected Element"),
1396 }
1397 }
1398
1399 #[test]
1400 fn regular_element_end_offset() {
1401 let input = "<Title>Hello</Title>";
1402 let nodes = parse(input).unwrap();
1403 assert_eq!(nodes.len(), 1);
1404 match &nodes[0] {
1405 XmlNode::Element {
1406 name,
1407 offset,
1408 end_offset,
1409 ..
1410 } => {
1411 assert_eq!(name, "Title");
1412 assert_eq!(*offset, 0);
1413 assert_eq!(*end_offset, input.len());
1414 }
1415 _ => panic!("expected Element"),
1416 }
1417 }
1418
1419 #[test]
1420 fn nested_element_end_offsets() {
1421 let input = r#"<Parent><Child id="c1" /></Parent>"#;
1422 let nodes = parse(input).unwrap();
1423 match &nodes[0] {
1424 XmlNode::Element {
1425 end_offset,
1426 children,
1427 ..
1428 } => {
1429 assert_eq!(*end_offset, input.len());
1430 match &children[0] {
1431 XmlNode::Element {
1432 name,
1433 offset,
1434 end_offset,
1435 ..
1436 } => {
1437 assert_eq!(name, "Child");
1438 assert_eq!(*offset, 8);
1439 assert_eq!(*end_offset, 25);
1441 }
1442 _ => panic!("expected Element"),
1443 }
1444 }
1445 _ => panic!("expected Element"),
1446 }
1447 }
1448
1449 #[test]
1450 fn element_end_offset_with_fence_offset() {
1451 let input = r#"<Spec id="s1" />"#;
1452 let fence_offset = 100;
1453 let nodes = parse_with_offset(input, fence_offset).unwrap();
1454 match &nodes[0] {
1455 XmlNode::Element {
1456 offset, end_offset, ..
1457 } => {
1458 assert_eq!(*offset, 100);
1459 assert_eq!(*end_offset, 100 + input.len());
1460 }
1461 _ => panic!("expected Element"),
1462 }
1463 }
1464}