1use std::path::Path;
16
17use quick_xml::Reader;
18use quick_xml::events::Event;
19use supersigil_core::ParseError;
20
21use crate::util::line_col;
22
23#[derive(Debug, Clone, PartialEq)]
29pub enum XmlNode {
30 Element {
32 name: String,
34 attributes: Vec<(String, String)>,
36 children: Vec<XmlNode>,
38 offset: usize,
40 end_offset: usize,
42 },
43 Text {
45 content: String,
47 offset: usize,
49 end_offset: usize,
54 },
55}
56
57const SYNTHETIC_ROOT: &str = "__root";
65const SYNTHETIC_OPEN: &str = "<__root>";
66const SYNTHETIC_CLOSE: &str = "</__root>";
67
68pub fn parse_supersigil_xml(
87 content: &str,
88 fence_offset: usize,
89 path: &Path,
90) -> Result<Vec<XmlNode>, ParseError> {
91 reject_unsupported(content, path)?;
93
94 let wrapped = format!("{SYNTHETIC_OPEN}{content}{SYNTHETIC_CLOSE}");
97
98 let mut reader = Reader::from_str(&wrapped);
99 reader.config_mut().trim_text(false);
100
101 #[allow(clippy::cast_possible_truncation, reason = "SYNTHETIC_OPEN is 8 bytes")]
104 let root_tag_len: u64 = SYNTHETIC_OPEN.len() as u64;
105
106 loop {
108 match reader.read_event() {
109 Ok(Event::Start(ref e)) if e.name().as_ref() == SYNTHETIC_ROOT.as_bytes() => break,
110 Ok(Event::Eof) => break,
111 Err(e) => {
112 return Err(quick_xml_error_to_parse_error(&e, path));
113 }
114 _ => {}
115 }
116 }
117
118 let nodes = parse_children(
120 &mut reader,
121 SYNTHETIC_ROOT,
122 content,
123 fence_offset,
124 root_tag_len,
125 path,
126 )?;
127
128 Ok(nodes)
129}
130
131fn reject_unsupported(content: &str, path: &Path) -> Result<(), ParseError> {
140 let bytes = content.as_bytes();
141 let mut i = 0;
142 while i < bytes.len() {
143 if bytes[i] == b'<' {
144 if bytes[i..].starts_with(b"<?") {
145 return Err(make_error(
146 content,
147 i,
148 path,
149 "processing instructions (`<?...?>`) are not supported",
150 ));
151 }
152 if bytes[i..].starts_with(b"<![CDATA[") {
153 return Err(make_error(
154 content,
155 i,
156 path,
157 "CDATA sections (`<![CDATA[...]]>`) are not supported",
158 ));
159 }
160 if bytes[i..].starts_with(b"<!DOCTYPE") || bytes[i..].starts_with(b"<!doctype") {
161 return Err(make_error(
162 content,
163 i,
164 path,
165 "DTD declarations (`<!DOCTYPE ...>`) are not supported",
166 ));
167 }
168 if bytes[i..].starts_with(b"<!--") {
169 return Err(make_error(
170 content,
171 i,
172 path,
173 "XML comments (`<!-- ... -->`) are not supported",
174 ));
175 }
176 }
177 i += 1;
178 }
179 Ok(())
180}
181
182#[allow(
192 clippy::too_many_lines,
193 reason = "event-loop structure is clearest as a single function"
194)]
195fn parse_children(
196 reader: &mut Reader<&[u8]>,
197 parent_name: &str,
198 content: &str,
199 fence_offset: usize,
200 root_tag_len: u64,
201 path: &Path,
202) -> Result<Vec<XmlNode>, ParseError> {
203 fn flush_text(
205 text_buf: &mut String,
206 text_start: &mut Option<usize>,
207 text_end: &mut Option<usize>,
208 nodes: &mut Vec<XmlNode>,
209 is_top_level: bool,
210 ) {
211 if text_buf.is_empty() {
212 return;
213 }
214 if is_top_level && text_buf.trim().is_empty() {
216 text_buf.clear();
217 *text_start = None;
218 *text_end = None;
219 return;
220 }
221 let start = text_start.take().unwrap_or(0);
222 let end = text_end.take().unwrap_or(start);
223 nodes.push(XmlNode::Text {
224 content: std::mem::take(text_buf),
225 offset: start,
226 end_offset: end,
227 });
228 }
229
230 let mut nodes: Vec<XmlNode> = Vec::new();
231 let mut text_buf = String::new();
233 let mut text_start_offset: Option<usize> = None;
235 let mut text_end_offset: Option<usize> = None;
237 let is_top_level = parent_name == SYNTHETIC_ROOT;
238
239 loop {
240 let event_pos = reader.buffer_position();
241
242 match reader.read_event() {
243 Ok(Event::Start(ref e)) => {
244 flush_text(
245 &mut text_buf,
246 &mut text_start_offset,
247 &mut text_end_offset,
248 &mut nodes,
249 is_top_level,
250 );
251
252 let offset_in_content = content_offset(event_pos, root_tag_len);
253 let file_offset = fence_offset + offset_in_content;
254
255 let tag_name = decode_name(e.name().as_ref(), content, offset_in_content, path)?;
256 validate_element_name(&tag_name, content, offset_in_content, path)?;
257 let attributes = parse_attributes(e, content, offset_in_content, path)?;
258
259 let children =
260 parse_children(reader, &tag_name, content, fence_offset, root_tag_len, path)?;
261
262 let end_in_content = content_offset(reader.buffer_position(), root_tag_len);
264 let file_end_offset = fence_offset + end_in_content;
265
266 nodes.push(XmlNode::Element {
267 name: tag_name,
268 attributes,
269 children,
270 offset: file_offset,
271 end_offset: file_end_offset,
272 });
273 }
274
275 Ok(Event::Empty(ref e)) => {
276 flush_text(
277 &mut text_buf,
278 &mut text_start_offset,
279 &mut text_end_offset,
280 &mut nodes,
281 is_top_level,
282 );
283
284 let offset_in_content = content_offset(event_pos, root_tag_len);
285 let file_offset = fence_offset + offset_in_content;
286
287 let tag_name = decode_name(e.name().as_ref(), content, offset_in_content, path)?;
288 validate_element_name(&tag_name, content, offset_in_content, path)?;
289 let attributes = parse_attributes(e, content, offset_in_content, path)?;
290
291 let end_in_content = content_offset(reader.buffer_position(), root_tag_len);
293 let file_end_offset = fence_offset + end_in_content;
294
295 nodes.push(XmlNode::Element {
296 name: tag_name,
297 attributes,
298 children: vec![],
299 offset: file_offset,
300 end_offset: file_end_offset,
301 });
302 }
303
304 Ok(Event::Text(ref e)) => {
305 let raw = std::str::from_utf8(e.as_ref()).map_err(|_err| {
307 let off = content_offset(event_pos, root_tag_len);
308 make_error(content, off, path, "invalid UTF-8 in text content")
309 })?;
310 let off = content_offset(event_pos, root_tag_len);
311 if text_start_offset.is_none() {
312 text_start_offset = Some(fence_offset + off);
313 }
314 text_end_offset = Some(fence_offset + off + raw.len());
317 text_buf.push_str(raw);
318 }
319
320 Ok(Event::GeneralRef(ref e)) => {
321 let entity_name = std::str::from_utf8(e.as_ref()).map_err(|_err| {
323 let off = content_offset(event_pos, root_tag_len);
324 make_error(content, off, path, "invalid UTF-8 in entity reference")
325 })?;
326 let off = content_offset(event_pos, root_tag_len);
327 if text_start_offset.is_none() {
328 text_start_offset = Some(fence_offset + off);
329 }
330 text_end_offset = Some(fence_offset + off + entity_name.len() + 2);
333 let resolved = resolve_entity(entity_name, content, off, path)?;
334 text_buf.push_str(resolved);
335 }
336
337 Ok(Event::End(ref e)) => {
338 let name_bytes = e.name();
339 let end_name = std::str::from_utf8(name_bytes.as_ref()).unwrap_or("<invalid>");
340 if end_name == parent_name {
341 flush_text(
342 &mut text_buf,
343 &mut text_start_offset,
344 &mut text_end_offset,
345 &mut nodes,
346 is_top_level,
347 );
348 return Ok(nodes);
349 }
350 let offset_in_content = content_offset(event_pos, root_tag_len);
352 return Err(make_error(
353 content,
354 offset_in_content,
355 path,
356 &format!(
357 "mismatched closing tag: expected `</{parent_name}>`, found `</{end_name}>`"
358 ),
359 ));
360 }
361
362 Ok(Event::Eof) => {
363 flush_text(
364 &mut text_buf,
365 &mut text_start_offset,
366 &mut text_end_offset,
367 &mut nodes,
368 is_top_level,
369 );
370 if is_top_level {
371 return Ok(nodes);
372 }
373 return Err(make_error(
374 content,
375 content.len(),
376 path,
377 &format!("expected closing tag `</{parent_name}>`, found end of input"),
378 ));
379 }
380
381 Ok(Event::Comment(_)) => {
383 let off = content_offset(event_pos, root_tag_len);
384 return Err(make_error(
385 content,
386 off,
387 path,
388 "XML comments (`<!-- ... -->`) are not supported",
389 ));
390 }
391 Ok(Event::CData(_)) => {
392 let off = content_offset(event_pos, root_tag_len);
393 return Err(make_error(
394 content,
395 off,
396 path,
397 "CDATA sections (`<![CDATA[...]]>`) are not supported",
398 ));
399 }
400 Ok(Event::PI(_) | Event::Decl(_)) => {
401 let off = content_offset(event_pos, root_tag_len);
402 return Err(make_error(
403 content,
404 off,
405 path,
406 "processing instructions (`<?...?>`) are not supported",
407 ));
408 }
409 Ok(Event::DocType(_)) => {
410 let off = content_offset(event_pos, root_tag_len);
411 return Err(make_error(
412 content,
413 off,
414 path,
415 "DTD declarations (`<!DOCTYPE ...>`) are not supported",
416 ));
417 }
418
419 Err(e) => {
420 return Err(quick_xml_error_to_parse_error(&e, path));
421 }
422 }
423 }
424}
425
426fn parse_attributes(
432 event: &quick_xml::events::BytesStart<'_>,
433 content: &str,
434 offset_in_content: usize,
435 path: &Path,
436) -> Result<Vec<(String, String)>, ParseError> {
437 validate_attribute_quotes(event, content, offset_in_content, path)?;
440
441 let mut attrs = Vec::new();
442 for attr_result in event.attributes() {
443 let attr = attr_result.map_err(|e| {
444 let msg = format!("{e}");
445 make_error(content, offset_in_content, path, &msg)
446 })?;
447
448 let key = decode_name(attr.key.as_ref(), content, offset_in_content, path)?;
449
450 if key.contains(':') {
452 return Err(make_error(
453 content,
454 offset_in_content,
455 path,
456 &format!("namespaced attribute `{key}` is not supported"),
457 ));
458 }
459
460 let raw_value = std::str::from_utf8(attr.value.as_ref()).map_err(|_err| {
462 make_error(
463 content,
464 offset_in_content,
465 path,
466 "invalid UTF-8 in attribute value",
467 )
468 })?;
469 let value = resolve_entities_in_str(raw_value, content, offset_in_content, path)?;
470
471 attrs.push((key, value));
472 }
473 Ok(attrs)
474}
475
476fn validate_attribute_quotes(
482 event: &quick_xml::events::BytesStart<'_>,
483 content: &str,
484 offset_in_content: usize,
485 path: &Path,
486) -> Result<(), ParseError> {
487 let raw: &[u8] = event.as_ref(); let mut i = 0;
489 let mut in_double_quote = false;
490 while i < raw.len() {
491 if in_double_quote {
492 if raw[i] == b'"' {
493 in_double_quote = false;
494 }
495 i += 1;
496 continue;
497 }
498 if raw[i] == b'"' {
499 in_double_quote = true;
500 i += 1;
501 continue;
502 }
503 if raw[i] == b'=' {
504 i += 1;
506 while i < raw.len() && raw[i].is_ascii_whitespace() {
507 i += 1;
508 }
509 if i < raw.len() && raw[i] == b'\'' {
510 return Err(make_error(
511 content,
512 offset_in_content,
513 path,
514 "attribute values must be double-quoted",
515 ));
516 }
517 continue;
520 }
521 i += 1;
522 }
523 Ok(())
524}
525
526fn resolve_entity(
532 name: &str,
533 content: &str,
534 offset_in_content: usize,
535 path: &Path,
536) -> Result<&'static str, ParseError> {
537 match name {
538 "amp" => Ok("&"),
539 "lt" => Ok("<"),
540 "gt" => Ok(">"),
541 "quot" => Ok("\""),
542 _ => Err(make_error(
543 content,
544 offset_in_content,
545 path,
546 &format!("unsupported entity reference `&{name};`"),
547 )),
548 }
549}
550
551fn resolve_entities_in_str(
554 text: &str,
555 content: &str,
556 offset_in_content: usize,
557 path: &Path,
558) -> Result<String, ParseError> {
559 if !text.contains('&') {
560 return Ok(text.to_owned());
561 }
562
563 let mut result = String::with_capacity(text.len());
564 let mut rest = text;
565
566 while let Some(amp_pos) = rest.find('&') {
567 result.push_str(&rest[..amp_pos]);
568 rest = &rest[amp_pos + 1..];
569 if let Some(semi_pos) = rest.find(';') {
570 let entity_name = &rest[..semi_pos];
571 let resolved = resolve_entity(entity_name, content, offset_in_content, path)?;
572 result.push_str(resolved);
573 rest = &rest[semi_pos + 1..];
574 } else {
575 return Err(make_error(
576 content,
577 offset_in_content,
578 path,
579 "unterminated entity reference (missing `;`)",
580 ));
581 }
582 }
583 result.push_str(rest);
584 Ok(result)
585}
586
587fn decode_name(
594 raw: &[u8],
595 content: &str,
596 offset_in_content: usize,
597 path: &Path,
598) -> Result<String, ParseError> {
599 std::str::from_utf8(raw)
600 .map(str::to_owned)
601 .map_err(|_err| make_error(content, offset_in_content, path, "invalid UTF-8 in name"))
602}
603
604fn validate_element_name(
611 name: &str,
612 content: &str,
613 offset_in_content: usize,
614 path: &Path,
615) -> Result<(), ParseError> {
616 if name.contains(':') {
617 return Err(make_error(
618 content,
619 offset_in_content,
620 path,
621 &format!("namespaced element `{name}` is not supported"),
622 ));
623 }
624 Ok(())
625}
626
627#[allow(
637 clippy::cast_possible_truncation,
638 reason = "XML fence content is always small"
639)]
640fn content_offset(event_pos: u64, root_tag_len: u64) -> usize {
641 (event_pos - root_tag_len) as usize
642}
643
644fn make_error(content: &str, offset_in_content: usize, path: &Path, message: &str) -> ParseError {
646 let (line, column) = line_col(content, offset_in_content);
647 ParseError::XmlSyntaxError {
648 path: path.to_path_buf(),
649 line,
650 column,
651 message: message.to_owned(),
652 }
653}
654
655fn quick_xml_error_to_parse_error(err: &quick_xml::Error, path: &Path) -> ParseError {
661 let message = match err {
662 quick_xml::Error::IllFormed(ill) => match ill {
663 quick_xml::errors::IllFormedError::MismatchedEndTag { expected, found } => {
664 if expected == SYNTHETIC_ROOT {
665 format!("unexpected closing tag `</{found}>` at top level")
667 } else {
668 format!("mismatched closing tag: expected `</{expected}>`, found `</{found}>`")
669 }
670 }
671 quick_xml::errors::IllFormedError::UnmatchedEndTag(name) => {
672 format!("unexpected closing tag `</{name}>` at top level")
673 }
674 quick_xml::errors::IllFormedError::MissingEndTag(name) => {
675 format!("expected closing tag `</{name}>`, found end of input")
676 }
677 quick_xml::errors::IllFormedError::UnclosedReference => {
678 "unterminated entity reference (missing `;`)".to_owned()
679 }
680 other => format!("{other}"),
681 },
682 other => format!("{other}"),
683 };
684
685 ParseError::XmlSyntaxError {
686 path: path.to_path_buf(),
687 line: 1,
688 column: 1,
689 message,
690 }
691}
692
693#[cfg(test)]
698#[allow(
699 clippy::match_wildcard_for_single_variants,
700 clippy::single_char_pattern,
701 reason = "test assertions are clearer with wildcards and string patterns"
702)]
703mod tests {
704 use super::*;
705
706 fn parse(content: &str) -> Result<Vec<XmlNode>, ParseError> {
707 parse_supersigil_xml(content, 0, Path::new("test.md"))
708 }
709
710 fn parse_with_offset(content: &str, offset: usize) -> Result<Vec<XmlNode>, ParseError> {
711 parse_supersigil_xml(content, offset, Path::new("test.md"))
712 }
713
714 #[test]
717 fn empty_input() {
718 let nodes = parse("").unwrap();
719 assert!(nodes.is_empty());
720 }
721
722 #[test]
723 fn whitespace_only_input() {
724 let nodes = parse(" \n \n ").unwrap();
725 assert!(nodes.is_empty());
726 }
727
728 #[test]
729 fn single_self_closing_element() {
730 let nodes = parse(r#"<Spec id="s1" />"#).unwrap();
731 assert_eq!(nodes.len(), 1);
732 match &nodes[0] {
733 XmlNode::Element {
734 name,
735 attributes,
736 children,
737 offset,
738 ..
739 } => {
740 assert_eq!(name, "Spec");
741 assert_eq!(attributes, &[("id".to_owned(), "s1".to_owned())]);
742 assert!(children.is_empty());
743 assert_eq!(*offset, 0);
744 }
745 _ => panic!("expected Element"),
746 }
747 }
748
749 #[test]
750 fn self_closing_no_space_before_slash() {
751 let nodes = parse(r#"<Spec id="s1"/>"#).unwrap();
752 assert_eq!(nodes.len(), 1);
753 if let XmlNode::Element { name, .. } = &nodes[0] {
754 assert_eq!(name, "Spec");
755 } else {
756 panic!("expected Element");
757 }
758 }
759
760 #[test]
761 fn element_with_text_content() {
762 let nodes = parse("<Title>Hello World</Title>").unwrap();
763 assert_eq!(nodes.len(), 1);
764 match &nodes[0] {
765 XmlNode::Element { name, children, .. } => {
766 assert_eq!(name, "Title");
767 assert_eq!(children.len(), 1);
768 assert!(
769 matches!(&children[0], XmlNode::Text { content, .. } if content == "Hello World")
770 );
771 }
772 _ => panic!("expected Element"),
773 }
774 }
775
776 #[test]
777 fn element_with_no_attributes() {
778 let nodes = parse("<Container></Container>").unwrap();
779 assert_eq!(nodes.len(), 1);
780 match &nodes[0] {
781 XmlNode::Element {
782 name, attributes, ..
783 } => {
784 assert_eq!(name, "Container");
785 assert!(attributes.is_empty());
786 }
787 _ => panic!("expected Element"),
788 }
789 }
790
791 #[test]
794 fn nested_elements() {
795 let input = r#"<Parent id="p1"><Child id="c1" /></Parent>"#;
796 let nodes = parse(input).unwrap();
797 assert_eq!(nodes.len(), 1);
798 match &nodes[0] {
799 XmlNode::Element { name, children, .. } => {
800 assert_eq!(name, "Parent");
801 assert_eq!(children.len(), 1);
802 match &children[0] {
803 XmlNode::Element {
804 name, attributes, ..
805 } => {
806 assert_eq!(name, "Child");
807 assert_eq!(attributes, &[("id".to_owned(), "c1".to_owned())]);
808 }
809 _ => panic!("expected nested Element"),
810 }
811 }
812 _ => panic!("expected Element"),
813 }
814 }
815
816 #[test]
817 fn deeply_nested_elements() {
818 let input = "<A><B><C>deep</C></B></A>";
819 let nodes = parse(input).unwrap();
820 assert_eq!(nodes.len(), 1);
821 let a = &nodes[0];
823 if let XmlNode::Element { children, .. } = a {
824 let b = &children[0];
825 if let XmlNode::Element { children, .. } = b {
826 let c = &children[0];
827 if let XmlNode::Element { children, .. } = c {
828 assert!(
829 matches!(&children[0], XmlNode::Text { content, .. } if content == "deep")
830 );
831 } else {
832 panic!("expected C element");
833 }
834 } else {
835 panic!("expected B element");
836 }
837 } else {
838 panic!("expected A element");
839 }
840 }
841
842 #[test]
843 fn mixed_children_text_and_elements() {
844 let input = "<Parent>before<Child />after</Parent>";
845 let nodes = parse(input).unwrap();
846 assert_eq!(nodes.len(), 1);
847 if let XmlNode::Element { children, .. } = &nodes[0] {
848 assert_eq!(children.len(), 3);
849 assert!(matches!(&children[0], XmlNode::Text { content, .. } if content == "before"));
850 assert!(matches!(&children[1], XmlNode::Element { name, .. } if name == "Child"));
851 assert!(matches!(&children[2], XmlNode::Text { content, .. } if content == "after"));
852 } else {
853 panic!("expected Element");
854 }
855 }
856
857 #[test]
860 fn multiple_top_level_elements() {
861 let input = r#"<A id="1" />
862<B id="2" />"#;
863 let nodes = parse(input).unwrap();
864 assert_eq!(nodes.len(), 2);
865 if let XmlNode::Element { name, .. } = &nodes[0] {
866 assert_eq!(name, "A");
867 }
868 if let XmlNode::Element { name, .. } = &nodes[1] {
869 assert_eq!(name, "B");
870 }
871 }
872
873 #[test]
876 fn multiple_attributes() {
877 let input = r#"<Criterion id="c1" strategy="tag" />"#;
878 let nodes = parse(input).unwrap();
879 if let XmlNode::Element { attributes, .. } = &nodes[0] {
880 assert_eq!(
881 attributes,
882 &[
883 ("id".to_owned(), "c1".to_owned()),
884 ("strategy".to_owned(), "tag".to_owned()),
885 ]
886 );
887 } else {
888 panic!("expected Element");
889 }
890 }
891
892 #[test]
893 fn attribute_with_entity_in_value() {
894 let input = r#"<Spec desc="a & b" />"#;
895 let nodes = parse(input).unwrap();
896 if let XmlNode::Element { attributes, .. } = &nodes[0] {
897 assert_eq!(attributes[0].1, "a & b");
898 } else {
899 panic!("expected Element");
900 }
901 }
902
903 #[test]
904 fn all_supported_entities_in_attribute() {
905 let input = r#"<Spec val="&<>"" />"#;
906 let nodes = parse(input).unwrap();
907 if let XmlNode::Element { attributes, .. } = &nodes[0] {
908 assert_eq!(attributes[0].1, "&<>\"");
909 } else {
910 panic!("expected Element");
911 }
912 }
913
914 #[test]
917 fn entity_references_in_text() {
918 let input = "<Note>a < b & c > d "e"</Note>";
919 let nodes = parse(input).unwrap();
920 if let XmlNode::Element { children, .. } = &nodes[0] {
921 assert!(
922 matches!(&children[0], XmlNode::Text { content, .. } if content == r#"a < b & c > d "e""#)
923 );
924 } else {
925 panic!("expected Element");
926 }
927 }
928
929 #[test]
932 fn offset_applied_to_elements() {
933 let fence_offset = 100;
934 let input = r#"<Spec id="s1" />"#;
935 let nodes = parse_with_offset(input, fence_offset).unwrap();
936 if let XmlNode::Element { offset, .. } = &nodes[0] {
937 assert_eq!(*offset, 100);
938 } else {
939 panic!("expected Element");
940 }
941 }
942
943 #[test]
944 fn offset_applied_to_nested_element() {
945 let fence_offset = 50;
946 let input = "<A><B /></A>";
948 let nodes = parse_with_offset(input, fence_offset).unwrap();
949 if let XmlNode::Element {
950 offset, children, ..
951 } = &nodes[0]
952 {
953 assert_eq!(*offset, 50); if let XmlNode::Element { offset, .. } = &children[0] {
955 assert_eq!(*offset, 53); } else {
957 panic!("expected nested Element");
958 }
959 } else {
960 panic!("expected Element");
961 }
962 }
963
964 #[test]
965 fn offset_with_multiple_top_level_elements() {
966 let fence_offset = 200;
967 let input = "<A />\n<B />";
968 let nodes = parse_with_offset(input, fence_offset).unwrap();
969 assert_eq!(nodes.len(), 2);
970 if let XmlNode::Element { offset, .. } = &nodes[0] {
971 assert_eq!(*offset, 200); }
973 if let XmlNode::Element { offset, .. } = &nodes[1] {
974 assert_eq!(*offset, 206); }
976 }
977
978 #[test]
981 fn unclosed_element() {
982 let err = parse("<Spec>content").unwrap_err();
983 let msg = err.to_string();
984 assert!(msg.contains("closing tag"), "got: {msg}");
985 assert!(msg.contains("Spec"), "got: {msg}");
986 }
987
988 #[test]
989 fn mismatched_closing_tag() {
990 let err = parse("<A>text</B>").unwrap_err();
991 let msg = err.to_string();
992 assert!(msg.contains("mismatched"), "got: {msg}");
993 assert!(msg.contains("A"), "got: {msg}");
994 assert!(msg.contains("B"), "got: {msg}");
995 }
996
997 #[test]
1000 fn single_quoted_attribute_value() {
1001 let err = parse("<Spec id='s1' />").unwrap_err();
1002 let msg = err.to_string();
1003 assert!(msg.contains("double-quoted"), "got: {msg}");
1004 }
1005
1006 #[test]
1007 fn single_quotes_inside_double_quoted_attribute_value() {
1008 let result = parse(r#"<Spec val="a='b'" />"#);
1010 assert!(result.is_ok(), "got: {}", result.unwrap_err());
1011 }
1012
1013 #[test]
1014 fn missing_attribute_value() {
1015 let err = parse("<Spec id />").unwrap_err();
1016 let msg = err.to_string();
1017 assert!(msg.contains("="), "got: {msg}");
1018 }
1019
1020 #[test]
1023 fn processing_instruction_rejected() {
1024 let err = parse("<?xml version=\"1.0\"?>").unwrap_err();
1025 let msg = err.to_string();
1026 assert!(msg.contains("processing instruction"), "got: {msg}");
1027 }
1028
1029 #[test]
1030 fn cdata_rejected() {
1031 let err = parse("<![CDATA[foo]]>").unwrap_err();
1032 let msg = err.to_string();
1033 assert!(msg.contains("CDATA"), "got: {msg}");
1034 }
1035
1036 #[test]
1037 fn doctype_rejected() {
1038 let err = parse("<!DOCTYPE html>").unwrap_err();
1039 let msg = err.to_string();
1040 assert!(msg.contains("DTD") || msg.contains("DOCTYPE"), "got: {msg}");
1041 }
1042
1043 #[test]
1044 fn comment_rejected() {
1045 let err = parse("<!-- comment -->").unwrap_err();
1046 let msg = err.to_string();
1047 assert!(msg.contains("comment"), "got: {msg}");
1048 }
1049
1050 #[test]
1051 fn namespace_in_element_rejected() {
1052 let err = parse("<foo:Bar />").unwrap_err();
1056 assert!(
1057 err.to_string().contains("test.md"),
1058 "error should include path"
1059 );
1060 }
1061
1062 #[test]
1063 fn unsupported_entity_rejected() {
1064 let err = parse("<Spec>'</Spec>").unwrap_err();
1065 let msg = err.to_string();
1066 assert!(msg.contains("unsupported entity"), "got: {msg}");
1067 }
1068
1069 #[test]
1070 fn unterminated_entity_rejected() {
1071 let err = parse("<Spec>&</Spec>").unwrap_err();
1072 let msg = err.to_string();
1073 assert!(msg.contains("unterminated entity"), "got: {msg}");
1074 }
1075
1076 #[test]
1079 fn lowercase_element_name_parsed_successfully() {
1080 let nodes = parse("<spec />").unwrap();
1081 assert_eq!(nodes.len(), 1);
1082 match &nodes[0] {
1083 XmlNode::Element { name, .. } => assert_eq!(name, "spec"),
1084 _ => panic!("expected Element"),
1085 }
1086 }
1087
1088 #[test]
1089 fn lowercase_element_inside_pascal_case_element() {
1090 let nodes = parse(r#"<Criterion id="c1">Use <em>fast</em> path</Criterion>"#).unwrap();
1091 assert_eq!(nodes.len(), 1);
1092 match &nodes[0] {
1093 XmlNode::Element { name, children, .. } => {
1094 assert_eq!(name, "Criterion");
1095 assert_eq!(children.len(), 3);
1097 assert!(matches!(&children[0], XmlNode::Text { content, .. } if content == "Use "));
1098 match &children[1] {
1099 XmlNode::Element { name, children, .. } => {
1100 assert_eq!(name, "em");
1101 assert_eq!(children.len(), 1);
1102 assert!(
1103 matches!(&children[0], XmlNode::Text { content, .. } if content == "fast")
1104 );
1105 }
1106 _ => panic!("expected em Element"),
1107 }
1108 assert!(
1109 matches!(&children[2], XmlNode::Text { content, .. } if content == " path")
1110 );
1111 }
1112 _ => panic!("expected Criterion Element"),
1113 }
1114 }
1115
1116 #[test]
1119 fn error_includes_line_and_column() {
1120 let err = parse("<A>\n<ns:B /></A>").unwrap_err();
1123 if let ParseError::XmlSyntaxError { line, column, .. } = &err {
1124 assert_eq!(*line, 2);
1125 assert_eq!(*column, 1);
1126 } else {
1127 panic!("expected XmlSyntaxError");
1128 }
1129 }
1130
1131 #[test]
1132 fn error_includes_file_path() {
1133 let err = parse_supersigil_xml("<?xml?>", 0, Path::new("/foo/bar.md")).unwrap_err();
1134 let msg = err.to_string();
1135 assert!(msg.contains("/foo/bar.md"), "got: {msg}");
1136 }
1137
1138 #[test]
1141 fn closing_tag_at_top_level_rejected() {
1142 let err = parse("</Orphan>").unwrap_err();
1143 let msg = err.to_string();
1144 assert!(msg.contains("unexpected closing tag"), "got: {msg}");
1145 }
1146
1147 #[test]
1150 fn realistic_component_example() {
1151 let input = r#"<Criterion id="perf-latency" strategy="tag">
1152 P99 latency must be under 100ms for API requests.
1153</Criterion>
1154<VerifiedBy strategy="tag" tag="perf-latency" />"#;
1155 let nodes = parse(input).unwrap();
1156 assert_eq!(nodes.len(), 2);
1157
1158 match &nodes[0] {
1160 XmlNode::Element {
1161 name,
1162 attributes,
1163 children,
1164 ..
1165 } => {
1166 assert_eq!(name, "Criterion");
1167 assert_eq!(attributes.len(), 2);
1168 assert_eq!(attributes[0], ("id".to_owned(), "perf-latency".to_owned()));
1169 assert_eq!(attributes[1], ("strategy".to_owned(), "tag".to_owned()));
1170 assert_eq!(children.len(), 1);
1171 if let XmlNode::Text { content, .. } = &children[0] {
1172 assert!(content.contains("P99 latency"));
1173 } else {
1174 panic!("expected Text child");
1175 }
1176 }
1177 _ => panic!("expected Element"),
1178 }
1179
1180 match &nodes[1] {
1182 XmlNode::Element {
1183 name,
1184 attributes,
1185 children,
1186 ..
1187 } => {
1188 assert_eq!(name, "VerifiedBy");
1189 assert_eq!(attributes.len(), 2);
1190 assert!(children.is_empty());
1191 }
1192 _ => panic!("expected Element"),
1193 }
1194 }
1195
1196 #[test]
1199 fn utf8_text_content_preserved() {
1200 let input = "<Note>cafe\u{0301} \u{1F600}</Note>";
1201 let nodes = parse(input).unwrap();
1202 if let XmlNode::Element { children, .. } = &nodes[0] {
1203 if let XmlNode::Text { content: t, .. } = &children[0] {
1204 assert!(t.contains("cafe\u{0301}"));
1205 assert!(t.contains('\u{1F600}'));
1206 } else {
1207 panic!("expected Text");
1208 }
1209 } else {
1210 panic!("expected Element");
1211 }
1212 }
1213
1214 #[test]
1215 fn text_node_has_correct_offset() {
1216 let input = "<Title>Hello</Title>";
1218 let nodes = parse(input).unwrap();
1219 if let XmlNode::Element { children, .. } = &nodes[0] {
1220 if let XmlNode::Text {
1221 content, offset, ..
1222 } = &children[0]
1223 {
1224 assert_eq!(content, "Hello");
1225 assert_eq!(*offset, 7, "text should start at byte 7 (after '<Title>')");
1226 } else {
1227 panic!("expected Text");
1228 }
1229 } else {
1230 panic!("expected Element");
1231 }
1232 }
1233
1234 #[test]
1235 fn text_node_offset_with_fence_offset() {
1236 let fence_offset = 100;
1237 let input = "<Title>Hello</Title>";
1238 let nodes = parse_with_offset(input, fence_offset).unwrap();
1239 if let XmlNode::Element { children, .. } = &nodes[0] {
1240 if let XmlNode::Text {
1241 content, offset, ..
1242 } = &children[0]
1243 {
1244 assert_eq!(content, "Hello");
1245 assert_eq!(*offset, 107, "text should be fence_offset + 7");
1246 } else {
1247 panic!("expected Text");
1248 }
1249 } else {
1250 panic!("expected Element");
1251 }
1252 }
1253
1254 #[test]
1255 fn text_node_end_offset_plain_text() {
1256 let input = "<Title>Hello</Title>";
1258 let nodes = parse(input).unwrap();
1259 if let XmlNode::Element { children, .. } = &nodes[0] {
1260 if let XmlNode::Text {
1261 content,
1262 offset,
1263 end_offset,
1264 } = &children[0]
1265 {
1266 assert_eq!(content, "Hello");
1267 assert_eq!(*offset, 7);
1268 assert_eq!(
1269 *end_offset, 12,
1270 "end_offset should be past the last byte of 'Hello'"
1271 );
1272 assert_eq!(
1273 &input[*offset..*end_offset],
1274 "Hello",
1275 "offset..end_offset should span the raw text"
1276 );
1277 } else {
1278 panic!("expected Text");
1279 }
1280 } else {
1281 panic!("expected Element");
1282 }
1283 }
1284
1285 #[test]
1286 fn text_node_end_offset_with_entities() {
1287 let input = "<T>a < b</T>";
1290 let nodes = parse(input).unwrap();
1291 if let XmlNode::Element { children, .. } = &nodes[0] {
1292 if let XmlNode::Text {
1293 content,
1294 offset,
1295 end_offset,
1296 } = &children[0]
1297 {
1298 assert_eq!(content, "a < b", "content should be entity-decoded");
1299 assert_eq!(*offset, 3, "text starts after '<T>'");
1300 assert_eq!(
1301 *end_offset, 11,
1302 "end_offset should be past the last byte of 'a < b' in raw source"
1303 );
1304 assert_eq!(
1305 &input[*offset..*end_offset],
1306 "a < b",
1307 "offset..end_offset should span the raw source text"
1308 );
1309 assert!(content.len() < (*end_offset - *offset));
1311 } else {
1312 panic!("expected Text");
1313 }
1314 } else {
1315 panic!("expected Element");
1316 }
1317 }
1318
1319 #[test]
1320 fn text_node_end_offset_with_fence_offset_and_entities() {
1321 let fence_offset = 50;
1322 let input = "<T>&</T>";
1323 let nodes = parse_with_offset(input, fence_offset).unwrap();
1324 if let XmlNode::Element { children, .. } = &nodes[0] {
1325 if let XmlNode::Text {
1326 content,
1327 offset,
1328 end_offset,
1329 } = &children[0]
1330 {
1331 assert_eq!(content, "&", "decoded entity");
1332 assert_eq!(*offset, 53, "starts at fence_offset + 3 (after '<T>')");
1333 assert_eq!(
1335 *end_offset, 58,
1336 "end_offset = fence_offset + 3 + 5 (length of '&')"
1337 );
1338 } else {
1339 panic!("expected Text");
1340 }
1341 } else {
1342 panic!("expected Element");
1343 }
1344 }
1345
1346 #[test]
1347 fn text_node_end_offset_multiple_entities() {
1348 let input = "<T><></T>";
1350 let nodes = parse(input).unwrap();
1351 if let XmlNode::Element { children, .. } = &nodes[0] {
1352 if let XmlNode::Text {
1353 content,
1354 offset,
1355 end_offset,
1356 } = &children[0]
1357 {
1358 assert_eq!(content, "<>");
1359 assert_eq!(*offset, 3);
1360 assert_eq!(*end_offset, 11, "past '<>' in raw source");
1361 assert_eq!(&input[*offset..*end_offset], "<>");
1362 } else {
1363 panic!("expected Text");
1364 }
1365 } else {
1366 panic!("expected Element");
1367 }
1368 }
1369
1370 #[test]
1373 fn self_closing_element_end_offset() {
1374 let input = r#"<Spec id="s1" />"#;
1375 let nodes = parse(input).unwrap();
1376 assert_eq!(nodes.len(), 1);
1377 match &nodes[0] {
1378 XmlNode::Element {
1379 name,
1380 offset,
1381 end_offset,
1382 ..
1383 } => {
1384 assert_eq!(name, "Spec");
1385 assert_eq!(*offset, 0);
1386 assert_eq!(*end_offset, input.len());
1387 }
1388 _ => panic!("expected Element"),
1389 }
1390 }
1391
1392 #[test]
1393 fn regular_element_end_offset() {
1394 let input = "<Title>Hello</Title>";
1395 let nodes = parse(input).unwrap();
1396 assert_eq!(nodes.len(), 1);
1397 match &nodes[0] {
1398 XmlNode::Element {
1399 name,
1400 offset,
1401 end_offset,
1402 ..
1403 } => {
1404 assert_eq!(name, "Title");
1405 assert_eq!(*offset, 0);
1406 assert_eq!(*end_offset, input.len());
1407 }
1408 _ => panic!("expected Element"),
1409 }
1410 }
1411
1412 #[test]
1413 fn nested_element_end_offsets() {
1414 let input = r#"<Parent><Child id="c1" /></Parent>"#;
1415 let nodes = parse(input).unwrap();
1416 match &nodes[0] {
1417 XmlNode::Element {
1418 end_offset,
1419 children,
1420 ..
1421 } => {
1422 assert_eq!(*end_offset, input.len());
1423 match &children[0] {
1424 XmlNode::Element {
1425 name,
1426 offset,
1427 end_offset,
1428 ..
1429 } => {
1430 assert_eq!(name, "Child");
1431 assert_eq!(*offset, 8);
1432 assert_eq!(*end_offset, 25);
1434 }
1435 _ => panic!("expected Element"),
1436 }
1437 }
1438 _ => panic!("expected Element"),
1439 }
1440 }
1441
1442 #[test]
1443 fn element_end_offset_with_fence_offset() {
1444 let input = r#"<Spec id="s1" />"#;
1445 let fence_offset = 100;
1446 let nodes = parse_with_offset(input, fence_offset).unwrap();
1447 match &nodes[0] {
1448 XmlNode::Element {
1449 offset, end_offset, ..
1450 } => {
1451 assert_eq!(*offset, 100);
1452 assert_eq!(*end_offset, 100 + input.len());
1453 }
1454 _ => panic!("expected Element"),
1455 }
1456 }
1457}