1use std::collections::{HashMap, HashSet};
22use std::fmt;
23
24use crate::error::{ParseError, SourceLocation};
25use crate::tree::{Document, NodeId, NodeKind};
26
27use super::{ValidationError, ValidationResult};
28
29#[derive(Debug, Clone, Default)]
37pub struct Dtd {
38 pub elements: HashMap<String, ElementDecl>,
40 pub attributes: HashMap<String, Vec<AttributeDecl>>,
42 pub entities: HashMap<String, EntityDecl>,
44 pub param_entities: HashMap<String, EntityDecl>,
46 pub notations: HashMap<String, NotationDecl>,
48 pub declarations: Vec<DtdDeclaration>,
50}
51
52#[derive(Debug, Clone)]
54pub enum DtdDeclaration {
55 Element(ElementDecl),
57 Attlist(AttributeDecl),
60 Entity(EntityDecl),
62 Notation(NotationDecl),
64 Comment(String),
66 Pi(String, Option<String>),
68}
69
70#[derive(Debug, Clone)]
74pub struct ElementDecl {
75 pub name: String,
77 pub content_model: ContentModel,
79}
80
81#[derive(Debug, Clone, PartialEq)]
86pub enum ContentModel {
87 Empty,
90 Any,
93 Mixed(Vec<String>),
98 Children(ContentSpec),
101}
102
103#[derive(Debug, Clone, PartialEq)]
110pub struct ContentSpec {
111 pub kind: ContentSpecKind,
113 pub occurrence: Occurrence,
115}
116
117#[derive(Debug, Clone, PartialEq)]
119pub enum ContentSpecKind {
120 Name(String),
122 Seq(Vec<ContentSpec>),
124 Choice(Vec<ContentSpec>),
126}
127
128#[derive(Debug, Clone, Copy, PartialEq, Eq)]
132pub enum Occurrence {
133 Once,
135 Optional,
137 ZeroOrMore,
139 OneOrMore,
141}
142
143#[derive(Debug, Clone)]
147pub struct AttributeDecl {
148 pub element_name: String,
150 pub attribute_name: String,
152 pub attribute_type: AttributeType,
154 pub default: AttributeDefault,
156}
157
158#[derive(Debug, Clone, PartialEq)]
162pub enum AttributeType {
163 CData,
165 Id,
167 IdRef,
169 IdRefs,
171 Entity,
173 Entities,
175 NmToken,
177 NmTokens,
179 Notation(Vec<String>),
181 Enumeration(Vec<String>),
183}
184
185#[derive(Debug, Clone, PartialEq)]
189pub enum AttributeDefault {
190 Required,
192 Implied,
194 Fixed(String),
196 Default(String),
198}
199
200#[derive(Debug, Clone)]
204pub struct EntityDecl {
205 pub name: String,
207 pub kind: EntityKind,
209}
210
211#[derive(Debug, Clone)]
214pub enum EntityKind {
215 Internal(String),
217 External {
219 system_id: String,
221 public_id: Option<String>,
223 },
224}
225
226#[derive(Debug, Clone)]
230pub struct NotationDecl {
231 pub name: String,
233 pub system_id: Option<String>,
235 pub public_id: Option<String>,
237}
238
239impl fmt::Display for ContentModel {
240 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
241 match self {
242 Self::Empty => write!(f, "EMPTY"),
243 Self::Any => write!(f, "ANY"),
244 Self::Mixed(names) => {
245 if names.is_empty() {
246 write!(f, "(#PCDATA)")
247 } else {
248 write!(f, "(#PCDATA|{})*", names.join("|"))
249 }
250 }
251 Self::Children(spec) => write!(f, "{spec}"),
252 }
253 }
254}
255
256impl fmt::Display for ContentSpec {
257 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
258 match &self.kind {
259 ContentSpecKind::Name(name) => write!(f, "{name}")?,
260 ContentSpecKind::Seq(items) => {
261 write!(f, "(")?;
262 for (i, item) in items.iter().enumerate() {
263 if i > 0 {
264 write!(f, " , ")?;
265 }
266 write!(f, "{item}")?;
267 }
268 write!(f, ")")?;
269 }
270 ContentSpecKind::Choice(items) => {
271 write!(f, "(")?;
272 for (i, item) in items.iter().enumerate() {
273 if i > 0 {
274 write!(f, " | ")?;
275 }
276 write!(f, "{item}")?;
277 }
278 write!(f, ")")?;
279 }
280 }
281 match self.occurrence {
282 Occurrence::Once => {}
283 Occurrence::Optional => write!(f, "?")?,
284 Occurrence::ZeroOrMore => write!(f, "*")?,
285 Occurrence::OneOrMore => write!(f, "+")?,
286 }
287 Ok(())
288 }
289}
290
291#[must_use]
301#[allow(clippy::too_many_lines)]
302pub fn serialize_dtd(dtd: &Dtd) -> String {
303 let mut out = String::new();
304 let mut last_was_comment = false;
305
306 for decl in &dtd.declarations {
307 if !last_was_comment {
312 out.push('\n');
313 }
314 match decl {
315 DtdDeclaration::Element(e) => {
316 out.push_str("<!ELEMENT ");
317 out.push_str(&e.name);
318 out.push(' ');
319 write_content_model(&mut out, &e.content_model);
320 out.push('>');
321 last_was_comment = false;
322 }
323 DtdDeclaration::Attlist(a) => {
324 out.push_str("<!ATTLIST ");
325 out.push_str(&a.element_name);
326 out.push(' ');
327 out.push_str(&a.attribute_name);
328 out.push(' ');
329 write_attribute_type(&mut out, &a.attribute_type);
330 out.push(' ');
331 write_attribute_default(&mut out, &a.default);
332 out.push('>');
333 last_was_comment = false;
334 }
335 DtdDeclaration::Entity(e) => {
336 out.push_str("<!ENTITY ");
337 out.push_str(&e.name);
338 match &e.kind {
339 EntityKind::Internal(value) => {
340 out.push(' ');
341 write_entity_value(&mut out, value);
342 }
343 EntityKind::External {
344 system_id,
345 public_id,
346 } => {
347 if let Some(pub_id) = public_id {
348 out.push_str(" PUBLIC \"");
349 out.push_str(pub_id);
350 out.push_str("\" \"");
351 out.push_str(system_id);
352 out.push('"');
353 } else {
354 out.push_str(" SYSTEM \"");
355 out.push_str(system_id);
356 out.push('"');
357 }
358 }
359 }
360 out.push('>');
361 last_was_comment = false;
362 }
363 DtdDeclaration::Notation(n) => {
364 out.push_str("<!NOTATION ");
365 out.push_str(&n.name);
366 match (&n.public_id, &n.system_id) {
367 (Some(pub_id), Some(sys_id)) => {
368 out.push_str(" PUBLIC \"");
369 out.push_str(pub_id);
370 out.push_str("\" \"");
371 out.push_str(sys_id);
372 out.push('"');
373 }
374 (Some(pub_id), None) => {
375 out.push_str(" PUBLIC \"");
376 out.push_str(pub_id);
377 out.push('"');
378 }
379 (None, Some(sys_id)) => {
380 out.push_str(" SYSTEM \"");
381 out.push_str(sys_id);
382 out.push('"');
383 }
384 (None, None) => {}
385 }
386 out.push('>');
387 last_was_comment = false;
388 }
389 DtdDeclaration::Comment(text) => {
390 out.push_str("<!--");
391 out.push_str(text);
392 out.push_str("-->");
393 last_was_comment = true;
394 }
395 DtdDeclaration::Pi(target, data) => {
396 out.push_str("<?");
397 out.push_str(target);
398 if let Some(d) = data {
399 out.push(' ');
400 out.push_str(d);
401 }
402 out.push_str("?>");
403 last_was_comment = false;
404 }
405 }
406 }
407
408 if !last_was_comment && !dtd.declarations.is_empty() {
410 out.push('\n');
411 }
412
413 out
414}
415
416fn write_content_model(out: &mut String, model: &ContentModel) {
418 match model {
419 ContentModel::Empty => out.push_str("EMPTY"),
420 ContentModel::Any => out.push_str("ANY"),
421 ContentModel::Mixed(names) => {
422 if names.is_empty() {
423 out.push_str("(#PCDATA)");
424 } else {
425 out.push_str("(#PCDATA");
426 for name in names {
427 out.push_str(" | ");
428 out.push_str(name);
429 }
430 out.push_str(")*");
431 }
432 }
433 ContentModel::Children(spec) => {
434 use std::fmt::Write;
435 let _ = write!(out, "{spec}");
436 }
437 }
438}
439
440fn write_attribute_type(out: &mut String, attr_type: &AttributeType) {
442 match attr_type {
443 AttributeType::CData => out.push_str("CDATA"),
444 AttributeType::Id => out.push_str("ID"),
445 AttributeType::IdRef => out.push_str("IDREF"),
446 AttributeType::IdRefs => out.push_str("IDREFS"),
447 AttributeType::Entity => out.push_str("ENTITY"),
448 AttributeType::Entities => out.push_str("ENTITIES"),
449 AttributeType::NmToken => out.push_str("NMTOKEN"),
450 AttributeType::NmTokens => out.push_str("NMTOKENS"),
451 AttributeType::Notation(values) | AttributeType::Enumeration(values) => {
452 if matches!(attr_type, AttributeType::Notation(_)) {
453 out.push_str("NOTATION ");
454 }
455 out.push('(');
456 for (i, v) in values.iter().enumerate() {
457 if i > 0 {
458 out.push_str(" | ");
459 }
460 out.push_str(v);
461 }
462 out.push(')');
463 }
464 }
465}
466
467fn write_attribute_default(out: &mut String, default: &AttributeDefault) {
469 match default {
470 AttributeDefault::Required => out.push_str("#REQUIRED"),
471 AttributeDefault::Implied => out.push_str("#IMPLIED"),
472 AttributeDefault::Fixed(value) => {
473 out.push_str("#FIXED \"");
474 out.push_str(value);
475 out.push('"');
476 }
477 AttributeDefault::Default(value) => {
478 out.push('"');
479 out.push_str(value);
480 out.push('"');
481 }
482 }
483}
484
485fn write_entity_value(out: &mut String, value: &str) {
492 let quote = if value.contains('"') && !value.contains('\'') {
495 '\''
496 } else {
497 '"'
498 };
499 out.push(quote);
500
501 let bytes = value.as_bytes();
502 let len = bytes.len();
503 let mut i = 0;
504
505 while i < len {
506 if bytes[i] == b'&' {
507 if let Some(ref_end) = find_reference_end(bytes, i) {
509 let ref_str = std::str::from_utf8(&bytes[i..=ref_end]).unwrap_or("&");
511 out.push_str(ref_str);
512 i = ref_end + 1;
513 } else {
514 out.push_str("&");
515 i += 1;
516 }
517 } else if bytes[i] == b'%' {
518 out.push_str("%");
519 i += 1;
520 } else if bytes[i] == quote as u8 {
521 if quote == '"' {
522 out.push_str(""");
523 } else {
524 out.push_str("'");
525 }
526 i += 1;
527 } else {
528 let ch = &value[i..];
530 if let Some(c) = ch.chars().next() {
531 out.push(c);
532 i += c.len_utf8();
533 } else {
534 i += 1;
535 }
536 }
537 }
538
539 out.push(quote);
540}
541
542fn find_reference_end(bytes: &[u8], start: usize) -> Option<usize> {
546 if start >= bytes.len() || bytes[start] != b'&' {
547 return None;
548 }
549 let mut i = start + 1;
550 if i >= bytes.len() {
551 return None;
552 }
553
554 if bytes[i] == b'#' {
555 i += 1;
557 if i >= bytes.len() {
558 return None;
559 }
560 if bytes[i] == b'x' {
561 i += 1;
562 let digit_start = i;
563 while i < bytes.len() && bytes[i].is_ascii_hexdigit() {
564 i += 1;
565 }
566 if i == digit_start || i >= bytes.len() || bytes[i] != b';' {
567 return None;
568 }
569 } else {
570 let digit_start = i;
571 while i < bytes.len() && bytes[i].is_ascii_digit() {
572 i += 1;
573 }
574 if i == digit_start || i >= bytes.len() || bytes[i] != b';' {
575 return None;
576 }
577 }
578 Some(i)
579 } else {
580 if !is_name_start_byte(bytes[i]) {
583 return None;
584 }
585 i += 1;
586 while i < bytes.len() && is_name_byte(bytes[i]) {
587 i += 1;
588 }
589 if i >= bytes.len() || bytes[i] != b';' {
590 return None;
591 }
592 Some(i)
593 }
594}
595
596fn is_name_start_byte(b: u8) -> bool {
598 b.is_ascii_alphabetic() || b == b'_' || b == b':'
599}
600
601fn is_name_byte(b: u8) -> bool {
603 b.is_ascii_alphanumeric() || b == b'_' || b == b':' || b == b'-' || b == b'.'
604}
605
606pub fn parse_dtd(input: &str) -> Result<Dtd, ParseError> {
628 let mut parser = DtdParser::new(input);
629 parser.parse()
630}
631
632struct DtdParser<'a> {
634 input: &'a [u8],
635 pos: usize,
636 line: u32,
637 column: u32,
638 dtd: Dtd,
639}
640
641impl<'a> DtdParser<'a> {
642 fn new(input: &'a str) -> Self {
643 Self {
644 input: input.as_bytes(),
645 pos: 0,
646 line: 1,
647 column: 1,
648 dtd: Dtd::default(),
649 }
650 }
651
652 fn parse(&mut self) -> Result<Dtd, ParseError> {
653 loop {
654 self.skip_whitespace();
655 if self.at_end() {
656 break;
657 }
658
659 if self.looking_at(b"<!--") {
660 self.parse_comment_decl()?;
661 } else if self.looking_at(b"<!ELEMENT") {
662 self.parse_element_decl()?;
663 } else if self.looking_at(b"<!ATTLIST") {
664 self.parse_attlist_decl()?;
665 } else if self.looking_at(b"<!ENTITY") {
666 self.parse_entity_decl()?;
667 } else if self.looking_at(b"<!NOTATION") {
668 self.parse_notation_decl()?;
669 } else if self.looking_at(b"<?") {
670 self.parse_pi_decl()?;
671 } else if self.peek() == Some(b'%') {
672 self.skip_pe_reference()?;
674 } else {
675 return Err(self.fatal(format!(
676 "unexpected character '{}' in DTD",
677 self.peek().map_or('?', |b| b as char)
678 )));
679 }
680 }
681
682 self.post_validate()?;
683
684 Ok(std::mem::take(&mut self.dtd))
685 }
686
687 fn post_validate(&self) -> Result<(), ParseError> {
693 for (name, decl) in &self.dtd.entities {
695 if let EntityKind::Internal(ref value) = decl.kind {
696 let mut visited = std::collections::HashSet::new();
697 visited.insert(name.clone());
698 self.check_entity_recursion(value, &mut visited)?;
699 }
700 }
701
702 for (name, decl) in &self.dtd.param_entities {
707 if let EntityKind::Internal(ref value) = decl.kind {
708 let expanded = expand_char_refs_only(value);
709 let mut visited = std::collections::HashSet::new();
710 visited.insert(name.clone());
711 self.check_pe_recursion(&expanded, &mut visited)?;
712 }
713 }
714
715 for (name, decl) in &self.dtd.entities {
720 if let EntityKind::Internal(ref value) = decl.kind {
721 self.validate_replacement_text(name, value)?;
722 }
723 }
724
725 self.validate_predefined_entities()?;
729
730 for attrs in self.dtd.attributes.values() {
737 for attr in attrs {
738 let (AttributeDefault::Default(default_value)
739 | AttributeDefault::Fixed(default_value)) = &attr.default
740 else {
741 continue;
742 };
743 self.validate_attr_default_entities(default_value)?;
744 }
745 }
746
747 Ok(())
748 }
749
750 fn validate_predefined_entities(&self) -> Result<(), ParseError> {
757 let expected: &[(&str, &str, &[&str])] = &[
761 ("lt", "<", &["<", "<", "<"]),
762 ("gt", ">", &[">", ">", ">", ">"]),
763 ("amp", "&", &["&", "&"]),
764 ("apos", "'", &["'", "'", "'"]),
765 ("quot", "\"", &["\"", """, """]),
766 ];
767 for &(name, _char_val, valid_refs) in expected {
768 if let Some(decl) = self.dtd.entities.get(name) {
769 match &decl.kind {
770 EntityKind::Internal(value) => {
771 if !valid_refs.iter().any(|r| r == value) {
774 return Err(self.fatal(format!(
775 "predefined entity '{name}' must be declared as \
776 a character reference (e.g., '{}')",
777 valid_refs[0]
778 )));
779 }
780 }
781 EntityKind::External { .. } => {
782 return Err(self.fatal(format!(
783 "predefined entity '{name}' must be an internal entity"
784 )));
785 }
786 }
787 }
788 }
789 Ok(())
790 }
791
792 fn validate_replacement_text(&self, entity_name: &str, value: &str) -> Result<(), ParseError> {
800 if !value.contains("&#") {
802 return Ok(());
803 }
804
805 let replacement = Self::expand_char_refs_only(value);
807
808 let bytes = replacement.as_bytes();
811 let mut i = 0;
812 while i < bytes.len() {
813 if bytes[i] == b'&' {
814 i += 1;
815 if i >= bytes.len() {
816 return Err(self.fatal(format!(
817 "entity '{entity_name}' replacement text contains \
818 bare '&' at end of text"
819 )));
820 }
821 if bytes[i] == b'#' {
822 i += 1;
824 let has_digits = if i < bytes.len() && bytes[i] == b'x' {
825 i += 1;
826 let start = i;
827 while i < bytes.len() && bytes[i].is_ascii_hexdigit() {
828 i += 1;
829 }
830 i > start
831 } else {
832 let start = i;
833 while i < bytes.len() && bytes[i].is_ascii_digit() {
834 i += 1;
835 }
836 i > start
837 };
838 if !has_digits || i >= bytes.len() || bytes[i] != b';' {
839 return Err(self.fatal(format!(
840 "entity '{entity_name}' replacement text contains \
841 incomplete character reference"
842 )));
843 }
844 i += 1;
845 } else if bytes[i].is_ascii_alphabetic() || bytes[i] == b'_' || bytes[i] == b':' {
846 while i < bytes.len() && bytes[i] != b';' {
848 i += 1;
849 }
850 if i >= bytes.len() {
851 return Err(self.fatal(format!(
852 "entity '{entity_name}' replacement text contains \
853 incomplete entity reference"
854 )));
855 }
856 i += 1;
857 } else {
858 return Err(self.fatal(format!(
859 "entity '{entity_name}' replacement text contains \
860 bare '&' not followed by a valid reference"
861 )));
862 }
863 } else {
864 i += 1;
865 }
866 }
867 Ok(())
868 }
869
870 fn expand_char_refs_only(value: &str) -> String {
873 expand_char_refs_only(value)
874 }
875
876 fn check_entity_recursion(
878 &self,
879 value: &str,
880 visited: &mut std::collections::HashSet<String>,
881 ) -> Result<(), ParseError> {
882 for ref_name in Self::extract_entity_refs(value) {
883 if visited.contains(ref_name) {
884 return Err(self.fatal(format!("recursive entity reference: '{ref_name}'")));
885 }
886 if let Some(decl) = self.dtd.entities.get(ref_name) {
887 if let EntityKind::Internal(ref inner_value) = decl.kind {
888 visited.insert(ref_name.to_string());
889 self.check_entity_recursion(inner_value, visited)?;
890 visited.remove(ref_name);
891 }
892 }
893 }
894 Ok(())
895 }
896
897 fn check_pe_recursion(
901 &self,
902 value: &str,
903 visited: &mut std::collections::HashSet<String>,
904 ) -> Result<(), ParseError> {
905 for ref_name in Self::extract_pe_refs(value) {
906 if visited.contains(&ref_name) {
907 return Err(self.fatal(format!(
908 "recursive parameter entity reference: '%{ref_name}'"
909 )));
910 }
911 if let Some(decl) = self.dtd.param_entities.get(&ref_name) {
912 if let EntityKind::Internal(ref inner_value) = decl.kind {
913 let expanded = expand_char_refs_only(inner_value);
914 visited.insert(ref_name.clone());
915 self.check_pe_recursion(&expanded, visited)?;
916 visited.remove(&ref_name);
917 }
918 }
919 }
920 Ok(())
921 }
922
923 fn extract_pe_refs(value: &str) -> Vec<String> {
925 let mut refs = Vec::new();
926 let bytes = value.as_bytes();
927 let mut i = 0;
928 while i < bytes.len() {
929 if bytes[i] == b'%' {
930 i += 1;
931 if i < bytes.len() && (bytes[i].is_ascii_alphabetic() || bytes[i] == b'_') {
932 let start = i;
933 while i < bytes.len() && bytes[i] != b';' && !bytes[i].is_ascii_whitespace() {
934 i += 1;
935 }
936 if i < bytes.len() && bytes[i] == b';' && i > start {
937 if let Ok(name) = std::str::from_utf8(&bytes[start..i]) {
938 refs.push(name.to_string());
939 }
940 i += 1;
941 }
942 }
943 } else {
944 i += 1;
945 }
946 }
947 refs
948 }
949
950 fn validate_attr_default_entities(&self, value: &str) -> Result<(), ParseError> {
955 for ref_name in Self::extract_entity_refs(value) {
956 if matches!(ref_name, "amp" | "lt" | "gt" | "apos" | "quot") {
958 continue;
959 }
960 match self.dtd.entities.get(ref_name) {
961 None => {
962 return Err(self.fatal(format!(
963 "undeclared entity '{ref_name}' referenced in \
964 attribute default value"
965 )));
966 }
967 Some(decl) => match &decl.kind {
968 EntityKind::External { .. } => {
969 return Err(self.fatal(format!(
970 "attribute default value must not reference \
971 external entity '{ref_name}'"
972 )));
973 }
974 EntityKind::Internal(ref text) => {
975 if text.contains('<') {
978 return Err(self.fatal(format!(
979 "entity '{ref_name}' contains '<' and cannot \
980 be used in attribute values"
981 )));
982 }
983 self.validate_attr_default_entities(text)?;
985 }
986 },
987 }
988 }
989 Ok(())
990 }
991
992 fn extract_entity_refs(value: &str) -> Vec<&str> {
997 let mut refs = Vec::new();
998 let bytes = value.as_bytes();
999 let mut i = 0;
1000 while i < bytes.len() {
1001 if bytes[i] == b'&' {
1002 i += 1;
1003 if i < bytes.len() && bytes[i] == b'#' {
1004 while i < bytes.len() && bytes[i] != b';' {
1006 i += 1;
1007 }
1008 if i < bytes.len() {
1009 i += 1;
1010 }
1011 } else {
1012 let start = i;
1014 while i < bytes.len() && bytes[i] != b';' && bytes[i] != b'&' {
1015 i += 1;
1016 }
1017 if i < bytes.len() && bytes[i] == b';' && i > start {
1018 if let Ok(name) = std::str::from_utf8(&bytes[start..i]) {
1019 refs.push(name);
1020 }
1021 i += 1;
1022 }
1023 }
1024 } else {
1025 i += 1;
1026 }
1027 }
1028 refs
1029 }
1030
1031 fn parse_element_decl(&mut self) -> Result<(), ParseError> {
1035 self.expect_str(b"<!ELEMENT")?;
1036 self.skip_whitespace_required()?;
1037 let name = self.parse_name()?;
1038 self.skip_whitespace_required()?;
1039 let content_model = self.parse_content_model()?;
1040 self.skip_whitespace();
1041 self.expect_byte(b'>')?;
1042
1043 let decl = ElementDecl {
1044 name: name.clone(),
1045 content_model,
1046 };
1047 self.dtd
1048 .declarations
1049 .push(DtdDeclaration::Element(decl.clone()));
1050 self.dtd.elements.insert(name, decl);
1051 Ok(())
1052 }
1053
1054 fn parse_content_model(&mut self) -> Result<ContentModel, ParseError> {
1055 if self.looking_at(b"EMPTY") {
1056 self.expect_str(b"EMPTY")?;
1057 return Ok(ContentModel::Empty);
1058 }
1059 if self.looking_at(b"ANY") {
1060 self.expect_str(b"ANY")?;
1061 return Ok(ContentModel::Any);
1062 }
1063
1064 self.expect_byte(b'(')?;
1066 self.skip_whitespace();
1067
1068 if self.looking_at(b"#PCDATA") {
1070 self.expect_str(b"#PCDATA")?;
1071 self.skip_whitespace();
1072
1073 let mut names = Vec::new();
1074
1075 if self.peek() == Some(b')') {
1076 self.advance(1);
1078 if self.peek() == Some(b'*') {
1080 self.advance(1);
1081 }
1082 return Ok(ContentModel::Mixed(names));
1083 }
1084
1085 while self.peek() == Some(b'|') {
1087 self.advance(1);
1088 self.skip_whitespace();
1089 let elem_name = self.parse_name()?;
1090 names.push(elem_name);
1091 self.skip_whitespace();
1092 }
1093
1094 self.expect_byte(b')')?;
1095 self.expect_byte(b'*')?;
1096
1097 return Ok(ContentModel::Mixed(names));
1098 }
1099
1100 let spec = self.parse_content_spec_group()?;
1102 Ok(ContentModel::Children(spec))
1103 }
1104
1105 fn parse_content_spec_group(&mut self) -> Result<ContentSpec, ParseError> {
1108 let mut first = self.parse_content_particle()?;
1109 self.skip_whitespace();
1110
1111 if self.peek() == Some(b',') {
1113 let mut items = vec![first];
1115 while self.peek() == Some(b',') {
1116 self.advance(1);
1117 self.skip_whitespace();
1118 let item = self.parse_content_particle()?;
1119 items.push(item);
1120 self.skip_whitespace();
1121 }
1122 self.expect_byte(b')')?;
1123 let occurrence = self.parse_occurrence();
1124 Ok(ContentSpec {
1125 kind: ContentSpecKind::Seq(items),
1126 occurrence,
1127 })
1128 } else if self.peek() == Some(b'|') {
1129 let mut items = vec![first];
1131 while self.peek() == Some(b'|') {
1132 self.advance(1);
1133 self.skip_whitespace();
1134 let item = self.parse_content_particle()?;
1135 items.push(item);
1136 self.skip_whitespace();
1137 }
1138 self.expect_byte(b')')?;
1139 let occurrence = self.parse_occurrence();
1140 Ok(ContentSpec {
1141 kind: ContentSpecKind::Choice(items),
1142 occurrence,
1143 })
1144 } else {
1145 self.expect_byte(b')')?;
1147 let group_occurrence = self.parse_occurrence();
1148
1149 if group_occurrence != Occurrence::Once {
1150 Ok(ContentSpec {
1152 kind: ContentSpecKind::Seq(vec![first]),
1153 occurrence: group_occurrence,
1154 })
1155 } else if first.occurrence != Occurrence::Once {
1156 let inner_occ = first.occurrence;
1160 first.occurrence = Occurrence::Once;
1161 Ok(ContentSpec {
1162 kind: ContentSpecKind::Seq(vec![first]),
1163 occurrence: inner_occ,
1164 })
1165 } else {
1166 Ok(first)
1168 }
1169 }
1170 }
1171
1172 fn parse_content_particle(&mut self) -> Result<ContentSpec, ParseError> {
1173 if self.peek() == Some(b'(') {
1174 self.advance(1);
1175 self.skip_whitespace();
1176 self.parse_content_spec_group()
1177 } else {
1178 let name = self.parse_name()?;
1179 let occurrence = self.parse_occurrence();
1180 Ok(ContentSpec {
1181 kind: ContentSpecKind::Name(name),
1182 occurrence,
1183 })
1184 }
1185 }
1186
1187 fn parse_occurrence(&mut self) -> Occurrence {
1188 match self.peek() {
1189 Some(b'?') => {
1190 self.advance(1);
1191 Occurrence::Optional
1192 }
1193 Some(b'*') => {
1194 self.advance(1);
1195 Occurrence::ZeroOrMore
1196 }
1197 Some(b'+') => {
1198 self.advance(1);
1199 Occurrence::OneOrMore
1200 }
1201 _ => Occurrence::Once,
1202 }
1203 }
1204
1205 fn parse_attlist_decl(&mut self) -> Result<(), ParseError> {
1209 self.expect_str(b"<!ATTLIST")?;
1210 self.skip_whitespace_required()?;
1211 let element_name = self.parse_name()?;
1212
1213 loop {
1214 self.skip_whitespace();
1215 if self.peek() == Some(b'>') {
1216 self.advance(1);
1217 break;
1218 }
1219
1220 let attribute_name = self.parse_name()?;
1221 self.skip_whitespace_required()?;
1222 let attribute_type = self.parse_attribute_type()?;
1223 self.skip_whitespace_required()?;
1224 let default = self.parse_attribute_default()?;
1225
1226 let decl = AttributeDecl {
1227 element_name: element_name.clone(),
1228 attribute_name,
1229 attribute_type,
1230 default,
1231 };
1232
1233 let attrs = self.dtd.attributes.entry(element_name.clone()).or_default();
1236 if !attrs
1237 .iter()
1238 .any(|a| a.attribute_name == decl.attribute_name)
1239 {
1240 self.dtd
1241 .declarations
1242 .push(DtdDeclaration::Attlist(decl.clone()));
1243 attrs.push(decl);
1244 }
1245 }
1246
1247 Ok(())
1248 }
1249
1250 fn parse_attribute_type(&mut self) -> Result<AttributeType, ParseError> {
1251 if self.looking_at(b"CDATA") {
1252 self.expect_str(b"CDATA")?;
1253 Ok(AttributeType::CData)
1254 } else if self.looking_at(b"IDREFS") {
1255 self.expect_str(b"IDREFS")?;
1256 Ok(AttributeType::IdRefs)
1257 } else if self.looking_at(b"IDREF") {
1258 self.expect_str(b"IDREF")?;
1259 Ok(AttributeType::IdRef)
1260 } else if self.looking_at(b"ID") {
1261 self.expect_str(b"ID")?;
1262 Ok(AttributeType::Id)
1263 } else if self.looking_at(b"ENTITIES") {
1264 self.expect_str(b"ENTITIES")?;
1265 Ok(AttributeType::Entities)
1266 } else if self.looking_at(b"ENTITY") {
1267 self.expect_str(b"ENTITY")?;
1268 Ok(AttributeType::Entity)
1269 } else if self.looking_at(b"NMTOKENS") {
1270 self.expect_str(b"NMTOKENS")?;
1271 Ok(AttributeType::NmTokens)
1272 } else if self.looking_at(b"NMTOKEN") {
1273 self.expect_str(b"NMTOKEN")?;
1274 Ok(AttributeType::NmToken)
1275 } else if self.looking_at(b"NOTATION") {
1276 self.expect_str(b"NOTATION")?;
1277 self.skip_whitespace_required()?;
1278 let values = self.parse_enumerated_values()?;
1279 Ok(AttributeType::Notation(values))
1280 } else if self.peek() == Some(b'(') {
1281 let values = self.parse_enumerated_values()?;
1282 Ok(AttributeType::Enumeration(values))
1283 } else {
1284 Err(self.fatal("expected attribute type"))
1285 }
1286 }
1287
1288 fn parse_enumerated_values(&mut self) -> Result<Vec<String>, ParseError> {
1289 self.expect_byte(b'(')?;
1290 self.skip_whitespace();
1291 let mut values = Vec::new();
1292
1293 let first = self.parse_nmtoken()?;
1294 values.push(first);
1295
1296 loop {
1297 self.skip_whitespace();
1298 if self.peek() == Some(b')') {
1299 self.advance(1);
1300 break;
1301 }
1302 self.expect_byte(b'|')?;
1303 self.skip_whitespace();
1304 let val = self.parse_nmtoken()?;
1305 values.push(val);
1306 }
1307
1308 Ok(values)
1309 }
1310
1311 fn parse_attribute_default(&mut self) -> Result<AttributeDefault, ParseError> {
1312 if self.looking_at(b"#REQUIRED") {
1313 self.expect_str(b"#REQUIRED")?;
1314 Ok(AttributeDefault::Required)
1315 } else if self.looking_at(b"#IMPLIED") {
1316 self.expect_str(b"#IMPLIED")?;
1317 Ok(AttributeDefault::Implied)
1318 } else if self.looking_at(b"#FIXED") {
1319 self.expect_str(b"#FIXED")?;
1320 self.skip_whitespace_required()?;
1321 let value = self.parse_quoted_value()?;
1322 self.validate_default_value(&value)?;
1323 Ok(AttributeDefault::Fixed(value))
1324 } else {
1325 let value = self.parse_quoted_value()?;
1326 self.validate_default_value(&value)?;
1327 Ok(AttributeDefault::Default(value))
1328 }
1329 }
1330
1331 #[allow(clippy::too_many_lines)]
1335 fn parse_entity_decl(&mut self) -> Result<(), ParseError> {
1336 self.expect_str(b"<!ENTITY")?;
1337 self.skip_whitespace_required()?;
1338
1339 if self.peek() == Some(b'%') {
1341 self.advance(1);
1342 self.skip_whitespace_required()?;
1343 let pe_name = self.parse_name()?;
1344 if pe_name.contains(':') {
1346 return Err(self.fatal(format!("entity name '{pe_name}' must not contain a colon")));
1347 }
1348 self.skip_whitespace_required()?;
1349
1350 let pe_kind = if self.peek() == Some(b'"') || self.peek() == Some(b'\'') {
1351 let value = self.parse_quoted_value()?;
1353 self.validate_entity_value(&value, true)?;
1354 Some(EntityKind::Internal(value))
1355 } else if self.looking_at(b"SYSTEM") {
1356 self.expect_str(b"SYSTEM")?;
1358 self.skip_whitespace_required()?;
1359 let system_id = self.parse_quoted_value()?;
1360 Some(EntityKind::External {
1361 system_id,
1362 public_id: None,
1363 })
1364 } else if self.looking_at(b"PUBLIC") {
1365 self.expect_str(b"PUBLIC")?;
1366 self.skip_whitespace_required()?;
1367 let public_id = self.parse_quoted_value()?;
1368 self.validate_public_id(&public_id)?;
1369 self.skip_whitespace_required()?;
1370 let system_id = self.parse_quoted_value()?;
1371 Some(EntityKind::External {
1372 system_id,
1373 public_id: Some(public_id),
1374 })
1375 } else {
1376 return Err(self.fatal("expected entity value or external ID"));
1377 };
1378
1379 self.skip_whitespace();
1380 if self.looking_at(b"NDATA") {
1382 return Err(self.fatal("NDATA annotation is not allowed on parameter entities"));
1383 }
1384 self.expect_byte(b'>')?;
1385
1386 if let Some(kind) = pe_kind {
1388 self.dtd
1389 .param_entities
1390 .entry(pe_name)
1391 .or_insert(EntityDecl {
1392 name: String::new(),
1393 kind,
1394 });
1395 }
1396 return Ok(());
1397 }
1398
1399 let name = self.parse_name()?;
1400 if name.contains(':') {
1402 return Err(self.fatal(format!("entity name '{name}' must not contain a colon")));
1403 }
1404 self.skip_whitespace_required()?;
1405
1406 let is_parameter_entity = false;
1407 let kind = if self.peek() == Some(b'"') || self.peek() == Some(b'\'') {
1408 let value = self.parse_quoted_value()?;
1410 self.validate_entity_value(&value, is_parameter_entity)?;
1411 EntityKind::Internal(value)
1412 } else if self.looking_at(b"SYSTEM") {
1413 self.expect_str(b"SYSTEM")?;
1414 self.skip_whitespace_required()?;
1415 let system_id = self.parse_quoted_value()?;
1416 EntityKind::External {
1417 system_id,
1418 public_id: None,
1419 }
1420 } else if self.looking_at(b"PUBLIC") {
1421 self.expect_str(b"PUBLIC")?;
1422 self.skip_whitespace_required()?;
1423 let public_id = self.parse_quoted_value()?;
1424 self.validate_public_id(&public_id)?;
1425 self.skip_whitespace_required()?;
1426 let system_id = self.parse_quoted_value()?;
1427 EntityKind::External {
1428 system_id,
1429 public_id: Some(public_id),
1430 }
1431 } else {
1432 return Err(self.fatal("expected entity value or external ID"));
1433 };
1434
1435 let had_ws = self.skip_whitespace();
1436
1437 if self.looking_at(b"NDATA") {
1439 if matches!(kind, EntityKind::Internal(_)) {
1441 return Err(self.fatal("NDATA annotation is not allowed on internal entities"));
1442 }
1443 if !had_ws {
1445 return Err(self.fatal("whitespace required before NDATA"));
1446 }
1447 self.expect_str(b"NDATA")?;
1448 self.skip_whitespace_required()?;
1449 let _notation_name = self.parse_name()?;
1450 self.skip_whitespace();
1451 }
1452
1453 self.expect_byte(b'>')?;
1454
1455 let decl = EntityDecl {
1458 name: name.clone(),
1459 kind,
1460 };
1461 self.dtd
1462 .declarations
1463 .push(DtdDeclaration::Entity(decl.clone()));
1464 self.dtd.entities.entry(name).or_insert(decl);
1465 Ok(())
1466 }
1467
1468 fn parse_notation_decl(&mut self) -> Result<(), ParseError> {
1472 self.expect_str(b"<!NOTATION")?;
1473 self.skip_whitespace_required()?;
1474 let name = self.parse_name()?;
1475 if name.contains(':') {
1477 return Err(self.fatal(format!("notation name '{name}' must not contain a colon")));
1478 }
1479 self.skip_whitespace_required()?;
1480
1481 let (system_id, public_id) = if self.looking_at(b"SYSTEM") {
1482 self.expect_str(b"SYSTEM")?;
1483 self.skip_whitespace_required()?;
1484 let sid = self.parse_quoted_value()?;
1485 (Some(sid), None)
1486 } else if self.looking_at(b"PUBLIC") {
1487 self.expect_str(b"PUBLIC")?;
1488 self.skip_whitespace_required()?;
1489 let pid = self.parse_quoted_value()?;
1490 self.validate_public_id(&pid)?;
1491 self.skip_whitespace();
1493 let sid = if self.peek() == Some(b'"') || self.peek() == Some(b'\'') {
1494 Some(self.parse_quoted_value()?)
1495 } else {
1496 None
1497 };
1498 (sid, Some(pid))
1499 } else {
1500 return Err(self.fatal("expected SYSTEM or PUBLIC in NOTATION declaration"));
1501 };
1502
1503 self.skip_whitespace();
1504 self.expect_byte(b'>')?;
1505
1506 let decl = NotationDecl {
1507 name: name.clone(),
1508 system_id,
1509 public_id,
1510 };
1511 self.dtd
1512 .declarations
1513 .push(DtdDeclaration::Notation(decl.clone()));
1514 self.dtd.notations.insert(name, decl);
1515 Ok(())
1516 }
1517
1518 fn parse_comment_decl(&mut self) -> Result<(), ParseError> {
1522 self.expect_str(b"<!--")?;
1523 let start = self.pos;
1524 loop {
1525 if self.at_end() {
1526 return Err(self.fatal("unexpected end of input in comment"));
1527 }
1528 if self.looking_at(b"-->") {
1529 let text = std::str::from_utf8(&self.input[start..self.pos])
1530 .unwrap_or("")
1531 .to_string();
1532 self.advance(3);
1533 self.dtd.declarations.push(DtdDeclaration::Comment(text));
1534 return Ok(());
1535 }
1536 self.advance(1);
1537 }
1538 }
1539
1540 fn parse_pi_decl(&mut self) -> Result<(), ParseError> {
1542 self.expect_str(b"<?")?;
1543
1544 let target = self.parse_name()?;
1546
1547 if target.eq_ignore_ascii_case("xml") {
1549 return Err(self.fatal("XML declaration is not allowed inside DTD"));
1550 }
1551
1552 if self.looking_at(b"?>") {
1554 self.advance(2);
1555 self.dtd.declarations.push(DtdDeclaration::Pi(target, None));
1556 return Ok(());
1557 }
1558
1559 let is_ws = self
1561 .peek()
1562 .is_some_and(|b| b == b' ' || b == b'\t' || b == b'\r' || b == b'\n');
1563 if !is_ws {
1564 return Err(self.fatal("space required between PI target and data"));
1565 }
1566
1567 let start = self.pos;
1568 loop {
1569 if self.at_end() {
1570 return Err(self.fatal("unexpected end of input in processing instruction"));
1571 }
1572 if self.looking_at(b"?>") {
1573 let data = std::str::from_utf8(&self.input[start..self.pos])
1574 .unwrap_or("")
1575 .trim()
1576 .to_string();
1577 self.advance(2);
1578 let data = if data.is_empty() { None } else { Some(data) };
1579 self.dtd.declarations.push(DtdDeclaration::Pi(target, data));
1580 return Ok(());
1581 }
1582 self.advance(1);
1583 }
1584 }
1585
1586 fn skip_pe_reference(&mut self) -> Result<(), ParseError> {
1587 self.expect_byte(b'%')?;
1588 let _name = self.parse_name()?;
1590 self.expect_byte(b';')?;
1591 Ok(())
1592 }
1593
1594 fn parse_name(&mut self) -> Result<String, ParseError> {
1597 if self.pos >= self.input.len() {
1598 return Err(self.fatal("expected name, found end of input"));
1599 }
1600
1601 let start = self.pos;
1602 let first = self.input[self.pos];
1603
1604 if is_ascii_name_start(first) {
1606 self.pos += 1;
1607 self.column += 1;
1608 while self.pos < self.input.len() && is_ascii_name_char(self.input[self.pos]) {
1609 self.pos += 1;
1610 self.column += 1;
1611 }
1612 if self.pos >= self.input.len() || self.input[self.pos] < 0x80 {
1613 let name = std::str::from_utf8(&self.input[start..self.pos])
1614 .map_err(|_| self.fatal("invalid UTF-8 in name"))?;
1615 return Ok(name.to_string());
1616 }
1617 } else {
1619 let ch = self
1620 .peek_char()
1621 .ok_or_else(|| self.fatal("expected name"))?;
1622 if !is_name_start_char(ch) {
1623 return Err(self.fatal(format!("invalid name start character: '{ch}'")));
1624 }
1625 self.advance_char(ch);
1626 }
1627
1628 while let Some(ch) = self.peek_char() {
1629 if is_name_char(ch) {
1630 self.advance_char(ch);
1631 } else {
1632 break;
1633 }
1634 }
1635
1636 let name = std::str::from_utf8(&self.input[start..self.pos])
1637 .map_err(|_| self.fatal("invalid UTF-8 in name"))?;
1638 Ok(name.to_string())
1639 }
1640
1641 fn parse_nmtoken(&mut self) -> Result<String, ParseError> {
1642 if self.pos >= self.input.len() {
1643 return Err(self.fatal("expected NMTOKEN, found end of input"));
1644 }
1645
1646 let start = self.pos;
1647 let first = self.input[self.pos];
1648
1649 if is_ascii_name_char(first) {
1651 self.pos += 1;
1652 self.column += 1;
1653 while self.pos < self.input.len() && is_ascii_name_char(self.input[self.pos]) {
1654 self.pos += 1;
1655 self.column += 1;
1656 }
1657 if self.pos >= self.input.len() || self.input[self.pos] < 0x80 {
1658 let token = std::str::from_utf8(&self.input[start..self.pos])
1659 .map_err(|_| self.fatal("invalid UTF-8 in NMTOKEN"))?;
1660 return Ok(token.to_string());
1661 }
1662 } else {
1664 let ch = self
1665 .peek_char()
1666 .ok_or_else(|| self.fatal("expected NMTOKEN"))?;
1667 if !is_name_char(ch) {
1668 return Err(self.fatal(format!("invalid NMTOKEN character: '{ch}'")));
1669 }
1670 self.advance_char(ch);
1671 }
1672
1673 while let Some(ch) = self.peek_char() {
1674 if is_name_char(ch) {
1675 self.advance_char(ch);
1676 } else {
1677 break;
1678 }
1679 }
1680
1681 let token = std::str::from_utf8(&self.input[start..self.pos])
1682 .map_err(|_| self.fatal("invalid UTF-8 in NMTOKEN"))?;
1683 Ok(token.to_string())
1684 }
1685
1686 #[allow(clippy::too_many_lines)]
1691 fn validate_entity_value(
1692 &self,
1693 value: &str,
1694 is_parameter_entity: bool,
1695 ) -> Result<(), ParseError> {
1696 for c in value.chars() {
1698 if !crate::parser::input::is_xml_char(c) {
1699 return Err(self.fatal(format!(
1700 "invalid XML character U+{:04X} in entity value",
1701 c as u32
1702 )));
1703 }
1704 }
1705
1706 if value.starts_with("<?xml") {
1710 let after = value.as_bytes().get(5).copied();
1711 if after.map_or(true, |b| b == b' ' || b == b'\t' || b == b'?') {
1712 return Err(self.fatal("text declaration is not allowed in internal entity value"));
1713 }
1714 }
1715
1716 let bytes = value.as_bytes();
1717 let mut i = 0;
1718 while i < bytes.len() {
1719 match bytes[i] {
1720 b'&' => {
1721 i += 1;
1723 if i >= bytes.len() {
1724 return Err(self.fatal("incomplete reference in entity value: '&' at end"));
1725 }
1726 if bytes[i] == b'#' {
1727 i += 1;
1729 let char_val = if i < bytes.len() && bytes[i] == b'x' {
1730 i += 1;
1731 let hex_start = i;
1732 if i >= bytes.len() || !bytes[i].is_ascii_hexdigit() {
1733 return Err(
1734 self.fatal("malformed character reference in entity value")
1735 );
1736 }
1737 while i < bytes.len() && bytes[i].is_ascii_hexdigit() {
1738 i += 1;
1739 }
1740 let hex_str = std::str::from_utf8(&bytes[hex_start..i]).unwrap_or("");
1741 u32::from_str_radix(hex_str, 16).unwrap_or(0)
1742 } else {
1743 let dec_start = i;
1744 if i >= bytes.len() || !bytes[i].is_ascii_digit() {
1745 return Err(
1746 self.fatal("malformed character reference in entity value")
1747 );
1748 }
1749 while i < bytes.len() && bytes[i].is_ascii_digit() {
1750 i += 1;
1751 }
1752 let dec_str = std::str::from_utf8(&bytes[dec_start..i]).unwrap_or("");
1753 dec_str.parse::<u32>().unwrap_or(0)
1754 };
1755 if i >= bytes.len() || bytes[i] != b';' {
1756 return Err(
1757 self.fatal("incomplete character reference in entity value")
1758 );
1759 }
1760 i += 1;
1761 if let Some(c) = char::from_u32(char_val) {
1763 if !crate::parser::input::is_xml_char(c) {
1764 return Err(self.fatal(format!(
1765 "character reference &#x{char_val:X}; refers to invalid XML character"
1766 )));
1767 }
1768 } else {
1769 return Err(self.fatal(format!(
1770 "character reference value {char_val} is not a valid Unicode code point"
1771 )));
1772 }
1773 } else {
1774 let start = i;
1776 while i < bytes.len()
1777 && bytes[i] != b';'
1778 && bytes[i] != b'&'
1779 && !bytes[i].is_ascii_whitespace()
1780 {
1781 i += 1;
1782 }
1783 if i == start || i >= bytes.len() || bytes[i] != b';' {
1784 return Err(self.fatal("malformed entity reference in entity value"));
1785 }
1786 let name_str = std::str::from_utf8(&bytes[start..i]).unwrap_or("");
1788 if let Some(first_char) = name_str.chars().next() {
1789 if !is_name_start_char(first_char) {
1790 return Err(self.fatal(format!(
1791 "entity reference name must start with a letter or underscore, found '{first_char}'"
1792 )));
1793 }
1794 }
1795 i += 1;
1796 }
1797 }
1798 b'%' if !is_parameter_entity => {
1799 return Err(self.fatal("'%' not allowed in general entity value"));
1801 }
1802 b'%' if is_parameter_entity => {
1803 i += 1;
1807 if i < bytes.len() {
1808 let first = bytes[i];
1809 if first.is_ascii_alphabetic() || first == b'_' || first == b':' {
1810 return Err(self.fatal(
1811 "parameter entity reference not allowed within \
1812 markup declaration in internal subset",
1813 ));
1814 }
1815 }
1816 }
1817 _ => {
1818 i += 1;
1819 }
1820 }
1821 }
1822 Ok(())
1823 }
1824
1825 fn validate_default_value(&self, value: &str) -> Result<(), ParseError> {
1831 let bytes = value.as_bytes();
1832 let mut i = 0;
1833 while i < bytes.len() {
1834 match bytes[i] {
1835 b'<' => {
1836 return Err(self.fatal("'<' not allowed in attribute default value"));
1837 }
1838 b'&' => {
1839 i += 1;
1840 if i < bytes.len() && bytes[i] == b'#' {
1841 i += 1;
1843 while i < bytes.len() && bytes[i] != b';' {
1844 i += 1;
1845 }
1846 if i < bytes.len() {
1847 i += 1;
1848 }
1849 } else {
1850 let start = i;
1852 while i < bytes.len() && bytes[i] != b';' {
1853 i += 1;
1854 }
1855 if i > start && i < bytes.len() {
1856 let name = std::str::from_utf8(&bytes[start..i]).unwrap_or("");
1857 let is_builtin = matches!(name, "amp" | "lt" | "gt" | "apos" | "quot");
1859 if !is_builtin && !self.dtd.entities.contains_key(name) {
1860 return Err(self.fatal(format!(
1861 "undeclared entity '{name}' in attribute default value"
1862 )));
1863 }
1864 }
1865 if i < bytes.len() {
1866 i += 1;
1867 }
1868 }
1869 }
1870 _ => {
1871 i += 1;
1872 }
1873 }
1874 }
1875 Ok(())
1876 }
1877
1878 fn validate_public_id(&self, pid: &str) -> Result<(), ParseError> {
1881 for c in pid.chars() {
1882 let valid = matches!(c,
1883 ' ' | '\r' | '\n' |
1884 'a'..='z' | 'A'..='Z' | '0'..='9' |
1885 '-' | '\'' | '(' | ')' | '+' | ',' | '.' | '/' | ':' |
1886 '=' | '?' | ';' | '!' | '*' | '#' | '@' | '$' | '_' | '%'
1887 );
1888 if !valid {
1889 return Err(self.fatal(format!(
1890 "invalid character in public ID: U+{:04X}",
1891 c as u32
1892 )));
1893 }
1894 }
1895 Ok(())
1896 }
1897
1898 fn parse_quoted_value(&mut self) -> Result<String, ParseError> {
1899 let quote = self.next_byte()?;
1900 if quote != b'"' && quote != b'\'' {
1901 return Err(self.fatal("expected quoted value"));
1902 }
1903 let start = self.pos;
1904 while !self.at_end() && self.peek() != Some(quote) {
1905 self.advance(1);
1906 }
1907 let value = std::str::from_utf8(&self.input[start..self.pos])
1908 .map_err(|_| self.fatal("invalid UTF-8 in quoted value"))?
1909 .to_string();
1910 if self.at_end() {
1911 return Err(self.fatal("unexpected end of input in quoted value"));
1912 }
1913 self.advance(1); Ok(value)
1915 }
1916
1917 fn location(&self) -> SourceLocation {
1920 SourceLocation {
1921 line: self.line,
1922 column: self.column,
1923 byte_offset: self.pos,
1924 }
1925 }
1926
1927 fn at_end(&self) -> bool {
1928 self.pos >= self.input.len()
1929 }
1930
1931 fn peek(&self) -> Option<u8> {
1932 self.input.get(self.pos).copied()
1933 }
1934
1935 fn peek_char(&self) -> Option<char> {
1936 if self.pos >= self.input.len() {
1937 return None;
1938 }
1939 let first = self.input[self.pos];
1940 if first < 0x80 {
1942 return Some(first as char);
1943 }
1944 let len = match first {
1946 0xC0..=0xDF => 2,
1947 0xE0..=0xEF => 3,
1948 0xF0..=0xF7 => 4,
1949 _ => return None,
1950 };
1951 let remaining = &self.input[self.pos..];
1952 if remaining.len() < len {
1953 return None;
1954 }
1955 std::str::from_utf8(&remaining[..len])
1956 .ok()
1957 .and_then(|s| s.chars().next())
1958 }
1959
1960 fn advance(&mut self, count: usize) {
1961 for _ in 0..count {
1962 if self.pos < self.input.len() {
1963 if self.input[self.pos] == b'\n' {
1964 self.line += 1;
1965 self.column = 1;
1966 } else {
1967 self.column += 1;
1968 }
1969 self.pos += 1;
1970 }
1971 }
1972 }
1973
1974 fn advance_char(&mut self, ch: char) {
1975 let len = ch.len_utf8();
1976 if ch == '\n' {
1977 self.line += 1;
1978 self.column = 1;
1979 } else {
1980 self.column += 1;
1981 }
1982 self.pos += len;
1983 }
1984
1985 fn next_byte(&mut self) -> Result<u8, ParseError> {
1986 if self.at_end() {
1987 return Err(self.fatal("unexpected end of input"));
1988 }
1989 let b = self.input[self.pos];
1990 self.advance(1);
1991 Ok(b)
1992 }
1993
1994 fn expect_byte(&mut self, expected: u8) -> Result<(), ParseError> {
1995 let b = self.next_byte()?;
1996 if b == expected {
1997 Ok(())
1998 } else {
1999 Err(self.fatal(format!(
2000 "expected '{}', found '{}'",
2001 expected as char, b as char
2002 )))
2003 }
2004 }
2005
2006 fn expect_str(&mut self, expected: &[u8]) -> Result<(), ParseError> {
2007 for &b in expected {
2008 self.expect_byte(b)?;
2009 }
2010 Ok(())
2011 }
2012
2013 fn looking_at(&self, s: &[u8]) -> bool {
2014 self.pos + s.len() <= self.input.len() && self.input[self.pos..].starts_with(s)
2015 }
2016
2017 fn skip_whitespace(&mut self) -> bool {
2018 let start = self.pos;
2019 while let Some(b) = self.peek() {
2020 if b == b' ' || b == b'\t' || b == b'\r' || b == b'\n' {
2021 self.advance(1);
2022 } else {
2023 break;
2024 }
2025 }
2026 self.pos > start
2027 }
2028
2029 fn skip_whitespace_required(&mut self) -> Result<(), ParseError> {
2030 if !self.skip_whitespace() {
2031 return Err(self.fatal("whitespace required"));
2032 }
2033 Ok(())
2034 }
2035
2036 fn fatal(&self, message: impl Into<String>) -> ParseError {
2037 ParseError {
2038 message: message.into(),
2039 location: self.location(),
2040 diagnostics: Vec::new(),
2041 }
2042 }
2043}
2044
2045pub(crate) fn expand_char_refs_only(value: &str) -> String {
2056 let bytes = value.as_bytes();
2057 let mut result = String::with_capacity(value.len());
2058 let mut i = 0;
2059 while i < bytes.len() {
2060 if bytes[i] == b'&' && i + 1 < bytes.len() && bytes[i + 1] == b'#' {
2061 i += 2;
2062 let char_val = if i < bytes.len() && bytes[i] == b'x' {
2063 i += 1;
2064 let start = i;
2065 while i < bytes.len() && bytes[i].is_ascii_hexdigit() {
2066 i += 1;
2067 }
2068 let hex = std::str::from_utf8(&bytes[start..i]).unwrap_or("0");
2069 u32::from_str_radix(hex, 16).unwrap_or(0)
2070 } else {
2071 let start = i;
2072 while i < bytes.len() && bytes[i].is_ascii_digit() {
2073 i += 1;
2074 }
2075 let dec = std::str::from_utf8(&bytes[start..i]).unwrap_or("0");
2076 dec.parse::<u32>().unwrap_or(0)
2077 };
2078 if i < bytes.len() && bytes[i] == b';' {
2079 i += 1;
2080 }
2081 if let Some(ch) = char::from_u32(char_val) {
2082 result.push(ch);
2083 }
2084 } else {
2085 let ch = value[i..].chars().next().unwrap_or('\u{FFFD}');
2087 result.push(ch);
2088 i += ch.len_utf8();
2089 }
2090 }
2091 result
2092}
2093
2094pub(crate) fn replace_entity_refs(text: &str) -> String {
2102 let bytes = text.as_bytes();
2103 let mut result = String::with_capacity(text.len());
2104 let mut i = 0;
2105 while i < bytes.len() {
2106 if bytes[i] == b'&' && i + 1 < bytes.len() && bytes[i + 1] != b'#' {
2107 let start = i;
2109 i += 1;
2110 if i < bytes.len()
2111 && (bytes[i].is_ascii_alphabetic() || bytes[i] == b'_' || bytes[i] == b':')
2112 {
2113 while i < bytes.len() && bytes[i] != b';' {
2115 i += 1;
2116 }
2117 if i < bytes.len() && bytes[i] == b';' {
2118 result.push(' ');
2120 i += 1;
2121 } else {
2122 result.push_str(&text[start..i]);
2124 }
2125 } else {
2126 result.push('&');
2128 }
2129 } else {
2130 let ch = text[i..].chars().next().unwrap_or('\u{FFFD}');
2132 result.push(ch);
2133 i += ch.len_utf8();
2134 }
2135 }
2136 result
2137}
2138
2139fn is_ascii_name_start(b: u8) -> bool {
2144 b.is_ascii_alphabetic() || b == b'_' || b == b':'
2145}
2146
2147fn is_ascii_name_char(b: u8) -> bool {
2148 b.is_ascii_alphanumeric() || b == b'_' || b == b':' || b == b'-' || b == b'.'
2149}
2150
2151fn is_name_start_char(c: char) -> bool {
2152 matches!(c,
2153 ':' | 'A'..='Z' | '_' | 'a'..='z' |
2154 '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' |
2155 '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' |
2156 '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' |
2157 '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' |
2158 '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' |
2159 '\u{10000}'..='\u{EFFFF}'
2160 )
2161}
2162
2163fn is_name_char(c: char) -> bool {
2164 is_name_start_char(c)
2165 || matches!(c,
2166 '-' | '.' | '0'..='9' | '\u{B7}' |
2167 '\u{300}'..='\u{36F}' | '\u{203F}'..='\u{2040}'
2168 )
2169}
2170
2171pub fn validate(doc: &mut Document, dtd: &Dtd) -> ValidationResult {
2204 let mut errors = Vec::new();
2205 let mut warnings = Vec::new();
2206 let mut id_values: HashSet<String> = HashSet::new();
2207 let mut idref_values: Vec<String> = Vec::new();
2208
2209 check_root_element(doc, dtd, &mut errors);
2211
2212 if let Some(root_elem) = doc.root_element() {
2214 validate_element_recursive(
2215 doc,
2216 dtd,
2217 root_elem,
2218 &mut errors,
2219 &mut warnings,
2220 &mut id_values,
2221 &mut idref_values,
2222 );
2223 }
2224
2225 for idref in &idref_values {
2227 if !id_values.contains(idref) {
2228 errors.push(ValidationError {
2229 message: format!("IDREF '{idref}' does not match any ID in the document"),
2230 line: None,
2231 column: None,
2232 });
2233 }
2234 }
2235
2236 let is_valid = errors.is_empty();
2237 ValidationResult {
2238 is_valid,
2239 errors,
2240 warnings,
2241 }
2242}
2243
2244fn check_root_element(doc: &Document, _dtd: &Dtd, errors: &mut Vec<ValidationError>) {
2246 let doctype_name = doc.children(doc.root()).find_map(|id| {
2248 if let NodeKind::DocumentType { ref name, .. } = doc.node(id).kind {
2249 Some(name.clone())
2250 } else {
2251 None
2252 }
2253 });
2254
2255 if let Some(ref expected_name) = doctype_name {
2256 if let Some(root_elem) = doc.root_element() {
2257 if let Some(actual_name) = doc.node_name(root_elem) {
2258 if actual_name != expected_name {
2259 errors.push(ValidationError {
2260 message: format!(
2261 "root element '{actual_name}' does not match \
2262 DOCTYPE name '{expected_name}'"
2263 ),
2264 line: None,
2265 column: None,
2266 });
2267 }
2268 }
2269 }
2270 }
2271}
2272
2273#[allow(clippy::too_many_arguments)]
2275fn validate_element_recursive(
2276 doc: &mut Document,
2277 dtd: &Dtd,
2278 node_id: NodeId,
2279 errors: &mut Vec<ValidationError>,
2280 warnings: &mut Vec<ValidationError>,
2281 id_values: &mut HashSet<String>,
2282 idref_values: &mut Vec<String>,
2283) {
2284 let elem_name = match doc.node_name(node_id) {
2285 Some(name) => name.to_string(),
2286 None => return,
2287 };
2288
2289 let has_element_decls = !dtd.elements.is_empty();
2291 if has_element_decls && !dtd.elements.contains_key(&elem_name) {
2292 errors.push(ValidationError {
2293 message: format!("element '{elem_name}' is not declared in the DTD"),
2294 line: None,
2295 column: None,
2296 });
2297 }
2298
2299 if let Some(elem_decl) = dtd.elements.get(&elem_name) {
2301 validate_content_model(doc, node_id, &elem_name, &elem_decl.content_model, errors);
2302 }
2303
2304 validate_attributes(
2306 doc,
2307 dtd,
2308 node_id,
2309 &elem_name,
2310 errors,
2311 warnings,
2312 id_values,
2313 idref_values,
2314 );
2315
2316 let child_ids: Vec<NodeId> = doc
2318 .children(node_id)
2319 .filter(|&child_id| matches!(doc.node(child_id).kind, NodeKind::Element { .. }))
2320 .collect();
2321
2322 for child_id in child_ids {
2324 validate_element_recursive(
2325 doc,
2326 dtd,
2327 child_id,
2328 errors,
2329 warnings,
2330 id_values,
2331 idref_values,
2332 );
2333 }
2334}
2335
2336fn validate_content_model(
2338 doc: &Document,
2339 node_id: NodeId,
2340 elem_name: &str,
2341 model: &ContentModel,
2342 errors: &mut Vec<ValidationError>,
2343) {
2344 match model {
2345 ContentModel::Empty => {
2346 let has_content = doc.children(node_id).any(|child| {
2348 matches!(
2349 doc.node(child).kind,
2350 NodeKind::Element { .. } | NodeKind::Text { .. } | NodeKind::CData { .. }
2351 )
2352 });
2353 if has_content {
2354 errors.push(ValidationError {
2355 message: format!(
2356 "element '{elem_name}' is declared EMPTY \
2357 but has content"
2358 ),
2359 line: None,
2360 column: None,
2361 });
2362 }
2363 }
2364 ContentModel::Any => {
2365 }
2367 ContentModel::Mixed(allowed_names) => {
2368 for child_id in doc.children(node_id) {
2370 if let NodeKind::Element { ref name, .. } = doc.node(child_id).kind {
2371 if !allowed_names.contains(name) {
2372 errors.push(ValidationError {
2373 message: format!(
2374 "element '{name}' is not allowed in mixed content \
2375 of '{elem_name}' (allowed: #PCDATA{})",
2376 if allowed_names.is_empty() {
2377 String::new()
2378 } else {
2379 format!("|{}", allowed_names.join("|"))
2380 }
2381 ),
2382 line: None,
2383 column: None,
2384 });
2385 }
2386 }
2387 }
2388 }
2389 ContentModel::Children(spec) => {
2390 let child_names: Vec<String> = doc
2392 .children(node_id)
2393 .filter_map(|child_id| {
2394 if let NodeKind::Element { ref name, .. } = doc.node(child_id).kind {
2395 Some(name.clone())
2396 } else {
2397 None
2398 }
2399 })
2400 .collect();
2401
2402 let has_text = doc.children(node_id).any(|child_id| {
2404 if let NodeKind::Text { ref content } = doc.node(child_id).kind {
2405 !content.trim().is_empty()
2406 } else {
2407 matches!(doc.node(child_id).kind, NodeKind::CData { .. })
2408 }
2409 });
2410
2411 if has_text {
2412 errors.push(ValidationError {
2413 message: format!(
2414 "element '{elem_name}' has element-only content model \
2415 but contains text"
2416 ),
2417 line: None,
2418 column: None,
2419 });
2420 }
2421
2422 let consumed = match_content_spec(spec, &child_names, 0);
2424 match consumed {
2425 Some(n) if n == child_names.len() => {
2426 }
2428 _ => {
2429 errors.push(ValidationError {
2430 message: format!(
2431 "element '{elem_name}' content does not match \
2432 declared content model {model}; \
2433 found children: [{}]",
2434 child_names.join(", ")
2435 ),
2436 line: None,
2437 column: None,
2438 });
2439 }
2440 }
2441 }
2442 }
2443}
2444
2445fn match_content_spec(spec: &ContentSpec, names: &[String], pos: usize) -> Option<usize> {
2450 match &spec.kind {
2451 ContentSpecKind::Name(expected) => match_with_occurrence(
2452 |all_names, p| {
2453 if p < all_names.len() && all_names[p] == *expected {
2454 Some(1)
2455 } else {
2456 None
2457 }
2458 },
2459 names,
2460 pos,
2461 spec.occurrence,
2462 ),
2463 ContentSpecKind::Seq(items) => match_with_occurrence(
2464 |all_names, p| {
2465 let mut current = p;
2466 for item in items {
2467 match match_content_spec(item, all_names, current) {
2468 Some(consumed) => current += consumed,
2469 None => return None,
2470 }
2471 }
2472 Some(current - p)
2473 },
2474 names,
2475 pos,
2476 spec.occurrence,
2477 ),
2478 ContentSpecKind::Choice(items) => match_with_occurrence(
2479 |all_names, p| {
2480 for item in items {
2481 if let Some(consumed) = match_content_spec(item, all_names, p) {
2482 return Some(consumed);
2483 }
2484 }
2485 None
2486 },
2487 names,
2488 pos,
2489 spec.occurrence,
2490 ),
2491 }
2492}
2493
2494fn match_with_occurrence(
2499 base_match: impl Fn(&[String], usize) -> Option<usize>,
2500 names: &[String],
2501 pos: usize,
2502 occurrence: Occurrence,
2503) -> Option<usize> {
2504 match occurrence {
2505 Occurrence::Once => base_match(names, pos),
2506 Occurrence::Optional => {
2507 Some(base_match(names, pos).unwrap_or(0))
2509 }
2510 Occurrence::ZeroOrMore | Occurrence::OneOrMore => {
2511 let mut total = 0;
2512 loop {
2513 match base_match(names, pos + total) {
2514 Some(0) | None => break, Some(n) => total += n,
2516 }
2517 }
2518 if occurrence == Occurrence::OneOrMore && total == 0 {
2520 None
2521 } else {
2522 Some(total)
2523 }
2524 }
2525 }
2526}
2527
2528#[allow(clippy::too_many_arguments)]
2530fn validate_attributes(
2531 doc: &mut Document,
2532 dtd: &Dtd,
2533 node_id: NodeId,
2534 elem_name: &str,
2535 errors: &mut Vec<ValidationError>,
2536 _warnings: &mut Vec<ValidationError>,
2537 id_values: &mut HashSet<String>,
2538 idref_values: &mut Vec<String>,
2539) {
2540 let attr_decls = dtd.attributes.get(elem_name);
2541 let actual_attrs = doc.attributes(node_id).to_vec();
2542
2543 if let Some(decls) = attr_decls {
2544 for decl in decls {
2546 let actual = actual_attrs.iter().find(|a| a.name == decl.attribute_name);
2547
2548 match (&decl.default, actual) {
2549 (AttributeDefault::Required, None) => {
2550 errors.push(ValidationError {
2551 message: format!(
2552 "required attribute '{}' missing on element '{elem_name}'",
2553 decl.attribute_name
2554 ),
2555 line: None,
2556 column: None,
2557 });
2558 }
2559 (AttributeDefault::Fixed(fixed_val), Some(attr)) => {
2560 if attr.value != *fixed_val {
2561 errors.push(ValidationError {
2562 message: format!(
2563 "attribute '{}' on element '{elem_name}' must have \
2564 fixed value '{fixed_val}', found '{}'",
2565 decl.attribute_name, attr.value
2566 ),
2567 line: None,
2568 column: None,
2569 });
2570 }
2571 }
2572 _ => {}
2573 }
2574
2575 if let Some(attr) = actual {
2577 validate_attribute_type(
2578 doc,
2579 node_id,
2580 &attr.value,
2581 &decl.attribute_type,
2582 &decl.attribute_name,
2583 elem_name,
2584 errors,
2585 id_values,
2586 idref_values,
2587 );
2588 }
2589 }
2590
2591 for attr in &actual_attrs {
2593 if attr.name == "xmlns" || attr.prefix.as_deref() == Some("xmlns") {
2594 continue;
2595 }
2596 let is_declared = decls.iter().any(|d| d.attribute_name == attr.name);
2597 if !is_declared {
2598 errors.push(ValidationError {
2599 message: format!(
2600 "attribute '{}' on element '{elem_name}' is not declared in the DTD",
2601 attr.name
2602 ),
2603 line: None,
2604 column: None,
2605 });
2606 }
2607 }
2608 }
2609}
2610
2611#[allow(clippy::too_many_arguments)]
2613fn validate_attribute_type(
2614 doc: &mut Document,
2615 node_id: NodeId,
2616 value: &str,
2617 attr_type: &AttributeType,
2618 attr_name: &str,
2619 elem_name: &str,
2620 errors: &mut Vec<ValidationError>,
2621 id_values: &mut HashSet<String>,
2622 idref_values: &mut Vec<String>,
2623) {
2624 match attr_type {
2625 AttributeType::CData => {
2626 }
2628 AttributeType::Id => {
2629 validate_id_value(doc, node_id, value, attr_name, elem_name, errors, id_values);
2630 }
2631 AttributeType::IdRef => {
2632 validate_idref_value(value, attr_name, elem_name, errors, idref_values);
2633 }
2634 AttributeType::IdRefs => {
2635 validate_idrefs_value(value, attr_name, elem_name, errors, idref_values);
2636 }
2637 AttributeType::NmToken => {
2638 validate_nmtoken_value(value, attr_name, elem_name, errors);
2639 }
2640 AttributeType::NmTokens => {
2641 validate_nmtokens_value(value, attr_name, elem_name, errors);
2642 }
2643 AttributeType::Enumeration(values) | AttributeType::Notation(values) => {
2644 validate_enumeration_value(value, values, attr_name, elem_name, errors);
2645 }
2646 AttributeType::Entity | AttributeType::Entities => {
2647 validate_entity_value(value, attr_type, attr_name, elem_name, errors);
2648 }
2649 }
2650}
2651
2652fn validate_id_value(
2657 doc: &mut Document,
2658 node_id: NodeId,
2659 value: &str,
2660 attr_name: &str,
2661 elem_name: &str,
2662 errors: &mut Vec<ValidationError>,
2663 id_values: &mut HashSet<String>,
2664) {
2665 if !is_valid_name(value) {
2666 errors.push(ValidationError {
2667 message: format!(
2668 "attribute '{attr_name}' on element '{elem_name}' \
2669 has invalid ID value '{value}' (not a valid XML Name)"
2670 ),
2671 line: None,
2672 column: None,
2673 });
2674 } else if !id_values.insert(value.to_string()) {
2675 errors.push(ValidationError {
2676 message: format!(
2677 "duplicate ID value '{value}' on attribute '{attr_name}' \
2678 of element '{elem_name}'"
2679 ),
2680 line: None,
2681 column: None,
2682 });
2683 } else {
2684 doc.set_id(value, node_id);
2685 }
2686}
2687
2688fn validate_idref_value(
2690 value: &str,
2691 attr_name: &str,
2692 elem_name: &str,
2693 errors: &mut Vec<ValidationError>,
2694 idref_values: &mut Vec<String>,
2695) {
2696 if is_valid_name(value) {
2697 idref_values.push(value.to_string());
2698 } else {
2699 errors.push(ValidationError {
2700 message: format!(
2701 "attribute '{attr_name}' on element '{elem_name}' \
2702 has invalid IDREF value '{value}'"
2703 ),
2704 line: None,
2705 column: None,
2706 });
2707 }
2708}
2709
2710fn validate_idrefs_value(
2712 value: &str,
2713 attr_name: &str,
2714 elem_name: &str,
2715 errors: &mut Vec<ValidationError>,
2716 idref_values: &mut Vec<String>,
2717) {
2718 for token in value.split_whitespace() {
2719 if is_valid_name(token) {
2720 idref_values.push(token.to_string());
2721 } else {
2722 errors.push(ValidationError {
2723 message: format!(
2724 "attribute '{attr_name}' on element '{elem_name}' \
2725 has invalid IDREFS token '{token}'"
2726 ),
2727 line: None,
2728 column: None,
2729 });
2730 }
2731 }
2732}
2733
2734fn validate_nmtoken_value(
2736 value: &str,
2737 attr_name: &str,
2738 elem_name: &str,
2739 errors: &mut Vec<ValidationError>,
2740) {
2741 if !is_valid_nmtoken(value) {
2742 errors.push(ValidationError {
2743 message: format!(
2744 "attribute '{attr_name}' on element '{elem_name}' \
2745 has invalid NMTOKEN value '{value}'"
2746 ),
2747 line: None,
2748 column: None,
2749 });
2750 }
2751}
2752
2753fn validate_nmtokens_value(
2755 value: &str,
2756 attr_name: &str,
2757 elem_name: &str,
2758 errors: &mut Vec<ValidationError>,
2759) {
2760 for token in value.split_whitespace() {
2761 if !is_valid_nmtoken(token) {
2762 errors.push(ValidationError {
2763 message: format!(
2764 "attribute '{attr_name}' on element '{elem_name}' \
2765 has invalid NMTOKENS token '{token}'"
2766 ),
2767 line: None,
2768 column: None,
2769 });
2770 }
2771 }
2772}
2773
2774fn validate_enumeration_value(
2776 value: &str,
2777 allowed: &[String],
2778 attr_name: &str,
2779 elem_name: &str,
2780 errors: &mut Vec<ValidationError>,
2781) {
2782 if !allowed.contains(&value.to_string()) {
2783 errors.push(ValidationError {
2784 message: format!(
2785 "attribute '{attr_name}' on element '{elem_name}' \
2786 has value '{value}' which is not in the allowed \
2787 values ({})",
2788 allowed.join("|")
2789 ),
2790 line: None,
2791 column: None,
2792 });
2793 }
2794}
2795
2796fn validate_entity_value(
2798 value: &str,
2799 attr_type: &AttributeType,
2800 attr_name: &str,
2801 elem_name: &str,
2802 errors: &mut Vec<ValidationError>,
2803) {
2804 let tokens: Vec<&str> = if matches!(attr_type, AttributeType::Entities) {
2807 value.split_whitespace().collect()
2808 } else {
2809 vec![value]
2810 };
2811 for token in tokens {
2812 if !is_valid_name(token) {
2813 errors.push(ValidationError {
2814 message: format!(
2815 "attribute '{attr_name}' on element '{elem_name}' \
2816 has invalid ENTITY/ENTITIES value '{token}'"
2817 ),
2818 line: None,
2819 column: None,
2820 });
2821 }
2822 }
2823}
2824
2825fn is_valid_name(s: &str) -> bool {
2827 let mut chars = s.chars();
2828 match chars.next() {
2829 Some(first) if is_name_start_char(first) => chars.all(is_name_char),
2830 _ => false,
2831 }
2832}
2833
2834fn is_valid_nmtoken(s: &str) -> bool {
2836 !s.is_empty() && s.chars().all(is_name_char)
2837}
2838
2839#[cfg(test)]
2844#[allow(clippy::unwrap_used)]
2845mod tests {
2846 use super::*;
2847 use pretty_assertions::assert_eq;
2848
2849 #[test]
2852 fn test_parse_element_empty() {
2853 let dtd = parse_dtd("<!ELEMENT br EMPTY>").unwrap();
2854 let decl = dtd.elements.get("br").unwrap();
2855 assert_eq!(decl.content_model, ContentModel::Empty);
2856 }
2857
2858 #[test]
2859 fn test_parse_element_any() {
2860 let dtd = parse_dtd("<!ELEMENT container ANY>").unwrap();
2861 let decl = dtd.elements.get("container").unwrap();
2862 assert_eq!(decl.content_model, ContentModel::Any);
2863 }
2864
2865 #[test]
2866 fn test_parse_element_pcdata() {
2867 let dtd = parse_dtd("<!ELEMENT title (#PCDATA)>").unwrap();
2868 let decl = dtd.elements.get("title").unwrap();
2869 assert_eq!(decl.content_model, ContentModel::Mixed(vec![]));
2870 }
2871
2872 #[test]
2873 fn test_parse_element_mixed_content() {
2874 let dtd = parse_dtd("<!ELEMENT p (#PCDATA|em|strong)*>").unwrap();
2875 let decl = dtd.elements.get("p").unwrap();
2876 assert_eq!(
2877 decl.content_model,
2878 ContentModel::Mixed(vec!["em".to_string(), "strong".to_string()])
2879 );
2880 }
2881
2882 #[test]
2883 fn test_parse_element_sequence() {
2884 let dtd = parse_dtd("<!ELEMENT book (title,author,year)>").unwrap();
2885 let decl = dtd.elements.get("book").unwrap();
2886 match &decl.content_model {
2887 ContentModel::Children(spec) => {
2888 assert_eq!(spec.occurrence, Occurrence::Once);
2889 match &spec.kind {
2890 ContentSpecKind::Seq(items) => {
2891 assert_eq!(items.len(), 3);
2892 assert_eq!(items[0].kind, ContentSpecKind::Name("title".to_string()));
2893 assert_eq!(items[1].kind, ContentSpecKind::Name("author".to_string()));
2894 assert_eq!(items[2].kind, ContentSpecKind::Name("year".to_string()));
2895 }
2896 other => panic!("expected Seq, got {other:?}"),
2897 }
2898 }
2899 other => panic!("expected Children, got {other:?}"),
2900 }
2901 }
2902
2903 #[test]
2904 fn test_parse_element_choice() {
2905 let dtd = parse_dtd("<!ELEMENT item (a|b|c)>").unwrap();
2906 let decl = dtd.elements.get("item").unwrap();
2907 match &decl.content_model {
2908 ContentModel::Children(spec) => match &spec.kind {
2909 ContentSpecKind::Choice(items) => {
2910 assert_eq!(items.len(), 3);
2911 }
2912 other => panic!("expected Choice, got {other:?}"),
2913 },
2914 other => panic!("expected Children, got {other:?}"),
2915 }
2916 }
2917
2918 #[test]
2919 fn test_parse_element_occurrence_indicators() {
2920 let dtd = parse_dtd("<!ELEMENT doc (head, body?, appendix*)>").unwrap();
2921 let decl = dtd.elements.get("doc").unwrap();
2922 match &decl.content_model {
2923 ContentModel::Children(spec) => match &spec.kind {
2924 ContentSpecKind::Seq(items) => {
2925 assert_eq!(items[0].occurrence, Occurrence::Once);
2926 assert_eq!(items[1].occurrence, Occurrence::Optional);
2927 assert_eq!(items[2].occurrence, Occurrence::ZeroOrMore);
2928 }
2929 other => panic!("expected Seq, got {other:?}"),
2930 },
2931 other => panic!("expected Children, got {other:?}"),
2932 }
2933 }
2934
2935 #[test]
2936 fn test_parse_element_nested_groups() {
2937 let dtd = parse_dtd("<!ELEMENT article ((title, author), body)>").unwrap();
2938 let decl = dtd.elements.get("article").unwrap();
2939 match &decl.content_model {
2940 ContentModel::Children(spec) => match &spec.kind {
2941 ContentSpecKind::Seq(items) => {
2942 assert_eq!(items.len(), 2);
2943 match &items[0].kind {
2945 ContentSpecKind::Seq(inner) => {
2946 assert_eq!(inner.len(), 2);
2947 }
2948 other => panic!("expected nested Seq, got {other:?}"),
2949 }
2950 }
2951 other => panic!("expected Seq, got {other:?}"),
2952 },
2953 other => panic!("expected Children, got {other:?}"),
2954 }
2955 }
2956
2957 #[test]
2958 fn test_parse_attlist_cdata() {
2959 let dtd = parse_dtd("<!ATTLIST img src CDATA #REQUIRED>").unwrap();
2960 let decls = dtd.attributes.get("img").unwrap();
2961 assert_eq!(decls.len(), 1);
2962 assert_eq!(decls[0].attribute_name, "src");
2963 assert_eq!(decls[0].attribute_type, AttributeType::CData);
2964 assert_eq!(decls[0].default, AttributeDefault::Required);
2965 }
2966
2967 #[test]
2968 fn test_parse_attlist_id() {
2969 let dtd = parse_dtd("<!ATTLIST div id ID #IMPLIED>").unwrap();
2970 let decls = dtd.attributes.get("div").unwrap();
2971 assert_eq!(decls[0].attribute_type, AttributeType::Id);
2972 assert_eq!(decls[0].default, AttributeDefault::Implied);
2973 }
2974
2975 #[test]
2976 fn test_parse_attlist_enumeration() {
2977 let dtd = parse_dtd("<!ATTLIST input type (text|password|submit) \"text\">").unwrap();
2978 let decls = dtd.attributes.get("input").unwrap();
2979 assert_eq!(
2980 decls[0].attribute_type,
2981 AttributeType::Enumeration(vec![
2982 "text".to_string(),
2983 "password".to_string(),
2984 "submit".to_string()
2985 ])
2986 );
2987 assert_eq!(
2988 decls[0].default,
2989 AttributeDefault::Default("text".to_string())
2990 );
2991 }
2992
2993 #[test]
2994 fn test_parse_attlist_fixed() {
2995 let dtd = parse_dtd("<!ATTLIST doc version CDATA #FIXED \"1.0\">").unwrap();
2996 let decls = dtd.attributes.get("doc").unwrap();
2997 assert_eq!(decls[0].default, AttributeDefault::Fixed("1.0".to_string()));
2998 }
2999
3000 #[test]
3001 fn test_parse_attlist_multiple_attrs() {
3002 let dtd =
3003 parse_dtd("<!ATTLIST person\n name CDATA #REQUIRED\n age NMTOKEN #IMPLIED>").unwrap();
3004 let decls = dtd.attributes.get("person").unwrap();
3005 assert_eq!(decls.len(), 2);
3006 assert_eq!(decls[0].attribute_name, "name");
3007 assert_eq!(decls[1].attribute_name, "age");
3008 assert_eq!(decls[1].attribute_type, AttributeType::NmToken);
3009 }
3010
3011 #[test]
3012 fn test_parse_entity_internal() {
3013 let dtd = parse_dtd("<!ENTITY copy \"©\">").unwrap();
3014 let ent = dtd.entities.get("copy").unwrap();
3015 match &ent.kind {
3016 EntityKind::Internal(value) => assert_eq!(value, "©"),
3017 EntityKind::External { .. } => panic!("expected Internal, got External"),
3018 }
3019 }
3020
3021 #[test]
3022 fn test_parse_entity_external() {
3023 let dtd = parse_dtd("<!ENTITY chapter SYSTEM \"chapter.xml\">").unwrap();
3024 let ent = dtd.entities.get("chapter").unwrap();
3025 match &ent.kind {
3026 EntityKind::External {
3027 system_id,
3028 public_id,
3029 } => {
3030 assert_eq!(system_id, "chapter.xml");
3031 assert_eq!(*public_id, None);
3032 }
3033 EntityKind::Internal(val) => panic!("expected External, got Internal({val})"),
3034 }
3035 }
3036
3037 #[test]
3038 fn test_parse_notation() {
3039 let dtd = parse_dtd("<!NOTATION png SYSTEM \"image/png\">").unwrap();
3040 let notation = dtd.notations.get("png").unwrap();
3041 assert_eq!(notation.system_id.as_deref(), Some("image/png"));
3042 }
3043
3044 #[test]
3045 fn test_parse_dtd_with_comments() {
3046 let dtd = parse_dtd(
3047 "<!-- element declarations -->\n\
3048 <!ELEMENT root (#PCDATA)>\n\
3049 <!-- end -->",
3050 )
3051 .unwrap();
3052 assert!(dtd.elements.contains_key("root"));
3053 }
3054
3055 #[test]
3056 fn test_parse_dtd_complex() {
3057 let input = "\
3058 <!ELEMENT doc (head, body)>\n\
3059 <!ELEMENT head (title)>\n\
3060 <!ELEMENT title (#PCDATA)>\n\
3061 <!ELEMENT body (p+)>\n\
3062 <!ELEMENT p (#PCDATA|em)*>\n\
3063 <!ELEMENT em (#PCDATA)>\n\
3064 <!ATTLIST doc version CDATA #FIXED \"1.0\">\n\
3065 <!ATTLIST p id ID #IMPLIED>\n\
3066 <!ENTITY copyright \"Copyright 2024\">\n";
3067 let dtd = parse_dtd(input).unwrap();
3068 assert_eq!(dtd.elements.len(), 6);
3069 assert!(dtd.attributes.contains_key("doc"));
3070 assert!(dtd.attributes.contains_key("p"));
3071 assert!(dtd.entities.contains_key("copyright"));
3072 }
3073
3074 fn make_doc(xml: &str) -> Document {
3077 Document::parse_str(xml).unwrap()
3078 }
3079
3080 #[test]
3081 fn test_validate_valid_document() {
3082 let dtd = parse_dtd("<!ELEMENT root (#PCDATA)>").unwrap();
3083 let mut doc = make_doc("<!DOCTYPE root><root>hello</root>");
3084 let result = validate(&mut doc, &dtd);
3085 assert!(result.is_valid, "errors: {:?}", result.errors);
3086 }
3087
3088 #[test]
3089 fn test_validate_root_name_mismatch() {
3090 let dtd = parse_dtd("<!ELEMENT root (#PCDATA)>").unwrap();
3091 let mut doc = make_doc("<!DOCTYPE root><other>text</other>");
3092 let result = validate(&mut doc, &dtd);
3093 assert!(!result.is_valid);
3094 assert!(
3095 result
3096 .errors
3097 .iter()
3098 .any(|e| e.message.contains("root element 'other'")
3099 && e.message.contains("does not match DOCTYPE name 'root'")),
3100 "errors: {:?}",
3101 result.errors
3102 );
3103 }
3104
3105 #[test]
3106 fn test_validate_empty_element() {
3107 let dtd = parse_dtd("<!ELEMENT br EMPTY>").unwrap();
3108 let mut doc = make_doc("<!DOCTYPE br><br/>");
3109 let result = validate(&mut doc, &dtd);
3110 assert!(result.is_valid, "errors: {:?}", result.errors);
3111 }
3112
3113 #[test]
3114 fn test_validate_empty_element_has_content() {
3115 let dtd = parse_dtd("<!ELEMENT br EMPTY>").unwrap();
3116 let mut doc = make_doc("<!DOCTYPE br><br>text</br>");
3117 let result = validate(&mut doc, &dtd);
3118 assert!(!result.is_valid);
3119 assert!(
3120 result
3121 .errors
3122 .iter()
3123 .any(|e| e.message.contains("EMPTY") && e.message.contains("has content")),
3124 "errors: {:?}",
3125 result.errors
3126 );
3127 }
3128
3129 #[test]
3130 fn test_validate_any_content() {
3131 let dtd = parse_dtd(
3132 "<!ELEMENT container ANY>\n\
3133 <!ELEMENT child (#PCDATA)>",
3134 )
3135 .unwrap();
3136 let mut doc = make_doc("<!DOCTYPE container><container><child>text</child></container>");
3137 let result = validate(&mut doc, &dtd);
3138 assert!(result.is_valid, "errors: {:?}", result.errors);
3139 }
3140
3141 #[test]
3142 fn test_validate_sequence_correct() {
3143 let dtd = parse_dtd(
3144 "<!ELEMENT book (title,author)>\n\
3145 <!ELEMENT title (#PCDATA)>\n\
3146 <!ELEMENT author (#PCDATA)>",
3147 )
3148 .unwrap();
3149 let mut doc = make_doc(
3150 "<!DOCTYPE book>\
3151 <book><title>XML</title><author>Jon</author></book>",
3152 );
3153 let result = validate(&mut doc, &dtd);
3154 assert!(result.is_valid, "errors: {:?}", result.errors);
3155 }
3156
3157 #[test]
3158 fn test_validate_sequence_wrong_order() {
3159 let dtd = parse_dtd(
3160 "<!ELEMENT book (title,author)>\n\
3161 <!ELEMENT title (#PCDATA)>\n\
3162 <!ELEMENT author (#PCDATA)>",
3163 )
3164 .unwrap();
3165 let mut doc = make_doc(
3166 "<!DOCTYPE book>\
3167 <book><author>Jon</author><title>XML</title></book>",
3168 );
3169 let result = validate(&mut doc, &dtd);
3170 assert!(!result.is_valid);
3171 assert!(
3172 result
3173 .errors
3174 .iter()
3175 .any(|e| e.message.contains("content does not match")),
3176 "errors: {:?}",
3177 result.errors
3178 );
3179 }
3180
3181 #[test]
3182 fn test_validate_required_attribute_missing() {
3183 let dtd = parse_dtd(
3184 "<!ELEMENT img EMPTY>\n\
3185 <!ATTLIST img src CDATA #REQUIRED>",
3186 )
3187 .unwrap();
3188 let mut doc = make_doc("<!DOCTYPE img><img/>");
3189 let result = validate(&mut doc, &dtd);
3190 assert!(!result.is_valid);
3191 assert!(
3192 result
3193 .errors
3194 .iter()
3195 .any(|e| e.message.contains("required attribute 'src'")),
3196 "errors: {:?}",
3197 result.errors
3198 );
3199 }
3200
3201 #[test]
3202 fn test_validate_required_attribute_present() {
3203 let dtd = parse_dtd(
3204 "<!ELEMENT img EMPTY>\n\
3205 <!ATTLIST img src CDATA #REQUIRED>",
3206 )
3207 .unwrap();
3208 let mut doc = make_doc("<!DOCTYPE img><img src=\"photo.jpg\"/>");
3209 let result = validate(&mut doc, &dtd);
3210 assert!(result.is_valid, "errors: {:?}", result.errors);
3211 }
3212
3213 #[test]
3214 fn test_validate_fixed_attribute_correct() {
3215 let dtd = parse_dtd(
3216 "<!ELEMENT doc (#PCDATA)>\n\
3217 <!ATTLIST doc version CDATA #FIXED \"1.0\">",
3218 )
3219 .unwrap();
3220 let mut doc = make_doc("<!DOCTYPE doc><doc version=\"1.0\">text</doc>");
3221 let result = validate(&mut doc, &dtd);
3222 assert!(result.is_valid, "errors: {:?}", result.errors);
3223 }
3224
3225 #[test]
3226 fn test_validate_fixed_attribute_wrong_value() {
3227 let dtd = parse_dtd(
3228 "<!ELEMENT doc (#PCDATA)>\n\
3229 <!ATTLIST doc version CDATA #FIXED \"1.0\">",
3230 )
3231 .unwrap();
3232 let mut doc = make_doc("<!DOCTYPE doc><doc version=\"2.0\">text</doc>");
3233 let result = validate(&mut doc, &dtd);
3234 assert!(!result.is_valid);
3235 assert!(
3236 result
3237 .errors
3238 .iter()
3239 .any(|e| e.message.contains("fixed value '1.0'")),
3240 "errors: {:?}",
3241 result.errors
3242 );
3243 }
3244
3245 #[test]
3246 fn test_validate_enumeration_valid() {
3247 let dtd = parse_dtd(
3248 "<!ELEMENT input EMPTY>\n\
3249 <!ATTLIST input type (text|password) #REQUIRED>",
3250 )
3251 .unwrap();
3252 let mut doc = make_doc("<!DOCTYPE input><input type=\"text\"/>");
3253 let result = validate(&mut doc, &dtd);
3254 assert!(result.is_valid, "errors: {:?}", result.errors);
3255 }
3256
3257 #[test]
3258 fn test_validate_enumeration_invalid() {
3259 let dtd = parse_dtd(
3260 "<!ELEMENT input EMPTY>\n\
3261 <!ATTLIST input type (text|password) #REQUIRED>",
3262 )
3263 .unwrap();
3264 let mut doc = make_doc("<!DOCTYPE input><input type=\"checkbox\"/>");
3265 let result = validate(&mut doc, &dtd);
3266 assert!(!result.is_valid);
3267 assert!(
3268 result
3269 .errors
3270 .iter()
3271 .any(|e| e.message.contains("not in the allowed values")),
3272 "errors: {:?}",
3273 result.errors
3274 );
3275 }
3276
3277 #[test]
3278 fn test_validate_duplicate_id() {
3279 let dtd = parse_dtd(
3280 "<!ELEMENT root (item, item)>\n\
3281 <!ELEMENT item (#PCDATA)>\n\
3282 <!ATTLIST item id ID #REQUIRED>",
3283 )
3284 .unwrap();
3285 let mut doc = make_doc(
3286 "<!DOCTYPE root>\
3287 <root>\
3288 <item id=\"a\">first</item>\
3289 <item id=\"a\">second</item>\
3290 </root>",
3291 );
3292 let result = validate(&mut doc, &dtd);
3293 assert!(!result.is_valid);
3294 assert!(
3295 result
3296 .errors
3297 .iter()
3298 .any(|e| e.message.contains("duplicate ID value 'a'")),
3299 "errors: {:?}",
3300 result.errors
3301 );
3302 }
3303
3304 #[test]
3305 fn test_validate_idref_valid() {
3306 let dtd = parse_dtd(
3307 "<!ELEMENT root (item, ref)>\n\
3308 <!ELEMENT item (#PCDATA)>\n\
3309 <!ELEMENT ref (#PCDATA)>\n\
3310 <!ATTLIST item id ID #REQUIRED>\n\
3311 <!ATTLIST ref target IDREF #REQUIRED>",
3312 )
3313 .unwrap();
3314 let mut doc = make_doc(
3315 "<!DOCTYPE root>\
3316 <root>\
3317 <item id=\"x\">item</item>\
3318 <ref target=\"x\">ref</ref>\
3319 </root>",
3320 );
3321 let result = validate(&mut doc, &dtd);
3322 assert!(result.is_valid, "errors: {:?}", result.errors);
3323 }
3324
3325 #[test]
3326 fn test_validate_idref_dangling() {
3327 let dtd = parse_dtd(
3328 "<!ELEMENT root (ref)>\n\
3329 <!ELEMENT ref (#PCDATA)>\n\
3330 <!ATTLIST ref target IDREF #REQUIRED>",
3331 )
3332 .unwrap();
3333 let mut doc = make_doc(
3334 "<!DOCTYPE root>\
3335 <root><ref target=\"nonexistent\">ref</ref></root>",
3336 );
3337 let result = validate(&mut doc, &dtd);
3338 assert!(!result.is_valid);
3339 assert!(
3340 result.errors.iter().any(|e| e
3341 .message
3342 .contains("IDREF 'nonexistent' does not match any ID")),
3343 "errors: {:?}",
3344 result.errors
3345 );
3346 }
3347
3348 #[test]
3349 fn test_validate_undeclared_element() {
3350 let dtd = parse_dtd("<!ELEMENT root (child)>\n<!ELEMENT child (#PCDATA)>").unwrap();
3351 let mut doc = make_doc("<!DOCTYPE root><root><unknown/></root>");
3352 let result = validate(&mut doc, &dtd);
3353 assert!(!result.is_valid);
3354 assert!(
3355 result
3356 .errors
3357 .iter()
3358 .any(|e| e.message.contains("element 'unknown' is not declared")),
3359 "errors: {:?}",
3360 result.errors
3361 );
3362 }
3363
3364 #[test]
3365 fn test_validate_undeclared_attribute() {
3366 let dtd = parse_dtd(
3367 "<!ELEMENT root (#PCDATA)>\n\
3368 <!ATTLIST root id ID #IMPLIED>",
3369 )
3370 .unwrap();
3371 let mut doc = make_doc("<!DOCTYPE root><root id=\"x\" bogus=\"y\">text</root>");
3372 let result = validate(&mut doc, &dtd);
3373 assert!(!result.is_valid);
3374 assert!(
3375 result
3376 .errors
3377 .iter()
3378 .any(|e| e.message.contains("attribute 'bogus'")
3379 && e.message.contains("not declared")),
3380 "errors: {:?}",
3381 result.errors
3382 );
3383 }
3384
3385 #[test]
3386 fn test_validate_mixed_content_valid() {
3387 let dtd = parse_dtd(
3388 "<!ELEMENT p (#PCDATA|em|strong)*>\n\
3389 <!ELEMENT em (#PCDATA)>\n\
3390 <!ELEMENT strong (#PCDATA)>",
3391 )
3392 .unwrap();
3393 let mut doc = make_doc(
3394 "<!DOCTYPE p>\
3395 <p>Hello <em>world</em> and <strong>friends</strong></p>",
3396 );
3397 let result = validate(&mut doc, &dtd);
3398 assert!(result.is_valid, "errors: {:?}", result.errors);
3399 }
3400
3401 #[test]
3402 fn test_validate_mixed_content_invalid_child() {
3403 let dtd = parse_dtd(
3404 "<!ELEMENT p (#PCDATA|em)*>\n\
3405 <!ELEMENT em (#PCDATA)>\n\
3406 <!ELEMENT b (#PCDATA)>",
3407 )
3408 .unwrap();
3409 let mut doc = make_doc(
3410 "<!DOCTYPE p>\
3411 <p>Hello <b>world</b></p>",
3412 );
3413 let result = validate(&mut doc, &dtd);
3414 assert!(!result.is_valid);
3415 assert!(
3416 result
3417 .errors
3418 .iter()
3419 .any(|e| e.message.contains("'b' is not allowed in mixed content")),
3420 "errors: {:?}",
3421 result.errors
3422 );
3423 }
3424
3425 #[test]
3426 fn test_validate_choice_correct() {
3427 let dtd = parse_dtd(
3428 "<!ELEMENT item (a|b)>\n\
3429 <!ELEMENT a (#PCDATA)>\n\
3430 <!ELEMENT b (#PCDATA)>",
3431 )
3432 .unwrap();
3433 let mut doc = make_doc("<!DOCTYPE item><item><b>hello</b></item>");
3434 let result = validate(&mut doc, &dtd);
3435 assert!(result.is_valid, "errors: {:?}", result.errors);
3436 }
3437
3438 #[test]
3439 fn test_validate_one_or_more() {
3440 let dtd = parse_dtd("<!ELEMENT list (item+)>\n<!ELEMENT item (#PCDATA)>").unwrap();
3441
3442 let mut doc = make_doc("<!DOCTYPE list><list><item>a</item></list>");
3444 assert!(validate(&mut doc, &dtd).is_valid);
3445
3446 let mut doc = make_doc("<!DOCTYPE list><list><item>a</item><item>b</item></list>");
3448 assert!(validate(&mut doc, &dtd).is_valid);
3449
3450 let mut doc = make_doc("<!DOCTYPE list><list></list>");
3452 assert!(!validate(&mut doc, &dtd).is_valid);
3453 }
3454
3455 #[test]
3456 fn test_validate_zero_or_more() {
3457 let dtd = parse_dtd("<!ELEMENT list (item*)>\n<!ELEMENT item (#PCDATA)>").unwrap();
3458
3459 let mut doc = make_doc("<!DOCTYPE list><list></list>");
3461 assert!(validate(&mut doc, &dtd).is_valid);
3462
3463 let mut doc = make_doc("<!DOCTYPE list><list><item>a</item><item>b</item></list>");
3465 assert!(validate(&mut doc, &dtd).is_valid);
3466 }
3467
3468 #[test]
3469 fn test_validate_optional_element() {
3470 let dtd = parse_dtd(
3471 "<!ELEMENT doc (title, subtitle?)>\n\
3472 <!ELEMENT title (#PCDATA)>\n\
3473 <!ELEMENT subtitle (#PCDATA)>",
3474 )
3475 .unwrap();
3476
3477 let mut doc = make_doc(
3479 "<!DOCTYPE doc>\
3480 <doc><title>T</title><subtitle>S</subtitle></doc>",
3481 );
3482 assert!(validate(&mut doc, &dtd).is_valid);
3483
3484 let mut doc = make_doc("<!DOCTYPE doc><doc><title>T</title></doc>");
3486 assert!(validate(&mut doc, &dtd).is_valid);
3487 }
3488
3489 #[test]
3490 fn test_content_model_display() {
3491 assert_eq!(ContentModel::Empty.to_string(), "EMPTY");
3492 assert_eq!(ContentModel::Any.to_string(), "ANY");
3493 assert_eq!(ContentModel::Mixed(vec![]).to_string(), "(#PCDATA)");
3494 assert_eq!(
3495 ContentModel::Mixed(vec!["a".to_string(), "b".to_string()]).to_string(),
3496 "(#PCDATA|a|b)*"
3497 );
3498
3499 let spec = ContentSpec {
3500 kind: ContentSpecKind::Seq(vec![
3501 ContentSpec {
3502 kind: ContentSpecKind::Name("a".to_string()),
3503 occurrence: Occurrence::Once,
3504 },
3505 ContentSpec {
3506 kind: ContentSpecKind::Name("b".to_string()),
3507 occurrence: Occurrence::ZeroOrMore,
3508 },
3509 ]),
3510 occurrence: Occurrence::Once,
3511 };
3512 assert_eq!(ContentModel::Children(spec).to_string(), "(a , b*)");
3513 }
3514
3515 #[test]
3516 fn test_parse_attlist_idref_idrefs() {
3517 let dtd = parse_dtd(
3518 "<!ATTLIST link target IDREF #REQUIRED>\n\
3519 <!ATTLIST group members IDREFS #REQUIRED>",
3520 )
3521 .unwrap();
3522 let link_decls = dtd.attributes.get("link").unwrap();
3523 assert_eq!(link_decls[0].attribute_type, AttributeType::IdRef);
3524 let group_decls = dtd.attributes.get("group").unwrap();
3525 assert_eq!(group_decls[0].attribute_type, AttributeType::IdRefs);
3526 }
3527
3528 #[test]
3529 fn test_validate_element_content_with_text() {
3530 let dtd = parse_dtd("<!ELEMENT book (title)>\n<!ELEMENT title (#PCDATA)>").unwrap();
3531 let mut doc = make_doc("<!DOCTYPE book><book>stray text<title>T</title></book>");
3532 let result = validate(&mut doc, &dtd);
3533 assert!(!result.is_valid);
3534 assert!(
3535 result
3536 .errors
3537 .iter()
3538 .any(|e| e.message.contains("element-only content model")
3539 && e.message.contains("contains text")),
3540 "errors: {:?}",
3541 result.errors
3542 );
3543 }
3544
3545 #[test]
3546 fn test_parse_entity_public() {
3547 let dtd = parse_dtd("<!ENTITY logo PUBLIC \"-//LOGO//\" \"logo.png\">").unwrap();
3548 let ent = dtd.entities.get("logo").unwrap();
3549 match &ent.kind {
3550 EntityKind::External {
3551 system_id,
3552 public_id,
3553 } => {
3554 assert_eq!(system_id, "logo.png");
3555 assert_eq!(public_id.as_deref(), Some("-//LOGO//"));
3556 }
3557 EntityKind::Internal(val) => panic!("expected External, got Internal({val})"),
3558 }
3559 }
3560
3561 #[test]
3562 fn test_parse_notation_public() {
3563 let dtd = parse_dtd("<!NOTATION gif PUBLIC \"-//GIF//\">").unwrap();
3564 let notation = dtd.notations.get("gif").unwrap();
3565 assert_eq!(notation.public_id.as_deref(), Some("-//GIF//"));
3566 assert_eq!(notation.system_id, None);
3567 }
3568
3569 #[test]
3570 fn test_parse_parameter_entity_skipped() {
3571 let dtd = parse_dtd(
3573 "<!ENTITY % common \"(#PCDATA)\">\n\
3574 <!ELEMENT root (#PCDATA)>",
3575 )
3576 .unwrap();
3577 assert!(dtd.elements.contains_key("root"));
3578 }
3579
3580 #[test]
3581 fn test_validate_nmtoken_attribute() {
3582 let dtd = parse_dtd(
3583 "<!ELEMENT root (#PCDATA)>\n\
3584 <!ATTLIST root token NMTOKEN #REQUIRED>",
3585 )
3586 .unwrap();
3587
3588 let mut doc = make_doc("<!DOCTYPE root><root token=\"abc-123\">text</root>");
3590 assert!(validate(&mut doc, &dtd).is_valid);
3591
3592 let mut doc = make_doc("<!DOCTYPE root><root token=\"abc 123\">text</root>");
3594 let result = validate(&mut doc, &dtd);
3595 assert!(!result.is_valid);
3596 assert!(
3597 result
3598 .errors
3599 .iter()
3600 .any(|e| e.message.contains("invalid NMTOKEN")),
3601 "errors: {:?}",
3602 result.errors
3603 );
3604 }
3605
3606 #[test]
3607 fn test_validate_populates_id_map() {
3608 let dtd = parse_dtd(
3609 "<!ELEMENT root (item*)>\n\
3610 <!ELEMENT item (#PCDATA)>\n\
3611 <!ATTLIST item id ID #REQUIRED>",
3612 )
3613 .unwrap();
3614 let mut doc =
3615 make_doc(r#"<!DOCTYPE root><root><item id="a">A</item><item id="b">B</item></root>"#);
3616 let result = validate(&mut doc, &dtd);
3617 assert!(result.is_valid, "errors: {:?}", result.errors);
3618
3619 let item_a = doc.element_by_id("a");
3621 assert!(item_a.is_some(), "expected to find element with id='a'");
3622 let item_b = doc.element_by_id("b");
3623 assert!(item_b.is_some(), "expected to find element with id='b'");
3624 assert_eq!(doc.element_by_id("c"), None);
3625
3626 assert_eq!(doc.node_name(item_a.unwrap()), Some("item"));
3628 assert_eq!(doc.node_name(item_b.unwrap()), Some("item"));
3629 }
3630}