Skip to main content

xmloxide/validation/
dtd.rs

1//! DTD (Document Type Definition) data model, parser, and validator.
2//!
3//! This module implements DTD processing as defined in XML 1.0 (Fifth Edition)
4//! sections 2.8, 3.2, 3.3, 3.4, and 4.2. It provides:
5//!
6//! - A data model for DTD declarations (elements, attributes, entities, notations)
7//! - A parser that processes DTD internal subset content
8//! - A validator that checks document conformance against a parsed DTD
9//!
10//! # Content Model Matching
11//!
12//! The validator implements deterministic content model matching for:
13//! - `EMPTY`: element must have no element or text children
14//! - `ANY`: any content is allowed
15//! - Mixed content `(#PCDATA|a|b)*`: text and listed elements in any order
16//! - Element content with sequences `(a,b,c)`, choices `(a|b|c)`, and
17//!   occurrence indicators `?`, `*`, `+`
18//!
19//! See XML 1.0 section 3.2 for the full content model specification.
20
21use std::collections::{HashMap, HashSet};
22use std::fmt;
23
24use crate::error::{ParseError, SourceLocation};
25use crate::tree::{Document, NodeId, NodeKind};
26
27use super::{ValidationError, ValidationResult};
28
29// ---------------------------------------------------------------------------
30// DTD Data Model
31// ---------------------------------------------------------------------------
32
33/// A parsed DTD containing all declarations from the internal subset.
34///
35/// This is the result of [`parse_dtd`] and serves as input to [`validate`].
36#[derive(Debug, Clone, Default)]
37pub struct Dtd {
38    /// Element declarations, keyed by element name.
39    pub elements: HashMap<String, ElementDecl>,
40    /// Attribute declarations, keyed by `(element_name, attribute_name)`.
41    pub attributes: HashMap<String, Vec<AttributeDecl>>,
42    /// General entity declarations, keyed by entity name.
43    pub entities: HashMap<String, EntityDecl>,
44    /// Parameter entity declarations, keyed by entity name.
45    pub param_entities: HashMap<String, EntityDecl>,
46    /// Notation declarations, keyed by notation name.
47    pub notations: HashMap<String, NotationDecl>,
48    /// Ordered list of all declarations (preserving source order and comments).
49    pub declarations: Vec<DtdDeclaration>,
50}
51
52/// A single DTD declaration, preserving source order for re-serialization.
53#[derive(Debug, Clone)]
54pub enum DtdDeclaration {
55    /// An element declaration.
56    Element(ElementDecl),
57    /// A single attribute declaration (one per attribute, even if the source
58    /// used a multi-attribute ATTLIST).
59    Attlist(AttributeDecl),
60    /// A general entity declaration.
61    Entity(EntityDecl),
62    /// A notation declaration.
63    Notation(NotationDecl),
64    /// A comment.
65    Comment(String),
66    /// A processing instruction.
67    Pi(String, Option<String>),
68}
69
70/// An element declaration from `<!ELEMENT name content-model>`.
71///
72/// See XML 1.0 section 3.2.
73#[derive(Debug, Clone)]
74pub struct ElementDecl {
75    /// The element name.
76    pub name: String,
77    /// The declared content model.
78    pub content_model: ContentModel,
79}
80
81/// The content model for an element declaration.
82///
83/// See XML 1.0 section 3.2 for the grammar:
84/// - `contentspec ::= 'EMPTY' | 'ANY' | Mixed | children`
85#[derive(Debug, Clone, PartialEq)]
86pub enum ContentModel {
87    /// The element must have no children (no elements, no text).
88    /// Declared as `<!ELEMENT name EMPTY>`.
89    Empty,
90    /// Any content is allowed.
91    /// Declared as `<!ELEMENT name ANY>`.
92    Any,
93    /// Mixed content: text and optionally listed elements in any order.
94    /// Declared as `<!ELEMENT name (#PCDATA)>` or `<!ELEMENT name (#PCDATA|a|b)*>`.
95    ///
96    /// The `Vec<String>` contains the allowed element names (empty for `#PCDATA` only).
97    Mixed(Vec<String>),
98    /// Element-only content following a content spec pattern.
99    /// Declared as `<!ELEMENT name (a,b,c)>` etc.
100    Children(ContentSpec),
101}
102
103/// A content specification for element-only content models.
104///
105/// Represents the recursive structure of `(a,b)`, `(a|b)`, etc.
106/// with occurrence indicators.
107///
108/// See XML 1.0 section 3.2.1 and 3.2.2.
109#[derive(Debug, Clone, PartialEq)]
110pub struct ContentSpec {
111    /// The content particle kind.
112    pub kind: ContentSpecKind,
113    /// How many times this particle may occur.
114    pub occurrence: Occurrence,
115}
116
117/// The kind of a content specification particle.
118#[derive(Debug, Clone, PartialEq)]
119pub enum ContentSpecKind {
120    /// A single named element, e.g., `a`.
121    Name(String),
122    /// A sequence of particles, e.g., `(a, b, c)`.
123    Seq(Vec<ContentSpec>),
124    /// A choice among particles, e.g., `(a | b | c)`.
125    Choice(Vec<ContentSpec>),
126}
127
128/// Occurrence indicator for a content particle.
129///
130/// See XML 1.0 section 3.2.1: `'?' | '*' | '+'`.
131#[derive(Debug, Clone, Copy, PartialEq, Eq)]
132pub enum Occurrence {
133    /// Exactly once (no indicator).
134    Once,
135    /// Zero or one time (`?`).
136    Optional,
137    /// Zero or more times (`*`).
138    ZeroOrMore,
139    /// One or more times (`+`).
140    OneOrMore,
141}
142
143/// An attribute declaration from `<!ATTLIST element-name attr-name type default>`.
144///
145/// See XML 1.0 section 3.3.
146#[derive(Debug, Clone)]
147pub struct AttributeDecl {
148    /// The element this attribute belongs to.
149    pub element_name: String,
150    /// The attribute name.
151    pub attribute_name: String,
152    /// The attribute type.
153    pub attribute_type: AttributeType,
154    /// The default value specification.
155    pub default: AttributeDefault,
156}
157
158/// The type of an attribute as declared in `<!ATTLIST>`.
159///
160/// See XML 1.0 section 3.3.1.
161#[derive(Debug, Clone, PartialEq)]
162pub enum AttributeType {
163    /// Character data (`CDATA`).
164    CData,
165    /// A unique identifier (`ID`).
166    Id,
167    /// A reference to an ID (`IDREF`).
168    IdRef,
169    /// Space-separated list of ID references (`IDREFS`).
170    IdRefs,
171    /// An entity name (`ENTITY`).
172    Entity,
173    /// Space-separated list of entity names (`ENTITIES`).
174    Entities,
175    /// A name token (`NMTOKEN`).
176    NmToken,
177    /// Space-separated list of name tokens (`NMTOKENS`).
178    NmTokens,
179    /// A notation type with allowed notation names (`NOTATION (a|b|c)`).
180    Notation(Vec<String>),
181    /// An enumeration of allowed values (`(a|b|c)`).
182    Enumeration(Vec<String>),
183}
184
185/// The default value specification for an attribute.
186///
187/// See XML 1.0 section 3.3.2.
188#[derive(Debug, Clone, PartialEq)]
189pub enum AttributeDefault {
190    /// The attribute is required (`#REQUIRED`).
191    Required,
192    /// The attribute is optional with no default (`#IMPLIED`).
193    Implied,
194    /// The attribute has a fixed value (`#FIXED "value"`).
195    Fixed(String),
196    /// The attribute has a default value (`"value"`).
197    Default(String),
198}
199
200/// A general entity declaration.
201///
202/// See XML 1.0 section 4.2.
203#[derive(Debug, Clone)]
204pub struct EntityDecl {
205    /// The entity name.
206    pub name: String,
207    /// The entity's value, either internal or external.
208    pub kind: EntityKind,
209}
210
211/// Whether an entity is internal (has a literal value) or external
212/// (references an external resource).
213#[derive(Debug, Clone)]
214pub enum EntityKind {
215    /// Internal entity with a literal replacement text.
216    Internal(String),
217    /// External entity identified by a system URI and optional public ID.
218    External {
219        /// The SYSTEM identifier (URI).
220        system_id: String,
221        /// The PUBLIC identifier, if any.
222        public_id: Option<String>,
223    },
224}
225
226/// A notation declaration from `<!NOTATION name ...>`.
227///
228/// See XML 1.0 section 4.7.
229#[derive(Debug, Clone)]
230pub struct NotationDecl {
231    /// The notation name.
232    pub name: String,
233    /// The SYSTEM identifier, if any.
234    pub system_id: Option<String>,
235    /// The PUBLIC identifier, if any.
236    pub public_id: Option<String>,
237}
238
239impl fmt::Display for ContentModel {
240    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
241        match self {
242            Self::Empty => write!(f, "EMPTY"),
243            Self::Any => write!(f, "ANY"),
244            Self::Mixed(names) => {
245                if names.is_empty() {
246                    write!(f, "(#PCDATA)")
247                } else {
248                    write!(f, "(#PCDATA|{})*", names.join("|"))
249                }
250            }
251            Self::Children(spec) => write!(f, "{spec}"),
252        }
253    }
254}
255
256impl fmt::Display for ContentSpec {
257    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
258        match &self.kind {
259            ContentSpecKind::Name(name) => write!(f, "{name}")?,
260            ContentSpecKind::Seq(items) => {
261                write!(f, "(")?;
262                for (i, item) in items.iter().enumerate() {
263                    if i > 0 {
264                        write!(f, " , ")?;
265                    }
266                    write!(f, "{item}")?;
267                }
268                write!(f, ")")?;
269            }
270            ContentSpecKind::Choice(items) => {
271                write!(f, "(")?;
272                for (i, item) in items.iter().enumerate() {
273                    if i > 0 {
274                        write!(f, " | ")?;
275                    }
276                    write!(f, "{item}")?;
277                }
278                write!(f, ")")?;
279            }
280        }
281        match self.occurrence {
282            Occurrence::Once => {}
283            Occurrence::Optional => write!(f, "?")?,
284            Occurrence::ZeroOrMore => write!(f, "*")?,
285            Occurrence::OneOrMore => write!(f, "+")?,
286        }
287        Ok(())
288    }
289}
290
291// ---------------------------------------------------------------------------
292// DTD Serializer
293// ---------------------------------------------------------------------------
294
295/// Serializes a parsed DTD's declarations into the internal subset format
296/// used by libxml2.
297///
298/// Each declaration appears on its own line. The output does NOT include
299/// the surrounding `[` and `]>` — the caller adds those.
300#[must_use]
301#[allow(clippy::too_many_lines)]
302pub fn serialize_dtd(dtd: &Dtd) -> String {
303    let mut out = String::new();
304    let mut last_was_comment = false;
305
306    for decl in &dtd.declarations {
307        // Don't add a newline before declarations that immediately follow
308        // a comment — the comment text already contains any needed whitespace.
309        // libxml2 concatenates the comment closing `-->` and the next
310        // declaration on the same line.
311        if !last_was_comment {
312            out.push('\n');
313        }
314        match decl {
315            DtdDeclaration::Element(e) => {
316                out.push_str("<!ELEMENT ");
317                out.push_str(&e.name);
318                out.push(' ');
319                write_content_model(&mut out, &e.content_model);
320                out.push('>');
321                last_was_comment = false;
322            }
323            DtdDeclaration::Attlist(a) => {
324                out.push_str("<!ATTLIST ");
325                out.push_str(&a.element_name);
326                out.push(' ');
327                out.push_str(&a.attribute_name);
328                out.push(' ');
329                write_attribute_type(&mut out, &a.attribute_type);
330                out.push(' ');
331                write_attribute_default(&mut out, &a.default);
332                out.push('>');
333                last_was_comment = false;
334            }
335            DtdDeclaration::Entity(e) => {
336                out.push_str("<!ENTITY ");
337                out.push_str(&e.name);
338                match &e.kind {
339                    EntityKind::Internal(value) => {
340                        out.push(' ');
341                        write_entity_value(&mut out, value);
342                    }
343                    EntityKind::External {
344                        system_id,
345                        public_id,
346                    } => {
347                        if let Some(pub_id) = public_id {
348                            out.push_str(" PUBLIC \"");
349                            out.push_str(pub_id);
350                            out.push_str("\" \"");
351                            out.push_str(system_id);
352                            out.push('"');
353                        } else {
354                            out.push_str(" SYSTEM \"");
355                            out.push_str(system_id);
356                            out.push('"');
357                        }
358                    }
359                }
360                out.push('>');
361                last_was_comment = false;
362            }
363            DtdDeclaration::Notation(n) => {
364                out.push_str("<!NOTATION ");
365                out.push_str(&n.name);
366                match (&n.public_id, &n.system_id) {
367                    (Some(pub_id), Some(sys_id)) => {
368                        out.push_str(" PUBLIC \"");
369                        out.push_str(pub_id);
370                        out.push_str("\" \"");
371                        out.push_str(sys_id);
372                        out.push('"');
373                    }
374                    (Some(pub_id), None) => {
375                        out.push_str(" PUBLIC \"");
376                        out.push_str(pub_id);
377                        out.push('"');
378                    }
379                    (None, Some(sys_id)) => {
380                        out.push_str(" SYSTEM \"");
381                        out.push_str(sys_id);
382                        out.push('"');
383                    }
384                    (None, None) => {}
385                }
386                out.push('>');
387                last_was_comment = false;
388            }
389            DtdDeclaration::Comment(text) => {
390                out.push_str("<!--");
391                out.push_str(text);
392                out.push_str("-->");
393                last_was_comment = true;
394            }
395            DtdDeclaration::Pi(target, data) => {
396                out.push_str("<?");
397                out.push_str(target);
398                if let Some(d) = data {
399                    out.push(' ');
400                    out.push_str(d);
401                }
402                out.push_str("?>");
403                last_was_comment = false;
404            }
405        }
406    }
407
408    // libxml2 adds a newline before ]> unless the last item was a comment.
409    if !last_was_comment && !dtd.declarations.is_empty() {
410        out.push('\n');
411    }
412
413    out
414}
415
416/// Writes a content model in libxml2's format.
417fn write_content_model(out: &mut String, model: &ContentModel) {
418    match model {
419        ContentModel::Empty => out.push_str("EMPTY"),
420        ContentModel::Any => out.push_str("ANY"),
421        ContentModel::Mixed(names) => {
422            if names.is_empty() {
423                out.push_str("(#PCDATA)");
424            } else {
425                out.push_str("(#PCDATA");
426                for name in names {
427                    out.push_str(" | ");
428                    out.push_str(name);
429                }
430                out.push_str(")*");
431            }
432        }
433        ContentModel::Children(spec) => {
434            use std::fmt::Write;
435            let _ = write!(out, "{spec}");
436        }
437    }
438}
439
440/// Writes an attribute type in libxml2's format.
441fn write_attribute_type(out: &mut String, attr_type: &AttributeType) {
442    match attr_type {
443        AttributeType::CData => out.push_str("CDATA"),
444        AttributeType::Id => out.push_str("ID"),
445        AttributeType::IdRef => out.push_str("IDREF"),
446        AttributeType::IdRefs => out.push_str("IDREFS"),
447        AttributeType::Entity => out.push_str("ENTITY"),
448        AttributeType::Entities => out.push_str("ENTITIES"),
449        AttributeType::NmToken => out.push_str("NMTOKEN"),
450        AttributeType::NmTokens => out.push_str("NMTOKENS"),
451        AttributeType::Notation(values) | AttributeType::Enumeration(values) => {
452            if matches!(attr_type, AttributeType::Notation(_)) {
453                out.push_str("NOTATION ");
454            }
455            out.push('(');
456            for (i, v) in values.iter().enumerate() {
457                if i > 0 {
458                    out.push_str(" | ");
459                }
460                out.push_str(v);
461            }
462            out.push(')');
463        }
464    }
465}
466
467/// Writes an attribute default in libxml2's format.
468fn write_attribute_default(out: &mut String, default: &AttributeDefault) {
469    match default {
470        AttributeDefault::Required => out.push_str("#REQUIRED"),
471        AttributeDefault::Implied => out.push_str("#IMPLIED"),
472        AttributeDefault::Fixed(value) => {
473            out.push_str("#FIXED \"");
474            out.push_str(value);
475            out.push('"');
476        }
477        AttributeDefault::Default(value) => {
478            out.push('"');
479            out.push_str(value);
480            out.push('"');
481        }
482    }
483}
484
485/// Escapes an entity value for DTD serialization.
486///
487/// Entity references (`&name;`) and character references (`&#...;`) within
488/// the value are preserved as-is (matching libxml2 behavior). Only standalone
489/// `&` characters are escaped. The quote character is chosen to minimize
490/// escaping: single quotes when the value contains double quotes.
491fn write_entity_value(out: &mut String, value: &str) {
492    // Choose quote character: use single quotes if value contains double quotes
493    // but not single quotes (avoids escaping). Otherwise use double quotes.
494    let quote = if value.contains('"') && !value.contains('\'') {
495        '\''
496    } else {
497        '"'
498    };
499    out.push(quote);
500
501    let bytes = value.as_bytes();
502    let len = bytes.len();
503    let mut i = 0;
504
505    while i < len {
506        if bytes[i] == b'&' {
507            // Check if this is a valid entity or character reference — if so, pass through.
508            if let Some(ref_end) = find_reference_end(bytes, i) {
509                // Copy the reference as-is
510                let ref_str = std::str::from_utf8(&bytes[i..=ref_end]).unwrap_or("&amp;");
511                out.push_str(ref_str);
512                i = ref_end + 1;
513            } else {
514                out.push_str("&amp;");
515                i += 1;
516            }
517        } else if bytes[i] == b'%' {
518            out.push_str("&#37;");
519            i += 1;
520        } else if bytes[i] == quote as u8 {
521            if quote == '"' {
522                out.push_str("&quot;");
523            } else {
524                out.push_str("&apos;");
525            }
526            i += 1;
527        } else {
528            // Push the char (may be multi-byte UTF-8)
529            let ch = &value[i..];
530            if let Some(c) = ch.chars().next() {
531                out.push(c);
532                i += c.len_utf8();
533            } else {
534                i += 1;
535            }
536        }
537    }
538
539    out.push(quote);
540}
541
542/// Finds the end position (inclusive, the `;`) of an entity or character
543/// reference starting at `start` in `bytes`. Returns `None` if the `&` at
544/// `start` is not the beginning of a valid reference.
545fn find_reference_end(bytes: &[u8], start: usize) -> Option<usize> {
546    if start >= bytes.len() || bytes[start] != b'&' {
547        return None;
548    }
549    let mut i = start + 1;
550    if i >= bytes.len() {
551        return None;
552    }
553
554    if bytes[i] == b'#' {
555        // Character reference: &#digits; or &#xhexdigits;
556        i += 1;
557        if i >= bytes.len() {
558            return None;
559        }
560        if bytes[i] == b'x' {
561            i += 1;
562            let digit_start = i;
563            while i < bytes.len() && bytes[i].is_ascii_hexdigit() {
564                i += 1;
565            }
566            if i == digit_start || i >= bytes.len() || bytes[i] != b';' {
567                return None;
568            }
569        } else {
570            let digit_start = i;
571            while i < bytes.len() && bytes[i].is_ascii_digit() {
572                i += 1;
573            }
574            if i == digit_start || i >= bytes.len() || bytes[i] != b';' {
575                return None;
576            }
577        }
578        Some(i)
579    } else {
580        // Named entity reference: &name;
581        // Name must start with a name start char (letter or _)
582        if !is_name_start_byte(bytes[i]) {
583            return None;
584        }
585        i += 1;
586        while i < bytes.len() && is_name_byte(bytes[i]) {
587            i += 1;
588        }
589        if i >= bytes.len() || bytes[i] != b';' {
590            return None;
591        }
592        Some(i)
593    }
594}
595
596/// Checks if a byte is valid as the start of an XML name.
597fn is_name_start_byte(b: u8) -> bool {
598    b.is_ascii_alphabetic() || b == b'_' || b == b':'
599}
600
601/// Checks if a byte is valid within an XML name.
602fn is_name_byte(b: u8) -> bool {
603    b.is_ascii_alphanumeric() || b == b'_' || b == b':' || b == b'-' || b == b'.'
604}
605
606// ---------------------------------------------------------------------------
607// DTD Parser
608// ---------------------------------------------------------------------------
609
610/// Parses a DTD internal subset string into a [`Dtd`] data structure.
611///
612/// The input should be the content from inside `<!DOCTYPE root [ ... ]>`,
613/// i.e., just the internal subset without the surrounding brackets.
614///
615/// # Errors
616///
617/// Returns a `ParseError` if the DTD content is malformed.
618///
619/// # Examples
620///
621/// ```
622/// use xmloxide::validation::dtd::parse_dtd;
623///
624/// let dtd = parse_dtd("<!ELEMENT root (#PCDATA)>").unwrap();
625/// assert!(dtd.elements.contains_key("root"));
626/// ```
627pub fn parse_dtd(input: &str) -> Result<Dtd, ParseError> {
628    let mut parser = DtdParser::new(input);
629    parser.parse()
630}
631
632/// Internal DTD parser state.
633struct DtdParser<'a> {
634    input: &'a [u8],
635    pos: usize,
636    line: u32,
637    column: u32,
638    dtd: Dtd,
639}
640
641impl<'a> DtdParser<'a> {
642    fn new(input: &'a str) -> Self {
643        Self {
644            input: input.as_bytes(),
645            pos: 0,
646            line: 1,
647            column: 1,
648            dtd: Dtd::default(),
649        }
650    }
651
652    fn parse(&mut self) -> Result<Dtd, ParseError> {
653        loop {
654            self.skip_whitespace();
655            if self.at_end() {
656                break;
657            }
658
659            if self.looking_at(b"<!--") {
660                self.parse_comment_decl()?;
661            } else if self.looking_at(b"<!ELEMENT") {
662                self.parse_element_decl()?;
663            } else if self.looking_at(b"<!ATTLIST") {
664                self.parse_attlist_decl()?;
665            } else if self.looking_at(b"<!ENTITY") {
666                self.parse_entity_decl()?;
667            } else if self.looking_at(b"<!NOTATION") {
668                self.parse_notation_decl()?;
669            } else if self.looking_at(b"<?") {
670                self.parse_pi_decl()?;
671            } else if self.peek() == Some(b'%') {
672                // Parameter entity reference — skip it since we don't expand
673                self.skip_pe_reference()?;
674            } else {
675                return Err(self.fatal(format!(
676                    "unexpected character '{}' in DTD",
677                    self.peek().map_or('?', |b| b as char)
678                )));
679            }
680        }
681
682        self.post_validate()?;
683
684        Ok(std::mem::take(&mut self.dtd))
685    }
686
687    /// Post-parse validation checks that require the complete entity map.
688    ///
689    /// Detects entity recursion (WFC: No Recursion), validates that entity
690    /// references in attribute defaults refer to internal parsed entities,
691    /// and checks for `<` in entity replacement text used in attributes.
692    fn post_validate(&self) -> Result<(), ParseError> {
693        // Check for entity recursion (WFC: No Recursion, XML 1.0 §4.1)
694        for (name, decl) in &self.dtd.entities {
695            if let EntityKind::Internal(ref value) = decl.kind {
696                let mut visited = std::collections::HashSet::new();
697                visited.insert(name.clone());
698                self.check_entity_recursion(value, &mut visited)?;
699            }
700        }
701
702        // Check for parameter entity recursion (WFC: No Recursion, XML 1.0 §4.1).
703        // PE values may contain encoded PE references via &#37; (which is '%').
704        // After char ref expansion, if %name; appears in its own value, that's
705        // direct or indirect recursion.
706        for (name, decl) in &self.dtd.param_entities {
707            if let EntityKind::Internal(ref value) = decl.kind {
708                let expanded = expand_char_refs_only(value);
709                let mut visited = std::collections::HashSet::new();
710                visited.insert(name.clone());
711                self.check_pe_recursion(&expanded, &mut visited)?;
712            }
713        }
714
715        // Validate entity replacement text after character reference
716        // expansion (XML 1.0 §4.5). Character references in entity
717        // values are expanded at declaration time. The resulting
718        // replacement text must be well-formed when re-parsed.
719        for (name, decl) in &self.dtd.entities {
720            if let EntityKind::Internal(ref value) = decl.kind {
721                self.validate_replacement_text(name, value)?;
722            }
723        }
724
725        // Validate predefined entity redeclarations (XML 1.0 §4.6).
726        // If lt, gt, amp, apos, or quot are declared, their replacement
727        // text must be a character reference to the respective character.
728        self.validate_predefined_entities()?;
729
730        // Note: content production validation (XML 1.0 §4.3.2) is
731        // performed at entity expansion time in the XML parser, not
732        // here, because it only applies to entities that are actually
733        // referenced in the document.
734
735        // Validate entity references in ATTLIST defaults
736        for attrs in self.dtd.attributes.values() {
737            for attr in attrs {
738                let (AttributeDefault::Default(default_value)
739                | AttributeDefault::Fixed(default_value)) = &attr.default
740                else {
741                    continue;
742                };
743                self.validate_attr_default_entities(default_value)?;
744            }
745        }
746
747        Ok(())
748    }
749
750    /// Validates that predefined entity redeclarations (lt, gt, amp, apos,
751    /// quot) use the correct character reference as replacement text.
752    ///
753    /// Per XML 1.0 §4.6: "If the entities lt or amp are declared, they MUST
754    /// be declared as internal entities whose replacement text is a character
755    /// reference to the respective character."
756    fn validate_predefined_entities(&self) -> Result<(), ParseError> {
757        // Per XML 1.0 §4.6: lt and amp MUST use character references.
758        // gt, apos, and quot may use either the literal character or a
759        // character reference.
760        let expected: &[(&str, &str, &[&str])] = &[
761            ("lt", "<", &["&#60;", "&#x3C;", "&#x3c;"]),
762            ("gt", ">", &[">", "&#62;", "&#x3E;", "&#x3e;"]),
763            ("amp", "&", &["&#38;", "&#x26;"]),
764            ("apos", "'", &["'", "&#39;", "&#x27;"]),
765            ("quot", "\"", &["\"", "&#34;", "&#x22;"]),
766        ];
767        for &(name, _char_val, valid_refs) in expected {
768            if let Some(decl) = self.dtd.entities.get(name) {
769                match &decl.kind {
770                    EntityKind::Internal(value) => {
771                        // Check if the value is a valid character reference
772                        // for this predefined entity.
773                        if !valid_refs.iter().any(|r| r == value) {
774                            return Err(self.fatal(format!(
775                                "predefined entity '{name}' must be declared as \
776                                 a character reference (e.g., '{}')",
777                                valid_refs[0]
778                            )));
779                        }
780                    }
781                    EntityKind::External { .. } => {
782                        return Err(self.fatal(format!(
783                            "predefined entity '{name}' must be an internal entity"
784                        )));
785                    }
786                }
787            }
788        }
789        Ok(())
790    }
791
792    /// Validates entity replacement text after character reference
793    /// expansion per XML 1.0 §4.5.
794    ///
795    /// Expands only character references in the entity value (not entity
796    /// references), then checks the resulting replacement text for basic
797    /// well-formedness: bare `&` characters from `&#38;` expansion that
798    /// don't form valid references are rejected.
799    fn validate_replacement_text(&self, entity_name: &str, value: &str) -> Result<(), ParseError> {
800        // Only check values that contain character references
801        if !value.contains("&#") {
802            return Ok(());
803        }
804
805        // Build replacement text by expanding only character references
806        let replacement = Self::expand_char_refs_only(value);
807
808        // Check for bare '&' in the replacement text that don't form
809        // valid entity or character references
810        let bytes = replacement.as_bytes();
811        let mut i = 0;
812        while i < bytes.len() {
813            if bytes[i] == b'&' {
814                i += 1;
815                if i >= bytes.len() {
816                    return Err(self.fatal(format!(
817                        "entity '{entity_name}' replacement text contains \
818                         bare '&' at end of text"
819                    )));
820                }
821                if bytes[i] == b'#' {
822                    // Character reference — check it's complete
823                    i += 1;
824                    let has_digits = if i < bytes.len() && bytes[i] == b'x' {
825                        i += 1;
826                        let start = i;
827                        while i < bytes.len() && bytes[i].is_ascii_hexdigit() {
828                            i += 1;
829                        }
830                        i > start
831                    } else {
832                        let start = i;
833                        while i < bytes.len() && bytes[i].is_ascii_digit() {
834                            i += 1;
835                        }
836                        i > start
837                    };
838                    if !has_digits || i >= bytes.len() || bytes[i] != b';' {
839                        return Err(self.fatal(format!(
840                            "entity '{entity_name}' replacement text contains \
841                             incomplete character reference"
842                        )));
843                    }
844                    i += 1;
845                } else if bytes[i].is_ascii_alphabetic() || bytes[i] == b'_' || bytes[i] == b':' {
846                    // Entity reference — skip name
847                    while i < bytes.len() && bytes[i] != b';' {
848                        i += 1;
849                    }
850                    if i >= bytes.len() {
851                        return Err(self.fatal(format!(
852                            "entity '{entity_name}' replacement text contains \
853                             incomplete entity reference"
854                        )));
855                    }
856                    i += 1;
857                } else {
858                    return Err(self.fatal(format!(
859                        "entity '{entity_name}' replacement text contains \
860                         bare '&' not followed by a valid reference"
861                    )));
862                }
863            } else {
864                i += 1;
865            }
866        }
867        Ok(())
868    }
869
870    /// Expands only character references in a string, leaving entity
871    /// references as-is. Returns the expanded text.
872    fn expand_char_refs_only(value: &str) -> String {
873        expand_char_refs_only(value)
874    }
875
876    /// Recursively checks for entity reference cycles.
877    fn check_entity_recursion(
878        &self,
879        value: &str,
880        visited: &mut std::collections::HashSet<String>,
881    ) -> Result<(), ParseError> {
882        for ref_name in Self::extract_entity_refs(value) {
883            if visited.contains(ref_name) {
884                return Err(self.fatal(format!("recursive entity reference: '{ref_name}'")));
885            }
886            if let Some(decl) = self.dtd.entities.get(ref_name) {
887                if let EntityKind::Internal(ref inner_value) = decl.kind {
888                    visited.insert(ref_name.to_string());
889                    self.check_entity_recursion(inner_value, visited)?;
890                    visited.remove(ref_name);
891                }
892            }
893        }
894        Ok(())
895    }
896
897    /// Recursively checks for parameter entity reference cycles.
898    ///
899    /// Examines the char-ref-expanded replacement text for `%name;` patterns.
900    fn check_pe_recursion(
901        &self,
902        value: &str,
903        visited: &mut std::collections::HashSet<String>,
904    ) -> Result<(), ParseError> {
905        for ref_name in Self::extract_pe_refs(value) {
906            if visited.contains(&ref_name) {
907                return Err(self.fatal(format!(
908                    "recursive parameter entity reference: '%{ref_name}'"
909                )));
910            }
911            if let Some(decl) = self.dtd.param_entities.get(&ref_name) {
912                if let EntityKind::Internal(ref inner_value) = decl.kind {
913                    let expanded = expand_char_refs_only(inner_value);
914                    visited.insert(ref_name.clone());
915                    self.check_pe_recursion(&expanded, visited)?;
916                    visited.remove(&ref_name);
917                }
918            }
919        }
920        Ok(())
921    }
922
923    /// Extracts parameter entity reference names (`%name;`) from a string.
924    fn extract_pe_refs(value: &str) -> Vec<String> {
925        let mut refs = Vec::new();
926        let bytes = value.as_bytes();
927        let mut i = 0;
928        while i < bytes.len() {
929            if bytes[i] == b'%' {
930                i += 1;
931                if i < bytes.len() && (bytes[i].is_ascii_alphabetic() || bytes[i] == b'_') {
932                    let start = i;
933                    while i < bytes.len() && bytes[i] != b';' && !bytes[i].is_ascii_whitespace() {
934                        i += 1;
935                    }
936                    if i < bytes.len() && bytes[i] == b';' && i > start {
937                        if let Ok(name) = std::str::from_utf8(&bytes[start..i]) {
938                            refs.push(name.to_string());
939                        }
940                        i += 1;
941                    }
942                }
943            } else {
944                i += 1;
945            }
946        }
947        refs
948    }
949
950    /// Validates entity references in attribute default values.
951    ///
952    /// Checks WFC: No External Entity References (§3.1) and
953    /// WFC: No `<` in Attribute Values for entity replacement text.
954    fn validate_attr_default_entities(&self, value: &str) -> Result<(), ParseError> {
955        for ref_name in Self::extract_entity_refs(value) {
956            // Built-in entities are always fine
957            if matches!(ref_name, "amp" | "lt" | "gt" | "apos" | "quot") {
958                continue;
959            }
960            match self.dtd.entities.get(ref_name) {
961                None => {
962                    return Err(self.fatal(format!(
963                        "undeclared entity '{ref_name}' referenced in \
964                         attribute default value"
965                    )));
966                }
967                Some(decl) => match &decl.kind {
968                    EntityKind::External { .. } => {
969                        return Err(self.fatal(format!(
970                            "attribute default value must not reference \
971                             external entity '{ref_name}'"
972                        )));
973                    }
974                    EntityKind::Internal(ref text) => {
975                        // Check for '<' in replacement text (WFC: No < in
976                        // Attribute Values, XML 1.0 §3.1)
977                        if text.contains('<') {
978                            return Err(self.fatal(format!(
979                                "entity '{ref_name}' contains '<' and cannot \
980                                 be used in attribute values"
981                            )));
982                        }
983                        // Recursively check referenced entities
984                        self.validate_attr_default_entities(text)?;
985                    }
986                },
987            }
988        }
989        Ok(())
990    }
991
992    /// Extracts entity reference names from a string value.
993    ///
994    /// Returns an iterator over entity names found in `&name;` patterns,
995    /// excluding character references (`&#...;`).
996    fn extract_entity_refs(value: &str) -> Vec<&str> {
997        let mut refs = Vec::new();
998        let bytes = value.as_bytes();
999        let mut i = 0;
1000        while i < bytes.len() {
1001            if bytes[i] == b'&' {
1002                i += 1;
1003                if i < bytes.len() && bytes[i] == b'#' {
1004                    // Character reference — skip
1005                    while i < bytes.len() && bytes[i] != b';' {
1006                        i += 1;
1007                    }
1008                    if i < bytes.len() {
1009                        i += 1;
1010                    }
1011                } else {
1012                    // Entity reference
1013                    let start = i;
1014                    while i < bytes.len() && bytes[i] != b';' && bytes[i] != b'&' {
1015                        i += 1;
1016                    }
1017                    if i < bytes.len() && bytes[i] == b';' && i > start {
1018                        if let Ok(name) = std::str::from_utf8(&bytes[start..i]) {
1019                            refs.push(name);
1020                        }
1021                        i += 1;
1022                    }
1023                }
1024            } else {
1025                i += 1;
1026            }
1027        }
1028        refs
1029    }
1030
1031    // --- ELEMENT declaration ---
1032    // See XML 1.0 §3.2: [45] elementdecl
1033
1034    fn parse_element_decl(&mut self) -> Result<(), ParseError> {
1035        self.expect_str(b"<!ELEMENT")?;
1036        self.skip_whitespace_required()?;
1037        let name = self.parse_name()?;
1038        self.skip_whitespace_required()?;
1039        let content_model = self.parse_content_model()?;
1040        self.skip_whitespace();
1041        self.expect_byte(b'>')?;
1042
1043        let decl = ElementDecl {
1044            name: name.clone(),
1045            content_model,
1046        };
1047        self.dtd
1048            .declarations
1049            .push(DtdDeclaration::Element(decl.clone()));
1050        self.dtd.elements.insert(name, decl);
1051        Ok(())
1052    }
1053
1054    fn parse_content_model(&mut self) -> Result<ContentModel, ParseError> {
1055        if self.looking_at(b"EMPTY") {
1056            self.expect_str(b"EMPTY")?;
1057            return Ok(ContentModel::Empty);
1058        }
1059        if self.looking_at(b"ANY") {
1060            self.expect_str(b"ANY")?;
1061            return Ok(ContentModel::Any);
1062        }
1063
1064        // Must be Mixed or Children, both start with '('
1065        self.expect_byte(b'(')?;
1066        self.skip_whitespace();
1067
1068        // Check for mixed content: (#PCDATA ...)
1069        if self.looking_at(b"#PCDATA") {
1070            self.expect_str(b"#PCDATA")?;
1071            self.skip_whitespace();
1072
1073            let mut names = Vec::new();
1074
1075            if self.peek() == Some(b')') {
1076                // (#PCDATA) — text only
1077                self.advance(1);
1078                // Optional '*' after (#PCDATA) — some DTDs write (#PCDATA)*
1079                if self.peek() == Some(b'*') {
1080                    self.advance(1);
1081                }
1082                return Ok(ContentModel::Mixed(names));
1083            }
1084
1085            // (#PCDATA|a|b)*
1086            while self.peek() == Some(b'|') {
1087                self.advance(1);
1088                self.skip_whitespace();
1089                let elem_name = self.parse_name()?;
1090                names.push(elem_name);
1091                self.skip_whitespace();
1092            }
1093
1094            self.expect_byte(b')')?;
1095            self.expect_byte(b'*')?;
1096
1097            return Ok(ContentModel::Mixed(names));
1098        }
1099
1100        // Element-only content: parse as a content spec group
1101        let spec = self.parse_content_spec_group()?;
1102        Ok(ContentModel::Children(spec))
1103    }
1104
1105    /// Parses a content spec starting after the opening '(' has been consumed
1106    /// and the first item is NOT `#PCDATA`.
1107    fn parse_content_spec_group(&mut self) -> Result<ContentSpec, ParseError> {
1108        let mut first = self.parse_content_particle()?;
1109        self.skip_whitespace();
1110
1111        // Determine if this is a sequence (,) or choice (|)
1112        if self.peek() == Some(b',') {
1113            // Sequence
1114            let mut items = vec![first];
1115            while self.peek() == Some(b',') {
1116                self.advance(1);
1117                self.skip_whitespace();
1118                let item = self.parse_content_particle()?;
1119                items.push(item);
1120                self.skip_whitespace();
1121            }
1122            self.expect_byte(b')')?;
1123            let occurrence = self.parse_occurrence();
1124            Ok(ContentSpec {
1125                kind: ContentSpecKind::Seq(items),
1126                occurrence,
1127            })
1128        } else if self.peek() == Some(b'|') {
1129            // Choice
1130            let mut items = vec![first];
1131            while self.peek() == Some(b'|') {
1132                self.advance(1);
1133                self.skip_whitespace();
1134                let item = self.parse_content_particle()?;
1135                items.push(item);
1136                self.skip_whitespace();
1137            }
1138            self.expect_byte(b')')?;
1139            let occurrence = self.parse_occurrence();
1140            Ok(ContentSpec {
1141                kind: ContentSpecKind::Choice(items),
1142                occurrence,
1143            })
1144        } else {
1145            // Single item group: (item)?/*
1146            self.expect_byte(b')')?;
1147            let group_occurrence = self.parse_occurrence();
1148
1149            if group_occurrence != Occurrence::Once {
1150                // Group has occurrence: (X)+ → wrap in Seq
1151                Ok(ContentSpec {
1152                    kind: ContentSpecKind::Seq(vec![first]),
1153                    occurrence: group_occurrence,
1154                })
1155            } else if first.occurrence != Occurrence::Once {
1156                // Inner particle has occurrence but group doesn't.
1157                // libxml2 normalizes (X+) → (X)+ by moving occurrence
1158                // to the outer group.
1159                let inner_occ = first.occurrence;
1160                first.occurrence = Occurrence::Once;
1161                Ok(ContentSpec {
1162                    kind: ContentSpecKind::Seq(vec![first]),
1163                    occurrence: inner_occ,
1164                })
1165            } else {
1166                // No occurrence on either — unwrap the group
1167                Ok(first)
1168            }
1169        }
1170    }
1171
1172    fn parse_content_particle(&mut self) -> Result<ContentSpec, ParseError> {
1173        if self.peek() == Some(b'(') {
1174            self.advance(1);
1175            self.skip_whitespace();
1176            self.parse_content_spec_group()
1177        } else {
1178            let name = self.parse_name()?;
1179            let occurrence = self.parse_occurrence();
1180            Ok(ContentSpec {
1181                kind: ContentSpecKind::Name(name),
1182                occurrence,
1183            })
1184        }
1185    }
1186
1187    fn parse_occurrence(&mut self) -> Occurrence {
1188        match self.peek() {
1189            Some(b'?') => {
1190                self.advance(1);
1191                Occurrence::Optional
1192            }
1193            Some(b'*') => {
1194                self.advance(1);
1195                Occurrence::ZeroOrMore
1196            }
1197            Some(b'+') => {
1198                self.advance(1);
1199                Occurrence::OneOrMore
1200            }
1201            _ => Occurrence::Once,
1202        }
1203    }
1204
1205    // --- ATTLIST declaration ---
1206    // See XML 1.0 §3.3: [52] AttlistDecl
1207
1208    fn parse_attlist_decl(&mut self) -> Result<(), ParseError> {
1209        self.expect_str(b"<!ATTLIST")?;
1210        self.skip_whitespace_required()?;
1211        let element_name = self.parse_name()?;
1212
1213        loop {
1214            self.skip_whitespace();
1215            if self.peek() == Some(b'>') {
1216                self.advance(1);
1217                break;
1218            }
1219
1220            let attribute_name = self.parse_name()?;
1221            self.skip_whitespace_required()?;
1222            let attribute_type = self.parse_attribute_type()?;
1223            self.skip_whitespace_required()?;
1224            let default = self.parse_attribute_default()?;
1225
1226            let decl = AttributeDecl {
1227                element_name: element_name.clone(),
1228                attribute_name,
1229                attribute_type,
1230                default,
1231            };
1232
1233            // Per XML 1.0 §3.3, the first attribute declaration is binding;
1234            // subsequent declarations for the same attribute are ignored.
1235            let attrs = self.dtd.attributes.entry(element_name.clone()).or_default();
1236            if !attrs
1237                .iter()
1238                .any(|a| a.attribute_name == decl.attribute_name)
1239            {
1240                self.dtd
1241                    .declarations
1242                    .push(DtdDeclaration::Attlist(decl.clone()));
1243                attrs.push(decl);
1244            }
1245        }
1246
1247        Ok(())
1248    }
1249
1250    fn parse_attribute_type(&mut self) -> Result<AttributeType, ParseError> {
1251        if self.looking_at(b"CDATA") {
1252            self.expect_str(b"CDATA")?;
1253            Ok(AttributeType::CData)
1254        } else if self.looking_at(b"IDREFS") {
1255            self.expect_str(b"IDREFS")?;
1256            Ok(AttributeType::IdRefs)
1257        } else if self.looking_at(b"IDREF") {
1258            self.expect_str(b"IDREF")?;
1259            Ok(AttributeType::IdRef)
1260        } else if self.looking_at(b"ID") {
1261            self.expect_str(b"ID")?;
1262            Ok(AttributeType::Id)
1263        } else if self.looking_at(b"ENTITIES") {
1264            self.expect_str(b"ENTITIES")?;
1265            Ok(AttributeType::Entities)
1266        } else if self.looking_at(b"ENTITY") {
1267            self.expect_str(b"ENTITY")?;
1268            Ok(AttributeType::Entity)
1269        } else if self.looking_at(b"NMTOKENS") {
1270            self.expect_str(b"NMTOKENS")?;
1271            Ok(AttributeType::NmTokens)
1272        } else if self.looking_at(b"NMTOKEN") {
1273            self.expect_str(b"NMTOKEN")?;
1274            Ok(AttributeType::NmToken)
1275        } else if self.looking_at(b"NOTATION") {
1276            self.expect_str(b"NOTATION")?;
1277            self.skip_whitespace_required()?;
1278            let values = self.parse_enumerated_values()?;
1279            Ok(AttributeType::Notation(values))
1280        } else if self.peek() == Some(b'(') {
1281            let values = self.parse_enumerated_values()?;
1282            Ok(AttributeType::Enumeration(values))
1283        } else {
1284            Err(self.fatal("expected attribute type"))
1285        }
1286    }
1287
1288    fn parse_enumerated_values(&mut self) -> Result<Vec<String>, ParseError> {
1289        self.expect_byte(b'(')?;
1290        self.skip_whitespace();
1291        let mut values = Vec::new();
1292
1293        let first = self.parse_nmtoken()?;
1294        values.push(first);
1295
1296        loop {
1297            self.skip_whitespace();
1298            if self.peek() == Some(b')') {
1299                self.advance(1);
1300                break;
1301            }
1302            self.expect_byte(b'|')?;
1303            self.skip_whitespace();
1304            let val = self.parse_nmtoken()?;
1305            values.push(val);
1306        }
1307
1308        Ok(values)
1309    }
1310
1311    fn parse_attribute_default(&mut self) -> Result<AttributeDefault, ParseError> {
1312        if self.looking_at(b"#REQUIRED") {
1313            self.expect_str(b"#REQUIRED")?;
1314            Ok(AttributeDefault::Required)
1315        } else if self.looking_at(b"#IMPLIED") {
1316            self.expect_str(b"#IMPLIED")?;
1317            Ok(AttributeDefault::Implied)
1318        } else if self.looking_at(b"#FIXED") {
1319            self.expect_str(b"#FIXED")?;
1320            self.skip_whitespace_required()?;
1321            let value = self.parse_quoted_value()?;
1322            self.validate_default_value(&value)?;
1323            Ok(AttributeDefault::Fixed(value))
1324        } else {
1325            let value = self.parse_quoted_value()?;
1326            self.validate_default_value(&value)?;
1327            Ok(AttributeDefault::Default(value))
1328        }
1329    }
1330
1331    // --- ENTITY declaration ---
1332    // See XML 1.0 §4.2: [70] EntityDecl
1333
1334    #[allow(clippy::too_many_lines)]
1335    fn parse_entity_decl(&mut self) -> Result<(), ParseError> {
1336        self.expect_str(b"<!ENTITY")?;
1337        self.skip_whitespace_required()?;
1338
1339        // Parameter entities (% name)
1340        if self.peek() == Some(b'%') {
1341            self.advance(1);
1342            self.skip_whitespace_required()?;
1343            let pe_name = self.parse_name()?;
1344            // Namespaces in XML 1.0: entity names must be NCNames (no colons).
1345            if pe_name.contains(':') {
1346                return Err(self.fatal(format!("entity name '{pe_name}' must not contain a colon")));
1347            }
1348            self.skip_whitespace_required()?;
1349
1350            let pe_kind = if self.peek() == Some(b'"') || self.peek() == Some(b'\'') {
1351                // Internal PE — parse and validate the value
1352                let value = self.parse_quoted_value()?;
1353                self.validate_entity_value(&value, true)?;
1354                Some(EntityKind::Internal(value))
1355            } else if self.looking_at(b"SYSTEM") {
1356                // External PE — parse external ID
1357                self.expect_str(b"SYSTEM")?;
1358                self.skip_whitespace_required()?;
1359                let system_id = self.parse_quoted_value()?;
1360                Some(EntityKind::External {
1361                    system_id,
1362                    public_id: None,
1363                })
1364            } else if self.looking_at(b"PUBLIC") {
1365                self.expect_str(b"PUBLIC")?;
1366                self.skip_whitespace_required()?;
1367                let public_id = self.parse_quoted_value()?;
1368                self.validate_public_id(&public_id)?;
1369                self.skip_whitespace_required()?;
1370                let system_id = self.parse_quoted_value()?;
1371                Some(EntityKind::External {
1372                    system_id,
1373                    public_id: Some(public_id),
1374                })
1375            } else {
1376                return Err(self.fatal("expected entity value or external ID"));
1377            };
1378
1379            self.skip_whitespace();
1380            // Reject NDATA on parameter entities (XML 1.0 §4.2.2)
1381            if self.looking_at(b"NDATA") {
1382                return Err(self.fatal("NDATA annotation is not allowed on parameter entities"));
1383            }
1384            self.expect_byte(b'>')?;
1385
1386            // Store PE declaration (first declaration wins per XML 1.0 §4.2)
1387            if let Some(kind) = pe_kind {
1388                self.dtd
1389                    .param_entities
1390                    .entry(pe_name)
1391                    .or_insert(EntityDecl {
1392                        name: String::new(),
1393                        kind,
1394                    });
1395            }
1396            return Ok(());
1397        }
1398
1399        let name = self.parse_name()?;
1400        // Namespaces in XML 1.0: entity names must be NCNames (no colons).
1401        if name.contains(':') {
1402            return Err(self.fatal(format!("entity name '{name}' must not contain a colon")));
1403        }
1404        self.skip_whitespace_required()?;
1405
1406        let is_parameter_entity = false;
1407        let kind = if self.peek() == Some(b'"') || self.peek() == Some(b'\'') {
1408            // Internal entity
1409            let value = self.parse_quoted_value()?;
1410            self.validate_entity_value(&value, is_parameter_entity)?;
1411            EntityKind::Internal(value)
1412        } else if self.looking_at(b"SYSTEM") {
1413            self.expect_str(b"SYSTEM")?;
1414            self.skip_whitespace_required()?;
1415            let system_id = self.parse_quoted_value()?;
1416            EntityKind::External {
1417                system_id,
1418                public_id: None,
1419            }
1420        } else if self.looking_at(b"PUBLIC") {
1421            self.expect_str(b"PUBLIC")?;
1422            self.skip_whitespace_required()?;
1423            let public_id = self.parse_quoted_value()?;
1424            self.validate_public_id(&public_id)?;
1425            self.skip_whitespace_required()?;
1426            let system_id = self.parse_quoted_value()?;
1427            EntityKind::External {
1428                system_id,
1429                public_id: Some(public_id),
1430            }
1431        } else {
1432            return Err(self.fatal("expected entity value or external ID"));
1433        };
1434
1435        let had_ws = self.skip_whitespace();
1436
1437        // Handle optional NDATA for unparsed external entities (XML 1.0 §4.2.2)
1438        if self.looking_at(b"NDATA") {
1439            // NDATA is only allowed on external entities
1440            if matches!(kind, EntityKind::Internal(_)) {
1441                return Err(self.fatal("NDATA annotation is not allowed on internal entities"));
1442            }
1443            // Whitespace is required before NDATA (XML 1.0 §4.2.2)
1444            if !had_ws {
1445                return Err(self.fatal("whitespace required before NDATA"));
1446            }
1447            self.expect_str(b"NDATA")?;
1448            self.skip_whitespace_required()?;
1449            let _notation_name = self.parse_name()?;
1450            self.skip_whitespace();
1451        }
1452
1453        self.expect_byte(b'>')?;
1454
1455        // Per XML 1.0 §4.2, the first entity declaration is binding;
1456        // subsequent declarations of the same entity are ignored.
1457        let decl = EntityDecl {
1458            name: name.clone(),
1459            kind,
1460        };
1461        self.dtd
1462            .declarations
1463            .push(DtdDeclaration::Entity(decl.clone()));
1464        self.dtd.entities.entry(name).or_insert(decl);
1465        Ok(())
1466    }
1467
1468    // --- NOTATION declaration ---
1469    // See XML 1.0 §4.7: [82] NotationDecl
1470
1471    fn parse_notation_decl(&mut self) -> Result<(), ParseError> {
1472        self.expect_str(b"<!NOTATION")?;
1473        self.skip_whitespace_required()?;
1474        let name = self.parse_name()?;
1475        // Namespaces in XML 1.0: notation names must be NCNames (no colons).
1476        if name.contains(':') {
1477            return Err(self.fatal(format!("notation name '{name}' must not contain a colon")));
1478        }
1479        self.skip_whitespace_required()?;
1480
1481        let (system_id, public_id) = if self.looking_at(b"SYSTEM") {
1482            self.expect_str(b"SYSTEM")?;
1483            self.skip_whitespace_required()?;
1484            let sid = self.parse_quoted_value()?;
1485            (Some(sid), None)
1486        } else if self.looking_at(b"PUBLIC") {
1487            self.expect_str(b"PUBLIC")?;
1488            self.skip_whitespace_required()?;
1489            let pid = self.parse_quoted_value()?;
1490            self.validate_public_id(&pid)?;
1491            // System ID is optional for notations with PUBLIC
1492            self.skip_whitespace();
1493            let sid = if self.peek() == Some(b'"') || self.peek() == Some(b'\'') {
1494                Some(self.parse_quoted_value()?)
1495            } else {
1496                None
1497            };
1498            (sid, Some(pid))
1499        } else {
1500            return Err(self.fatal("expected SYSTEM or PUBLIC in NOTATION declaration"));
1501        };
1502
1503        self.skip_whitespace();
1504        self.expect_byte(b'>')?;
1505
1506        let decl = NotationDecl {
1507            name: name.clone(),
1508            system_id,
1509            public_id,
1510        };
1511        self.dtd
1512            .declarations
1513            .push(DtdDeclaration::Notation(decl.clone()));
1514        self.dtd.notations.insert(name, decl);
1515        Ok(())
1516    }
1517
1518    // --- Skip helpers ---
1519
1520    /// Parses a comment and stores it as a `DtdDeclaration::Comment`.
1521    fn parse_comment_decl(&mut self) -> Result<(), ParseError> {
1522        self.expect_str(b"<!--")?;
1523        let start = self.pos;
1524        loop {
1525            if self.at_end() {
1526                return Err(self.fatal("unexpected end of input in comment"));
1527            }
1528            if self.looking_at(b"-->") {
1529                let text = std::str::from_utf8(&self.input[start..self.pos])
1530                    .unwrap_or("")
1531                    .to_string();
1532                self.advance(3);
1533                self.dtd.declarations.push(DtdDeclaration::Comment(text));
1534                return Ok(());
1535            }
1536            self.advance(1);
1537        }
1538    }
1539
1540    /// Parses a processing instruction and stores it as a `DtdDeclaration::Pi`.
1541    fn parse_pi_decl(&mut self) -> Result<(), ParseError> {
1542        self.expect_str(b"<?")?;
1543
1544        // Parse and validate the PI target name (XML 1.0 §2.6)
1545        let target = self.parse_name()?;
1546
1547        // Reject <?xml ...?> inside DTD (XML 1.0 §2.8)
1548        if target.eq_ignore_ascii_case("xml") {
1549            return Err(self.fatal("XML declaration is not allowed inside DTD"));
1550        }
1551
1552        // If we're immediately at ?>, no data — that's fine
1553        if self.looking_at(b"?>") {
1554            self.advance(2);
1555            self.dtd.declarations.push(DtdDeclaration::Pi(target, None));
1556            return Ok(());
1557        }
1558
1559        // If there's data, whitespace is required between target and data
1560        let is_ws = self
1561            .peek()
1562            .is_some_and(|b| b == b' ' || b == b'\t' || b == b'\r' || b == b'\n');
1563        if !is_ws {
1564            return Err(self.fatal("space required between PI target and data"));
1565        }
1566
1567        let start = self.pos;
1568        loop {
1569            if self.at_end() {
1570                return Err(self.fatal("unexpected end of input in processing instruction"));
1571            }
1572            if self.looking_at(b"?>") {
1573                let data = std::str::from_utf8(&self.input[start..self.pos])
1574                    .unwrap_or("")
1575                    .trim()
1576                    .to_string();
1577                self.advance(2);
1578                let data = if data.is_empty() { None } else { Some(data) };
1579                self.dtd.declarations.push(DtdDeclaration::Pi(target, data));
1580                return Ok(());
1581            }
1582            self.advance(1);
1583        }
1584    }
1585
1586    fn skip_pe_reference(&mut self) -> Result<(), ParseError> {
1587        self.expect_byte(b'%')?;
1588        // Read the name
1589        let _name = self.parse_name()?;
1590        self.expect_byte(b';')?;
1591        Ok(())
1592    }
1593
1594    // --- Name / token parsing ---
1595
1596    fn parse_name(&mut self) -> Result<String, ParseError> {
1597        if self.pos >= self.input.len() {
1598            return Err(self.fatal("expected name, found end of input"));
1599        }
1600
1601        let start = self.pos;
1602        let first = self.input[self.pos];
1603
1604        // ASCII fast path
1605        if is_ascii_name_start(first) {
1606            self.pos += 1;
1607            self.column += 1;
1608            while self.pos < self.input.len() && is_ascii_name_char(self.input[self.pos]) {
1609                self.pos += 1;
1610                self.column += 1;
1611            }
1612            if self.pos >= self.input.len() || self.input[self.pos] < 0x80 {
1613                let name = std::str::from_utf8(&self.input[start..self.pos])
1614                    .map_err(|_| self.fatal("invalid UTF-8 in name"))?;
1615                return Ok(name.to_string());
1616            }
1617            // Fall through to slow path for non-ASCII continuation
1618        } else {
1619            let ch = self
1620                .peek_char()
1621                .ok_or_else(|| self.fatal("expected name"))?;
1622            if !is_name_start_char(ch) {
1623                return Err(self.fatal(format!("invalid name start character: '{ch}'")));
1624            }
1625            self.advance_char(ch);
1626        }
1627
1628        while let Some(ch) = self.peek_char() {
1629            if is_name_char(ch) {
1630                self.advance_char(ch);
1631            } else {
1632                break;
1633            }
1634        }
1635
1636        let name = std::str::from_utf8(&self.input[start..self.pos])
1637            .map_err(|_| self.fatal("invalid UTF-8 in name"))?;
1638        Ok(name.to_string())
1639    }
1640
1641    fn parse_nmtoken(&mut self) -> Result<String, ParseError> {
1642        if self.pos >= self.input.len() {
1643            return Err(self.fatal("expected NMTOKEN, found end of input"));
1644        }
1645
1646        let start = self.pos;
1647        let first = self.input[self.pos];
1648
1649        // ASCII fast path
1650        if is_ascii_name_char(first) {
1651            self.pos += 1;
1652            self.column += 1;
1653            while self.pos < self.input.len() && is_ascii_name_char(self.input[self.pos]) {
1654                self.pos += 1;
1655                self.column += 1;
1656            }
1657            if self.pos >= self.input.len() || self.input[self.pos] < 0x80 {
1658                let token = std::str::from_utf8(&self.input[start..self.pos])
1659                    .map_err(|_| self.fatal("invalid UTF-8 in NMTOKEN"))?;
1660                return Ok(token.to_string());
1661            }
1662            // Fall through to slow path
1663        } else {
1664            let ch = self
1665                .peek_char()
1666                .ok_or_else(|| self.fatal("expected NMTOKEN"))?;
1667            if !is_name_char(ch) {
1668                return Err(self.fatal(format!("invalid NMTOKEN character: '{ch}'")));
1669            }
1670            self.advance_char(ch);
1671        }
1672
1673        while let Some(ch) = self.peek_char() {
1674            if is_name_char(ch) {
1675                self.advance_char(ch);
1676            } else {
1677                break;
1678            }
1679        }
1680
1681        let token = std::str::from_utf8(&self.input[start..self.pos])
1682            .map_err(|_| self.fatal("invalid UTF-8 in NMTOKEN"))?;
1683        Ok(token.to_string())
1684    }
1685
1686    /// Validates an entity value per XML 1.0 §4.3.2 `EntityValue` production.
1687    ///
1688    /// Checks that `&` is only used in valid entity/character references,
1689    /// and that `%` is not present in general entity values.
1690    #[allow(clippy::too_many_lines)]
1691    fn validate_entity_value(
1692        &self,
1693        value: &str,
1694        is_parameter_entity: bool,
1695    ) -> Result<(), ParseError> {
1696        // First validate all characters are valid XML chars.
1697        for c in value.chars() {
1698            if !crate::parser::input::is_xml_char(c) {
1699                return Err(self.fatal(format!(
1700                    "invalid XML character U+{:04X} in entity value",
1701                    c as u32
1702                )));
1703            }
1704        }
1705
1706        // Text declarations (<?xml ...?>) are forbidden in internal
1707        // entities (XML 1.0 §4.3.1). They may only appear at the start
1708        // of external parsed entities.
1709        if value.starts_with("<?xml") {
1710            let after = value.as_bytes().get(5).copied();
1711            if after.map_or(true, |b| b == b' ' || b == b'\t' || b == b'?') {
1712                return Err(self.fatal("text declaration is not allowed in internal entity value"));
1713            }
1714        }
1715
1716        let bytes = value.as_bytes();
1717        let mut i = 0;
1718        while i < bytes.len() {
1719            match bytes[i] {
1720                b'&' => {
1721                    // Must be a valid reference: &name; or &#N; or &#xH;
1722                    i += 1;
1723                    if i >= bytes.len() {
1724                        return Err(self.fatal("incomplete reference in entity value: '&' at end"));
1725                    }
1726                    if bytes[i] == b'#' {
1727                        // Character reference — parse and validate
1728                        i += 1;
1729                        let char_val = if i < bytes.len() && bytes[i] == b'x' {
1730                            i += 1;
1731                            let hex_start = i;
1732                            if i >= bytes.len() || !bytes[i].is_ascii_hexdigit() {
1733                                return Err(
1734                                    self.fatal("malformed character reference in entity value")
1735                                );
1736                            }
1737                            while i < bytes.len() && bytes[i].is_ascii_hexdigit() {
1738                                i += 1;
1739                            }
1740                            let hex_str = std::str::from_utf8(&bytes[hex_start..i]).unwrap_or("");
1741                            u32::from_str_radix(hex_str, 16).unwrap_or(0)
1742                        } else {
1743                            let dec_start = i;
1744                            if i >= bytes.len() || !bytes[i].is_ascii_digit() {
1745                                return Err(
1746                                    self.fatal("malformed character reference in entity value")
1747                                );
1748                            }
1749                            while i < bytes.len() && bytes[i].is_ascii_digit() {
1750                                i += 1;
1751                            }
1752                            let dec_str = std::str::from_utf8(&bytes[dec_start..i]).unwrap_or("");
1753                            dec_str.parse::<u32>().unwrap_or(0)
1754                        };
1755                        if i >= bytes.len() || bytes[i] != b';' {
1756                            return Err(
1757                                self.fatal("incomplete character reference in entity value")
1758                            );
1759                        }
1760                        i += 1;
1761                        // Validate the referenced character is a valid XML char
1762                        if let Some(c) = char::from_u32(char_val) {
1763                            if !crate::parser::input::is_xml_char(c) {
1764                                return Err(self.fatal(format!(
1765                                    "character reference &#x{char_val:X}; refers to invalid XML character"
1766                                )));
1767                            }
1768                        } else {
1769                            return Err(self.fatal(format!(
1770                                "character reference value {char_val} is not a valid Unicode code point"
1771                            )));
1772                        }
1773                    } else {
1774                        // Entity reference — must be Name followed by ';'
1775                        let start = i;
1776                        while i < bytes.len()
1777                            && bytes[i] != b';'
1778                            && bytes[i] != b'&'
1779                            && !bytes[i].is_ascii_whitespace()
1780                        {
1781                            i += 1;
1782                        }
1783                        if i == start || i >= bytes.len() || bytes[i] != b';' {
1784                            return Err(self.fatal("malformed entity reference in entity value"));
1785                        }
1786                        // Validate the entity name starts with a NameStartChar
1787                        let name_str = std::str::from_utf8(&bytes[start..i]).unwrap_or("");
1788                        if let Some(first_char) = name_str.chars().next() {
1789                            if !is_name_start_char(first_char) {
1790                                return Err(self.fatal(format!(
1791                                    "entity reference name must start with a letter or underscore, found '{first_char}'"
1792                                )));
1793                            }
1794                        }
1795                        i += 1;
1796                    }
1797                }
1798                b'%' if !is_parameter_entity => {
1799                    // '%' is not allowed in general entity values (XML 1.0 §4.3.2)
1800                    return Err(self.fatal("'%' not allowed in general entity value"));
1801                }
1802                b'%' if is_parameter_entity => {
1803                    // WFC: PEs in Internal Subset — PE references MUST NOT
1804                    // occur within markup declarations in the internal subset
1805                    // (XML 1.0 §2.8).
1806                    i += 1;
1807                    if i < bytes.len() {
1808                        let first = bytes[i];
1809                        if first.is_ascii_alphabetic() || first == b'_' || first == b':' {
1810                            return Err(self.fatal(
1811                                "parameter entity reference not allowed within \
1812                                 markup declaration in internal subset",
1813                            ));
1814                        }
1815                    }
1816                }
1817                _ => {
1818                    i += 1;
1819                }
1820            }
1821        }
1822        Ok(())
1823    }
1824
1825    /// Validates an attribute default value per XML 1.0 §3.3.2.
1826    ///
1827    /// Checks that entity references within the default value refer to
1828    /// entities that have already been declared (WFC: Entity Declared).
1829    /// Also rejects `<` in default values (WFC: No `<` in Attribute Values).
1830    fn validate_default_value(&self, value: &str) -> Result<(), ParseError> {
1831        let bytes = value.as_bytes();
1832        let mut i = 0;
1833        while i < bytes.len() {
1834            match bytes[i] {
1835                b'<' => {
1836                    return Err(self.fatal("'<' not allowed in attribute default value"));
1837                }
1838                b'&' => {
1839                    i += 1;
1840                    if i < bytes.len() && bytes[i] == b'#' {
1841                        // Character reference — skip over it
1842                        i += 1;
1843                        while i < bytes.len() && bytes[i] != b';' {
1844                            i += 1;
1845                        }
1846                        if i < bytes.len() {
1847                            i += 1;
1848                        }
1849                    } else {
1850                        // Entity reference — extract name and check declaration
1851                        let start = i;
1852                        while i < bytes.len() && bytes[i] != b';' {
1853                            i += 1;
1854                        }
1855                        if i > start && i < bytes.len() {
1856                            let name = std::str::from_utf8(&bytes[start..i]).unwrap_or("");
1857                            // Built-in entities are always available
1858                            let is_builtin = matches!(name, "amp" | "lt" | "gt" | "apos" | "quot");
1859                            if !is_builtin && !self.dtd.entities.contains_key(name) {
1860                                return Err(self.fatal(format!(
1861                                    "undeclared entity '{name}' in attribute default value"
1862                                )));
1863                            }
1864                        }
1865                        if i < bytes.len() {
1866                            i += 1;
1867                        }
1868                    }
1869                }
1870                _ => {
1871                    i += 1;
1872                }
1873            }
1874        }
1875        Ok(())
1876    }
1877
1878    /// Validates that a public ID string contains only valid `PubidChar`s
1879    /// per XML 1.0 §2.3 `[13]`.
1880    fn validate_public_id(&self, pid: &str) -> Result<(), ParseError> {
1881        for c in pid.chars() {
1882            let valid = matches!(c,
1883                ' ' | '\r' | '\n' |
1884                'a'..='z' | 'A'..='Z' | '0'..='9' |
1885                '-' | '\'' | '(' | ')' | '+' | ',' | '.' | '/' | ':' |
1886                '=' | '?' | ';' | '!' | '*' | '#' | '@' | '$' | '_' | '%'
1887            );
1888            if !valid {
1889                return Err(self.fatal(format!(
1890                    "invalid character in public ID: U+{:04X}",
1891                    c as u32
1892                )));
1893            }
1894        }
1895        Ok(())
1896    }
1897
1898    fn parse_quoted_value(&mut self) -> Result<String, ParseError> {
1899        let quote = self.next_byte()?;
1900        if quote != b'"' && quote != b'\'' {
1901            return Err(self.fatal("expected quoted value"));
1902        }
1903        let start = self.pos;
1904        while !self.at_end() && self.peek() != Some(quote) {
1905            self.advance(1);
1906        }
1907        let value = std::str::from_utf8(&self.input[start..self.pos])
1908            .map_err(|_| self.fatal("invalid UTF-8 in quoted value"))?
1909            .to_string();
1910        if self.at_end() {
1911            return Err(self.fatal("unexpected end of input in quoted value"));
1912        }
1913        self.advance(1); // consume closing quote
1914        Ok(value)
1915    }
1916
1917    // --- Low-level input helpers ---
1918
1919    fn location(&self) -> SourceLocation {
1920        SourceLocation {
1921            line: self.line,
1922            column: self.column,
1923            byte_offset: self.pos,
1924        }
1925    }
1926
1927    fn at_end(&self) -> bool {
1928        self.pos >= self.input.len()
1929    }
1930
1931    fn peek(&self) -> Option<u8> {
1932        self.input.get(self.pos).copied()
1933    }
1934
1935    fn peek_char(&self) -> Option<char> {
1936        if self.pos >= self.input.len() {
1937            return None;
1938        }
1939        let first = self.input[self.pos];
1940        // Fast path: ASCII
1941        if first < 0x80 {
1942            return Some(first as char);
1943        }
1944        // Slow path: multi-byte UTF-8 — decode only the needed bytes
1945        let len = match first {
1946            0xC0..=0xDF => 2,
1947            0xE0..=0xEF => 3,
1948            0xF0..=0xF7 => 4,
1949            _ => return None,
1950        };
1951        let remaining = &self.input[self.pos..];
1952        if remaining.len() < len {
1953            return None;
1954        }
1955        std::str::from_utf8(&remaining[..len])
1956            .ok()
1957            .and_then(|s| s.chars().next())
1958    }
1959
1960    fn advance(&mut self, count: usize) {
1961        for _ in 0..count {
1962            if self.pos < self.input.len() {
1963                if self.input[self.pos] == b'\n' {
1964                    self.line += 1;
1965                    self.column = 1;
1966                } else {
1967                    self.column += 1;
1968                }
1969                self.pos += 1;
1970            }
1971        }
1972    }
1973
1974    fn advance_char(&mut self, ch: char) {
1975        let len = ch.len_utf8();
1976        if ch == '\n' {
1977            self.line += 1;
1978            self.column = 1;
1979        } else {
1980            self.column += 1;
1981        }
1982        self.pos += len;
1983    }
1984
1985    fn next_byte(&mut self) -> Result<u8, ParseError> {
1986        if self.at_end() {
1987            return Err(self.fatal("unexpected end of input"));
1988        }
1989        let b = self.input[self.pos];
1990        self.advance(1);
1991        Ok(b)
1992    }
1993
1994    fn expect_byte(&mut self, expected: u8) -> Result<(), ParseError> {
1995        let b = self.next_byte()?;
1996        if b == expected {
1997            Ok(())
1998        } else {
1999            Err(self.fatal(format!(
2000                "expected '{}', found '{}'",
2001                expected as char, b as char
2002            )))
2003        }
2004    }
2005
2006    fn expect_str(&mut self, expected: &[u8]) -> Result<(), ParseError> {
2007        for &b in expected {
2008            self.expect_byte(b)?;
2009        }
2010        Ok(())
2011    }
2012
2013    fn looking_at(&self, s: &[u8]) -> bool {
2014        self.pos + s.len() <= self.input.len() && self.input[self.pos..].starts_with(s)
2015    }
2016
2017    fn skip_whitespace(&mut self) -> bool {
2018        let start = self.pos;
2019        while let Some(b) = self.peek() {
2020            if b == b' ' || b == b'\t' || b == b'\r' || b == b'\n' {
2021                self.advance(1);
2022            } else {
2023                break;
2024            }
2025        }
2026        self.pos > start
2027    }
2028
2029    fn skip_whitespace_required(&mut self) -> Result<(), ParseError> {
2030        if !self.skip_whitespace() {
2031            return Err(self.fatal("whitespace required"));
2032        }
2033        Ok(())
2034    }
2035
2036    fn fatal(&self, message: impl Into<String>) -> ParseError {
2037        ParseError {
2038            message: message.into(),
2039            location: self.location(),
2040            diagnostics: Vec::new(),
2041        }
2042    }
2043}
2044
2045// ---------------------------------------------------------------------------
2046// Entity value helper functions (used by DTD parser and XML parser)
2047// ---------------------------------------------------------------------------
2048
2049/// Expands only character references in a string, leaving entity references
2050/// as-is. Returns the expanded text.
2051///
2052/// Used to compute the replacement text of an internal entity per XML 1.0
2053/// §4.5: character references are expanded at declaration time, while
2054/// entity references are left for expansion at reference time.
2055pub(crate) fn expand_char_refs_only(value: &str) -> String {
2056    let bytes = value.as_bytes();
2057    let mut result = String::with_capacity(value.len());
2058    let mut i = 0;
2059    while i < bytes.len() {
2060        if bytes[i] == b'&' && i + 1 < bytes.len() && bytes[i + 1] == b'#' {
2061            i += 2;
2062            let char_val = if i < bytes.len() && bytes[i] == b'x' {
2063                i += 1;
2064                let start = i;
2065                while i < bytes.len() && bytes[i].is_ascii_hexdigit() {
2066                    i += 1;
2067                }
2068                let hex = std::str::from_utf8(&bytes[start..i]).unwrap_or("0");
2069                u32::from_str_radix(hex, 16).unwrap_or(0)
2070            } else {
2071                let start = i;
2072                while i < bytes.len() && bytes[i].is_ascii_digit() {
2073                    i += 1;
2074                }
2075                let dec = std::str::from_utf8(&bytes[start..i]).unwrap_or("0");
2076                dec.parse::<u32>().unwrap_or(0)
2077            };
2078            if i < bytes.len() && bytes[i] == b';' {
2079                i += 1;
2080            }
2081            if let Some(ch) = char::from_u32(char_val) {
2082                result.push(ch);
2083            }
2084        } else {
2085            // Copy one complete UTF-8 character
2086            let ch = value[i..].chars().next().unwrap_or('\u{FFFD}');
2087            result.push(ch);
2088            i += ch.len_utf8();
2089        }
2090    }
2091    result
2092}
2093
2094/// Replaces entity references (`&name;`) with spaces, leaving character
2095/// references (`&#...;`) and other text unchanged. Correctly handles
2096/// multi-byte UTF-8 characters.
2097///
2098/// Used to sanitize entity replacement text before fragment parsing so
2099/// that entity references (which are valid `Reference` productions in
2100/// content) don't cause undeclared-entity errors.
2101pub(crate) fn replace_entity_refs(text: &str) -> String {
2102    let bytes = text.as_bytes();
2103    let mut result = String::with_capacity(text.len());
2104    let mut i = 0;
2105    while i < bytes.len() {
2106        if bytes[i] == b'&' && i + 1 < bytes.len() && bytes[i + 1] != b'#' {
2107            // Possible entity reference: &name;
2108            let start = i;
2109            i += 1;
2110            if i < bytes.len()
2111                && (bytes[i].is_ascii_alphabetic() || bytes[i] == b'_' || bytes[i] == b':')
2112            {
2113                // Scan to semicolon
2114                while i < bytes.len() && bytes[i] != b';' {
2115                    i += 1;
2116                }
2117                if i < bytes.len() && bytes[i] == b';' {
2118                    // Complete entity reference — replace with space
2119                    result.push(' ');
2120                    i += 1;
2121                } else {
2122                    // Incomplete — keep original text
2123                    result.push_str(&text[start..i]);
2124                }
2125            } else {
2126                // Not a valid entity ref start — keep the '&'
2127                result.push('&');
2128            }
2129        } else {
2130            // Copy one complete UTF-8 character
2131            let ch = text[i..].chars().next().unwrap_or('\u{FFFD}');
2132            result.push(ch);
2133            i += ch.len_utf8();
2134        }
2135    }
2136    result
2137}
2138
2139// ---------------------------------------------------------------------------
2140// XML Name character classes (shared with parser/xml.rs)
2141// ---------------------------------------------------------------------------
2142
2143fn is_ascii_name_start(b: u8) -> bool {
2144    b.is_ascii_alphabetic() || b == b'_' || b == b':'
2145}
2146
2147fn is_ascii_name_char(b: u8) -> bool {
2148    b.is_ascii_alphanumeric() || b == b'_' || b == b':' || b == b'-' || b == b'.'
2149}
2150
2151fn is_name_start_char(c: char) -> bool {
2152    matches!(c,
2153        ':' | 'A'..='Z' | '_' | 'a'..='z' |
2154        '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' |
2155        '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' |
2156        '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' |
2157        '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' |
2158        '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' |
2159        '\u{10000}'..='\u{EFFFF}'
2160    )
2161}
2162
2163fn is_name_char(c: char) -> bool {
2164    is_name_start_char(c)
2165        || matches!(c,
2166            '-' | '.' | '0'..='9' | '\u{B7}' |
2167            '\u{300}'..='\u{36F}' | '\u{203F}'..='\u{2040}'
2168        )
2169}
2170
2171// ---------------------------------------------------------------------------
2172// DTD Validator
2173// ---------------------------------------------------------------------------
2174
2175/// Validates a document against a DTD.
2176///
2177/// Checks that the document conforms to the element declarations, attribute
2178/// declarations, and other constraints in the DTD. Returns a
2179/// [`ValidationResult`] with any errors and warnings.
2180///
2181/// # Checks Performed
2182///
2183/// - Root element name matches DOCTYPE declaration
2184/// - Element content matches declared content models
2185/// - Required attributes are present
2186/// - Attribute values match their declared types (ID uniqueness, IDREF targets,
2187///   enumeration values)
2188/// - No undeclared elements (when the DTD declares elements)
2189/// - No undeclared attributes (when the DTD declares attributes for that element)
2190/// - `#FIXED` attribute values match the declared value
2191///
2192/// # Examples
2193///
2194/// ```
2195/// use xmloxide::Document;
2196/// use xmloxide::validation::dtd::{parse_dtd, validate};
2197///
2198/// let dtd = parse_dtd("<!ELEMENT root (#PCDATA)>").unwrap();
2199/// let mut doc = Document::parse_str("<!DOCTYPE root><root>hello</root>").unwrap();
2200/// let result = validate(&mut doc, &dtd);
2201/// assert!(result.is_valid);
2202/// ```
2203pub fn validate(doc: &mut Document, dtd: &Dtd) -> ValidationResult {
2204    let mut errors = Vec::new();
2205    let mut warnings = Vec::new();
2206    let mut id_values: HashSet<String> = HashSet::new();
2207    let mut idref_values: Vec<String> = Vec::new();
2208
2209    // Check root element name against DOCTYPE
2210    check_root_element(doc, dtd, &mut errors);
2211
2212    // Walk all element nodes and validate
2213    if let Some(root_elem) = doc.root_element() {
2214        validate_element_recursive(
2215            doc,
2216            dtd,
2217            root_elem,
2218            &mut errors,
2219            &mut warnings,
2220            &mut id_values,
2221            &mut idref_values,
2222        );
2223    }
2224
2225    // Check that all IDREF values point to existing IDs
2226    for idref in &idref_values {
2227        if !id_values.contains(idref) {
2228            errors.push(ValidationError {
2229                message: format!("IDREF '{idref}' does not match any ID in the document"),
2230                line: None,
2231                column: None,
2232            });
2233        }
2234    }
2235
2236    let is_valid = errors.is_empty();
2237    ValidationResult {
2238        is_valid,
2239        errors,
2240        warnings,
2241    }
2242}
2243
2244/// Checks that the root element name matches the DOCTYPE name.
2245fn check_root_element(doc: &Document, _dtd: &Dtd, errors: &mut Vec<ValidationError>) {
2246    // Find the DOCTYPE node to get the declared root name
2247    let doctype_name = doc.children(doc.root()).find_map(|id| {
2248        if let NodeKind::DocumentType { ref name, .. } = doc.node(id).kind {
2249            Some(name.clone())
2250        } else {
2251            None
2252        }
2253    });
2254
2255    if let Some(ref expected_name) = doctype_name {
2256        if let Some(root_elem) = doc.root_element() {
2257            if let Some(actual_name) = doc.node_name(root_elem) {
2258                if actual_name != expected_name {
2259                    errors.push(ValidationError {
2260                        message: format!(
2261                            "root element '{actual_name}' does not match \
2262                             DOCTYPE name '{expected_name}'"
2263                        ),
2264                        line: None,
2265                        column: None,
2266                    });
2267                }
2268            }
2269        }
2270    }
2271}
2272
2273/// Recursively validates an element and its descendants.
2274#[allow(clippy::too_many_arguments)]
2275fn validate_element_recursive(
2276    doc: &mut Document,
2277    dtd: &Dtd,
2278    node_id: NodeId,
2279    errors: &mut Vec<ValidationError>,
2280    warnings: &mut Vec<ValidationError>,
2281    id_values: &mut HashSet<String>,
2282    idref_values: &mut Vec<String>,
2283) {
2284    let elem_name = match doc.node_name(node_id) {
2285        Some(name) => name.to_string(),
2286        None => return,
2287    };
2288
2289    // Check if element is declared
2290    let has_element_decls = !dtd.elements.is_empty();
2291    if has_element_decls && !dtd.elements.contains_key(&elem_name) {
2292        errors.push(ValidationError {
2293            message: format!("element '{elem_name}' is not declared in the DTD"),
2294            line: None,
2295            column: None,
2296        });
2297    }
2298
2299    // Check content model
2300    if let Some(elem_decl) = dtd.elements.get(&elem_name) {
2301        validate_content_model(doc, node_id, &elem_name, &elem_decl.content_model, errors);
2302    }
2303
2304    // Check attributes
2305    validate_attributes(
2306        doc,
2307        dtd,
2308        node_id,
2309        &elem_name,
2310        errors,
2311        warnings,
2312        id_values,
2313        idref_values,
2314    );
2315
2316    // Collect child element IDs first to avoid borrow conflicts
2317    let child_ids: Vec<NodeId> = doc
2318        .children(node_id)
2319        .filter(|&child_id| matches!(doc.node(child_id).kind, NodeKind::Element { .. }))
2320        .collect();
2321
2322    // Recurse into child elements
2323    for child_id in child_ids {
2324        validate_element_recursive(
2325            doc,
2326            dtd,
2327            child_id,
2328            errors,
2329            warnings,
2330            id_values,
2331            idref_values,
2332        );
2333    }
2334}
2335
2336/// Validates that an element's children match its declared content model.
2337fn validate_content_model(
2338    doc: &Document,
2339    node_id: NodeId,
2340    elem_name: &str,
2341    model: &ContentModel,
2342    errors: &mut Vec<ValidationError>,
2343) {
2344    match model {
2345        ContentModel::Empty => {
2346            // No children at all
2347            let has_content = doc.children(node_id).any(|child| {
2348                matches!(
2349                    doc.node(child).kind,
2350                    NodeKind::Element { .. } | NodeKind::Text { .. } | NodeKind::CData { .. }
2351                )
2352            });
2353            if has_content {
2354                errors.push(ValidationError {
2355                    message: format!(
2356                        "element '{elem_name}' is declared EMPTY \
2357                         but has content"
2358                    ),
2359                    line: None,
2360                    column: None,
2361                });
2362            }
2363        }
2364        ContentModel::Any => {
2365            // Anything is valid
2366        }
2367        ContentModel::Mixed(allowed_names) => {
2368            // Text is always allowed. Check that element children are in the allowed list.
2369            for child_id in doc.children(node_id) {
2370                if let NodeKind::Element { ref name, .. } = doc.node(child_id).kind {
2371                    if !allowed_names.contains(name) {
2372                        errors.push(ValidationError {
2373                            message: format!(
2374                                "element '{name}' is not allowed in mixed content \
2375                                 of '{elem_name}' (allowed: #PCDATA{})",
2376                                if allowed_names.is_empty() {
2377                                    String::new()
2378                                } else {
2379                                    format!("|{}", allowed_names.join("|"))
2380                                }
2381                            ),
2382                            line: None,
2383                            column: None,
2384                        });
2385                    }
2386                }
2387            }
2388        }
2389        ContentModel::Children(spec) => {
2390            // Collect element child names (ignore text, comments, PIs)
2391            let child_names: Vec<String> = doc
2392                .children(node_id)
2393                .filter_map(|child_id| {
2394                    if let NodeKind::Element { ref name, .. } = doc.node(child_id).kind {
2395                        Some(name.clone())
2396                    } else {
2397                        None
2398                    }
2399                })
2400                .collect();
2401
2402            // Check for text content in element-only content model
2403            let has_text = doc.children(node_id).any(|child_id| {
2404                if let NodeKind::Text { ref content } = doc.node(child_id).kind {
2405                    !content.trim().is_empty()
2406                } else {
2407                    matches!(doc.node(child_id).kind, NodeKind::CData { .. })
2408                }
2409            });
2410
2411            if has_text {
2412                errors.push(ValidationError {
2413                    message: format!(
2414                        "element '{elem_name}' has element-only content model \
2415                         but contains text"
2416                    ),
2417                    line: None,
2418                    column: None,
2419                });
2420            }
2421
2422            // Match the sequence of child element names against the content spec
2423            let consumed = match_content_spec(spec, &child_names, 0);
2424            match consumed {
2425                Some(n) if n == child_names.len() => {
2426                    // Perfect match
2427                }
2428                _ => {
2429                    errors.push(ValidationError {
2430                        message: format!(
2431                            "element '{elem_name}' content does not match \
2432                             declared content model {model}; \
2433                             found children: [{}]",
2434                            child_names.join(", ")
2435                        ),
2436                        line: None,
2437                        column: None,
2438                    });
2439                }
2440            }
2441        }
2442    }
2443}
2444
2445/// Matches a content spec against a slice of element names starting at `pos`.
2446///
2447/// Returns `Some(count)` if the spec matches, consuming `count` names from
2448/// position `pos`. Returns `None` if no match is possible.
2449fn match_content_spec(spec: &ContentSpec, names: &[String], pos: usize) -> Option<usize> {
2450    match &spec.kind {
2451        ContentSpecKind::Name(expected) => match_with_occurrence(
2452            |all_names, p| {
2453                if p < all_names.len() && all_names[p] == *expected {
2454                    Some(1)
2455                } else {
2456                    None
2457                }
2458            },
2459            names,
2460            pos,
2461            spec.occurrence,
2462        ),
2463        ContentSpecKind::Seq(items) => match_with_occurrence(
2464            |all_names, p| {
2465                let mut current = p;
2466                for item in items {
2467                    match match_content_spec(item, all_names, current) {
2468                        Some(consumed) => current += consumed,
2469                        None => return None,
2470                    }
2471                }
2472                Some(current - p)
2473            },
2474            names,
2475            pos,
2476            spec.occurrence,
2477        ),
2478        ContentSpecKind::Choice(items) => match_with_occurrence(
2479            |all_names, p| {
2480                for item in items {
2481                    if let Some(consumed) = match_content_spec(item, all_names, p) {
2482                        return Some(consumed);
2483                    }
2484                }
2485                None
2486            },
2487            names,
2488            pos,
2489            spec.occurrence,
2490        ),
2491    }
2492}
2493
2494/// Applies occurrence matching around a base matcher function.
2495///
2496/// The `base_match` function attempts a single match at a given position,
2497/// returning `Some(consumed)` on success.
2498fn match_with_occurrence(
2499    base_match: impl Fn(&[String], usize) -> Option<usize>,
2500    names: &[String],
2501    pos: usize,
2502    occurrence: Occurrence,
2503) -> Option<usize> {
2504    match occurrence {
2505        Occurrence::Once => base_match(names, pos),
2506        Occurrence::Optional => {
2507            // Try matching once; if it fails, succeed consuming 0
2508            Some(base_match(names, pos).unwrap_or(0))
2509        }
2510        Occurrence::ZeroOrMore | Occurrence::OneOrMore => {
2511            let mut total = 0;
2512            loop {
2513                match base_match(names, pos + total) {
2514                    Some(0) | None => break, // zero-width or no match
2515                    Some(n) => total += n,
2516                }
2517            }
2518            // OneOrMore requires at least one match
2519            if occurrence == Occurrence::OneOrMore && total == 0 {
2520                None
2521            } else {
2522                Some(total)
2523            }
2524        }
2525    }
2526}
2527
2528/// Validates attributes for an element against DTD attribute declarations.
2529#[allow(clippy::too_many_arguments)]
2530fn validate_attributes(
2531    doc: &mut Document,
2532    dtd: &Dtd,
2533    node_id: NodeId,
2534    elem_name: &str,
2535    errors: &mut Vec<ValidationError>,
2536    _warnings: &mut Vec<ValidationError>,
2537    id_values: &mut HashSet<String>,
2538    idref_values: &mut Vec<String>,
2539) {
2540    let attr_decls = dtd.attributes.get(elem_name);
2541    let actual_attrs = doc.attributes(node_id).to_vec();
2542
2543    if let Some(decls) = attr_decls {
2544        // Check each declared attribute
2545        for decl in decls {
2546            let actual = actual_attrs.iter().find(|a| a.name == decl.attribute_name);
2547
2548            match (&decl.default, actual) {
2549                (AttributeDefault::Required, None) => {
2550                    errors.push(ValidationError {
2551                        message: format!(
2552                            "required attribute '{}' missing on element '{elem_name}'",
2553                            decl.attribute_name
2554                        ),
2555                        line: None,
2556                        column: None,
2557                    });
2558                }
2559                (AttributeDefault::Fixed(fixed_val), Some(attr)) => {
2560                    if attr.value != *fixed_val {
2561                        errors.push(ValidationError {
2562                            message: format!(
2563                                "attribute '{}' on element '{elem_name}' must have \
2564                                 fixed value '{fixed_val}', found '{}'",
2565                                decl.attribute_name, attr.value
2566                            ),
2567                            line: None,
2568                            column: None,
2569                        });
2570                    }
2571                }
2572                _ => {}
2573            }
2574
2575            // Type checking for present attributes
2576            if let Some(attr) = actual {
2577                validate_attribute_type(
2578                    doc,
2579                    node_id,
2580                    &attr.value,
2581                    &decl.attribute_type,
2582                    &decl.attribute_name,
2583                    elem_name,
2584                    errors,
2585                    id_values,
2586                    idref_values,
2587                );
2588            }
2589        }
2590
2591        // Check for undeclared attributes (skip xmlns-related attributes)
2592        for attr in &actual_attrs {
2593            if attr.name == "xmlns" || attr.prefix.as_deref() == Some("xmlns") {
2594                continue;
2595            }
2596            let is_declared = decls.iter().any(|d| d.attribute_name == attr.name);
2597            if !is_declared {
2598                errors.push(ValidationError {
2599                    message: format!(
2600                        "attribute '{}' on element '{elem_name}' is not declared in the DTD",
2601                        attr.name
2602                    ),
2603                    line: None,
2604                    column: None,
2605                });
2606            }
2607        }
2608    }
2609}
2610
2611/// Validates an attribute value against its declared type.
2612#[allow(clippy::too_many_arguments)]
2613fn validate_attribute_type(
2614    doc: &mut Document,
2615    node_id: NodeId,
2616    value: &str,
2617    attr_type: &AttributeType,
2618    attr_name: &str,
2619    elem_name: &str,
2620    errors: &mut Vec<ValidationError>,
2621    id_values: &mut HashSet<String>,
2622    idref_values: &mut Vec<String>,
2623) {
2624    match attr_type {
2625        AttributeType::CData => {
2626            // Any string is valid CDATA
2627        }
2628        AttributeType::Id => {
2629            validate_id_value(doc, node_id, value, attr_name, elem_name, errors, id_values);
2630        }
2631        AttributeType::IdRef => {
2632            validate_idref_value(value, attr_name, elem_name, errors, idref_values);
2633        }
2634        AttributeType::IdRefs => {
2635            validate_idrefs_value(value, attr_name, elem_name, errors, idref_values);
2636        }
2637        AttributeType::NmToken => {
2638            validate_nmtoken_value(value, attr_name, elem_name, errors);
2639        }
2640        AttributeType::NmTokens => {
2641            validate_nmtokens_value(value, attr_name, elem_name, errors);
2642        }
2643        AttributeType::Enumeration(values) | AttributeType::Notation(values) => {
2644            validate_enumeration_value(value, values, attr_name, elem_name, errors);
2645        }
2646        AttributeType::Entity | AttributeType::Entities => {
2647            validate_entity_value(value, attr_type, attr_name, elem_name, errors);
2648        }
2649    }
2650}
2651
2652/// Validates an ID attribute value: must be a valid XML Name and unique.
2653///
2654/// On success, registers the ID in the document's `id_map` so it can be
2655/// looked up via [`Document::element_by_id`] and the `XPath` `id()` function.
2656fn validate_id_value(
2657    doc: &mut Document,
2658    node_id: NodeId,
2659    value: &str,
2660    attr_name: &str,
2661    elem_name: &str,
2662    errors: &mut Vec<ValidationError>,
2663    id_values: &mut HashSet<String>,
2664) {
2665    if !is_valid_name(value) {
2666        errors.push(ValidationError {
2667            message: format!(
2668                "attribute '{attr_name}' on element '{elem_name}' \
2669                 has invalid ID value '{value}' (not a valid XML Name)"
2670            ),
2671            line: None,
2672            column: None,
2673        });
2674    } else if !id_values.insert(value.to_string()) {
2675        errors.push(ValidationError {
2676            message: format!(
2677                "duplicate ID value '{value}' on attribute '{attr_name}' \
2678                 of element '{elem_name}'"
2679            ),
2680            line: None,
2681            column: None,
2682        });
2683    } else {
2684        doc.set_id(value, node_id);
2685    }
2686}
2687
2688/// Validates an IDREF attribute value.
2689fn validate_idref_value(
2690    value: &str,
2691    attr_name: &str,
2692    elem_name: &str,
2693    errors: &mut Vec<ValidationError>,
2694    idref_values: &mut Vec<String>,
2695) {
2696    if is_valid_name(value) {
2697        idref_values.push(value.to_string());
2698    } else {
2699        errors.push(ValidationError {
2700            message: format!(
2701                "attribute '{attr_name}' on element '{elem_name}' \
2702                 has invalid IDREF value '{value}'"
2703            ),
2704            line: None,
2705            column: None,
2706        });
2707    }
2708}
2709
2710/// Validates an IDREFS attribute value (space-separated list).
2711fn validate_idrefs_value(
2712    value: &str,
2713    attr_name: &str,
2714    elem_name: &str,
2715    errors: &mut Vec<ValidationError>,
2716    idref_values: &mut Vec<String>,
2717) {
2718    for token in value.split_whitespace() {
2719        if is_valid_name(token) {
2720            idref_values.push(token.to_string());
2721        } else {
2722            errors.push(ValidationError {
2723                message: format!(
2724                    "attribute '{attr_name}' on element '{elem_name}' \
2725                     has invalid IDREFS token '{token}'"
2726                ),
2727                line: None,
2728                column: None,
2729            });
2730        }
2731    }
2732}
2733
2734/// Validates a NMTOKEN attribute value.
2735fn validate_nmtoken_value(
2736    value: &str,
2737    attr_name: &str,
2738    elem_name: &str,
2739    errors: &mut Vec<ValidationError>,
2740) {
2741    if !is_valid_nmtoken(value) {
2742        errors.push(ValidationError {
2743            message: format!(
2744                "attribute '{attr_name}' on element '{elem_name}' \
2745                 has invalid NMTOKEN value '{value}'"
2746            ),
2747            line: None,
2748            column: None,
2749        });
2750    }
2751}
2752
2753/// Validates a NMTOKENS attribute value (space-separated list).
2754fn validate_nmtokens_value(
2755    value: &str,
2756    attr_name: &str,
2757    elem_name: &str,
2758    errors: &mut Vec<ValidationError>,
2759) {
2760    for token in value.split_whitespace() {
2761        if !is_valid_nmtoken(token) {
2762            errors.push(ValidationError {
2763                message: format!(
2764                    "attribute '{attr_name}' on element '{elem_name}' \
2765                     has invalid NMTOKENS token '{token}'"
2766                ),
2767                line: None,
2768                column: None,
2769            });
2770        }
2771    }
2772}
2773
2774/// Validates an enumeration or notation attribute value.
2775fn validate_enumeration_value(
2776    value: &str,
2777    allowed: &[String],
2778    attr_name: &str,
2779    elem_name: &str,
2780    errors: &mut Vec<ValidationError>,
2781) {
2782    if !allowed.contains(&value.to_string()) {
2783        errors.push(ValidationError {
2784            message: format!(
2785                "attribute '{attr_name}' on element '{elem_name}' \
2786                 has value '{value}' which is not in the allowed \
2787                 values ({})",
2788                allowed.join("|")
2789            ),
2790            line: None,
2791            column: None,
2792        });
2793    }
2794}
2795
2796/// Validates an ENTITY or ENTITIES attribute value.
2797fn validate_entity_value(
2798    value: &str,
2799    attr_type: &AttributeType,
2800    attr_name: &str,
2801    elem_name: &str,
2802    errors: &mut Vec<ValidationError>,
2803) {
2804    // Entity/Entities validation would require checking against
2805    // declared unparsed entities. For now we just check Name validity.
2806    let tokens: Vec<&str> = if matches!(attr_type, AttributeType::Entities) {
2807        value.split_whitespace().collect()
2808    } else {
2809        vec![value]
2810    };
2811    for token in tokens {
2812        if !is_valid_name(token) {
2813            errors.push(ValidationError {
2814                message: format!(
2815                    "attribute '{attr_name}' on element '{elem_name}' \
2816                     has invalid ENTITY/ENTITIES value '{token}'"
2817                ),
2818                line: None,
2819                column: None,
2820            });
2821        }
2822    }
2823}
2824
2825/// Checks if a string is a valid XML Name.
2826fn is_valid_name(s: &str) -> bool {
2827    let mut chars = s.chars();
2828    match chars.next() {
2829        Some(first) if is_name_start_char(first) => chars.all(is_name_char),
2830        _ => false,
2831    }
2832}
2833
2834/// Checks if a string is a valid NMTOKEN.
2835fn is_valid_nmtoken(s: &str) -> bool {
2836    !s.is_empty() && s.chars().all(is_name_char)
2837}
2838
2839// ---------------------------------------------------------------------------
2840// Tests
2841// ---------------------------------------------------------------------------
2842
2843#[cfg(test)]
2844#[allow(clippy::unwrap_used)]
2845mod tests {
2846    use super::*;
2847    use pretty_assertions::assert_eq;
2848
2849    // --- DTD Parsing Tests ---
2850
2851    #[test]
2852    fn test_parse_element_empty() {
2853        let dtd = parse_dtd("<!ELEMENT br EMPTY>").unwrap();
2854        let decl = dtd.elements.get("br").unwrap();
2855        assert_eq!(decl.content_model, ContentModel::Empty);
2856    }
2857
2858    #[test]
2859    fn test_parse_element_any() {
2860        let dtd = parse_dtd("<!ELEMENT container ANY>").unwrap();
2861        let decl = dtd.elements.get("container").unwrap();
2862        assert_eq!(decl.content_model, ContentModel::Any);
2863    }
2864
2865    #[test]
2866    fn test_parse_element_pcdata() {
2867        let dtd = parse_dtd("<!ELEMENT title (#PCDATA)>").unwrap();
2868        let decl = dtd.elements.get("title").unwrap();
2869        assert_eq!(decl.content_model, ContentModel::Mixed(vec![]));
2870    }
2871
2872    #[test]
2873    fn test_parse_element_mixed_content() {
2874        let dtd = parse_dtd("<!ELEMENT p (#PCDATA|em|strong)*>").unwrap();
2875        let decl = dtd.elements.get("p").unwrap();
2876        assert_eq!(
2877            decl.content_model,
2878            ContentModel::Mixed(vec!["em".to_string(), "strong".to_string()])
2879        );
2880    }
2881
2882    #[test]
2883    fn test_parse_element_sequence() {
2884        let dtd = parse_dtd("<!ELEMENT book (title,author,year)>").unwrap();
2885        let decl = dtd.elements.get("book").unwrap();
2886        match &decl.content_model {
2887            ContentModel::Children(spec) => {
2888                assert_eq!(spec.occurrence, Occurrence::Once);
2889                match &spec.kind {
2890                    ContentSpecKind::Seq(items) => {
2891                        assert_eq!(items.len(), 3);
2892                        assert_eq!(items[0].kind, ContentSpecKind::Name("title".to_string()));
2893                        assert_eq!(items[1].kind, ContentSpecKind::Name("author".to_string()));
2894                        assert_eq!(items[2].kind, ContentSpecKind::Name("year".to_string()));
2895                    }
2896                    other => panic!("expected Seq, got {other:?}"),
2897                }
2898            }
2899            other => panic!("expected Children, got {other:?}"),
2900        }
2901    }
2902
2903    #[test]
2904    fn test_parse_element_choice() {
2905        let dtd = parse_dtd("<!ELEMENT item (a|b|c)>").unwrap();
2906        let decl = dtd.elements.get("item").unwrap();
2907        match &decl.content_model {
2908            ContentModel::Children(spec) => match &spec.kind {
2909                ContentSpecKind::Choice(items) => {
2910                    assert_eq!(items.len(), 3);
2911                }
2912                other => panic!("expected Choice, got {other:?}"),
2913            },
2914            other => panic!("expected Children, got {other:?}"),
2915        }
2916    }
2917
2918    #[test]
2919    fn test_parse_element_occurrence_indicators() {
2920        let dtd = parse_dtd("<!ELEMENT doc (head, body?, appendix*)>").unwrap();
2921        let decl = dtd.elements.get("doc").unwrap();
2922        match &decl.content_model {
2923            ContentModel::Children(spec) => match &spec.kind {
2924                ContentSpecKind::Seq(items) => {
2925                    assert_eq!(items[0].occurrence, Occurrence::Once);
2926                    assert_eq!(items[1].occurrence, Occurrence::Optional);
2927                    assert_eq!(items[2].occurrence, Occurrence::ZeroOrMore);
2928                }
2929                other => panic!("expected Seq, got {other:?}"),
2930            },
2931            other => panic!("expected Children, got {other:?}"),
2932        }
2933    }
2934
2935    #[test]
2936    fn test_parse_element_nested_groups() {
2937        let dtd = parse_dtd("<!ELEMENT article ((title, author), body)>").unwrap();
2938        let decl = dtd.elements.get("article").unwrap();
2939        match &decl.content_model {
2940            ContentModel::Children(spec) => match &spec.kind {
2941                ContentSpecKind::Seq(items) => {
2942                    assert_eq!(items.len(), 2);
2943                    // First item is a nested sequence (title, author)
2944                    match &items[0].kind {
2945                        ContentSpecKind::Seq(inner) => {
2946                            assert_eq!(inner.len(), 2);
2947                        }
2948                        other => panic!("expected nested Seq, got {other:?}"),
2949                    }
2950                }
2951                other => panic!("expected Seq, got {other:?}"),
2952            },
2953            other => panic!("expected Children, got {other:?}"),
2954        }
2955    }
2956
2957    #[test]
2958    fn test_parse_attlist_cdata() {
2959        let dtd = parse_dtd("<!ATTLIST img src CDATA #REQUIRED>").unwrap();
2960        let decls = dtd.attributes.get("img").unwrap();
2961        assert_eq!(decls.len(), 1);
2962        assert_eq!(decls[0].attribute_name, "src");
2963        assert_eq!(decls[0].attribute_type, AttributeType::CData);
2964        assert_eq!(decls[0].default, AttributeDefault::Required);
2965    }
2966
2967    #[test]
2968    fn test_parse_attlist_id() {
2969        let dtd = parse_dtd("<!ATTLIST div id ID #IMPLIED>").unwrap();
2970        let decls = dtd.attributes.get("div").unwrap();
2971        assert_eq!(decls[0].attribute_type, AttributeType::Id);
2972        assert_eq!(decls[0].default, AttributeDefault::Implied);
2973    }
2974
2975    #[test]
2976    fn test_parse_attlist_enumeration() {
2977        let dtd = parse_dtd("<!ATTLIST input type (text|password|submit) \"text\">").unwrap();
2978        let decls = dtd.attributes.get("input").unwrap();
2979        assert_eq!(
2980            decls[0].attribute_type,
2981            AttributeType::Enumeration(vec![
2982                "text".to_string(),
2983                "password".to_string(),
2984                "submit".to_string()
2985            ])
2986        );
2987        assert_eq!(
2988            decls[0].default,
2989            AttributeDefault::Default("text".to_string())
2990        );
2991    }
2992
2993    #[test]
2994    fn test_parse_attlist_fixed() {
2995        let dtd = parse_dtd("<!ATTLIST doc version CDATA #FIXED \"1.0\">").unwrap();
2996        let decls = dtd.attributes.get("doc").unwrap();
2997        assert_eq!(decls[0].default, AttributeDefault::Fixed("1.0".to_string()));
2998    }
2999
3000    #[test]
3001    fn test_parse_attlist_multiple_attrs() {
3002        let dtd =
3003            parse_dtd("<!ATTLIST person\n  name CDATA #REQUIRED\n  age NMTOKEN #IMPLIED>").unwrap();
3004        let decls = dtd.attributes.get("person").unwrap();
3005        assert_eq!(decls.len(), 2);
3006        assert_eq!(decls[0].attribute_name, "name");
3007        assert_eq!(decls[1].attribute_name, "age");
3008        assert_eq!(decls[1].attribute_type, AttributeType::NmToken);
3009    }
3010
3011    #[test]
3012    fn test_parse_entity_internal() {
3013        let dtd = parse_dtd("<!ENTITY copy \"&#169;\">").unwrap();
3014        let ent = dtd.entities.get("copy").unwrap();
3015        match &ent.kind {
3016            EntityKind::Internal(value) => assert_eq!(value, "&#169;"),
3017            EntityKind::External { .. } => panic!("expected Internal, got External"),
3018        }
3019    }
3020
3021    #[test]
3022    fn test_parse_entity_external() {
3023        let dtd = parse_dtd("<!ENTITY chapter SYSTEM \"chapter.xml\">").unwrap();
3024        let ent = dtd.entities.get("chapter").unwrap();
3025        match &ent.kind {
3026            EntityKind::External {
3027                system_id,
3028                public_id,
3029            } => {
3030                assert_eq!(system_id, "chapter.xml");
3031                assert_eq!(*public_id, None);
3032            }
3033            EntityKind::Internal(val) => panic!("expected External, got Internal({val})"),
3034        }
3035    }
3036
3037    #[test]
3038    fn test_parse_notation() {
3039        let dtd = parse_dtd("<!NOTATION png SYSTEM \"image/png\">").unwrap();
3040        let notation = dtd.notations.get("png").unwrap();
3041        assert_eq!(notation.system_id.as_deref(), Some("image/png"));
3042    }
3043
3044    #[test]
3045    fn test_parse_dtd_with_comments() {
3046        let dtd = parse_dtd(
3047            "<!-- element declarations -->\n\
3048             <!ELEMENT root (#PCDATA)>\n\
3049             <!-- end -->",
3050        )
3051        .unwrap();
3052        assert!(dtd.elements.contains_key("root"));
3053    }
3054
3055    #[test]
3056    fn test_parse_dtd_complex() {
3057        let input = "\
3058            <!ELEMENT doc (head, body)>\n\
3059            <!ELEMENT head (title)>\n\
3060            <!ELEMENT title (#PCDATA)>\n\
3061            <!ELEMENT body (p+)>\n\
3062            <!ELEMENT p (#PCDATA|em)*>\n\
3063            <!ELEMENT em (#PCDATA)>\n\
3064            <!ATTLIST doc version CDATA #FIXED \"1.0\">\n\
3065            <!ATTLIST p id ID #IMPLIED>\n\
3066            <!ENTITY copyright \"Copyright 2024\">\n";
3067        let dtd = parse_dtd(input).unwrap();
3068        assert_eq!(dtd.elements.len(), 6);
3069        assert!(dtd.attributes.contains_key("doc"));
3070        assert!(dtd.attributes.contains_key("p"));
3071        assert!(dtd.entities.contains_key("copyright"));
3072    }
3073
3074    // --- Validation Tests ---
3075
3076    fn make_doc(xml: &str) -> Document {
3077        Document::parse_str(xml).unwrap()
3078    }
3079
3080    #[test]
3081    fn test_validate_valid_document() {
3082        let dtd = parse_dtd("<!ELEMENT root (#PCDATA)>").unwrap();
3083        let mut doc = make_doc("<!DOCTYPE root><root>hello</root>");
3084        let result = validate(&mut doc, &dtd);
3085        assert!(result.is_valid, "errors: {:?}", result.errors);
3086    }
3087
3088    #[test]
3089    fn test_validate_root_name_mismatch() {
3090        let dtd = parse_dtd("<!ELEMENT root (#PCDATA)>").unwrap();
3091        let mut doc = make_doc("<!DOCTYPE root><other>text</other>");
3092        let result = validate(&mut doc, &dtd);
3093        assert!(!result.is_valid);
3094        assert!(
3095            result
3096                .errors
3097                .iter()
3098                .any(|e| e.message.contains("root element 'other'")
3099                    && e.message.contains("does not match DOCTYPE name 'root'")),
3100            "errors: {:?}",
3101            result.errors
3102        );
3103    }
3104
3105    #[test]
3106    fn test_validate_empty_element() {
3107        let dtd = parse_dtd("<!ELEMENT br EMPTY>").unwrap();
3108        let mut doc = make_doc("<!DOCTYPE br><br/>");
3109        let result = validate(&mut doc, &dtd);
3110        assert!(result.is_valid, "errors: {:?}", result.errors);
3111    }
3112
3113    #[test]
3114    fn test_validate_empty_element_has_content() {
3115        let dtd = parse_dtd("<!ELEMENT br EMPTY>").unwrap();
3116        let mut doc = make_doc("<!DOCTYPE br><br>text</br>");
3117        let result = validate(&mut doc, &dtd);
3118        assert!(!result.is_valid);
3119        assert!(
3120            result
3121                .errors
3122                .iter()
3123                .any(|e| e.message.contains("EMPTY") && e.message.contains("has content")),
3124            "errors: {:?}",
3125            result.errors
3126        );
3127    }
3128
3129    #[test]
3130    fn test_validate_any_content() {
3131        let dtd = parse_dtd(
3132            "<!ELEMENT container ANY>\n\
3133             <!ELEMENT child (#PCDATA)>",
3134        )
3135        .unwrap();
3136        let mut doc = make_doc("<!DOCTYPE container><container><child>text</child></container>");
3137        let result = validate(&mut doc, &dtd);
3138        assert!(result.is_valid, "errors: {:?}", result.errors);
3139    }
3140
3141    #[test]
3142    fn test_validate_sequence_correct() {
3143        let dtd = parse_dtd(
3144            "<!ELEMENT book (title,author)>\n\
3145             <!ELEMENT title (#PCDATA)>\n\
3146             <!ELEMENT author (#PCDATA)>",
3147        )
3148        .unwrap();
3149        let mut doc = make_doc(
3150            "<!DOCTYPE book>\
3151             <book><title>XML</title><author>Jon</author></book>",
3152        );
3153        let result = validate(&mut doc, &dtd);
3154        assert!(result.is_valid, "errors: {:?}", result.errors);
3155    }
3156
3157    #[test]
3158    fn test_validate_sequence_wrong_order() {
3159        let dtd = parse_dtd(
3160            "<!ELEMENT book (title,author)>\n\
3161             <!ELEMENT title (#PCDATA)>\n\
3162             <!ELEMENT author (#PCDATA)>",
3163        )
3164        .unwrap();
3165        let mut doc = make_doc(
3166            "<!DOCTYPE book>\
3167             <book><author>Jon</author><title>XML</title></book>",
3168        );
3169        let result = validate(&mut doc, &dtd);
3170        assert!(!result.is_valid);
3171        assert!(
3172            result
3173                .errors
3174                .iter()
3175                .any(|e| e.message.contains("content does not match")),
3176            "errors: {:?}",
3177            result.errors
3178        );
3179    }
3180
3181    #[test]
3182    fn test_validate_required_attribute_missing() {
3183        let dtd = parse_dtd(
3184            "<!ELEMENT img EMPTY>\n\
3185             <!ATTLIST img src CDATA #REQUIRED>",
3186        )
3187        .unwrap();
3188        let mut doc = make_doc("<!DOCTYPE img><img/>");
3189        let result = validate(&mut doc, &dtd);
3190        assert!(!result.is_valid);
3191        assert!(
3192            result
3193                .errors
3194                .iter()
3195                .any(|e| e.message.contains("required attribute 'src'")),
3196            "errors: {:?}",
3197            result.errors
3198        );
3199    }
3200
3201    #[test]
3202    fn test_validate_required_attribute_present() {
3203        let dtd = parse_dtd(
3204            "<!ELEMENT img EMPTY>\n\
3205             <!ATTLIST img src CDATA #REQUIRED>",
3206        )
3207        .unwrap();
3208        let mut doc = make_doc("<!DOCTYPE img><img src=\"photo.jpg\"/>");
3209        let result = validate(&mut doc, &dtd);
3210        assert!(result.is_valid, "errors: {:?}", result.errors);
3211    }
3212
3213    #[test]
3214    fn test_validate_fixed_attribute_correct() {
3215        let dtd = parse_dtd(
3216            "<!ELEMENT doc (#PCDATA)>\n\
3217             <!ATTLIST doc version CDATA #FIXED \"1.0\">",
3218        )
3219        .unwrap();
3220        let mut doc = make_doc("<!DOCTYPE doc><doc version=\"1.0\">text</doc>");
3221        let result = validate(&mut doc, &dtd);
3222        assert!(result.is_valid, "errors: {:?}", result.errors);
3223    }
3224
3225    #[test]
3226    fn test_validate_fixed_attribute_wrong_value() {
3227        let dtd = parse_dtd(
3228            "<!ELEMENT doc (#PCDATA)>\n\
3229             <!ATTLIST doc version CDATA #FIXED \"1.0\">",
3230        )
3231        .unwrap();
3232        let mut doc = make_doc("<!DOCTYPE doc><doc version=\"2.0\">text</doc>");
3233        let result = validate(&mut doc, &dtd);
3234        assert!(!result.is_valid);
3235        assert!(
3236            result
3237                .errors
3238                .iter()
3239                .any(|e| e.message.contains("fixed value '1.0'")),
3240            "errors: {:?}",
3241            result.errors
3242        );
3243    }
3244
3245    #[test]
3246    fn test_validate_enumeration_valid() {
3247        let dtd = parse_dtd(
3248            "<!ELEMENT input EMPTY>\n\
3249             <!ATTLIST input type (text|password) #REQUIRED>",
3250        )
3251        .unwrap();
3252        let mut doc = make_doc("<!DOCTYPE input><input type=\"text\"/>");
3253        let result = validate(&mut doc, &dtd);
3254        assert!(result.is_valid, "errors: {:?}", result.errors);
3255    }
3256
3257    #[test]
3258    fn test_validate_enumeration_invalid() {
3259        let dtd = parse_dtd(
3260            "<!ELEMENT input EMPTY>\n\
3261             <!ATTLIST input type (text|password) #REQUIRED>",
3262        )
3263        .unwrap();
3264        let mut doc = make_doc("<!DOCTYPE input><input type=\"checkbox\"/>");
3265        let result = validate(&mut doc, &dtd);
3266        assert!(!result.is_valid);
3267        assert!(
3268            result
3269                .errors
3270                .iter()
3271                .any(|e| e.message.contains("not in the allowed values")),
3272            "errors: {:?}",
3273            result.errors
3274        );
3275    }
3276
3277    #[test]
3278    fn test_validate_duplicate_id() {
3279        let dtd = parse_dtd(
3280            "<!ELEMENT root (item, item)>\n\
3281             <!ELEMENT item (#PCDATA)>\n\
3282             <!ATTLIST item id ID #REQUIRED>",
3283        )
3284        .unwrap();
3285        let mut doc = make_doc(
3286            "<!DOCTYPE root>\
3287             <root>\
3288               <item id=\"a\">first</item>\
3289               <item id=\"a\">second</item>\
3290             </root>",
3291        );
3292        let result = validate(&mut doc, &dtd);
3293        assert!(!result.is_valid);
3294        assert!(
3295            result
3296                .errors
3297                .iter()
3298                .any(|e| e.message.contains("duplicate ID value 'a'")),
3299            "errors: {:?}",
3300            result.errors
3301        );
3302    }
3303
3304    #[test]
3305    fn test_validate_idref_valid() {
3306        let dtd = parse_dtd(
3307            "<!ELEMENT root (item, ref)>\n\
3308             <!ELEMENT item (#PCDATA)>\n\
3309             <!ELEMENT ref (#PCDATA)>\n\
3310             <!ATTLIST item id ID #REQUIRED>\n\
3311             <!ATTLIST ref target IDREF #REQUIRED>",
3312        )
3313        .unwrap();
3314        let mut doc = make_doc(
3315            "<!DOCTYPE root>\
3316             <root>\
3317               <item id=\"x\">item</item>\
3318               <ref target=\"x\">ref</ref>\
3319             </root>",
3320        );
3321        let result = validate(&mut doc, &dtd);
3322        assert!(result.is_valid, "errors: {:?}", result.errors);
3323    }
3324
3325    #[test]
3326    fn test_validate_idref_dangling() {
3327        let dtd = parse_dtd(
3328            "<!ELEMENT root (ref)>\n\
3329             <!ELEMENT ref (#PCDATA)>\n\
3330             <!ATTLIST ref target IDREF #REQUIRED>",
3331        )
3332        .unwrap();
3333        let mut doc = make_doc(
3334            "<!DOCTYPE root>\
3335             <root><ref target=\"nonexistent\">ref</ref></root>",
3336        );
3337        let result = validate(&mut doc, &dtd);
3338        assert!(!result.is_valid);
3339        assert!(
3340            result.errors.iter().any(|e| e
3341                .message
3342                .contains("IDREF 'nonexistent' does not match any ID")),
3343            "errors: {:?}",
3344            result.errors
3345        );
3346    }
3347
3348    #[test]
3349    fn test_validate_undeclared_element() {
3350        let dtd = parse_dtd("<!ELEMENT root (child)>\n<!ELEMENT child (#PCDATA)>").unwrap();
3351        let mut doc = make_doc("<!DOCTYPE root><root><unknown/></root>");
3352        let result = validate(&mut doc, &dtd);
3353        assert!(!result.is_valid);
3354        assert!(
3355            result
3356                .errors
3357                .iter()
3358                .any(|e| e.message.contains("element 'unknown' is not declared")),
3359            "errors: {:?}",
3360            result.errors
3361        );
3362    }
3363
3364    #[test]
3365    fn test_validate_undeclared_attribute() {
3366        let dtd = parse_dtd(
3367            "<!ELEMENT root (#PCDATA)>\n\
3368             <!ATTLIST root id ID #IMPLIED>",
3369        )
3370        .unwrap();
3371        let mut doc = make_doc("<!DOCTYPE root><root id=\"x\" bogus=\"y\">text</root>");
3372        let result = validate(&mut doc, &dtd);
3373        assert!(!result.is_valid);
3374        assert!(
3375            result
3376                .errors
3377                .iter()
3378                .any(|e| e.message.contains("attribute 'bogus'")
3379                    && e.message.contains("not declared")),
3380            "errors: {:?}",
3381            result.errors
3382        );
3383    }
3384
3385    #[test]
3386    fn test_validate_mixed_content_valid() {
3387        let dtd = parse_dtd(
3388            "<!ELEMENT p (#PCDATA|em|strong)*>\n\
3389             <!ELEMENT em (#PCDATA)>\n\
3390             <!ELEMENT strong (#PCDATA)>",
3391        )
3392        .unwrap();
3393        let mut doc = make_doc(
3394            "<!DOCTYPE p>\
3395             <p>Hello <em>world</em> and <strong>friends</strong></p>",
3396        );
3397        let result = validate(&mut doc, &dtd);
3398        assert!(result.is_valid, "errors: {:?}", result.errors);
3399    }
3400
3401    #[test]
3402    fn test_validate_mixed_content_invalid_child() {
3403        let dtd = parse_dtd(
3404            "<!ELEMENT p (#PCDATA|em)*>\n\
3405             <!ELEMENT em (#PCDATA)>\n\
3406             <!ELEMENT b (#PCDATA)>",
3407        )
3408        .unwrap();
3409        let mut doc = make_doc(
3410            "<!DOCTYPE p>\
3411             <p>Hello <b>world</b></p>",
3412        );
3413        let result = validate(&mut doc, &dtd);
3414        assert!(!result.is_valid);
3415        assert!(
3416            result
3417                .errors
3418                .iter()
3419                .any(|e| e.message.contains("'b' is not allowed in mixed content")),
3420            "errors: {:?}",
3421            result.errors
3422        );
3423    }
3424
3425    #[test]
3426    fn test_validate_choice_correct() {
3427        let dtd = parse_dtd(
3428            "<!ELEMENT item (a|b)>\n\
3429             <!ELEMENT a (#PCDATA)>\n\
3430             <!ELEMENT b (#PCDATA)>",
3431        )
3432        .unwrap();
3433        let mut doc = make_doc("<!DOCTYPE item><item><b>hello</b></item>");
3434        let result = validate(&mut doc, &dtd);
3435        assert!(result.is_valid, "errors: {:?}", result.errors);
3436    }
3437
3438    #[test]
3439    fn test_validate_one_or_more() {
3440        let dtd = parse_dtd("<!ELEMENT list (item+)>\n<!ELEMENT item (#PCDATA)>").unwrap();
3441
3442        // Valid: one item
3443        let mut doc = make_doc("<!DOCTYPE list><list><item>a</item></list>");
3444        assert!(validate(&mut doc, &dtd).is_valid);
3445
3446        // Valid: multiple items
3447        let mut doc = make_doc("<!DOCTYPE list><list><item>a</item><item>b</item></list>");
3448        assert!(validate(&mut doc, &dtd).is_valid);
3449
3450        // Invalid: zero items
3451        let mut doc = make_doc("<!DOCTYPE list><list></list>");
3452        assert!(!validate(&mut doc, &dtd).is_valid);
3453    }
3454
3455    #[test]
3456    fn test_validate_zero_or_more() {
3457        let dtd = parse_dtd("<!ELEMENT list (item*)>\n<!ELEMENT item (#PCDATA)>").unwrap();
3458
3459        // Valid: zero items
3460        let mut doc = make_doc("<!DOCTYPE list><list></list>");
3461        assert!(validate(&mut doc, &dtd).is_valid);
3462
3463        // Valid: multiple items
3464        let mut doc = make_doc("<!DOCTYPE list><list><item>a</item><item>b</item></list>");
3465        assert!(validate(&mut doc, &dtd).is_valid);
3466    }
3467
3468    #[test]
3469    fn test_validate_optional_element() {
3470        let dtd = parse_dtd(
3471            "<!ELEMENT doc (title, subtitle?)>\n\
3472             <!ELEMENT title (#PCDATA)>\n\
3473             <!ELEMENT subtitle (#PCDATA)>",
3474        )
3475        .unwrap();
3476
3477        // Valid: with optional
3478        let mut doc = make_doc(
3479            "<!DOCTYPE doc>\
3480             <doc><title>T</title><subtitle>S</subtitle></doc>",
3481        );
3482        assert!(validate(&mut doc, &dtd).is_valid);
3483
3484        // Valid: without optional
3485        let mut doc = make_doc("<!DOCTYPE doc><doc><title>T</title></doc>");
3486        assert!(validate(&mut doc, &dtd).is_valid);
3487    }
3488
3489    #[test]
3490    fn test_content_model_display() {
3491        assert_eq!(ContentModel::Empty.to_string(), "EMPTY");
3492        assert_eq!(ContentModel::Any.to_string(), "ANY");
3493        assert_eq!(ContentModel::Mixed(vec![]).to_string(), "(#PCDATA)");
3494        assert_eq!(
3495            ContentModel::Mixed(vec!["a".to_string(), "b".to_string()]).to_string(),
3496            "(#PCDATA|a|b)*"
3497        );
3498
3499        let spec = ContentSpec {
3500            kind: ContentSpecKind::Seq(vec![
3501                ContentSpec {
3502                    kind: ContentSpecKind::Name("a".to_string()),
3503                    occurrence: Occurrence::Once,
3504                },
3505                ContentSpec {
3506                    kind: ContentSpecKind::Name("b".to_string()),
3507                    occurrence: Occurrence::ZeroOrMore,
3508                },
3509            ]),
3510            occurrence: Occurrence::Once,
3511        };
3512        assert_eq!(ContentModel::Children(spec).to_string(), "(a , b*)");
3513    }
3514
3515    #[test]
3516    fn test_parse_attlist_idref_idrefs() {
3517        let dtd = parse_dtd(
3518            "<!ATTLIST link target IDREF #REQUIRED>\n\
3519             <!ATTLIST group members IDREFS #REQUIRED>",
3520        )
3521        .unwrap();
3522        let link_decls = dtd.attributes.get("link").unwrap();
3523        assert_eq!(link_decls[0].attribute_type, AttributeType::IdRef);
3524        let group_decls = dtd.attributes.get("group").unwrap();
3525        assert_eq!(group_decls[0].attribute_type, AttributeType::IdRefs);
3526    }
3527
3528    #[test]
3529    fn test_validate_element_content_with_text() {
3530        let dtd = parse_dtd("<!ELEMENT book (title)>\n<!ELEMENT title (#PCDATA)>").unwrap();
3531        let mut doc = make_doc("<!DOCTYPE book><book>stray text<title>T</title></book>");
3532        let result = validate(&mut doc, &dtd);
3533        assert!(!result.is_valid);
3534        assert!(
3535            result
3536                .errors
3537                .iter()
3538                .any(|e| e.message.contains("element-only content model")
3539                    && e.message.contains("contains text")),
3540            "errors: {:?}",
3541            result.errors
3542        );
3543    }
3544
3545    #[test]
3546    fn test_parse_entity_public() {
3547        let dtd = parse_dtd("<!ENTITY logo PUBLIC \"-//LOGO//\" \"logo.png\">").unwrap();
3548        let ent = dtd.entities.get("logo").unwrap();
3549        match &ent.kind {
3550            EntityKind::External {
3551                system_id,
3552                public_id,
3553            } => {
3554                assert_eq!(system_id, "logo.png");
3555                assert_eq!(public_id.as_deref(), Some("-//LOGO//"));
3556            }
3557            EntityKind::Internal(val) => panic!("expected External, got Internal({val})"),
3558        }
3559    }
3560
3561    #[test]
3562    fn test_parse_notation_public() {
3563        let dtd = parse_dtd("<!NOTATION gif PUBLIC \"-//GIF//\">").unwrap();
3564        let notation = dtd.notations.get("gif").unwrap();
3565        assert_eq!(notation.public_id.as_deref(), Some("-//GIF//"));
3566        assert_eq!(notation.system_id, None);
3567    }
3568
3569    #[test]
3570    fn test_parse_parameter_entity_skipped() {
3571        // Parameter entities should be skipped without error
3572        let dtd = parse_dtd(
3573            "<!ENTITY % common \"(#PCDATA)\">\n\
3574             <!ELEMENT root (#PCDATA)>",
3575        )
3576        .unwrap();
3577        assert!(dtd.elements.contains_key("root"));
3578    }
3579
3580    #[test]
3581    fn test_validate_nmtoken_attribute() {
3582        let dtd = parse_dtd(
3583            "<!ELEMENT root (#PCDATA)>\n\
3584             <!ATTLIST root token NMTOKEN #REQUIRED>",
3585        )
3586        .unwrap();
3587
3588        // Valid NMTOKEN
3589        let mut doc = make_doc("<!DOCTYPE root><root token=\"abc-123\">text</root>");
3590        assert!(validate(&mut doc, &dtd).is_valid);
3591
3592        // Invalid NMTOKEN (spaces not allowed)
3593        let mut doc = make_doc("<!DOCTYPE root><root token=\"abc 123\">text</root>");
3594        let result = validate(&mut doc, &dtd);
3595        assert!(!result.is_valid);
3596        assert!(
3597            result
3598                .errors
3599                .iter()
3600                .any(|e| e.message.contains("invalid NMTOKEN")),
3601            "errors: {:?}",
3602            result.errors
3603        );
3604    }
3605
3606    #[test]
3607    fn test_validate_populates_id_map() {
3608        let dtd = parse_dtd(
3609            "<!ELEMENT root (item*)>\n\
3610             <!ELEMENT item (#PCDATA)>\n\
3611             <!ATTLIST item id ID #REQUIRED>",
3612        )
3613        .unwrap();
3614        let mut doc =
3615            make_doc(r#"<!DOCTYPE root><root><item id="a">A</item><item id="b">B</item></root>"#);
3616        let result = validate(&mut doc, &dtd);
3617        assert!(result.is_valid, "errors: {:?}", result.errors);
3618
3619        // The id_map should have been populated
3620        let item_a = doc.element_by_id("a");
3621        assert!(item_a.is_some(), "expected to find element with id='a'");
3622        let item_b = doc.element_by_id("b");
3623        assert!(item_b.is_some(), "expected to find element with id='b'");
3624        assert_eq!(doc.element_by_id("c"), None);
3625
3626        // Verify the nodes are the correct elements
3627        assert_eq!(doc.node_name(item_a.unwrap()), Some("item"));
3628        assert_eq!(doc.node_name(item_b.unwrap()), Some("item"));
3629    }
3630}