html_inspector_html/
lib.rs

1use std::borrow::Cow;
2use std::collections::VecDeque;
3use std::sync::Arc;
4use std::sync::OnceLock;
5
6use html_inspector::{Attribute, EventSource, InputFormat, ParseEvent, Span, ValidatorError};
7use rustc_hash::FxHashMap;
8
9#[cfg(feature = "html5ever")]
10mod html5ever_rcdom;
11#[cfg(feature = "html5ever")]
12mod html5ever_source;
13mod named_entities;
14#[cfg(feature = "html5ever")]
15pub use html5ever_source::Html5EverEventSource;
16
17#[derive(Clone)]
18pub enum HtmlEventSource {
19    Simple(SimpleHtmlEventSource),
20    #[cfg(feature = "html5ever")]
21    Html5Ever(Html5EverEventSource),
22}
23
24impl HtmlEventSource {
25    pub fn from_bytes(
26        name: impl Into<String>,
27        format: InputFormat,
28        bytes: Vec<u8>,
29    ) -> Result<Self, ValidatorError> {
30        Self::from_shared_bytes(name, format, Arc::new(bytes))
31    }
32
33    pub fn from_shared_bytes(
34        name: impl Into<String>,
35        format: InputFormat,
36        bytes: Arc<Vec<u8>>,
37    ) -> Result<Self, ValidatorError> {
38        let name = name.into();
39
40        #[cfg(feature = "html5ever")]
41        if format == InputFormat::Html {
42            return Ok(HtmlEventSource::Html5Ever(
43                Html5EverEventSource::from_shared_bytes(name, bytes),
44            ));
45        }
46
47        Ok(HtmlEventSource::Simple(
48            SimpleHtmlEventSource::from_shared_bytes(name, format, bytes),
49        ))
50    }
51
52    pub fn from_str(
53        name: impl Into<String>,
54        format: InputFormat,
55        s: &str,
56    ) -> Result<Self, ValidatorError> {
57        Self::from_bytes(name, format, s.as_bytes().to_vec())
58    }
59}
60
61#[cfg(test)]
62mod html_event_source_tests {
63    use super::*;
64
65    #[test]
66    fn html_event_source_selects_backend_by_feature_and_format() {
67        let html = HtmlEventSource::from_str("t", InputFormat::Html, "<p>hi</p>").unwrap();
68        #[cfg(feature = "html5ever")]
69        assert!(matches!(html, HtmlEventSource::Html5Ever(_)));
70        #[cfg(not(feature = "html5ever"))]
71        assert!(matches!(html, HtmlEventSource::Simple(_)));
72
73        let xhtml = HtmlEventSource::from_str("t", InputFormat::Xhtml, "<p/>").unwrap();
74        assert!(matches!(xhtml, HtmlEventSource::Simple(_)));
75    }
76
77    #[test]
78    fn simple_scanner_normalizes_tag_and_attribute_names_only_for_html() {
79        let mut html =
80            HtmlEventSource::from_str("t", InputFormat::Html, "<DiV CLass=foo></DiV>").unwrap();
81        let (name, attrs) = loop {
82            match html.next_event().unwrap() {
83                Some(ParseEvent::StartTag { name, attrs, .. }) if name == "div" => {
84                    break (name, attrs);
85                }
86                Some(_) => continue,
87                None => panic!("did not find <div> StartTag"),
88            }
89        };
90        assert_eq!(name, "div");
91        assert_eq!(attrs.len(), 1);
92        assert_eq!(attrs[0].name, "class");
93        assert_eq!(attrs[0].value.as_deref(), Some("foo"));
94
95        let mut xhtml =
96            HtmlEventSource::from_str("t", InputFormat::Xhtml, "<DiV CLass=\"foo\"/>").unwrap();
97        let (name, attrs) = loop {
98            match xhtml.next_event().unwrap() {
99                Some(ParseEvent::StartTag { name, attrs, .. }) if name == "DiV" => {
100                    break (name, attrs);
101                }
102                Some(_) => continue,
103                None => panic!("did not find <DiV> StartTag"),
104            }
105        };
106        assert_eq!(name, "DiV");
107        assert_eq!(attrs.len(), 1);
108        assert_eq!(attrs[0].name, "CLass");
109        assert_eq!(attrs[0].value.as_deref(), Some("foo"));
110    }
111
112    #[test]
113    fn simple_scanner_lowercases_ascii_in_non_ascii_attribute_names_for_html_only() {
114        let mut html =
115            HtmlEventSource::from_str("t", InputFormat::Html, "<div ❤A=foo></div>").unwrap();
116        let attrs = loop {
117            match html.next_event().unwrap() {
118                Some(ParseEvent::StartTag { name, attrs, .. }) if name == "div" => break attrs,
119                Some(_) => continue,
120                None => panic!("did not find <div> StartTag"),
121            }
122        };
123        assert_eq!(attrs.len(), 1);
124        assert_eq!(attrs[0].name, "❤a");
125        assert_eq!(attrs[0].value.as_deref(), Some("foo"));
126
127        let mut xhtml =
128            HtmlEventSource::from_str("t", InputFormat::Xhtml, "<div ❤A=\"foo\"/>").unwrap();
129        let attrs = loop {
130            match xhtml.next_event().unwrap() {
131                Some(ParseEvent::StartTag { name, attrs, .. }) if name == "div" => break attrs,
132                Some(_) => continue,
133                None => panic!("did not find <div> StartTag"),
134            }
135        };
136        assert_eq!(attrs.len(), 1);
137        assert_eq!(attrs[0].name, "❤A");
138        assert_eq!(attrs[0].value.as_deref(), Some("foo"));
139    }
140
141    #[test]
142    fn bytes_at_cursor_is_safe_at_eof() {
143        let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<p>");
144        assert!(src.bytes_at_cursor(b"<p"));
145
146        src.cursor = src.bytes.len();
147        assert!(!src.bytes_at_cursor(b"<"));
148        assert!(src.bytes_at_cursor(b""));
149    }
150}
151
152impl EventSource for HtmlEventSource {
153    fn source_name(&self) -> &str {
154        match self {
155            HtmlEventSource::Simple(s) => s.source_name(),
156            #[cfg(feature = "html5ever")]
157            HtmlEventSource::Html5Ever(s) => s.source_name(),
158        }
159    }
160
161    fn format(&self) -> InputFormat {
162        match self {
163            HtmlEventSource::Simple(s) => s.format(),
164            #[cfg(feature = "html5ever")]
165            HtmlEventSource::Html5Ever(s) => s.format(),
166        }
167    }
168
169    fn next_event(&mut self) -> Result<Option<ParseEvent>, ValidatorError> {
170        match self {
171            HtmlEventSource::Simple(s) => s.next_event(),
172            #[cfg(feature = "html5ever")]
173            HtmlEventSource::Html5Ever(s) => s.next_event(),
174        }
175    }
176}
177
178#[derive(Clone)]
179pub struct SimpleHtmlEventSource {
180    name: String,
181    format: InputFormat,
182    bytes: Arc<Vec<u8>>,
183    cursor: usize,
184    line: u32,
185    col: u32,
186    open_elements: Vec<String>,
187    open_namespaces: Vec<HtmlNamespace>,
188    pending: VecDeque<ParseEvent>,
189    finished: bool,
190}
191
192#[derive(Clone, Copy, Debug, PartialEq, Eq)]
193enum HtmlNamespace {
194    Html,
195    Svg,
196    Math,
197}
198
199impl SimpleHtmlEventSource {
200    pub fn from_bytes(name: impl Into<String>, format: InputFormat, bytes: Vec<u8>) -> Self {
201        Self::from_shared_bytes(name, format, Arc::new(bytes))
202    }
203
204    pub fn from_shared_bytes(
205        name: impl Into<String>,
206        format: InputFormat,
207        bytes: Arc<Vec<u8>>,
208    ) -> Self {
209        Self {
210            name: name.into(),
211            format,
212            bytes,
213            cursor: 0,
214            line: 1,
215            col: 1,
216            open_elements: Vec::new(),
217            open_namespaces: Vec::new(),
218            pending: VecDeque::new(),
219            finished: false,
220        }
221    }
222
223    pub fn from_str(name: impl Into<String>, format: InputFormat, s: &str) -> Self {
224        Self::from_bytes(name, format, s.as_bytes().to_vec())
225    }
226
227    fn bump_to(&mut self, new_cursor: usize) {
228        for &b in &self.bytes[self.cursor..new_cursor] {
229            if b == b'\n' {
230                self.line += 1;
231                self.col = 1;
232            } else {
233                self.col += 1;
234            }
235        }
236        self.cursor = new_cursor;
237    }
238
239    #[inline]
240    fn bytes_at_cursor(&self, needle: &[u8]) -> bool {
241        self.bytes
242            .get(self.cursor..)
243            .is_some_and(|tail| tail.starts_with(needle))
244    }
245
246    fn current_span(&self, start: usize, end: usize, start_line: u32, start_col: u32) -> Span {
247        Span::new(start, end, start_line, start_col)
248    }
249
250    fn emit_tokenizer_eof_after_lt(&mut self, start: usize, start_line: u32, start_col: u32) {
251        let end = self.bytes.len();
252        self.bump_to(end);
253        self.pending.push_back(ParseEvent::ParseError {
254            code: "html.tokenizer.eof_after_lt".to_string(),
255            message: "End of file after “<”.".to_string(),
256            span: Some(self.current_span(start, end, start_line, start_col)),
257        });
258        self.finished = true;
259    }
260
261    fn normalize_name(&self, s: impl Into<String>) -> String {
262        let mut out = s.into();
263        if self.format == InputFormat::Html {
264            out.make_ascii_lowercase();
265        }
266        out
267    }
268
269    fn current_text_mode_kind(&self) -> TextModeKind {
270        let (Some(name), Some(&HtmlNamespace::Html)) =
271            (self.open_elements.last(), self.open_namespaces.last())
272        else {
273            return TextModeKind::Data;
274        };
275        match name.as_str() {
276            "script" | "style" | "xmp" | "iframe" | "noembed" | "noframes" => TextModeKind::RawText,
277            "title" | "textarea" => TextModeKind::RcData,
278            "plaintext" => TextModeKind::Plaintext,
279            _ => TextModeKind::Data,
280        }
281    }
282
283    fn current_insertion_namespace(&self) -> HtmlNamespace {
284        let ns = self
285            .open_namespaces
286            .last()
287            .copied()
288            .unwrap_or(HtmlNamespace::Html);
289        if ns == HtmlNamespace::Svg
290            && self
291                .open_elements
292                .last()
293                .is_some_and(|name| matches!(name.as_str(), "foreignobject" | "desc" | "title"))
294        {
295            HtmlNamespace::Html
296        } else {
297            ns
298        }
299    }
300
301    fn namespace_for_start_tag(&self, name: &str) -> HtmlNamespace {
302        match self.current_insertion_namespace() {
303            HtmlNamespace::Html => match name {
304                "svg" => HtmlNamespace::Svg,
305                "math" => HtmlNamespace::Math,
306                _ => HtmlNamespace::Html,
307            },
308            ns => ns,
309        }
310    }
311
312    fn scan_next(&mut self) -> Result<(), ValidatorError> {
313        if self.finished {
314            return Ok(());
315        }
316
317        if self.cursor >= self.bytes.len() {
318            self.finished = true;
319            return Ok(());
320        }
321
322        // If we are in data state and the last character is a lone "<", report the tokenizer EOF error.
323        if self.bytes[self.cursor] == b'<' && self.cursor + 1 == self.bytes.len() {
324            let start = self.cursor;
325            let start_line = self.line;
326            let start_col = self.col;
327            self.emit_tokenizer_eof_after_lt(start, start_line, start_col);
328            return Ok(());
329        }
330
331        match self.current_text_mode_kind() {
332            TextModeKind::Data => {}
333            TextModeKind::Plaintext => {
334                let start = self.cursor;
335                let start_line = self.line;
336                let start_col = self.col;
337                let end = self.bytes.len();
338                self.bump_to(end);
339                let text = bytes_to_string_lossy(&self.bytes[start..end]);
340                self.pending.push_back(ParseEvent::Text {
341                    text,
342                    span: Some(self.current_span(start, end, start_line, start_col)),
343                });
344                self.finished = true;
345                return Ok(());
346            }
347            TextModeKind::RawText => {
348                return self.scan_rawtext(false);
349            }
350            TextModeKind::RcData => {
351                return self.scan_rawtext(true);
352            }
353        }
354
355        let next_lt = memchr(b'<', &self.bytes[self.cursor..]).map(|off| self.cursor + off);
356        if let Some(lt) = next_lt {
357            if lt > self.cursor {
358                let start = self.cursor;
359                let start_line = self.line;
360                let start_col = self.col;
361                self.bump_to(lt);
362                let raw = str_from_bytes_lossy(&self.bytes[start..lt]);
363                let (text, errs) = decode_char_refs_with_errors(
364                    self.format,
365                    raw.as_ref(),
366                    false,
367                    start,
368                    start_line,
369                    start_col,
370                );
371                self.pending.extend(errs);
372                self.pending.push_back(ParseEvent::Text {
373                    text,
374                    span: Some(self.current_span(start, lt, start_line, start_col)),
375                });
376                return Ok(());
377            }
378
379            // cursor at '<'
380            let start = self.cursor;
381            let start_line = self.line;
382            let start_col = self.col;
383
384            // Common bad sequences in data.
385            if self.format == InputFormat::Html && self.bytes_at_cursor(b"<>") {
386                // "<>" is not a tag.
387                self.bump_to(self.cursor + 2);
388                self.pending.push_back(ParseEvent::ParseError {
389                    code: "html.tokenizer.lt_gt".to_string(),
390                    message:
391                        "Saw “<>”. Probable causes: Unescaped “<” (escape as “&lt;”) or mistyped start tag."
392                            .to_string(),
393                    span: Some(self.current_span(start, start + 2, start_line, start_col)),
394                });
395                self.pending.push_back(ParseEvent::Text {
396                    text: "<>".to_string(),
397                    span: Some(self.current_span(start, start + 2, start_line, start_col)),
398                });
399                return Ok(());
400            }
401
402            // Comments: <!-- ... -->
403            if self.bytes_at_cursor(b"<!--") {
404                return self.scan_comment(start, start_line, start_col);
405            }
406
407            // CDATA: <![CDATA[ ... ]]>
408            if self.bytes_at_cursor(b"<![CDATA[") {
409                match self.format {
410                    InputFormat::Xhtml => return self.scan_cdata(start, start_line, start_col),
411                    InputFormat::Html => {
412                        if self.current_insertion_namespace() != HtmlNamespace::Html {
413                            return self.scan_cdata(start, start_line, start_col);
414                        }
415                    }
416                }
417            }
418
419            // Doctype: <!doctype ...>
420            if self.bytes_at_cursor(b"<!") {
421                let mut j = self.cursor + 2;
422                while j < self.bytes.len() && self.bytes[j].is_ascii_whitespace() {
423                    j += 1;
424                }
425                if starts_with_ascii_case_insensitive(&self.bytes[j..], b"doctype") {
426                    return self.scan_doctype(start, start_line, start_col);
427                }
428            }
429
430            // End tag: </...> (only if next char starts a tag name)
431            if self.bytes_at_cursor(b"</") {
432                let Some(b) = self.bytes.get(self.cursor + 2).copied() else {
433                    // "</" at EOF: align with VNU error text.
434                    self.emit_tokenizer_eof_after_lt(start, start_line, start_col);
435                    return Ok(());
436                };
437
438                if b.is_ascii_alphabetic() {
439                    return self.scan_end_tag(start, start_line, start_col);
440                }
441                if b == b'>' {
442                    // "</>"
443                    self.bump_to(self.cursor + 3);
444                    self.pending.push_back(ParseEvent::ParseError {
445                        code: "html.tokenizer.lt_slash_gt".to_string(),
446                        message:
447                            "Saw “</>”. Probable causes: Unescaped “<” (escape as “&lt;”) or mistyped end tag."
448                                .to_string(),
449                        span: Some(self.current_span(start, start + 3, start_line, start_col)),
450                    });
451                    self.pending.push_back(ParseEvent::Text {
452                        text: "</>".to_string(),
453                        span: Some(self.current_span(start, start + 3, start_line, start_col)),
454                    });
455                    return Ok(());
456                }
457                if b.is_ascii_whitespace() {
458                    // "</ garbage>"
459                    return self.scan_garbage_after_lt_slash(start, start_line, start_col);
460                }
461                // Not a real end tag; treat '<' as literal text and emit a coalesced text run.
462                return self.scan_text_run(start, start_line, start_col);
463            }
464
465            // Bogus comment: <? ... > or <! ... >
466            if self.bytes_at_cursor(b"<?") {
467                return match self.format {
468                    InputFormat::Xhtml => {
469                        self.scan_processing_instruction(start, start_line, start_col)
470                    }
471                    InputFormat::Html => self.scan_bogus_comment(start, start_line, start_col),
472                };
473            }
474            if self.bytes_at_cursor(b"<!") {
475                return self.scan_bogus_comment(start, start_line, start_col);
476            }
477
478            // Start tag: <...> (only if next char starts a tag name)
479            if let Some(&b) = self.bytes.get(self.cursor + 1) {
480                if b.is_ascii_alphabetic() {
481                    return self.scan_start_tag(start, start_line, start_col);
482                }
483                if self.format == InputFormat::Html
484                    && !b.is_ascii_whitespace()
485                    && !matches!(b, b'!' | b'/' | b'?')
486                {
487                    // "<1", "<\\", etc.
488                    self.pending.push_back(ParseEvent::ParseError {
489                        code: "html.tokenizer.bad_char_after_lt".to_string(),
490                        message:
491                            format!("Bad character “{}” after “<”. Probable cause: Unescaped “<”. Try escaping it as “&lt;”.", b as char),
492                        span: Some(self.current_span(start, start + 2, start_line, start_col)),
493                    });
494                    return self.scan_text_run(start, start_line, start_col);
495                }
496            }
497
498            // Not a tag start; treat '<' as literal text and emit a coalesced text run.
499            return self.scan_text_run(start, start_line, start_col);
500        }
501
502        // No more tags; emit remainder as text.
503        let start = self.cursor;
504        let start_line = self.line;
505        let start_col = self.col;
506        let end = self.bytes.len();
507        self.bump_to(end);
508        let raw = str_from_bytes_lossy(&self.bytes[start..end]);
509        let (text, errs) = decode_char_refs_with_errors(
510            self.format,
511            raw.as_ref(),
512            false,
513            start,
514            start_line,
515            start_col,
516        );
517        self.pending.extend(errs);
518        self.pending.push_back(ParseEvent::Text {
519            text,
520            span: Some(self.current_span(start, end, start_line, start_col)),
521        });
522        Ok(())
523    }
524
525    fn scan_comment(
526        &mut self,
527        start: usize,
528        start_line: u32,
529        start_col: u32,
530    ) -> Result<(), ValidatorError> {
531        let Some(end) = find_subslice(&self.bytes, self.cursor + 4, b"-->") else {
532            self.finished = true;
533            self.pending.push_back(ParseEvent::ParseError {
534                code: "html.tokenizer.eof_in_comment".to_string(),
535                message: "End of file inside comment.".to_string(),
536                span: Some(self.current_span(start, self.bytes.len(), start_line, start_col)),
537            });
538            return Ok(());
539        };
540
541        let comment_start = self.cursor + 4;
542        let comment_end = end;
543        let text = bytes_to_string_lossy(&self.bytes[comment_start..comment_end]);
544        if self.format == InputFormat::Html
545            && let Some(off) = text.find("<!--")
546        {
547            let err_start = comment_start + off;
548            let (line, col) = line_col_at_byte_offset(
549                self.bytes.as_ref(),
550                start,
551                start_line,
552                start_col,
553                err_start,
554            );
555            self.pending.push_back(ParseEvent::ParseError {
556                code: "html.tokenizer.nested_comment".to_string(),
557                message:
558                    "Saw “<!--” within a comment. Probable cause: Nested comment (not allowed)."
559                        .to_string(),
560                span: Some(Span::new(err_start, err_start + 4, line, col)),
561            });
562        }
563        let close_end = end + 3;
564        self.bump_to(close_end);
565        self.pending.push_back(ParseEvent::Comment {
566            text,
567            span: Some(self.current_span(start, close_end, start_line, start_col)),
568        });
569        Ok(())
570    }
571
572    fn scan_cdata(
573        &mut self,
574        start: usize,
575        start_line: u32,
576        start_col: u32,
577    ) -> Result<(), ValidatorError> {
578        // <![CDATA[ ... ]]>
579        let cdata_start = self.cursor + 9;
580        let Some(end) = find_subslice(&self.bytes, cdata_start, b"]]>") else {
581            self.finished = true;
582            self.pending.push_back(ParseEvent::ParseError {
583                code: "xml.cdata_eof".to_string(),
584                message: "Unterminated CDATA section.".to_string(),
585                span: Some(self.current_span(start, self.bytes.len(), start_line, start_col)),
586            });
587            return Ok(());
588        };
589
590        let text = bytes_to_string_lossy(&self.bytes[cdata_start..end]);
591        let close_end = end + 3;
592        self.bump_to(close_end);
593        self.pending.push_back(ParseEvent::Text {
594            text,
595            span: Some(self.current_span(start, close_end, start_line, start_col)),
596        });
597        Ok(())
598    }
599
600    fn scan_bogus_comment(
601        &mut self,
602        start: usize,
603        start_line: u32,
604        start_col: u32,
605    ) -> Result<(), ValidatorError> {
606        let is_processing_instruction = self.bytes_at_cursor(b"<?");
607        self.pending.push_back(ParseEvent::ParseError {
608            code: if is_processing_instruction {
609                "html.tokenizer.processing_instruction".to_string()
610            } else {
611                "html.tokenizer.bogus_comment".to_string()
612            },
613            message: if is_processing_instruction {
614                "Saw “<?”. Probable cause: Attempt to use an XML processing instruction in HTML. (XML processing instructions are not supported in HTML.)".to_string()
615            } else {
616                "Bogus comment.".to_string()
617            },
618            span: Some(self.current_span(start, start + 2, start_line, start_col)),
619        });
620        // Treat everything up to the next '>' (or EOF) as a comment payload.
621        let prefix_len = 2;
622        if let Some(off) = memchr(b'>', &self.bytes[self.cursor + prefix_len..]) {
623            let gt = self.cursor + prefix_len + off;
624            let text = bytes_to_string_lossy(&self.bytes[self.cursor + prefix_len..gt]);
625            let end = gt + 1;
626            self.bump_to(end);
627            self.pending.push_back(ParseEvent::Comment {
628                text,
629                span: Some(self.current_span(start, end, start_line, start_col)),
630            });
631            return Ok(());
632        }
633        // EOF: emit comment then finish.
634        let text = bytes_to_string_lossy(&self.bytes[self.cursor + prefix_len..]);
635        self.bump_to(self.bytes.len());
636        self.pending.push_back(ParseEvent::Comment {
637            text,
638            span: Some(self.current_span(start, self.bytes.len(), start_line, start_col)),
639        });
640        self.finished = true;
641        Ok(())
642    }
643
644    fn scan_processing_instruction(
645        &mut self,
646        start: usize,
647        start_line: u32,
648        start_col: u32,
649    ) -> Result<(), ValidatorError> {
650        // XML processing instruction: <?target data?>
651        let content_start = self.cursor + 2;
652        let Some(pi_end) = find_subslice(&self.bytes, content_start, b"?>") else {
653            self.finished = true;
654            self.pending.push_back(ParseEvent::ParseError {
655                code: "xml.pi_eof".to_string(),
656                message: "Unterminated processing instruction.".to_string(),
657                span: Some(self.current_span(start, self.bytes.len(), start_line, start_col)),
658            });
659            return Ok(());
660        };
661
662        let mut i = content_start;
663        while i < pi_end && self.bytes[i].is_ascii_whitespace() {
664            i += 1;
665        }
666        let target_start = i;
667        while i < pi_end && !self.bytes[i].is_ascii_whitespace() {
668            i += 1;
669        }
670        let target = bytes_to_string_lossy(&self.bytes[target_start..i]);
671        let data = str_from_bytes_lossy(&self.bytes[i..pi_end])
672            .trim()
673            .to_string();
674
675        let close_end = pi_end + 2;
676        self.bump_to(close_end);
677        self.pending.push_back(ParseEvent::ProcessingInstruction {
678            target,
679            data,
680            span: Some(self.current_span(start, close_end, start_line, start_col)),
681        });
682        Ok(())
683    }
684
685    fn scan_end_tag(
686        &mut self,
687        start: usize,
688        start_line: u32,
689        start_col: u32,
690    ) -> Result<(), ValidatorError> {
691        let Some(off) = memchr(b'>', &self.bytes[self.cursor + 2..]) else {
692            self.finished = true;
693            self.pending.push_back(ParseEvent::ParseError {
694                code: "html.tokenizer.eof_in_end_tag".to_string(),
695                message: "End of file seen when looking for tag name. Ignoring tag.".to_string(),
696                span: Some(self.current_span(start, self.bytes.len(), start_line, start_col)),
697            });
698            return Ok(());
699        };
700        let gt = self.cursor + 2 + off;
701
702        let raw_all = str_from_bytes_lossy(&self.bytes[self.cursor + 2..gt]);
703        let raw_trimmed = raw_all.trim();
704
705        let mut raw = raw_trimmed;
706        if raw.ends_with('/') {
707            self.pending.push_back(ParseEvent::ParseError {
708                code: "html.tokenizer.end_tag_stray_slash".to_string(),
709                message: "Stray “/” at the end of an end tag.".to_string(),
710                span: Some(self.current_span(start, gt + 1, start_line, start_col)),
711            });
712            raw = raw.trim_end_matches('/').trim_end();
713        }
714
715        let mut it = raw.split_whitespace();
716        let name_raw = it.next().unwrap_or("");
717        if it.next().is_some() {
718            self.pending.push_back(ParseEvent::ParseError {
719                code: "html.tokenizer.end_tag_with_attrs".to_string(),
720                message: "End tag had attributes.".to_string(),
721                span: Some(self.current_span(start, gt + 1, start_line, start_col)),
722            });
723        }
724
725        let name = self.normalize_name(name_raw);
726        let end = gt + 1;
727        self.bump_to(end);
728        self.pop_open_element(&name);
729        self.pending.push_back(ParseEvent::EndTag {
730            name,
731            span: Some(self.current_span(start, end, start_line, start_col)),
732        });
733        Ok(())
734    }
735
736    fn scan_start_tag(
737        &mut self,
738        start: usize,
739        start_line: u32,
740        start_col: u32,
741    ) -> Result<(), ValidatorError> {
742        let Some(gt) = find_tag_close(&self.bytes, self.cursor + 1) else {
743            let rest = &self.bytes[self.cursor + 1..];
744            let (code, message) = classify_start_tag_eof(rest);
745            self.finished = true;
746            self.pending.push_back(ParseEvent::ParseError {
747                code,
748                message,
749                span: Some(self.current_span(start, self.bytes.len(), start_line, start_col)),
750            });
751            self.bump_to(self.bytes.len());
752            return Ok(());
753        };
754
755        let inside = str_from_bytes_lossy(&self.bytes[self.cursor + 1..gt]);
756        let end = gt + 1;
757        let (name, attrs, self_closing, errs) =
758            parse_start_tag(self, inside.as_ref(), start, start_line, start_col, end)?;
759        self.pending.extend(errs);
760        self.bump_to(end);
761        let ns = self.namespace_for_start_tag(&name);
762        let pushes = match self.format {
763            InputFormat::Html => {
764                if ns == HtmlNamespace::Html {
765                    !html_inspector::is_void_html_element(&name)
766                } else {
767                    !self_closing
768                }
769            }
770            InputFormat::Xhtml => !self_closing,
771        };
772        if pushes {
773            self.open_elements.push(name.clone());
774            self.open_namespaces.push(ns);
775        }
776        self.pending.push_back(ParseEvent::StartTag {
777            name,
778            attrs,
779            self_closing,
780            span: Some(self.current_span(start, end, start_line, start_col)),
781        });
782        Ok(())
783    }
784
785    fn scan_doctype(
786        &mut self,
787        start: usize,
788        start_line: u32,
789        start_col: u32,
790    ) -> Result<(), ValidatorError> {
791        let all_bytes: &[u8] = &self.bytes;
792        let mk_span = |byte_start: usize, byte_end: usize| {
793            let (line, col) =
794                line_col_at_byte_offset(all_bytes, start, start_line, start_col, byte_start);
795            Span::new(byte_start, byte_end, line, col)
796        };
797
798        // Find the end of the doctype token: first '>' outside quotes.
799        let end = {
800            let mut pos = self.cursor + 2;
801            let mut quote: Option<u8> = None;
802            loop {
803                let Some(&b) = self.bytes.get(pos) else {
804                    break self.bytes.len();
805                };
806                if let Some(q) = quote {
807                    if b == q {
808                        quote = None;
809                    }
810                } else if matches!(b, b'"' | b'\'') {
811                    quote = Some(b);
812                } else if b == b'>' {
813                    break pos + 1;
814                }
815                pos += 1;
816            }
817        };
818
819        // Parse the contents for name/public/system (and emit Java/VNU-like parse errors).
820        let bytes = all_bytes;
821        let mut i = self.cursor + 2;
822
823        // Consume optional whitespace and "doctype" keyword.
824        while i < end && bytes[i].is_ascii_whitespace() {
825            i += 1;
826        }
827        if starts_with_ascii_case_insensitive(&bytes[i..end], b"doctype") {
828            i += "doctype".len();
829        }
830
831        // After the keyword, HTML expects whitespace before the name (even if the name is missing).
832        if i >= end || !bytes[i].is_ascii_whitespace() {
833            self.pending.push_back(ParseEvent::ParseError {
834                code: "html.tokenizer.doctype.missing_space_before_name".to_string(),
835                message: "Missing space before doctype name.".to_string(),
836                span: Some(mk_span(start, start + 2)),
837            });
838        }
839
840        while i < end && bytes[i].is_ascii_whitespace() {
841            i += 1;
842        }
843
844        let name_start = i;
845        while i < end && !bytes[i].is_ascii_whitespace() && bytes[i] != b'>' {
846            i += 1;
847        }
848        let name = if name_start < i {
849            Some(self.normalize_name(str_from_bytes_lossy(&bytes[name_start..i])))
850        } else {
851            None
852        };
853
854        while i < end && bytes[i].is_ascii_whitespace() {
855            i += 1;
856        }
857
858        let mut public_id: Option<String> = None;
859        let mut system_id: Option<String> = None;
860        let mut saw_syntax_error = false;
861        let mut saw_bogus_doctype = false;
862
863        // PUBLIC/SYSTEM identifiers.
864        if i < end && bytes[i] != b'>' {
865            if starts_with_ascii_case_insensitive(&bytes[i..end], b"public") {
866                i += "public".len();
867                if i < end && matches!(bytes[i], b'"' | b'\'') {
868                    saw_syntax_error = true;
869                    self.pending.push_back(ParseEvent::ParseError {
870                        code: "html.tokenizer.doctype.no_space_after_public".to_string(),
871                        message: "No space between the doctype “PUBLIC” keyword and the quote."
872                            .to_string(),
873                        span: Some(mk_span(start, start + 2)),
874                    });
875                }
876                while i < end && bytes[i].is_ascii_whitespace() {
877                    i += 1;
878                }
879                if i >= end || bytes[i] == b'>' {
880                    saw_syntax_error = true;
881                    self.pending.push_back(ParseEvent::ParseError {
882                        code: "html.tokenizer.doctype.expected_public_id".to_string(),
883                        message: "Expected a public identifier but the doctype ended.".to_string(),
884                        span: Some(mk_span(start, start + 2)),
885                    });
886                } else if matches!(bytes[i], b'"' | b'\'') {
887                    let q = bytes[i];
888                    i += 1;
889                    let id_start = i;
890                    let mut saw_gt = false;
891                    while i < end && bytes[i] != q {
892                        if bytes[i] == b'>' && !saw_gt {
893                            saw_gt = true;
894                            saw_syntax_error = true;
895                            self.pending.push_back(ParseEvent::ParseError {
896                                code: "html.tokenizer.doctype.gt_in_public_id".to_string(),
897                                message: "“>” in public identifier.".to_string(),
898                                span: Some(mk_span(i, i + 1)),
899                            });
900                        }
901                        i += 1;
902                    }
903                    if i >= end {
904                        saw_syntax_error = true;
905                        self.pending.push_back(ParseEvent::ParseError {
906                            code: "html.tokenizer.doctype.eof_in_public_id".to_string(),
907                            message: "End of file inside public identifier.".to_string(),
908                            span: Some(mk_span(start, end)),
909                        });
910                    }
911                    public_id = Some(bytes_to_string_lossy(&bytes[id_start..i.min(end)]));
912                    if i < end && bytes[i] == q {
913                        i += 1;
914                    }
915
916                    let mut had_ws = false;
917                    while i < end && bytes[i].is_ascii_whitespace() {
918                        had_ws = true;
919                        i += 1;
920                    }
921                    if i < end && (bytes[i] == b'"' || bytes[i] == b'\'') {
922                        if !had_ws {
923                            saw_syntax_error = true;
924                            self.pending.push_back(ParseEvent::ParseError {
925                                code: "html.tokenizer.doctype.no_space_between_public_system"
926                                    .to_string(),
927                                message:
928                                    "No space between the doctype public and system identifiers."
929                                        .to_string(),
930                                span: Some(mk_span(i, i + 1)),
931                            });
932                        }
933                        let q = bytes[i];
934                        i += 1;
935                        let id_start = i;
936                        let mut saw_gt = false;
937                        while i < end && bytes[i] != q {
938                            if bytes[i] == b'>' && !saw_gt {
939                                saw_gt = true;
940                                saw_syntax_error = true;
941                                self.pending.push_back(ParseEvent::ParseError {
942                                    code: "html.tokenizer.doctype.gt_in_system_id".to_string(),
943                                    message: "“>” in system identifier.".to_string(),
944                                    span: Some(mk_span(i, i + 1)),
945                                });
946                            }
947                            i += 1;
948                        }
949                        if i >= end {
950                            saw_syntax_error = true;
951                            self.pending.push_back(ParseEvent::ParseError {
952                                code: "html.tokenizer.doctype.eof_in_system_id".to_string(),
953                                message: "End of file inside system identifier.".to_string(),
954                                span: Some(mk_span(start, end)),
955                            });
956                        }
957                        system_id = Some(bytes_to_string_lossy(&bytes[id_start..i.min(end)]));
958                        // If there's a closing quote within the token, it's already accounted for by `i`.
959                    }
960                }
961            } else if starts_with_ascii_case_insensitive(&bytes[i..end], b"system") {
962                i += "system".len();
963                if i < end && (bytes[i] == b'"' || bytes[i] == b'\'') {
964                    saw_syntax_error = true;
965                    self.pending.push_back(ParseEvent::ParseError {
966                        code: "html.tokenizer.doctype.no_space_after_system".to_string(),
967                        message: "No space between the doctype “SYSTEM” keyword and the quote."
968                            .to_string(),
969                        span: Some(mk_span(start, start + 2)),
970                    });
971                }
972                while i < end && bytes[i].is_ascii_whitespace() {
973                    i += 1;
974                }
975                if i >= end || bytes[i] == b'>' {
976                    saw_syntax_error = true;
977                    self.pending.push_back(ParseEvent::ParseError {
978                        code: "html.tokenizer.doctype.expected_system_id".to_string(),
979                        message: "Expected a system identifier but the doctype ended.".to_string(),
980                        span: Some(mk_span(start, start + 2)),
981                    });
982                } else if bytes[i] == b'"' || bytes[i] == b'\'' {
983                    let q = bytes[i];
984                    i += 1;
985                    let id_start = i;
986                    let mut saw_gt = false;
987                    while i < end && bytes[i] != q {
988                        if bytes[i] == b'>' && !saw_gt {
989                            saw_gt = true;
990                            saw_syntax_error = true;
991                            self.pending.push_back(ParseEvent::ParseError {
992                                code: "html.tokenizer.doctype.gt_in_system_id".to_string(),
993                                message: "“>” in system identifier.".to_string(),
994                                span: Some(mk_span(i, i + 1)),
995                            });
996                        }
997                        i += 1;
998                    }
999                    if i >= end {
1000                        saw_syntax_error = true;
1001                        self.pending.push_back(ParseEvent::ParseError {
1002                            code: "html.tokenizer.doctype.eof_in_system_id".to_string(),
1003                            message: "End of file inside system identifier.".to_string(),
1004                            span: Some(mk_span(start, end)),
1005                        });
1006                    }
1007                    system_id = Some(bytes_to_string_lossy(&bytes[id_start..i.min(end)]));
1008                    // If there's a closing quote within the token, it's already accounted for by `i`.
1009                }
1010            } else {
1011                saw_bogus_doctype = true;
1012                self.pending.push_back(ParseEvent::ParseError {
1013                    code: "html.tokenizer.doctype.bogus".to_string(),
1014                    message: "Bogus doctype.".to_string(),
1015                    span: Some(mk_span(start, start + 2)),
1016                });
1017            }
1018        }
1019
1020        // Doctype conformance classification (VNU-style).
1021        if !saw_syntax_error
1022            && !saw_bogus_doctype
1023            && let Some(n) = name.as_deref()
1024        {
1025            let is_html = n.eq_ignore_ascii_case("html");
1026            if !is_html || public_id.is_some() || system_id.is_some() {
1027                let transitional_public =
1028                    public_id.as_deref() == Some("-//W3C//DTD HTML 4.01 Transitional//EN");
1029                let transitional_system =
1030                    system_id.as_deref() == Some("http://www.w3.org/TR/html4/loose.dtd");
1031                let msg = if is_html && transitional_public && transitional_system {
1032                    "Almost standards mode doctype. Expected “<!DOCTYPE html>”."
1033                } else {
1034                    "Obsolete doctype. Expected “<!DOCTYPE html>”."
1035                };
1036                self.pending.push_back(ParseEvent::ParseError {
1037                    code: "html.parser.doctype.not_html5".to_string(),
1038                    message: msg.to_string(),
1039                    span: Some(mk_span(start, start + 2)),
1040                });
1041            }
1042        }
1043
1044        self.bump_to(end);
1045        if end == self.bytes.len() {
1046            self.finished = true;
1047        }
1048        self.pending.push_back(ParseEvent::Doctype {
1049            name,
1050            public_id,
1051            system_id,
1052            span: Some(self.current_span(start, end, start_line, start_col)),
1053        });
1054        Ok(())
1055    }
1056
1057    fn scan_rawtext(&mut self, decode: bool) -> Result<(), ValidatorError> {
1058        let start = self.cursor;
1059        let start_line = self.line;
1060        let start_col = self.col;
1061
1062        let end_tag = self.open_elements.last().map_or("", String::as_str);
1063        let lt = find_rawtext_end_tag(&self.bytes, self.cursor, end_tag, self.format);
1064        if let Some(lt) = lt {
1065            if lt > self.cursor {
1066                self.bump_to(lt);
1067                let raw = bytes_to_string_lossy(&self.bytes[start..lt]);
1068                let text = if decode {
1069                    decode_char_refs(self.format, raw, false)
1070                } else {
1071                    raw
1072                };
1073                self.pending.push_back(ParseEvent::Text {
1074                    text,
1075                    span: Some(self.current_span(start, lt, start_line, start_col)),
1076                });
1077                return Ok(());
1078            }
1079            // Already at an end tag; parse it now so the text mode can exit.
1080            return self.scan_end_tag(start, start_line, start_col);
1081        }
1082
1083        // No closing tag found; emit remainder as text and finish.
1084        let end = self.bytes.len();
1085        self.bump_to(end);
1086        let raw = bytes_to_string_lossy(&self.bytes[start..end]);
1087        let text = if decode {
1088            decode_char_refs(self.format, raw, false)
1089        } else {
1090            raw
1091        };
1092        self.pending.push_back(ParseEvent::Text {
1093            text,
1094            span: Some(self.current_span(start, end, start_line, start_col)),
1095        });
1096        self.finished = true;
1097        Ok(())
1098    }
1099
1100    fn pop_open_element(&mut self, name: &str) {
1101        let Some(pos) = (match self.format {
1102            InputFormat::Html => self
1103                .open_elements
1104                .iter()
1105                .rposition(|n| n.eq_ignore_ascii_case(name)),
1106            InputFormat::Xhtml => self.open_elements.iter().rposition(|n| n == name),
1107        }) else {
1108            return;
1109        };
1110        self.open_elements.truncate(pos);
1111        self.open_namespaces.truncate(pos);
1112    }
1113
1114    fn scan_text_run(
1115        &mut self,
1116        start: usize,
1117        start_line: u32,
1118        start_col: u32,
1119    ) -> Result<(), ValidatorError> {
1120        // Cursor is currently at a '<' that we decided is not markup. Emit a coalesced text run
1121        // up to (but not including) the next '<' or EOF.
1122        let next_lt = memchr(b'<', &self.bytes[self.cursor + 1..])
1123            .map_or(self.bytes.len(), |off| self.cursor + 1 + off);
1124        self.bump_to(next_lt);
1125        let raw = bytes_to_string_lossy(&self.bytes[start..next_lt]);
1126        let text = decode_char_refs(self.format, raw, false);
1127        self.pending.push_back(ParseEvent::Text {
1128            text,
1129            span: Some(self.current_span(start, next_lt, start_line, start_col)),
1130        });
1131        Ok(())
1132    }
1133
1134    fn scan_garbage_after_lt_slash(
1135        &mut self,
1136        start: usize,
1137        start_line: u32,
1138        start_col: u32,
1139    ) -> Result<(), ValidatorError> {
1140        // Consume up to the next '>' (or EOF) and emit as a bogus comment, but first report the parse error.
1141        self.pending.push_back(ParseEvent::ParseError {
1142            code: "html.tokenizer.garbage_after_lt_slash".to_string(),
1143            message: "Garbage after “</”.".to_string(),
1144            span: Some(self.current_span(start, start + 2, start_line, start_col)),
1145        });
1146        let Some(off) = memchr(b'>', &self.bytes[self.cursor + 2..]) else {
1147            self.finished = true;
1148            self.bump_to(self.bytes.len());
1149            return Ok(());
1150        };
1151        let gt = self.cursor + 2 + off;
1152
1153        let text = bytes_to_string_lossy(&self.bytes[self.cursor + 2..gt]);
1154        let end = gt + 1;
1155        self.bump_to(end);
1156        self.pending.push_back(ParseEvent::Comment {
1157            text,
1158            span: Some(self.current_span(start, end, start_line, start_col)),
1159        });
1160        Ok(())
1161    }
1162}
1163
1164impl EventSource for SimpleHtmlEventSource {
1165    fn source_name(&self) -> &str {
1166        &self.name
1167    }
1168
1169    fn format(&self) -> InputFormat {
1170        self.format
1171    }
1172
1173    fn next_event(&mut self) -> Result<Option<ParseEvent>, ValidatorError> {
1174        if self.pending.is_empty() && !self.finished {
1175            self.scan_next()?;
1176        }
1177        Ok(self.pending.pop_front())
1178    }
1179}
1180
1181#[derive(Clone, Copy, Debug)]
1182enum TextModeKind {
1183    Data,
1184    RawText,
1185    RcData,
1186    Plaintext,
1187}
1188
1189fn str_from_bytes_lossy(bytes: &[u8]) -> Cow<'_, str> {
1190    String::from_utf8_lossy(bytes)
1191}
1192
1193fn bytes_to_string_lossy(bytes: &[u8]) -> String {
1194    str_from_bytes_lossy(bytes).into_owned()
1195}
1196
1197fn line_col_at_byte_offset(
1198    bytes: &[u8],
1199    base_start: usize,
1200    base_line: u32,
1201    base_col: u32,
1202    target: usize,
1203) -> (u32, u32) {
1204    let mut line = base_line;
1205    let mut col = base_col;
1206    if base_start >= bytes.len() || target <= base_start {
1207        return (line, col);
1208    }
1209    let end = target.min(bytes.len());
1210    if end <= base_start {
1211        return (line, col);
1212    }
1213    for &b in &bytes[base_start..end] {
1214        if b == b'\n' {
1215            line += 1;
1216            col = 1;
1217        } else {
1218            col += 1;
1219        }
1220    }
1221    (line, col)
1222}
1223
1224fn parse_start_tag(
1225    src: &SimpleHtmlEventSource,
1226    inside: &str,
1227    tag_start: usize,
1228    tag_line: u32,
1229    tag_col: u32,
1230    tag_end: usize,
1231) -> Result<(String, Vec<Attribute>, bool, Vec<ParseEvent>), ValidatorError> {
1232    #[inline]
1233    fn skip_ws(bytes: &[u8], i: &mut usize) {
1234        while *i < bytes.len() && bytes[*i].is_ascii_whitespace() {
1235            *i += 1;
1236        }
1237    }
1238
1239    #[inline]
1240    fn push_parse_error(
1241        errs: &mut Vec<ParseEvent>,
1242        span: Option<Span>,
1243        code: &'static str,
1244        message: impl Into<String>,
1245    ) {
1246        errs.push(ParseEvent::ParseError {
1247            code: code.to_string(),
1248            message: message.into(),
1249            span,
1250        });
1251    }
1252
1253    let bytes = inside.as_bytes();
1254    let mut i = 0usize;
1255
1256    let inside_base_start = tag_start.saturating_add(1);
1257    let has_gt = tag_end > 0
1258        && tag_end <= src.bytes.len()
1259        && src.bytes.get(tag_end - 1).copied() == Some(b'>');
1260    let inside_base_end = if has_gt {
1261        tag_end.saturating_sub(1)
1262    } else {
1263        tag_end
1264    };
1265    let can_map_to_source_bytes = inside_base_end <= src.bytes.len()
1266        && inside_base_end.saturating_sub(inside_base_start) == bytes.len();
1267
1268    // Tag name.
1269    skip_ws(bytes, &mut i);
1270    let name_start = i;
1271    while i < bytes.len() && is_tag_name_char(bytes[i]) {
1272        i += 1;
1273    }
1274    let name_raw = &inside[name_start..i];
1275    let name = src.normalize_name(name_raw);
1276
1277    let mut attrs: Vec<Attribute> = Vec::new();
1278    let mut errs: Vec<ParseEvent> = Vec::new();
1279    let mut self_closing = false;
1280    let tag_span = Some(Span::new(tag_start, tag_end, tag_line, tag_col));
1281
1282    while i < bytes.len() {
1283        skip_ws(bytes, &mut i);
1284        if i >= bytes.len() {
1285            break;
1286        }
1287        if bytes[i] == b'/' {
1288            // Self-closing marker only if it's the last non-whitespace char.
1289            let mut j = i + 1;
1290            skip_ws(bytes, &mut j);
1291            if j >= bytes.len() {
1292                self_closing = true;
1293                break;
1294            }
1295        }
1296
1297        // Attribute name.
1298        if bytes[i] == b'=' {
1299            push_parse_error(
1300                &mut errs,
1301                tag_span,
1302                "html.tokenizer.equals_expecting_attr_name",
1303                "Saw “=” when expecting an attribute name. Probable cause: Attribute name missing.",
1304            );
1305            i += 1;
1306            continue;
1307        }
1308        if bytes[i] == b'<' {
1309            push_parse_error(
1310                &mut errs,
1311                tag_span,
1312                "html.tokenizer.lt_expecting_attr_name",
1313                "Saw “<” when expecting an attribute name. Probable cause: Missing “>” immediately before.",
1314            );
1315            i += 1;
1316            continue;
1317        }
1318
1319        let attr_name_start = i;
1320        while i < bytes.len() && !bytes[i].is_ascii_whitespace() && bytes[i] != b'=' {
1321            if bytes[i] == b'/' {
1322                break;
1323            }
1324            i += 1;
1325        }
1326        if i == attr_name_start {
1327            // Nothing meaningful; stop parsing.
1328            break;
1329        }
1330        let attr_name_raw = &inside[attr_name_start..i];
1331        let attr_name = src.normalize_name(attr_name_raw);
1332        if attr_name_raw.contains('"') {
1333            push_parse_error(
1334                &mut errs,
1335                tag_span,
1336                "html.tokenizer.quote_in_attr_name",
1337                "Quote “\"” in attribute name. Probable cause: Matching quote missing somewhere earlier.",
1338            );
1339        }
1340        if attr_name_raw.contains('<') {
1341            push_parse_error(
1342                &mut errs,
1343                tag_span,
1344                "html.tokenizer.lt_in_attr_name",
1345                "“<” in attribute name. Probable cause: “>” missing immediately before.",
1346            );
1347        }
1348        if attrs.iter().any(|a| a.name == attr_name) {
1349            push_parse_error(
1350                &mut errs,
1351                tag_span,
1352                "html.tokenizer.duplicate_attribute",
1353                format!("Duplicate attribute “{attr_name}”."),
1354            );
1355        }
1356
1357        skip_ws(bytes, &mut i);
1358
1359        let mut value: Option<String> = None;
1360        if i < bytes.len() && bytes[i] == b'=' {
1361            i += 1;
1362            skip_ws(bytes, &mut i);
1363            if i >= bytes.len() || bytes[i] == b'>' {
1364                push_parse_error(
1365                    &mut errs,
1366                    tag_span,
1367                    "html.tokenizer.attr_value_missing",
1368                    "Attribute value missing.",
1369                );
1370            }
1371            if i < bytes.len() && (bytes[i] == b'"' || bytes[i] == b'\'') {
1372                let quote = bytes[i];
1373                i += 1;
1374                let value_start = i;
1375                while i < bytes.len() && bytes[i] != quote {
1376                    i += 1;
1377                }
1378                let raw = &inside[value_start..i];
1379                let (base_start, base_line, base_col) = if can_map_to_source_bytes {
1380                    let base_start = inside_base_start + value_start;
1381                    let (base_line, base_col) = line_col_at_byte_offset(
1382                        src.bytes.as_ref(),
1383                        tag_start,
1384                        tag_line,
1385                        tag_col,
1386                        base_start,
1387                    );
1388                    (base_start, base_line, base_col)
1389                } else {
1390                    (tag_start, tag_line, tag_col)
1391                };
1392                let (decoded, decoded_errs) = decode_char_refs_with_errors(
1393                    src.format, raw, true, base_start, base_line, base_col,
1394                );
1395                errs.extend(decoded_errs);
1396                value = Some(decoded);
1397                if i < bytes.len() && bytes[i] == quote {
1398                    i += 1;
1399                }
1400                // No space between attributes.
1401                if i < bytes.len() && bytes[i].is_ascii_alphabetic() {
1402                    push_parse_error(
1403                        &mut errs,
1404                        tag_span,
1405                        "html.tokenizer.no_space_between_attrs",
1406                        "No space between attributes.",
1407                    );
1408                }
1409            } else {
1410                let value_start = i;
1411                while i < bytes.len() && !bytes[i].is_ascii_whitespace() {
1412                    i += 1;
1413                }
1414                let raw = &inside[value_start..i];
1415                if raw.starts_with('`') {
1416                    push_parse_error(
1417                        &mut errs,
1418                        tag_span,
1419                        "html.tokenizer.backtick_at_start_unquoted",
1420                        "“`” at the start of an unquoted attribute value. Probable cause: Using the wrong character as a quote.",
1421                    );
1422                } else if raw.contains('`') {
1423                    push_parse_error(
1424                        &mut errs,
1425                        tag_span,
1426                        "html.tokenizer.backtick_in_unquoted",
1427                        "“`” in an unquoted attribute value. Probable cause: Using the wrong character as a quote.",
1428                    );
1429                }
1430                if raw.starts_with('<') {
1431                    push_parse_error(
1432                        &mut errs,
1433                        tag_span,
1434                        "html.tokenizer.lt_at_start_unquoted",
1435                        "“<” at the start of an unquoted attribute value. Probable cause: Missing “>” immediately before.",
1436                    );
1437                } else if raw.contains('<') {
1438                    push_parse_error(
1439                        &mut errs,
1440                        tag_span,
1441                        "html.tokenizer.lt_in_unquoted",
1442                        "“<” in an unquoted attribute value. Probable cause: Missing “>” immediately before.",
1443                    );
1444                }
1445                if raw.starts_with('=') {
1446                    push_parse_error(
1447                        &mut errs,
1448                        tag_span,
1449                        "html.tokenizer.equals_at_start_unquoted",
1450                        "“=” at the start of an unquoted attribute value. Probable cause: Stray duplicate equals sign.",
1451                    );
1452                }
1453                if raw.contains('"') {
1454                    push_parse_error(
1455                        &mut errs,
1456                        tag_span,
1457                        "html.tokenizer.quote_in_unquoted",
1458                        "“\"” in an unquoted attribute value. Probable causes: Attributes running together or a URL query string in an unquoted attribute value.",
1459                    );
1460                }
1461                let (base_start, base_line, base_col) = if can_map_to_source_bytes {
1462                    let base_start = inside_base_start + value_start;
1463                    let (base_line, base_col) = line_col_at_byte_offset(
1464                        src.bytes.as_ref(),
1465                        tag_start,
1466                        tag_line,
1467                        tag_col,
1468                        base_start,
1469                    );
1470                    (base_start, base_line, base_col)
1471                } else {
1472                    (tag_start, tag_line, tag_col)
1473                };
1474                let (decoded, decoded_errs) = decode_char_refs_with_errors(
1475                    src.format, raw, true, base_start, base_line, base_col,
1476                );
1477                errs.extend(decoded_errs);
1478                value = Some(decoded);
1479            }
1480        }
1481
1482        let span = if can_map_to_source_bytes {
1483            let base_start = inside_base_start + attr_name_start;
1484            let base_end = inside_base_start + i;
1485            let (line, col) = line_col_at_byte_offset(
1486                src.bytes.as_ref(),
1487                tag_start,
1488                tag_line,
1489                tag_col,
1490                base_start,
1491            );
1492            Some(Span::new(base_start, base_end, line, col))
1493        } else {
1494            None
1495        };
1496        attrs.push(Attribute {
1497            name: attr_name,
1498            value,
1499            span,
1500        });
1501    }
1502
1503    // Slash not immediately followed by ">".
1504    if src.format == InputFormat::Html {
1505        let trimmed = inside.trim_end();
1506        if trimmed.ends_with('/') && trimmed.len() != inside.len() {
1507            push_parse_error(
1508                &mut errs,
1509                tag_span,
1510                "html.tokenizer.slash_not_immediately_followed_by_gt",
1511                "A slash was not immediately followed by “>”.",
1512            );
1513        }
1514    }
1515
1516    // Special-case: <image> start tag.
1517    if src.format == InputFormat::Html && name.eq_ignore_ascii_case("image") {
1518        push_parse_error(
1519            &mut errs,
1520            tag_span,
1521            "html.tokenizer.image_start_tag",
1522            "Saw a start tag “image”.",
1523        );
1524    }
1525
1526    Ok((name, attrs, self_closing, errs))
1527}
1528
1529fn decode_char_refs(format: InputFormat, s: String, in_attribute: bool) -> String {
1530    if !s.contains('&') {
1531        return s;
1532    }
1533
1534    let bytes = s.as_bytes();
1535    let mut out = String::with_capacity(s.len());
1536    let mut i = 0usize;
1537    let mut last = 0usize;
1538    while i < bytes.len() {
1539        if bytes[i] != b'&' {
1540            i += 1;
1541            continue;
1542        }
1543
1544        out.push_str(&s[last..i]);
1545        let start = i;
1546        i += 1;
1547        if i >= bytes.len() {
1548            out.push('&');
1549            last = i;
1550            break;
1551        }
1552
1553        if bytes[i] == b'#' {
1554            let mut j = i + 1;
1555            let is_hex = j < bytes.len() && matches!(bytes[j], b'x' | b'X');
1556            if is_hex {
1557                j += 1;
1558            }
1559            let digits_start = j;
1560            if is_hex {
1561                while j < bytes.len() && bytes[j].is_ascii_hexdigit() {
1562                    j += 1;
1563                }
1564            } else {
1565                while j < bytes.len() && bytes[j].is_ascii_digit() {
1566                    j += 1;
1567                }
1568            }
1569            if digits_start == j {
1570                out.push('&');
1571                i = start + 1;
1572                last = i;
1573                continue;
1574            }
1575            let digits = &s[digits_start..j];
1576            let radix = if is_hex { 16 } else { 10 };
1577            let value = u32::from_str_radix(digits, radix).ok();
1578            if j < bytes.len() && bytes[j] == b';' {
1579                j += 1;
1580            }
1581            if let Some(cp) = value.and_then(valid_code_point) {
1582                out.push(cp);
1583            } else {
1584                out.push('\u{FFFD}');
1585            }
1586            i = j;
1587            last = i;
1588            continue;
1589        }
1590
1591        // Named reference: attempt longest match against the named-entity table.
1592        let mut j = i;
1593        let mut best: Option<(usize, &'static str)> = None;
1594        while j < bytes.len() {
1595            let b = bytes[j];
1596            if !(b.is_ascii_alphanumeric() || b == b';') {
1597                break;
1598            }
1599            j += 1;
1600            let cand = &s[i..j];
1601            if let Some(val) = resolve_named_ref(format, cand) {
1602                best = Some((j, val));
1603            }
1604            if b == b';' {
1605                break;
1606            }
1607        }
1608
1609        if let Some((end, val)) = best {
1610            let matched = &s[i..end];
1611            if in_attribute && !matched.ends_with(';') {
1612                let next = bytes.get(end).copied().unwrap_or(b' ');
1613                if next.is_ascii_alphanumeric() || next == b'=' {
1614                    out.push('&');
1615                    i = start + 1;
1616                    last = i;
1617                    continue;
1618                }
1619            }
1620            out.push_str(val);
1621            i = end;
1622            last = i;
1623            continue;
1624        }
1625
1626        // Not recognized: consume only '&'.
1627        out.push('&');
1628        i = start + 1;
1629        last = i;
1630    }
1631    out.push_str(&s[last..]);
1632    out
1633}
1634
1635fn decode_char_refs_with_errors(
1636    format: InputFormat,
1637    s: &str,
1638    in_attribute: bool,
1639    base_start: usize,
1640    base_line: u32,
1641    base_col: u32,
1642) -> (String, Vec<ParseEvent>) {
1643    if format != InputFormat::Html {
1644        return (s.to_string(), Vec::new());
1645    }
1646    let mut errs: Vec<ParseEvent> = Vec::new();
1647    let line_col_at =
1648        |byte_off: usize| line_col_at_byte_offset(s.as_bytes(), 0, base_line, base_col, byte_off);
1649    if let Some((byte_off, cp, byte_len)) = first_forbidden_code_point(s) {
1650        let (line, col) = line_col_at(byte_off);
1651        errs.push(ParseEvent::ParseError {
1652            code: "html.tokenizer.forbidden_code_point".to_string(),
1653            message: format!("Forbidden code point U+{:04x}.", cp),
1654            span: Some(Span::new(
1655                base_start + byte_off,
1656                base_start + byte_off + byte_len,
1657                line,
1658                col,
1659            )),
1660        });
1661    }
1662    if let Some((byte_off, byte_len)) = first_astral_noncharacter(s) {
1663        let (line, col) = line_col_at(byte_off);
1664        errs.push(ParseEvent::ParseError {
1665            code: "html.tokenizer.astral_noncharacter".to_string(),
1666            message: "Astral non-character.".to_string(),
1667            span: Some(Span::new(
1668                base_start + byte_off,
1669                base_start + byte_off + byte_len,
1670                line,
1671                col,
1672            )),
1673        });
1674    }
1675
1676    if !s.contains('&') {
1677        return (s.to_string(), errs);
1678    }
1679
1680    let bytes = s.as_bytes();
1681    let mut out = String::with_capacity(s.len());
1682    let mut i = 0usize;
1683    let mut last = 0usize;
1684    while i < bytes.len() {
1685        if bytes[i] != b'&' {
1686            i += 1;
1687            continue;
1688        }
1689
1690        out.push_str(&s[last..i]);
1691        let amp_off = i;
1692        i += 1;
1693        if i >= bytes.len() {
1694            out.push('&');
1695            last = i;
1696            break;
1697        }
1698
1699        if bytes[i] == b'#' {
1700            let mut j = i + 1;
1701            let is_hex = j < bytes.len() && matches!(bytes[j], b'x' | b'X');
1702            if is_hex {
1703                j += 1;
1704            }
1705            let digits_start = j;
1706            if is_hex {
1707                while j < bytes.len() && bytes[j].is_ascii_hexdigit() {
1708                    j += 1;
1709                }
1710            } else {
1711                while j < bytes.len() && bytes[j].is_ascii_digit() {
1712                    j += 1;
1713                }
1714            }
1715            if digits_start == j {
1716                let (line, col) = line_col_at(amp_off);
1717                errs.push(ParseEvent::ParseError {
1718                    code: "html.tokenizer.charref_no_digits".to_string(),
1719                    message: "No digits after “”.".to_string(),
1720                    span: Some(Span::new(
1721                        base_start + amp_off,
1722                        base_start + amp_off + 1,
1723                        line,
1724                        col,
1725                    )),
1726                });
1727                out.push('&');
1728                i = amp_off + 1;
1729                last = i;
1730                continue;
1731            }
1732            let digits = &s[digits_start..j];
1733            let radix = if is_hex { 16 } else { 10 };
1734            let value = u32::from_str_radix(digits, radix).unwrap_or(0);
1735            let had_semicolon = j < bytes.len() && bytes[j] == b';';
1736            if had_semicolon {
1737                j += 1;
1738            } else {
1739                let (line, col) = line_col_at(amp_off);
1740                errs.push(ParseEvent::ParseError {
1741                    code: "html.tokenizer.charref_no_semicolon".to_string(),
1742                    message: "Character reference was not terminated by a semicolon.".to_string(),
1743                    span: Some(Span::new(
1744                        base_start + amp_off,
1745                        base_start + amp_off + 1,
1746                        line,
1747                        col,
1748                    )),
1749                });
1750            }
1751
1752            let msg = classify_numeric_charref(value);
1753            if let Some((code, message)) = msg {
1754                let (line, col) = line_col_at(amp_off);
1755                errs.push(ParseEvent::ParseError {
1756                    code: code.to_string(),
1757                    message,
1758                    span: Some(Span::new(
1759                        base_start + amp_off,
1760                        base_start + amp_off + 1,
1761                        line,
1762                        col,
1763                    )),
1764                });
1765            }
1766
1767            if let Some(cp) = valid_code_point(value) {
1768                out.push(cp);
1769            } else {
1770                out.push('\u{FFFD}');
1771            }
1772            i = j;
1773            last = i;
1774            continue;
1775        }
1776
1777        // Named reference: attempt longest match against the named-entity table.
1778        let mut j = i;
1779        let mut best: Option<(usize, &'static str)> = None;
1780        while j < bytes.len() {
1781            let b = bytes[j];
1782            if !(b.is_ascii_alphanumeric() || b == b';') {
1783                break;
1784            }
1785            j += 1;
1786            let cand = &s[i..j];
1787            if let Some(val) = resolve_named_ref(format, cand) {
1788                best = Some((j, val));
1789            }
1790            if b == b';' {
1791                break;
1792            }
1793        }
1794
1795        if let Some((end, val)) = best {
1796            let matched = &s[i..end];
1797            if in_attribute && !matched.ends_with(';') {
1798                let next = bytes.get(end).copied().unwrap_or(b' ');
1799                if next.is_ascii_alphanumeric() || next == b'=' {
1800                    out.push('&');
1801                    i = amp_off + 1;
1802                    last = i;
1803                    continue;
1804                }
1805            }
1806            if !matched.ends_with(';') {
1807                let (line, col) = line_col_at(amp_off);
1808                errs.push(ParseEvent::ParseError {
1809                    code: "html.tokenizer.named_charref_no_semicolon".to_string(),
1810                    message: "Named character reference was not terminated by a semicolon. (Or “&” should have been escaped as “&amp;”.)".to_string(),
1811                    span: Some(Span::new(
1812                        base_start + amp_off,
1813                        base_start + amp_off + 1,
1814                        line,
1815                        col,
1816                    )),
1817                });
1818            }
1819            out.push_str(val);
1820            i = end;
1821            last = i;
1822            continue;
1823        }
1824
1825        // Not recognized: consume only '&'.
1826        out.push('&');
1827        i = amp_off + 1;
1828        last = i;
1829    }
1830    out.push_str(&s[last..]);
1831
1832    (out, errs)
1833}
1834
1835fn classify_numeric_charref(cp: u32) -> Option<(&'static str, String)> {
1836    if cp == 0 {
1837        return Some((
1838            "html.tokenizer.charref_zero",
1839            "Character reference expands to zero.".to_string(),
1840        ));
1841    }
1842    if cp > 0x10FFFF {
1843        return Some((
1844            "html.tokenizer.charref_outside_range",
1845            "Character reference outside the permissible Unicode range.".to_string(),
1846        ));
1847    }
1848    if (0xD800..=0xDFFF).contains(&cp) {
1849        return Some((
1850            "html.tokenizer.charref_surrogate",
1851            "Character reference expands to a surrogate.".to_string(),
1852        ));
1853    }
1854    if cp == 0x0D {
1855        return Some((
1856            "html.tokenizer.charref_cr",
1857            "A numeric character reference expanded to carriage return.".to_string(),
1858        ));
1859    }
1860    if (0x80..=0x9F).contains(&cp) {
1861        return Some((
1862            "html.tokenizer.charref_c1_controls",
1863            "A numeric character reference expanded to the C1 controls range.".to_string(),
1864        ));
1865    }
1866    if cp > 0xFFFF && (cp & 0xFFFE) == 0xFFFE {
1867        return Some((
1868            "html.tokenizer.charref_astral_noncharacter",
1869            format!(
1870                "Character reference expands to an astral non-character (U+{:x}).",
1871                cp
1872            ),
1873        ));
1874    }
1875    if (cp & 0xFFFE) == 0xFFFE {
1876        return Some((
1877            "html.tokenizer.charref_noncharacter",
1878            format!(
1879                "Character reference expands to a non-character (U+{:04x}).",
1880                cp
1881            ),
1882        ));
1883    }
1884    if (0xFDD0..=0xFDEF).contains(&cp) {
1885        return Some((
1886            "html.tokenizer.charref_unassigned",
1887            "Character reference expands to a permanently unassigned code point.".to_string(),
1888        ));
1889    }
1890    if ((1..=0x1F).contains(&cp) && cp != 0x09 && cp != 0x0A && cp != 0x0C && cp != 0x0D)
1891        || cp == 0x7F
1892    {
1893        return Some((
1894            "html.tokenizer.charref_control",
1895            format!(
1896                "Character reference expands to a control character (U+{:04x}).",
1897                cp
1898            ),
1899        ));
1900    }
1901    None
1902}
1903
1904fn first_forbidden_code_point(s: &str) -> Option<(usize, u32, usize)> {
1905    // For this suite we only need U+000B (vertical tab) as a forbidden code point in the stream.
1906    let idx = s.as_bytes().iter().position(|&b| b == 0x0B)?;
1907    Some((idx, 0x000B, 1))
1908}
1909
1910fn first_astral_noncharacter(s: &str) -> Option<(usize, usize)> {
1911    for (idx, ch) in s.char_indices() {
1912        let cp = ch as u32;
1913        if cp > 0xFFFF && (cp & 0xFFFE) == 0xFFFE {
1914            return Some((idx, ch.len_utf8()));
1915        }
1916    }
1917    None
1918}
1919
1920fn resolve_named_ref(format: InputFormat, name: &str) -> Option<&'static str> {
1921    match (format, name) {
1922        (InputFormat::Html, _) => html_named_entity_map().get(name).copied(),
1923        (InputFormat::Xhtml, "lt;") => Some("<"),
1924        (InputFormat::Xhtml, "gt;") => Some(">"),
1925        (InputFormat::Xhtml, "amp;") => Some("&"),
1926        (InputFormat::Xhtml, "quot;") => Some("\""),
1927        (InputFormat::Xhtml, "apos;") => Some("'"),
1928        (InputFormat::Xhtml, _) => None,
1929    }
1930}
1931
1932#[cfg(test)]
1933mod resolve_named_ref_tests {
1934    use super::{InputFormat, resolve_named_ref};
1935
1936    #[test]
1937    fn xhtml_supports_only_predefined_named_entities() {
1938        assert_eq!(resolve_named_ref(InputFormat::Xhtml, "lt;"), Some("<"));
1939        assert_eq!(resolve_named_ref(InputFormat::Xhtml, "gt;"), Some(">"));
1940        assert_eq!(resolve_named_ref(InputFormat::Xhtml, "amp;"), Some("&"));
1941        assert_eq!(resolve_named_ref(InputFormat::Xhtml, "quot;"), Some("\""));
1942        assert_eq!(resolve_named_ref(InputFormat::Xhtml, "apos;"), Some("'"));
1943
1944        assert_eq!(resolve_named_ref(InputFormat::Xhtml, "copy;"), None);
1945        assert_eq!(resolve_named_ref(InputFormat::Xhtml, "amp"), None);
1946    }
1947}
1948
1949fn valid_code_point(cp: u32) -> Option<char> {
1950    // Roughly align with HTML parsing behavior: reject surrogate range and out-of-range.
1951    if cp == 0 || cp > 0x10FFFF || (0xD800..=0xDFFF).contains(&cp) {
1952        None
1953    } else {
1954        char::from_u32(cp)
1955    }
1956}
1957
1958fn is_tag_name_char(b: u8) -> bool {
1959    b.is_ascii_alphanumeric() || b == b'-' || b == b':'
1960}
1961
1962fn starts_with_ascii_case_insensitive(haystack: &[u8], needle: &[u8]) -> bool {
1963    haystack.len() >= needle.len() && haystack[..needle.len()].eq_ignore_ascii_case(needle)
1964}
1965
1966fn find_tag_close(bytes: &[u8], from: usize) -> Option<usize> {
1967    #[derive(Clone, Copy, Debug)]
1968    enum State {
1969        TagName,
1970        BeforeAttrName,
1971        AttrName,
1972        AfterAttrName,
1973        BeforeAttrValue,
1974        AttrValueUnquoted,
1975        AttrValueQuoted(u8),
1976    }
1977
1978    let mut i = from;
1979    let mut state = State::TagName;
1980    while i < bytes.len() {
1981        let b = bytes[i];
1982        if b == b'>' && !matches!(state, State::AttrValueQuoted(_)) {
1983            return Some(i);
1984        }
1985        match state {
1986            State::TagName => {
1987                if b.is_ascii_whitespace() {
1988                    state = State::BeforeAttrName;
1989                }
1990            }
1991            State::BeforeAttrName => {
1992                if b.is_ascii_whitespace() {
1993                    // stay
1994                } else if b == b'/' {
1995                    // self-closing marker or just a stray slash; stay in this loose state
1996                } else {
1997                    state = State::AttrName;
1998                }
1999            }
2000            State::AttrName => {
2001                if b.is_ascii_whitespace() {
2002                    state = State::AfterAttrName;
2003                } else if b == b'=' {
2004                    state = State::BeforeAttrValue;
2005                }
2006            }
2007            State::AfterAttrName => {
2008                if b.is_ascii_whitespace() {
2009                    // stay
2010                } else if b == b'=' {
2011                    state = State::BeforeAttrValue;
2012                } else {
2013                    state = State::AttrName;
2014                }
2015            }
2016            State::BeforeAttrValue => {
2017                if b.is_ascii_whitespace() {
2018                    // stay
2019                } else if b == b'"' || b == b'\'' {
2020                    state = State::AttrValueQuoted(b);
2021                } else {
2022                    state = State::AttrValueUnquoted;
2023                }
2024            }
2025            State::AttrValueUnquoted => {
2026                if b.is_ascii_whitespace() {
2027                    state = State::BeforeAttrName;
2028                }
2029            }
2030            State::AttrValueQuoted(q) => {
2031                if b == q {
2032                    state = State::BeforeAttrName;
2033                }
2034            }
2035        }
2036        i += 1;
2037    }
2038    None
2039}
2040
2041fn find_rawtext_end_tag(
2042    bytes: &[u8],
2043    from: usize,
2044    end_tag: &str,
2045    format: InputFormat,
2046) -> Option<usize> {
2047    let end_bytes = end_tag.as_bytes();
2048    let mut i = from;
2049    while i < bytes.len() {
2050        let off = memchr(b'<', &bytes[i..])?;
2051        let lt = i + off;
2052        if bytes.get(lt + 1) != Some(&b'/') {
2053            i = lt + 1;
2054            continue;
2055        }
2056        let name_start = lt + 2;
2057        if name_start + end_bytes.len() > bytes.len() {
2058            return None;
2059        }
2060        let candidate = &bytes[name_start..name_start + end_bytes.len()];
2061        let matches = match format {
2062            InputFormat::Html => candidate.eq_ignore_ascii_case(end_bytes),
2063            InputFormat::Xhtml => candidate == end_bytes,
2064        };
2065        if !matches {
2066            i = lt + 1;
2067            continue;
2068        }
2069        let after = bytes
2070            .get(name_start + end_bytes.len())
2071            .copied()
2072            .unwrap_or(b'>');
2073        if after.is_ascii_whitespace() || after == b'>' || after == b'/' {
2074            return Some(lt);
2075        }
2076        i = lt + 1;
2077    }
2078    None
2079}
2080
2081fn classify_start_tag_eof(rest: &[u8]) -> (String, String) {
2082    // Best-effort: if we're in a quoted attribute value, prefer that error message.
2083    let mut quote: Option<u8> = None;
2084    for &b in rest {
2085        match (quote, b) {
2086            (None, b'\'' | b'"') => quote = Some(b),
2087            (Some(q), b) if b == q => quote = None,
2088            _ => {}
2089        }
2090    }
2091    if quote.is_some() {
2092        return (
2093            "html.tokenizer.eof_in_attr_value".to_string(),
2094            "End of file reached when inside an attribute value. Ignoring tag.".to_string(),
2095        );
2096    }
2097    (
2098        "html.tokenizer.eof_in_attr_name".to_string(),
2099        "End of file occurred in an attribute name. Ignoring tag.".to_string(),
2100    )
2101}
2102
2103fn memchr(needle: u8, haystack: &[u8]) -> Option<usize> {
2104    haystack.iter().position(|&b| b == needle)
2105}
2106
2107fn find_subslice(haystack: &[u8], from: usize, needle: &[u8]) -> Option<usize> {
2108    haystack[from..]
2109        .windows(needle.len())
2110        .position(|w| w == needle)
2111        .map(|off| from + off)
2112}
2113
2114fn html_named_entity_map() -> &'static FxHashMap<&'static str, &'static str> {
2115    static MAP: OnceLock<FxHashMap<&'static str, &'static str>> = OnceLock::new();
2116    MAP.get_or_init(|| {
2117        let mut map = FxHashMap::with_capacity_and_hasher(
2118            named_entities::HTML_NAMED_ENTITIES.len(),
2119            Default::default(),
2120        );
2121        map.extend(named_entities::HTML_NAMED_ENTITIES.iter().copied());
2122        map
2123    })
2124}
2125
2126#[cfg(test)]
2127mod entity_map_tests {
2128    use super::{html_named_entity_map, named_entities};
2129
2130    #[test]
2131    fn html_named_entity_map_contains_expected_entries() {
2132        let map = html_named_entity_map();
2133        assert_eq!(map.len(), named_entities::HTML_NAMED_ENTITIES.len());
2134        assert_eq!(map.get("AMP;"), Some(&"&"));
2135        assert_eq!(map.get("NegativeMediumSpace;"), Some(&"\u{200B}"));
2136        assert_eq!(map.get("NegativeThickSpace;"), Some(&"\u{200B}"));
2137        assert_eq!(map.get("NegativeThinSpace;"), Some(&"\u{200B}"));
2138        assert_eq!(map.get("NegativeVeryThinSpace;"), Some(&"\u{200B}"));
2139        assert_eq!(map.get("NoBreak;"), Some(&"\u{2060}"));
2140        assert_eq!(map.get("ZeroWidthSpace;"), Some(&"\u{200B}"));
2141        assert_eq!(map.get("shy;"), Some(&"\u{AD}"));
2142        assert_eq!(map.get("shy"), Some(&"\u{AD}"));
2143    }
2144}
2145
2146#[cfg(test)]
2147mod tests {
2148    use super::*;
2149    use html_inspector::EventSource;
2150
2151    fn collect(mut src: SimpleHtmlEventSource) -> Vec<ParseEvent> {
2152        let mut out = Vec::new();
2153        while let Some(ev) = src.next_event().unwrap() {
2154            out.push(ev);
2155        }
2156        out
2157    }
2158
2159    #[test]
2160    fn classify_start_tag_eof_prefers_attr_value_error_when_in_quote() {
2161        let (code, _msg) = classify_start_tag_eof(br#" class="unterminated"#);
2162        assert_eq!(code, "html.tokenizer.eof_in_attr_value");
2163
2164        // Different quote bytes should not terminate the current quoted value.
2165        let (code, _msg) = classify_start_tag_eof(br#" class="has'single"#);
2166        assert_eq!(code, "html.tokenizer.eof_in_attr_value");
2167    }
2168
2169    #[test]
2170    fn classify_start_tag_eof_returns_attr_name_error_when_not_in_quote() {
2171        let (code, _msg) = classify_start_tag_eof(b" class=foo");
2172        assert_eq!(code, "html.tokenizer.eof_in_attr_name");
2173
2174        // Balanced quotes leave us outside an attribute value at EOF.
2175        let (code, _msg) = classify_start_tag_eof(br#" class="ok""#);
2176        assert_eq!(code, "html.tokenizer.eof_in_attr_name");
2177    }
2178
2179    #[test]
2180    fn bytes_at_cursor_is_false_when_out_of_bounds() {
2181        let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<");
2182        assert!(src.bytes_at_cursor(b""));
2183        assert!(src.bytes_at_cursor(b"<"));
2184        assert!(!src.bytes_at_cursor(b"<>"));
2185        src.cursor = src.bytes.len();
2186        assert!(src.bytes_at_cursor(b""));
2187        assert!(!src.bytes_at_cursor(b"<"));
2188        src.cursor = src.bytes.len() + 1;
2189        assert!(!src.bytes_at_cursor(b""));
2190        assert!(!src.bytes_at_cursor(b"<"));
2191    }
2192
2193    #[test]
2194    fn valid_code_point_matches_html_scalar_value_constraints() {
2195        assert_eq!(valid_code_point(0), None);
2196        assert_eq!(valid_code_point(0xD800), None);
2197        assert_eq!(valid_code_point(0xDFFF), None);
2198        assert_eq!(valid_code_point(0x110000), None);
2199
2200        assert_eq!(valid_code_point(0x41), Some('A'));
2201        assert_eq!(
2202            valid_code_point(0x10FFFF),
2203            Some(char::from_u32(0x10FFFF).unwrap())
2204        );
2205    }
2206
2207    #[test]
2208    fn first_forbidden_code_point_finds_vertical_tab_by_byte_offset() {
2209        assert_eq!(first_forbidden_code_point("abc"), None);
2210
2211        let s = "❤\u{000B}x";
2212        let (idx, cp, len) = first_forbidden_code_point(s).unwrap();
2213        assert_eq!(idx, "❤".len());
2214        assert_eq!(cp, 0x000B);
2215        assert_eq!(len, 1);
2216    }
2217
2218    #[test]
2219    fn first_astral_noncharacter_finds_noncharacters_and_reports_byte_len() {
2220        assert_eq!(first_astral_noncharacter("abc"), None);
2221
2222        let ch = char::from_u32(0x1FFFE).unwrap();
2223        let s = format!("a{ch}b");
2224        let (idx, len) = first_astral_noncharacter(&s).unwrap();
2225        assert_eq!(idx, 1);
2226        assert_eq!(len, ch.len_utf8());
2227    }
2228
2229    #[test]
2230    fn normalize_name_lowercases_ascii_in_html_only() {
2231        let html = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
2232        assert_eq!(html.normalize_name("DiV❤"), "div❤");
2233        assert_eq!(html.normalize_name("div"), "div");
2234
2235        let xhtml = SimpleHtmlEventSource::from_str("t", InputFormat::Xhtml, "");
2236        assert_eq!(xhtml.normalize_name("DiV❤"), "DiV❤");
2237    }
2238
2239    #[test]
2240    fn normalize_name_accepts_cow_without_extra_allocation_for_owned() {
2241        let html = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
2242        assert_eq!(
2243            html.normalize_name(std::borrow::Cow::Borrowed("DiV")),
2244            html.normalize_name("DiV")
2245        );
2246        assert_eq!(
2247            html.normalize_name(std::borrow::Cow::Owned("DiV".to_string())),
2248            html.normalize_name("DiV")
2249        );
2250
2251        let s = "DiV".to_string();
2252        let ptr = s.as_ptr();
2253        let cap = s.capacity();
2254        let out = html.normalize_name(std::borrow::Cow::Owned(s));
2255        assert_eq!(out, "div");
2256        assert_eq!(out.as_ptr(), ptr);
2257        assert_eq!(out.capacity(), cap);
2258
2259        let s = "div".to_string();
2260        let ptr = s.as_ptr();
2261        let cap = s.capacity();
2262        let out = html.normalize_name(s);
2263        assert_eq!(out, "div");
2264        assert_eq!(out.as_ptr(), ptr);
2265        assert_eq!(out.capacity(), cap);
2266
2267        let xhtml = SimpleHtmlEventSource::from_str("t", InputFormat::Xhtml, "");
2268        assert_eq!(
2269            xhtml.normalize_name(std::borrow::Cow::Borrowed("DiV")),
2270            xhtml.normalize_name("DiV")
2271        );
2272        assert_eq!(
2273            xhtml.normalize_name(std::borrow::Cow::Owned("DiV".to_string())),
2274            xhtml.normalize_name("DiV")
2275        );
2276
2277        let s = "DiV".to_string();
2278        let ptr = s.as_ptr();
2279        let cap = s.capacity();
2280        let out = xhtml.normalize_name(std::borrow::Cow::Owned(s));
2281        assert_eq!(out, "DiV");
2282        assert_eq!(out.as_ptr(), ptr);
2283        assert_eq!(out.capacity(), cap);
2284    }
2285
2286    #[test]
2287    fn next_event_drains_pending_even_if_finished() {
2288        let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
2289        src.finished = true;
2290        src.pending.push_back(ParseEvent::Text {
2291            text: "x".to_string(),
2292            span: None,
2293        });
2294        assert!(matches!(
2295            src.next_event().unwrap(),
2296            Some(ParseEvent::Text { ref text, .. }) if text == "x"
2297        ));
2298        assert!(src.next_event().unwrap().is_none());
2299    }
2300
2301    #[test]
2302    fn pop_open_element_truncates_stacks_at_last_match() {
2303        let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
2304        src.open_elements = vec![
2305            "a".to_string(),
2306            "b".to_string(),
2307            "B".to_string(),
2308            "c".to_string(),
2309        ];
2310        src.open_namespaces = vec![
2311            HtmlNamespace::Html,
2312            HtmlNamespace::Html,
2313            HtmlNamespace::Svg,
2314            HtmlNamespace::Math,
2315        ];
2316        src.pop_open_element("b");
2317        assert_eq!(src.open_elements, vec!["a".to_string(), "b".to_string()]);
2318        assert_eq!(
2319            src.open_namespaces,
2320            vec![HtmlNamespace::Html, HtmlNamespace::Html]
2321        );
2322
2323        let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Xhtml, "");
2324        src.open_elements = vec![
2325            "a".to_string(),
2326            "b".to_string(),
2327            "B".to_string(),
2328            "c".to_string(),
2329        ];
2330        src.open_namespaces = vec![
2331            HtmlNamespace::Html,
2332            HtmlNamespace::Html,
2333            HtmlNamespace::Svg,
2334            HtmlNamespace::Math,
2335        ];
2336        src.pop_open_element("b");
2337        assert_eq!(src.open_elements, vec!["a".to_string()]);
2338        assert_eq!(src.open_namespaces, vec![HtmlNamespace::Html]);
2339    }
2340
2341    #[test]
2342    fn pop_open_element_is_noop_when_name_is_missing() {
2343        let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
2344        src.open_elements = vec!["a".to_string(), "b".to_string()];
2345        src.open_namespaces = vec![HtmlNamespace::Html, HtmlNamespace::Svg];
2346        src.pop_open_element("missing");
2347        assert_eq!(src.open_elements, vec!["a".to_string(), "b".to_string()]);
2348        assert_eq!(
2349            src.open_namespaces,
2350            vec![HtmlNamespace::Html, HtmlNamespace::Svg]
2351        );
2352    }
2353
2354    fn as_start_tag(ev: &ParseEvent) -> Option<(&str, &[html_inspector::Attribute])> {
2355        match ev {
2356            ParseEvent::StartTag { name, attrs, .. } => Some((name, attrs.as_slice())),
2357            _ => None,
2358        }
2359    }
2360
2361    #[test]
2362    fn str_from_bytes_lossy_borrows_valid_utf8() {
2363        let s = str_from_bytes_lossy(b"hello");
2364        assert!(matches!(s, Cow::Borrowed(_)));
2365        assert_eq!(s.as_ref(), "hello");
2366    }
2367
2368    #[test]
2369    fn str_from_bytes_lossy_allocates_on_invalid_utf8() {
2370        let s = str_from_bytes_lossy(&[0xff, b'a']);
2371        assert!(matches!(s, Cow::Owned(_)));
2372        assert_eq!(s.as_ref(), "�a");
2373    }
2374
2375    #[test]
2376    fn doctype_name_normalizes_even_when_decoding_allocates() {
2377        let src = SimpleHtmlEventSource::from_bytes(
2378            "t",
2379            InputFormat::Html,
2380            vec![
2381                b'<', b'!', b'D', b'O', b'C', b'T', b'Y', b'P', b'E', b' ', 0xff, b'A', b'>',
2382            ],
2383        );
2384        let evs = collect(src);
2385        let name = evs.iter().find_map(|e| match e {
2386            ParseEvent::Doctype {
2387                name: Some(name), ..
2388            } => Some(name.as_str()),
2389            _ => None,
2390        });
2391        assert_eq!(name, Some("�a"));
2392    }
2393
2394    #[test]
2395    fn treats_lt_not_followed_by_tag_name_as_text() {
2396        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<$");
2397        let evs = collect(src);
2398        assert!(
2399            matches!(evs[0], ParseEvent::ParseError { ref code, .. } if code == "html.tokenizer.bad_char_after_lt")
2400        );
2401        assert!(matches!(evs[1], ParseEvent::Text { ref text, .. } if text == "<$"));
2402    }
2403
2404    #[test]
2405    fn lt_at_eof_emits_tokenizer_eof_after_lt_error() {
2406        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<");
2407        let evs = collect(src);
2408        assert_eq!(evs.len(), 1);
2409        let ParseEvent::ParseError {
2410            code,
2411            message,
2412            span,
2413        } = &evs[0]
2414        else {
2415            panic!("expected a parse error event");
2416        };
2417        assert_eq!(code, "html.tokenizer.eof_after_lt");
2418        assert_eq!(message, "End of file after “<”.");
2419        assert_eq!(span.unwrap(), Span::new(0, 1, 1, 1));
2420    }
2421
2422    #[test]
2423    fn parses_tag_end_ignoring_gt_inside_quotes() {
2424        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<a title=\">\">x</a>");
2425        let evs = collect(src);
2426        let (name, attrs) = as_start_tag(&evs[0]).unwrap();
2427        assert_eq!(name, "a");
2428        assert_eq!(attrs.len(), 1);
2429        assert_eq!(attrs[0].name, "title");
2430        assert_eq!(attrs[0].value.as_deref(), Some(">"));
2431    }
2432
2433    #[test]
2434    fn lt_followed_by_whitespace_is_literal_text_without_error() {
2435        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "< ");
2436        let evs = collect(src);
2437        assert!(matches!(evs[0], ParseEvent::Text { ref text, .. } if text == "< "));
2438    }
2439
2440    #[test]
2441    fn xhtml_processing_instruction_skips_leading_whitespace_in_target() {
2442        let src = SimpleHtmlEventSource::from_str(
2443            "t",
2444            InputFormat::Xhtml,
2445            "<?   xml-stylesheet href=\"a\"?>",
2446        );
2447        let evs = collect(src);
2448        assert!(evs.iter().any(|e| matches!(
2449            e,
2450            ParseEvent::ProcessingInstruction { target, data, .. }
2451                if target == "xml-stylesheet" && data == "href=\"a\""
2452        )));
2453    }
2454
2455    #[test]
2456    fn doctype_public_and_system_missing_ids_emit_expected_errors() {
2457        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<!DOCTYPE html PUBLIC>");
2458        let evs = collect(src);
2459        assert!(evs.iter().any(|e| matches!(
2460                e,
2461                ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.expected_public_id"
2462            )));
2463
2464        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<!DOCTYPE html SYSTEM>");
2465        let evs = collect(src);
2466        assert!(evs.iter().any(|e| matches!(
2467                e,
2468                ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.expected_system_id"
2469            )));
2470    }
2471
2472    #[test]
2473    fn doctype_allows_whitespace_after_bang_and_newlines_affect_span_tracking() {
2474        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<! DOCTYPE html>");
2475        let evs = collect(src);
2476        assert!(evs.iter().any(|e| matches!(e, ParseEvent::Doctype { .. })));
2477
2478        let src = SimpleHtmlEventSource::from_str(
2479            "t",
2480            InputFormat::Html,
2481            "<!DOCTYPE html PUBLIC \"a\\n>\" \"sys\">",
2482        );
2483        let evs = collect(src);
2484        assert!(evs.iter().any(|e| matches!(
2485            e,
2486            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.gt_in_public_id"
2487        )));
2488    }
2489
2490    #[test]
2491    fn parse_start_tag_allows_leading_whitespace_in_inside_buffer() {
2492        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
2493        let (name, _attrs, _sc, _errs) = parse_start_tag(&src, "   a", 0, 1, 1, 0).unwrap();
2494        assert_eq!(name, "a");
2495    }
2496
2497    #[test]
2498    fn parse_start_tag_errors_use_the_full_tag_span() {
2499        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<a x==y ></a>");
2500        let evs = collect(src);
2501        let span = evs.iter().find_map(|e| match e {
2502            ParseEvent::ParseError {
2503                code,
2504                span: Some(span),
2505                ..
2506            } if code == "html.tokenizer.equals_at_start_unquoted" => Some(*span),
2507            _ => None,
2508        });
2509        let span = span.expect("expected equals_at_start_unquoted error");
2510        assert_eq!(span.byte_start, 0);
2511        assert_eq!(span.byte_end, 9);
2512        assert_eq!(span.line, 1);
2513        assert_eq!(span.col, 1);
2514    }
2515
2516    #[test]
2517    fn multiple_spaces_after_attr_name_are_accepted() {
2518        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<a x  =\"y\"></a>");
2519        let evs = collect(src);
2520        let (_name, attrs) = as_start_tag(&evs[0]).unwrap();
2521        let x = attrs.iter().find(|a| a.name == "x").unwrap();
2522        assert_eq!(x.value.as_deref(), Some("y"));
2523    }
2524
2525    #[test]
2526    fn rawtext_end_tag_search_can_exhaust_input() {
2527        assert_eq!(
2528            find_rawtext_end_tag(b"<<<<", 0, "script", InputFormat::Html),
2529            None
2530        );
2531    }
2532
2533    #[test]
2534    fn quoted_attributes_without_space_emit_no_space_between_attrs_error() {
2535        let src =
2536            SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<a title=\"x\"id=\"y\"></a>");
2537        let evs = collect(src);
2538        assert!(evs.iter().any(|e| matches!(
2539            e,
2540            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.no_space_between_attrs"
2541        )));
2542        let (name, attrs) = evs.iter().find_map(as_start_tag).unwrap();
2543        assert_eq!(name, "a");
2544        assert!(attrs.iter().any(|a| a.name == "title"));
2545        assert!(attrs.iter().any(|a| a.name == "id"));
2546    }
2547
2548    #[test]
2549    fn classify_start_tag_eof_covers_mixed_quote_tracking() {
2550        let (code, _msg) = classify_start_tag_eof(b" a='x\"");
2551        assert_eq!(code, "html.tokenizer.eof_in_attr_value");
2552    }
2553
2554    #[test]
2555    fn classify_start_tag_eof_tracks_double_quotes() {
2556        let (code, _msg) = classify_start_tag_eof(br#" a="b"#);
2557        assert_eq!(code, "html.tokenizer.eof_in_attr_value");
2558
2559        let (code, _msg) = classify_start_tag_eof(br#" a="b" c"#);
2560        assert_eq!(code, "html.tokenizer.eof_in_attr_name");
2561    }
2562
2563    #[test]
2564    fn start_tag_helper_returns_none_for_non_start_tag() {
2565        assert!(
2566            as_start_tag(&ParseEvent::Text {
2567                text: "x".to_string(),
2568                span: None,
2569            })
2570            .is_none()
2571        );
2572    }
2573
2574    #[test]
2575    fn rawtext_script_does_not_tokenize_lt() {
2576        let src = SimpleHtmlEventSource::from_str(
2577            "t",
2578            InputFormat::Html,
2579            "<script>if (a < b) {}</script>",
2580        );
2581        let evs = collect(src);
2582        assert!(matches!(evs[0], ParseEvent::StartTag { ref name, .. } if name == "script"));
2583        assert!(matches!(evs[1], ParseEvent::Text { ref text, .. } if text == "if (a < b) {}"));
2584        assert!(matches!(evs[2], ParseEvent::EndTag { ref name, .. } if name == "script"));
2585    }
2586
2587    #[test]
2588    fn rcdata_textarea_does_not_tokenize_lt_but_decodes_entities() {
2589        let src = SimpleHtmlEventSource::from_str(
2590            "t",
2591            InputFormat::Html,
2592            "<textarea>1 < 2 &lt; 3</textarea>",
2593        );
2594        let evs = collect(src);
2595        assert!(matches!(evs[0], ParseEvent::StartTag { ref name, .. } if name == "textarea"));
2596        assert!(matches!(evs[1], ParseEvent::Text { ref text, .. } if text == "1 < 2 < 3"));
2597        assert!(matches!(evs[2], ParseEvent::EndTag { ref name, .. } if name == "textarea"));
2598    }
2599
2600    #[test]
2601    fn rcdata_title_decodes_entities() {
2602        let src =
2603            SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<title>1 &lt; 2</title>");
2604        let evs = collect(src);
2605        assert!(matches!(evs[0], ParseEvent::StartTag { ref name, .. } if name == "title"));
2606        assert!(matches!(evs[1], ParseEvent::Text { ref text, .. } if text == "1 < 2"));
2607        assert!(matches!(evs[2], ParseEvent::EndTag { ref name, .. } if name == "title"));
2608    }
2609
2610    #[test]
2611    fn plaintext_consumes_rest_of_document_as_text() {
2612        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<plaintext><b>hi</b>");
2613        let evs = collect(src);
2614        assert!(matches!(evs[0], ParseEvent::StartTag { ref name, .. } if name == "plaintext"));
2615        assert!(matches!(evs[1], ParseEvent::Text { ref text, .. } if text == "<b>hi</b>"));
2616        assert_eq!(evs.len(), 2);
2617    }
2618
2619    #[test]
2620    fn xhtml_cdata_emits_text() {
2621        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Xhtml, "<![CDATA[<tag>]]>");
2622        let evs = collect(src);
2623        assert_eq!(
2624            evs,
2625            vec![ParseEvent::Text {
2626                text: "<tag>".to_string(),
2627                span: Some(Span::new(0, 17, 1, 1)),
2628            }]
2629        );
2630    }
2631
2632    #[test]
2633    fn html_cdata_outside_foreign_content_is_bogus_comment() {
2634        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<![CDATA[<tag>]]>");
2635        let evs = collect(src);
2636        assert!(evs.iter().any(|e| matches!(
2637            e,
2638            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.bogus_comment"
2639        )));
2640        assert!(evs.iter().any(|e| matches!(e, ParseEvent::Comment { .. })));
2641    }
2642
2643    #[test]
2644    fn html_cdata_inside_svg_emits_text_without_bogus_comment_error() {
2645        let src = SimpleHtmlEventSource::from_str(
2646            "t",
2647            InputFormat::Html,
2648            "<!--<!-- --><svg><script><![CDATA[if (a < b) {}]]></script></svg>",
2649        );
2650        let evs = collect(src);
2651        assert!(!evs.iter().any(|e| matches!(
2652            e,
2653            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.bogus_comment"
2654        )));
2655        assert!(evs.iter().any(|e| matches!(
2656            e,
2657            ParseEvent::Text { text, .. } if text.contains("if (a < b) {}")
2658        )));
2659    }
2660
2661    #[test]
2662    fn doctype_parses_public_and_system_ids() {
2663        let src = SimpleHtmlEventSource::from_str(
2664            "t",
2665            InputFormat::Html,
2666            "<!DOCTYPE html PUBLIC \"pub\" 'sys'><html></html>",
2667        );
2668        let evs = collect(src);
2669        let (name, public_id, system_id) = evs
2670            .iter()
2671            .find_map(|e| match e {
2672                ParseEvent::Doctype {
2673                    name,
2674                    public_id,
2675                    system_id,
2676                    ..
2677                } => Some((name.clone(), public_id.clone(), system_id.clone())),
2678                _ => None,
2679            })
2680            .expect("expected a doctype event");
2681        assert_eq!(name.as_deref(), Some("html"));
2682        assert_eq!(public_id.as_deref(), Some("pub"));
2683        assert_eq!(system_id.as_deref(), Some("sys"));
2684    }
2685
2686    #[test]
2687    fn decodes_basic_entities_in_text_and_attributes() {
2688        let src = SimpleHtmlEventSource::from_str(
2689            "t",
2690            InputFormat::Html,
2691            "<p title=\"a &lt; b\">Tom &amp; Jerry</p>",
2692        );
2693        let evs = collect(src);
2694        let (_name, attrs) = as_start_tag(&evs[0]).unwrap();
2695        let title = attrs.iter().find(|a| a.name == "title").unwrap();
2696        assert_eq!(title.value.as_deref(), Some("a < b"));
2697        assert!(matches!(evs[1], ParseEvent::Text { ref text, .. } if text == "Tom & Jerry"));
2698    }
2699
2700    #[test]
2701    fn named_char_ref_without_semicolon_emits_error_and_decodes_in_text() {
2702        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<p>&copy</p>");
2703        let evs = collect(src);
2704        assert!(evs.iter().any(|e| matches!(
2705            e,
2706            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.named_charref_no_semicolon"
2707        )));
2708        assert!(evs.iter().any(|e| matches!(
2709            e,
2710            ParseEvent::Text { text, .. } if text == "©"
2711        )));
2712    }
2713
2714    #[test]
2715    fn named_char_ref_without_semicolon_span_matches_ampersand_location_in_text() {
2716        let html = "<p>a &copy=1</p>";
2717        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, html);
2718        let evs = collect(src);
2719        let span = evs
2720            .iter()
2721            .find_map(|e| match e {
2722                ParseEvent::ParseError {
2723                    code,
2724                    span: Some(span),
2725                    ..
2726                } if code == "html.tokenizer.named_charref_no_semicolon" => Some(*span),
2727                _ => None,
2728            })
2729            .expect("expected named_charref_no_semicolon parse error");
2730
2731        let amp = html.find('&').expect("expected '&' in HTML");
2732        assert_eq!(span.byte_start, amp);
2733        assert_eq!(span.byte_end, amp + 1);
2734        assert_eq!(span.line, 1);
2735        assert_eq!(span.col, (amp + 1) as u32);
2736    }
2737
2738    #[test]
2739    fn named_char_ref_without_semicolon_span_matches_ampersand_location_in_attribute() {
2740        let html = "<p title=\"&copy.\">x</p>";
2741        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, html);
2742        let evs = collect(src);
2743        let span = evs
2744            .iter()
2745            .find_map(|e| match e {
2746                ParseEvent::ParseError {
2747                    code,
2748                    span: Some(span),
2749                    ..
2750                } if code == "html.tokenizer.named_charref_no_semicolon" => Some(*span),
2751                _ => None,
2752            })
2753            .expect("expected named_charref_no_semicolon parse error");
2754
2755        let amp = html.find('&').expect("expected '&' in HTML");
2756        assert_eq!(span.byte_start, amp);
2757        assert_eq!(span.byte_end, amp + 1);
2758        assert_eq!(span.line, 1);
2759        assert_eq!(span.col, (amp + 1) as u32);
2760    }
2761
2762    #[test]
2763    fn named_char_ref_without_semicolon_not_decoded_in_attribute_when_followed_by_equals() {
2764        let src = SimpleHtmlEventSource::from_str(
2765            "t",
2766            InputFormat::Html,
2767            "<!--<!-- --><a title=\"&copy=1\"></a>",
2768        );
2769        let evs = collect(src);
2770        assert!(!evs.iter().any(|e| matches!(
2771            e,
2772            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.named_charref_no_semicolon"
2773        )));
2774        let (name, attrs) = evs.iter().find_map(as_start_tag).unwrap();
2775        assert_eq!(name, "a");
2776        let title = attrs.iter().find(|a| a.name == "title").unwrap();
2777        assert_eq!(title.value.as_deref(), Some("&copy=1"));
2778    }
2779
2780    #[test]
2781    fn numeric_char_ref_without_semicolon_emits_error_and_decodes() {
2782        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<p>&#65</p>");
2783        let evs = collect(src);
2784        assert!(evs.iter().any(|e| matches!(
2785            e,
2786            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.charref_no_semicolon"
2787        )));
2788        assert!(evs.iter().any(|e| matches!(
2789            e,
2790            ParseEvent::Text { text, .. } if text == "A"
2791        )));
2792    }
2793
2794    #[test]
2795    fn numeric_char_ref_zero_emits_error_and_replacement_char() {
2796        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<p>&#0;</p>");
2797        let evs = collect(src);
2798        assert!(evs.iter().any(|e| matches!(
2799            e,
2800            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.charref_zero"
2801        )));
2802        assert!(evs.iter().any(|e| matches!(
2803            e,
2804            ParseEvent::Text { text, .. } if text == "\u{FFFD}"
2805        )));
2806    }
2807
2808    #[test]
2809    fn forbidden_code_point_in_text_emits_error() {
2810        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<p>\u{000B}</p>");
2811        let evs = collect(src);
2812        assert!(evs.iter().any(|e| matches!(
2813            e,
2814            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.forbidden_code_point"
2815        )));
2816    }
2817
2818    #[test]
2819    fn astral_noncharacter_in_text_emits_error() {
2820        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<p>\u{10FFFE}</p>");
2821        let evs = collect(src);
2822        assert!(evs.iter().any(|e| matches!(
2823            e,
2824            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.astral_noncharacter"
2825        )));
2826    }
2827
2828    #[test]
2829    fn nested_comment_emits_parse_error() {
2830        let html = "<!-- a <!-- b -->";
2831        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, html);
2832        let evs = collect(src);
2833        let span = evs
2834            .iter()
2835            .find_map(|e| match e {
2836                ParseEvent::ParseError {
2837                    code,
2838                    span: Some(span),
2839                    ..
2840                } if code == "html.tokenizer.nested_comment" => Some(*span),
2841                _ => None,
2842            })
2843            .expect("expected nested_comment parse error");
2844        let nested = html.rfind("<!--").expect("expected nested '<!--' in HTML");
2845        assert_eq!(span.byte_start, nested);
2846        assert_eq!(span.byte_end, nested + "<!--".len());
2847        assert_eq!(span.line, 1);
2848        assert_eq!(span.col, (nested + 1) as u32);
2849        assert!(
2850            evs.iter()
2851                .any(|e| matches!(e, ParseEvent::Comment { text, .. } if text.contains("a")))
2852        );
2853    }
2854
2855    #[test]
2856    fn html_processing_instruction_emits_error_and_comment() {
2857        let src = SimpleHtmlEventSource::from_str(
2858            "t",
2859            InputFormat::Html,
2860            "<?xml version=\"1.0\"?><p>x</p>",
2861        );
2862        let evs = collect(src);
2863        assert!(evs.iter().any(|e| matches!(
2864            e,
2865            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.processing_instruction"
2866        )));
2867        assert!(evs.iter().any(|e| matches!(
2868            e,
2869            ParseEvent::Comment { text, .. } if text.contains("xml version")
2870        )));
2871    }
2872
2873    #[test]
2874    fn xhtml_processing_instruction_is_emitted_as_processing_instruction_event() {
2875        let src = SimpleHtmlEventSource::from_str(
2876            "t",
2877            InputFormat::Xhtml,
2878            "<?xml-stylesheet href=\"a.css\" type=\"text/css\"?><root/>",
2879        );
2880        let evs = collect(src);
2881        assert!(matches!(
2882            evs[0],
2883            ParseEvent::ProcessingInstruction { ref target, ref data, .. }
2884            if target == "xml-stylesheet" && data.contains("href=\"a.css\"")
2885        ));
2886    }
2887
2888    #[test]
2889    fn doctype_missing_space_before_name_emits_error() {
2890        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<!DOCTYPEhtml><p>x</p>");
2891        let evs = collect(src);
2892        assert!(evs.iter().any(|e| matches!(
2893            e,
2894            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.missing_space_before_name"
2895        )));
2896    }
2897
2898    #[test]
2899    fn html_event_source_wrapper_uses_simple_backend_for_xhtml() {
2900        let mut src = HtmlEventSource::from_str("t", InputFormat::Xhtml, "<root/>").unwrap();
2901        assert_eq!(src.source_name(), "t");
2902        assert_eq!(src.format(), InputFormat::Xhtml);
2903        let mut evs = Vec::new();
2904        while let Some(ev) = src.next_event().unwrap() {
2905            evs.push(ev);
2906        }
2907        assert!(
2908            evs.iter()
2909                .any(|e| matches!(e, ParseEvent::StartTag { name, .. } if name == "root"))
2910        );
2911    }
2912
2913    #[test]
2914    fn foreignobject_in_svg_switches_insertion_namespace_to_html() {
2915        let src = SimpleHtmlEventSource::from_str(
2916            "t",
2917            InputFormat::Html,
2918            "<svg><foreignObject><p>hi</p></foreignObject></svg>",
2919        );
2920        let evs = collect(src);
2921        assert!(evs.iter().any(|e| matches!(e, ParseEvent::StartTag { name, .. } if name.eq_ignore_ascii_case("foreignobject"))));
2922        assert!(
2923            evs.iter()
2924                .any(|e| matches!(e, ParseEvent::StartTag { name, .. } if name == "p"))
2925        );
2926    }
2927
2928    #[test]
2929    fn unterminated_cdata_emits_xml_cdata_eof_error() {
2930        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Xhtml, "<![CDATA[unterminated");
2931        let evs = collect(src);
2932        assert!(evs.iter().any(|e| matches!(
2933            e,
2934            ParseEvent::ParseError { code, .. } if code == "xml.cdata_eof"
2935        )));
2936    }
2937
2938    #[test]
2939    fn bogus_comment_without_gt_emits_comment_then_finishes() {
2940        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<?xml");
2941        let evs = collect(src);
2942        assert!(evs.iter().any(|e| matches!(
2943            e,
2944            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.processing_instruction"
2945        )));
2946        assert!(
2947            evs.iter()
2948                .any(|e| matches!(e, ParseEvent::Comment { text, .. } if text.contains("xml")))
2949        );
2950    }
2951
2952    #[test]
2953    fn unterminated_processing_instruction_emits_xml_pi_eof_error() {
2954        let src =
2955            SimpleHtmlEventSource::from_str("t", InputFormat::Xhtml, "<?xml-stylesheet href=\"a\"");
2956        let evs = collect(src);
2957        assert!(evs.iter().any(|e| matches!(
2958            e,
2959            ParseEvent::ParseError { code, .. } if code == "xml.pi_eof"
2960        )));
2961    }
2962
2963    #[test]
2964    fn garbage_after_lt_slash_at_eof_emits_error_and_finishes() {
2965        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "</ ");
2966        let evs = collect(src);
2967        assert!(evs.iter().any(|e| matches!(
2968            e,
2969            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.garbage_after_lt_slash"
2970        )));
2971        assert_eq!(evs.len(), 1);
2972    }
2973
2974    #[test]
2975    fn doctype_end_finder_ignores_gt_inside_quoted_identifiers() {
2976        let src = SimpleHtmlEventSource::from_str(
2977            "t",
2978            InputFormat::Html,
2979            "<!DOCTYPE html SYSTEM \"a> b\"><html></html>",
2980        );
2981        let evs = collect(src);
2982
2983        assert!(evs.iter().any(|e| matches!(
2984            e,
2985            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.gt_in_system_id"
2986        )));
2987
2988        let system_id = evs
2989            .iter()
2990            .find_map(|e| match e {
2991                ParseEvent::Doctype { system_id, .. } => system_id.as_deref(),
2992                _ => None,
2993            })
2994            .expect("expected doctype event");
2995        assert_eq!(system_id, "a> b");
2996
2997        assert!(
2998            evs.iter()
2999                .any(|e| matches!(e, ParseEvent::StartTag { name, .. } if name == "html"))
3000        );
3001    }
3002
3003    #[test]
3004    fn doctype_system_identifier_gt_and_unclosed_quote_emit_errors() {
3005        let src = SimpleHtmlEventSource::from_str(
3006            "t",
3007            InputFormat::Html,
3008            "<!DOCTYPE html SYSTEM \"a> b><p>x</p>",
3009        );
3010        let evs = collect(src);
3011        assert!(evs.iter().any(|e| matches!(
3012            e,
3013            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.gt_in_system_id"
3014        )));
3015        assert!(evs.iter().any(|e| matches!(
3016            e,
3017            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.eof_in_system_id"
3018        )));
3019    }
3020
3021    #[test]
3022    fn doctype_public_identifier_gt_and_unclosed_quote_emit_errors() {
3023        let src = SimpleHtmlEventSource::from_str(
3024            "t",
3025            InputFormat::Html,
3026            "<!DOCTYPE html PUBLIC \"pub\" \"a> b",
3027        );
3028        let evs = collect(src);
3029        assert!(evs.iter().any(|e| matches!(
3030            e,
3031            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.gt_in_system_id"
3032        )));
3033        assert!(evs.iter().any(|e| matches!(
3034            e,
3035            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.eof_in_system_id"
3036        )));
3037    }
3038
3039    #[test]
3040    fn simple_event_source_exposes_source_name_and_format() {
3041        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<p>x</p>");
3042        assert_eq!(src.source_name(), "t");
3043        assert_eq!(src.format(), InputFormat::Html);
3044    }
3045
3046    #[test]
3047    fn scan_next_is_noop_when_finished_is_true() {
3048        let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<p>x</p>");
3049        while src.next_event().unwrap().is_some() {}
3050        assert!(src.finished);
3051        src.scan_next().unwrap();
3052    }
3053
3054    #[test]
3055    fn cdata_in_html_inside_svg_is_parsed_as_text() {
3056        let src =
3057            SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<svg><![CDATA[<tag>]]></svg>");
3058        let evs = collect(src);
3059        assert!(
3060            evs.iter()
3061                .any(|e| matches!(e, ParseEvent::Text { text, .. } if text == "<tag>"))
3062        );
3063    }
3064
3065    #[test]
3066    fn end_tag_garbage_and_non_tag_end_sequences_are_coalesced_as_text() {
3067        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "</ x> </1> <p>x</p>");
3068        let evs = collect(src);
3069        assert!(evs.iter().any(|e| matches!(
3070            e,
3071            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.garbage_after_lt_slash"
3072        )));
3073        assert!(
3074            evs.iter()
3075                .any(|e| matches!(e, ParseEvent::Text { text, .. } if text.contains("</1>")))
3076        );
3077    }
3078
3079    #[test]
3080    fn malformed_attribute_syntax_triggers_simple_parser_errors() {
3081        let src = SimpleHtmlEventSource::from_str(
3082            "t",
3083            InputFormat::Html,
3084            "<a href=></a><a title=\"x\"href=\"y\"></a><a /x></a><a ></a><a/>",
3085        );
3086        let evs = collect(src);
3087        assert!(evs.iter().any(|e| matches!(
3088            e,
3089            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.attr_value_missing"
3090        )));
3091        assert!(evs.iter().any(|e| matches!(
3092            e,
3093            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.no_space_between_attrs"
3094        )));
3095        assert!(evs.iter().any(|e| matches!(
3096            e,
3097            ParseEvent::StartTag { name, self_closing, .. } if name == "a" && *self_closing
3098        )));
3099    }
3100
3101    #[test]
3102    fn decode_char_refs_covers_edge_cases_directly() {
3103        assert_eq!(
3104            decode_char_refs(InputFormat::Html, "&".to_string(), false),
3105            "&"
3106        );
3107        assert_eq!(
3108            decode_char_refs(InputFormat::Html, "&#;".to_string(), false),
3109            "&#;"
3110        );
3111        assert_eq!(
3112            decode_char_refs(InputFormat::Html, "&bogus;".to_string(), false),
3113            "&bogus;"
3114        );
3115        assert_eq!(
3116            decode_char_refs(InputFormat::Html, "&#65;".to_string(), false),
3117            "A"
3118        );
3119        assert_eq!(
3120            decode_char_refs(InputFormat::Html, "&#x41;".to_string(), false),
3121            "A"
3122        );
3123        assert_eq!(
3124            decode_char_refs(InputFormat::Html, "&#X41;".to_string(), false),
3125            "A"
3126        );
3127        assert_eq!(
3128            decode_char_refs(InputFormat::Html, "&#x;".to_string(), false),
3129            "&#x;"
3130        );
3131        assert_eq!(
3132            decode_char_refs(InputFormat::Html, "&#x110000;".to_string(), false),
3133            "\u{FFFD}"
3134        );
3135
3136        // Attribute-specific behavior: do not expand missing-semicolon named refs when followed by
3137        // an alnum or '='.
3138        assert_eq!(
3139            decode_char_refs(InputFormat::Html, "&copy=1".to_string(), true),
3140            "&copy=1"
3141        );
3142        let (s, errs) = decode_char_refs_with_errors(InputFormat::Xhtml, "&copy", true, 0, 1, 1);
3143        assert_eq!(s, "&copy");
3144        assert!(errs.is_empty());
3145
3146        let (s2, errs2) = decode_char_refs_with_errors(InputFormat::Html, "&copy=1", true, 0, 1, 1);
3147        assert_eq!(s2, "&copy=1");
3148        assert!(errs2.is_empty());
3149    }
3150
3151    #[test]
3152    fn decode_char_refs_returns_input_string_when_no_refs_present() {
3153        let raw = "plain text".to_string();
3154        let ptr = raw.as_ptr();
3155        let cap = raw.capacity();
3156        let out = decode_char_refs(InputFormat::Html, raw, false);
3157        assert_eq!(out, "plain text");
3158        assert_eq!(out.as_ptr(), ptr);
3159        assert_eq!(out.capacity(), cap);
3160    }
3161
3162    #[test]
3163    fn normalize_name_lowercases_in_html_but_not_in_xhtml() {
3164        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
3165        assert_eq!(src.normalize_name("div"), "div");
3166        assert_eq!(src.normalize_name("DiV"), "div");
3167        assert_eq!(src.normalize_name("Ü"), "Ü");
3168        assert_eq!(src.normalize_name("ÜA"), "Üa");
3169
3170        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Xhtml, "");
3171        assert_eq!(src.normalize_name("DiV"), "DiV");
3172        assert_eq!(src.normalize_name("ÜA"), "ÜA");
3173    }
3174
3175    #[test]
3176    fn current_text_mode_kind_applies_only_in_html_namespace() {
3177        let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
3178        assert!(matches!(src.current_text_mode_kind(), TextModeKind::Data));
3179        src.open_elements.push("script".to_string());
3180        src.open_namespaces.push(HtmlNamespace::Html);
3181        assert!(matches!(
3182            src.current_text_mode_kind(),
3183            TextModeKind::RawText
3184        ));
3185
3186        src.open_elements.pop();
3187        src.open_namespaces.pop();
3188        src.open_elements.push("title".to_string());
3189        src.open_namespaces.push(HtmlNamespace::Html);
3190        assert!(matches!(src.current_text_mode_kind(), TextModeKind::RcData));
3191
3192        src.open_elements.pop();
3193        src.open_namespaces.pop();
3194        src.open_elements.push("plaintext".to_string());
3195        src.open_namespaces.push(HtmlNamespace::Html);
3196        assert!(matches!(
3197            src.current_text_mode_kind(),
3198            TextModeKind::Plaintext
3199        ));
3200
3201        src.open_elements.pop();
3202        src.open_namespaces.pop();
3203        src.open_elements.push("script".to_string());
3204        src.open_namespaces.push(HtmlNamespace::Svg);
3205        assert!(matches!(src.current_text_mode_kind(), TextModeKind::Data));
3206    }
3207
3208    #[test]
3209    fn internal_stack_mismatches_fall_back_to_defaults() {
3210        let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
3211
3212        src.open_elements.push("script".to_string());
3213        assert!(matches!(src.current_text_mode_kind(), TextModeKind::Data));
3214        assert_eq!(src.current_insertion_namespace(), HtmlNamespace::Html);
3215
3216        src.open_elements.clear();
3217        src.open_namespaces.push(HtmlNamespace::Html);
3218        assert!(matches!(src.current_text_mode_kind(), TextModeKind::Data));
3219        assert_eq!(src.current_insertion_namespace(), HtmlNamespace::Html);
3220
3221        src.open_namespaces.clear();
3222        src.open_namespaces.push(HtmlNamespace::Svg);
3223        assert!(matches!(src.current_text_mode_kind(), TextModeKind::Data));
3224        assert_eq!(src.current_insertion_namespace(), HtmlNamespace::Svg);
3225    }
3226
3227    #[test]
3228    fn namespace_for_start_tag_respects_current_insertion_namespace() {
3229        let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
3230
3231        // Default: HTML insertion namespace.
3232        assert_eq!(src.namespace_for_start_tag("div"), HtmlNamespace::Html);
3233        assert_eq!(src.namespace_for_start_tag("svg"), HtmlNamespace::Svg);
3234        assert_eq!(src.namespace_for_start_tag("math"), HtmlNamespace::Math);
3235
3236        // Inside SVG insertion mode, elements remain in SVG namespace.
3237        src.open_elements.push("svg".to_string());
3238        src.open_namespaces.push(HtmlNamespace::Svg);
3239        assert_eq!(src.current_insertion_namespace(), HtmlNamespace::Svg);
3240        assert_eq!(src.namespace_for_start_tag("div"), HtmlNamespace::Svg);
3241        assert_eq!(src.namespace_for_start_tag("math"), HtmlNamespace::Svg);
3242
3243        // Integration points inside SVG switch the insertion namespace back to HTML.
3244        for tag in ["foreignobject", "desc", "title"] {
3245            src.open_elements.pop();
3246            src.open_namespaces.pop();
3247            src.open_elements.push(tag.to_string());
3248            src.open_namespaces.push(HtmlNamespace::Svg);
3249            assert_eq!(src.current_insertion_namespace(), HtmlNamespace::Html);
3250            assert_eq!(src.namespace_for_start_tag("div"), HtmlNamespace::Html);
3251            assert_eq!(src.namespace_for_start_tag("svg"), HtmlNamespace::Svg);
3252        }
3253
3254        // Integration point tag names only matter when currently inserting into SVG.
3255        src.open_elements.pop();
3256        src.open_namespaces.pop();
3257        src.open_elements.push("foreignobject".to_string());
3258        src.open_namespaces.push(HtmlNamespace::Html);
3259        assert_eq!(src.current_insertion_namespace(), HtmlNamespace::Html);
3260
3261        // Math insertion namespace persists.
3262        src.open_elements.pop();
3263        src.open_namespaces.pop();
3264        src.open_elements.push("math".to_string());
3265        src.open_namespaces.push(HtmlNamespace::Math);
3266        assert_eq!(src.current_insertion_namespace(), HtmlNamespace::Math);
3267        assert_eq!(src.namespace_for_start_tag("div"), HtmlNamespace::Math);
3268    }
3269
3270    #[test]
3271    fn rawtext_and_eof_helpers_cover_edge_branches() {
3272        assert_eq!(
3273            find_rawtext_end_tag(b"</s", 0, "script", InputFormat::Html),
3274            None
3275        );
3276        assert_eq!(
3277            find_rawtext_end_tag(b"</scriptx>", 0, "script", InputFormat::Html),
3278            None
3279        );
3280        assert_eq!(
3281            find_rawtext_end_tag(b"</script>", 0, "script", InputFormat::Html),
3282            Some(0)
3283        );
3284
3285        let (code, _msg) = classify_start_tag_eof(b"a='b");
3286        assert_eq!(code, "html.tokenizer.eof_in_attr_value");
3287        let (code2, _msg2) = classify_start_tag_eof(b"a='b' c");
3288        assert_eq!(code2, "html.tokenizer.eof_in_attr_name");
3289    }
3290
3291    #[test]
3292    fn rcdata_without_closing_tag_decodes_entities_and_finishes() {
3293        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<textarea>1 &lt; 2");
3294        let evs = collect(src);
3295        assert!(
3296            evs.iter()
3297                .any(|e| matches!(e, ParseEvent::Text { text, .. } if text == "1 < 2"))
3298        );
3299    }
3300
3301    #[test]
3302    fn direct_helpers_cover_remaining_xhtml_and_edge_branches() {
3303        assert_eq!(
3304            decode_char_refs(
3305                InputFormat::Xhtml,
3306                "&lt;&gt;&amp;&quot;&apos;".to_string(),
3307                false
3308            ),
3309            "<>&\"'"
3310        );
3311
3312        let (s, errs) = decode_char_refs_with_errors(InputFormat::Html, "&", false, 0, 1, 1);
3313        assert_eq!(s, "&");
3314        assert!(errs.is_empty());
3315
3316        let (s2, errs2) = decode_char_refs_with_errors(InputFormat::Html, "&#x;", false, 0, 1, 1);
3317        assert_eq!(s2, "&#x;");
3318        assert!(errs2.iter().any(|e| matches!(
3319            e,
3320            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.charref_no_digits"
3321        )));
3322
3323        assert!(starts_with_ascii_case_insensitive(b"do", b""));
3324        assert!(starts_with_ascii_case_insensitive(b"DOCTYPE", b"doctype"));
3325        assert!(!starts_with_ascii_case_insensitive(b"do", b"doctype"));
3326
3327        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
3328        let (_name, _attrs, _self_closing, errs3) =
3329            parse_start_tag(&src, "a href=>", 0, 1, 1, 8).unwrap();
3330        assert!(errs3.iter().any(|e| matches!(
3331            e,
3332            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.attr_value_missing"
3333        )));
3334    }
3335
3336    #[test]
3337    fn text_run_decode_errors_are_emitted_before_text_event() {
3338        let evs = collect(SimpleHtmlEventSource::from_str(
3339            "t",
3340            InputFormat::Html,
3341            "&#x;",
3342        ));
3343        let err_i = evs
3344            .iter()
3345            .position(|e| matches!(e, ParseEvent::ParseError { code, .. } if code == "html.tokenizer.charref_no_digits"))
3346            .expect("expected parse error event");
3347        let text_i = evs
3348            .iter()
3349            .position(|e| matches!(e, ParseEvent::Text { text, .. } if text == "&#x;"))
3350            .expect("expected text event");
3351        assert!(err_i < text_i);
3352    }
3353
3354    #[test]
3355    fn html_event_source_covers_variant_dispatch() {
3356        let mut html = HtmlEventSource::from_str("t", InputFormat::Html, "<p>x</p>").unwrap();
3357        assert_eq!(html.source_name(), "t");
3358        assert_eq!(html.format(), InputFormat::Html);
3359        #[cfg(feature = "html5ever")]
3360        assert!(matches!(&html, HtmlEventSource::Html5Ever(_)));
3361        assert!(html.next_event().unwrap().is_some());
3362
3363        let mut xhtml =
3364            HtmlEventSource::from_str("t2", InputFormat::Xhtml, "<?xml-stylesheet href=\"a\"?>")
3365                .unwrap();
3366        assert_eq!(xhtml.source_name(), "t2");
3367        assert_eq!(xhtml.format(), InputFormat::Xhtml);
3368        assert!(matches!(&xhtml, HtmlEventSource::Simple(_)));
3369        assert!(matches!(
3370            xhtml.next_event().unwrap(),
3371            Some(ParseEvent::ProcessingInstruction { .. })
3372        ));
3373    }
3374
3375    #[test]
3376    fn newline_advances_line_and_col_for_end_tags() {
3377        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<p>\n</p>");
3378        let evs = collect(src);
3379        let span = evs.iter().find_map(|e| match e {
3380            ParseEvent::EndTag { name, span, .. } if name == "p" => *span,
3381            _ => None,
3382        });
3383        let span = span.expect("expected </p> span");
3384        assert_eq!(span.line, 2);
3385        assert_eq!(span.col, 1);
3386    }
3387
3388    #[test]
3389    fn math_namespace_is_tracked_across_nested_tags() {
3390        let src =
3391            SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<math><mi>x</mi></math>");
3392        let evs = collect(src);
3393        assert!(
3394            evs.iter()
3395                .any(|e| matches!(e, ParseEvent::StartTag { name, .. } if name == "math"))
3396        );
3397        assert!(
3398            evs.iter()
3399                .any(|e| matches!(e, ParseEvent::StartTag { name, .. } if name == "mi"))
3400        );
3401        assert!(
3402            evs.iter()
3403                .any(|e| matches!(e, ParseEvent::EndTag { name, .. } if name == "math"))
3404        );
3405    }
3406
3407    #[test]
3408    fn common_bad_sequences_emit_errors_and_text() {
3409        let evs = collect(SimpleHtmlEventSource::from_str(
3410            "t",
3411            InputFormat::Html,
3412            "<></>",
3413        ));
3414        assert!(evs.iter().any(|e| matches!(
3415            e,
3416            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.lt_gt"
3417        )));
3418        assert!(
3419            evs.iter()
3420                .any(|e| matches!(e, ParseEvent::Text { text, .. } if text == "<>"))
3421        );
3422        assert!(evs.iter().any(|e| matches!(
3423            e,
3424            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.lt_slash_gt"
3425        )));
3426        assert!(
3427            evs.iter()
3428                .any(|e| matches!(e, ParseEvent::Text { text, .. } if text == "</>"))
3429        );
3430    }
3431
3432    #[test]
3433    fn comment_parses_text_and_span() {
3434        let evs = collect(SimpleHtmlEventSource::from_str(
3435            "t",
3436            InputFormat::Html,
3437            "<!-- hi -->",
3438        ));
3439        let (text, span) = evs
3440            .iter()
3441            .find_map(|e| match e {
3442                ParseEvent::Comment { text, span } => Some((text.as_str(), *span)),
3443                _ => None,
3444            })
3445            .expect("expected comment event");
3446        assert_eq!(text, " hi ");
3447        assert_eq!(span, Some(Span::new(0, 11, 1, 1)));
3448    }
3449
3450    #[test]
3451    fn comment_eof_emits_expected_parse_error() {
3452        let evs = collect(SimpleHtmlEventSource::from_str(
3453            "t",
3454            InputFormat::Html,
3455            "<!--",
3456        ));
3457        assert!(evs.iter().any(|e| matches!(
3458            e,
3459            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.eof_in_comment"
3460        )));
3461    }
3462
3463    #[test]
3464    fn doctype_emits_additional_vnu_like_syntax_errors() {
3465        let evs = collect(SimpleHtmlEventSource::from_str(
3466            "t",
3467            InputFormat::Html,
3468            "<!DOCTYPEhtml>",
3469        ));
3470        assert!(evs.iter().any(|e| matches!(
3471            e,
3472            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.missing_space_before_name"
3473        )));
3474
3475        let evs = collect(SimpleHtmlEventSource::from_str(
3476            "t",
3477            InputFormat::Html,
3478            "<!DOCTYPE html PUBLIC\"a\">",
3479        ));
3480        assert!(evs.iter().any(|e| matches!(
3481            e,
3482            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.no_space_after_public"
3483        )));
3484
3485        let evs = collect(SimpleHtmlEventSource::from_str(
3486            "t",
3487            InputFormat::Html,
3488            "<!DOCTYPE html PUBLIC \"a\"\"b\">",
3489        ));
3490        assert!(evs.iter().any(|e| matches!(
3491            e,
3492            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.no_space_between_public_system"
3493        )));
3494
3495        let evs = collect(SimpleHtmlEventSource::from_str(
3496            "t",
3497            InputFormat::Html,
3498            "<!DOCTYPE html SYSTEM\"a\">",
3499        ));
3500        assert!(evs.iter().any(|e| matches!(
3501            e,
3502            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.no_space_after_system"
3503        )));
3504
3505        let evs = collect(SimpleHtmlEventSource::from_str(
3506            "t",
3507            InputFormat::Html,
3508            "<!DOCTYPE html bogus>",
3509        ));
3510        assert!(evs.iter().any(|e| matches!(
3511            e,
3512            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.bogus"
3513        )));
3514    }
3515
3516    #[test]
3517    fn parse_start_tag_emits_errors_for_malformed_attribute_syntax() {
3518        let evs = collect(SimpleHtmlEventSource::from_str(
3519            "t",
3520            InputFormat::Html,
3521            "<a =></a>",
3522        ));
3523        assert!(evs.iter().any(|e| matches!(
3524            e,
3525            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.equals_expecting_attr_name"
3526        )));
3527
3528        let evs = collect(SimpleHtmlEventSource::from_str(
3529            "t",
3530            InputFormat::Html,
3531            "<a <x=1></a>",
3532        ));
3533        assert!(evs.iter().any(|e| matches!(
3534            e,
3535            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.lt_expecting_attr_name"
3536        )));
3537
3538        let evs = collect(SimpleHtmlEventSource::from_str(
3539            "t",
3540            InputFormat::Html,
3541            "<a x\"y=1></a>",
3542        ));
3543        assert!(evs.iter().any(|e| matches!(
3544            e,
3545            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.quote_in_attr_name"
3546        )));
3547
3548        let evs = collect(SimpleHtmlEventSource::from_str(
3549            "t",
3550            InputFormat::Html,
3551            "<a x<y=1></a>",
3552        ));
3553        assert!(evs.iter().any(|e| matches!(
3554            e,
3555            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.lt_in_attr_name"
3556        )));
3557
3558        let evs = collect(SimpleHtmlEventSource::from_str(
3559            "t",
3560            InputFormat::Html,
3561            "<a id='a' id='b'></a>",
3562        ));
3563        assert!(evs.iter().any(|e| matches!(
3564            e,
3565            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.duplicate_attribute"
3566        )));
3567
3568        let evs = collect(SimpleHtmlEventSource::from_str(
3569            "t",
3570            InputFormat::Html,
3571            "<a ID='a' id='b'></a>",
3572        ));
3573        assert!(evs.iter().any(|e| matches!(
3574            e,
3575            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.duplicate_attribute"
3576        )));
3577    }
3578
3579    #[test]
3580    fn html_normalizes_tag_names_to_ascii_lowercase_but_xhtml_preserves_case() {
3581        let evs = collect(SimpleHtmlEventSource::from_str(
3582            "t",
3583            InputFormat::Html,
3584            "<DIV></DIV>",
3585        ));
3586        assert!(matches!(evs[0], ParseEvent::StartTag { ref name, .. } if name == "div"));
3587        assert!(matches!(evs[1], ParseEvent::EndTag { ref name, .. } if name == "div"));
3588
3589        let evs = collect(SimpleHtmlEventSource::from_str(
3590            "t",
3591            InputFormat::Xhtml,
3592            "<DIV></DIV>",
3593        ));
3594        assert!(matches!(evs[0], ParseEvent::StartTag { ref name, .. } if name == "DIV"));
3595        assert!(matches!(evs[1], ParseEvent::EndTag { ref name, .. } if name == "DIV"));
3596    }
3597
3598    #[test]
3599    fn unquoted_attribute_values_emit_expected_parse_errors() {
3600        let evs = collect(SimpleHtmlEventSource::from_str(
3601            "t",
3602            InputFormat::Html,
3603            "<a x=`y ></a>",
3604        ));
3605        assert!(evs.iter().any(|e| matches!(
3606            e,
3607            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.backtick_at_start_unquoted"
3608        )));
3609
3610        let evs = collect(SimpleHtmlEventSource::from_str(
3611            "t",
3612            InputFormat::Html,
3613            "<a x=y`z ></a>",
3614        ));
3615        assert!(evs.iter().any(|e| matches!(
3616            e,
3617            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.backtick_in_unquoted"
3618        )));
3619
3620        let evs = collect(SimpleHtmlEventSource::from_str(
3621            "t",
3622            InputFormat::Html,
3623            "<a x=<y ></a>",
3624        ));
3625        assert!(evs.iter().any(|e| matches!(
3626            e,
3627            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.lt_at_start_unquoted"
3628        )));
3629
3630        let evs = collect(SimpleHtmlEventSource::from_str(
3631            "t",
3632            InputFormat::Html,
3633            "<a x=y<z ></a>",
3634        ));
3635        assert!(evs.iter().any(|e| matches!(
3636            e,
3637            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.lt_in_unquoted"
3638        )));
3639
3640        let evs = collect(SimpleHtmlEventSource::from_str(
3641            "t",
3642            InputFormat::Html,
3643            "<a x==y ></a>",
3644        ));
3645        assert!(evs.iter().any(|e| matches!(
3646            e,
3647            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.equals_at_start_unquoted"
3648        )));
3649
3650        let evs = collect(SimpleHtmlEventSource::from_str(
3651            "t",
3652            InputFormat::Html,
3653            "<a x=y\"z ></a>",
3654        ));
3655        assert!(evs.iter().any(|e| matches!(
3656            e,
3657            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.quote_in_unquoted"
3658        )));
3659    }
3660
3661    #[test]
3662    fn slash_not_immediately_followed_by_gt_emits_error() {
3663        let evs = collect(SimpleHtmlEventSource::from_str(
3664            "t",
3665            InputFormat::Html,
3666            "<a / ></a>",
3667        ));
3668        assert!(evs.iter().any(|e| matches!(
3669            e,
3670            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.slash_not_immediately_followed_by_gt"
3671        )));
3672    }
3673
3674    #[test]
3675    fn slash_immediately_followed_by_gt_does_not_emit_error() {
3676        let evs = collect(SimpleHtmlEventSource::from_str(
3677            "t",
3678            InputFormat::Html,
3679            "<a /></a>",
3680        ));
3681        assert!(!evs.iter().any(|e| matches!(
3682            e,
3683            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.slash_not_immediately_followed_by_gt"
3684        )));
3685    }
3686
3687    #[test]
3688    fn image_start_tag_emits_error() {
3689        let evs = collect(SimpleHtmlEventSource::from_str(
3690            "t",
3691            InputFormat::Html,
3692            "<image></image>",
3693        ));
3694        assert!(evs.iter().any(|e| matches!(
3695            e,
3696            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.image_start_tag"
3697        )));
3698    }
3699
3700    #[test]
3701    fn rawtext_end_tag_search_is_case_sensitive_in_xhtml() {
3702        assert_eq!(
3703            find_rawtext_end_tag(b"</script>", 0, "script", InputFormat::Xhtml),
3704            Some(0)
3705        );
3706        assert_eq!(
3707            find_rawtext_end_tag(b"</SCRIPT>", 0, "script", InputFormat::Xhtml),
3708            None
3709        );
3710    }
3711
3712    #[test]
3713    fn xhtml_end_tag_matching_is_case_sensitive() {
3714        let src = SimpleHtmlEventSource::from_str("t", InputFormat::Xhtml, "<A></A>");
3715        let evs = collect(src);
3716        assert!(
3717            evs.iter()
3718                .any(|e| matches!(e, ParseEvent::StartTag { name, .. } if name == "A"))
3719        );
3720        assert!(
3721            evs.iter()
3722                .any(|e| matches!(e, ParseEvent::EndTag { name, .. } if name == "A"))
3723        );
3724    }
3725
3726    #[test]
3727    fn numeric_character_references_emit_expected_parse_errors() {
3728        let cases = [
3729            ("&#0;", "html.tokenizer.charref_zero"),
3730            ("&#x110000;", "html.tokenizer.charref_outside_range"),
3731            ("&#xD800;", "html.tokenizer.charref_surrogate"),
3732            ("&#13;", "html.tokenizer.charref_cr"),
3733            ("&#x80;", "html.tokenizer.charref_c1_controls"),
3734            ("&#x1FFFE;", "html.tokenizer.charref_astral_noncharacter"),
3735            ("&#xFFFE;", "html.tokenizer.charref_noncharacter"),
3736            ("&#xFDD0;", "html.tokenizer.charref_unassigned"),
3737            ("&#11;", "html.tokenizer.charref_control"),
3738        ];
3739
3740        for (input, expected_code) in cases {
3741            let (_s, errs) = decode_char_refs_with_errors(InputFormat::Html, input, false, 0, 1, 1);
3742            assert!(
3743                errs.iter().any(|e| matches!(
3744                    e,
3745                    ParseEvent::ParseError { code, .. } if code == expected_code
3746                )),
3747                "missing {expected_code} for {input}"
3748            );
3749        }
3750    }
3751
3752    #[test]
3753    fn decode_char_refs_with_errors_reports_unrecognized_named_refs_as_literal_ampersand() {
3754        let (s, errs) = decode_char_refs_with_errors(InputFormat::Html, "&zzzzzz;", false, 0, 1, 1);
3755        assert_eq!(s, "&zzzzzz;");
3756        assert!(errs.is_empty());
3757    }
3758
3759    #[test]
3760    fn decode_char_refs_with_errors_preserves_stream_errors_when_decoding_refs() {
3761        let input = "a\u{000B}&amp;";
3762        let (s, errs) = decode_char_refs_with_errors(InputFormat::Html, input, false, 0, 1, 1);
3763        assert_eq!(s, "a\u{000B}&");
3764        assert!(errs.iter().any(|e| matches!(
3765            e,
3766            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.forbidden_code_point"
3767        )));
3768    }
3769
3770    #[test]
3771    fn end_tag_syntax_errors_are_reported() {
3772        let evs = collect(SimpleHtmlEventSource::from_str(
3773            "t",
3774            InputFormat::Html,
3775            "</p/>",
3776        ));
3777        assert!(evs.iter().any(|e| matches!(
3778            e,
3779            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.end_tag_stray_slash"
3780        )));
3781
3782        let evs = collect(SimpleHtmlEventSource::from_str(
3783            "t",
3784            InputFormat::Html,
3785            "</p class=x>",
3786        ));
3787        assert!(evs.iter().any(|e| matches!(
3788            e,
3789            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.end_tag_with_attrs"
3790        )));
3791
3792        let evs = collect(SimpleHtmlEventSource::from_str(
3793            "t",
3794            InputFormat::Html,
3795            "</p",
3796        ));
3797        assert!(evs.iter().any(|e| matches!(
3798            e,
3799            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.eof_in_end_tag"
3800        )));
3801    }
3802
3803    #[test]
3804    fn end_tag_with_invalid_utf8_is_lossy_and_emits_end_tag() {
3805        let mut bytes = b"<p></p".to_vec();
3806        bytes.push(0xFF);
3807        bytes.extend_from_slice(b">");
3808
3809        let evs = collect(SimpleHtmlEventSource::from_bytes(
3810            "t",
3811            InputFormat::Html,
3812            bytes,
3813        ));
3814        assert!(evs.iter().any(|e| matches!(
3815            e,
3816            ParseEvent::EndTag { name, .. } if name == "p\u{FFFD}"
3817        )));
3818    }
3819
3820    #[test]
3821    fn text_with_invalid_utf8_is_lossy_and_decodes_char_refs() {
3822        let mut bytes = b"<p>".to_vec();
3823        bytes.push(0xFF);
3824        bytes.extend_from_slice(b"&amp;</p>");
3825        let evs = collect(SimpleHtmlEventSource::from_bytes(
3826            "t",
3827            InputFormat::Html,
3828            bytes,
3829        ));
3830        let texts: Vec<_> = evs
3831            .iter()
3832            .filter_map(|e| match e {
3833                ParseEvent::Text { text, .. } => Some(text.clone()),
3834                _ => None,
3835            })
3836            .collect();
3837        assert_eq!(texts, vec![format!("\u{FFFD}&")]);
3838
3839        let mut bytes = Vec::new();
3840        bytes.push(0xFF);
3841        bytes.extend_from_slice(b"&amp;");
3842        let evs = collect(SimpleHtmlEventSource::from_bytes(
3843            "t",
3844            InputFormat::Html,
3845            bytes,
3846        ));
3847        let texts: Vec<_> = evs
3848            .iter()
3849            .filter_map(|e| match e {
3850                ParseEvent::Text { text, .. } => Some(text.clone()),
3851                _ => None,
3852            })
3853            .collect();
3854        assert_eq!(texts, vec![format!("\u{FFFD}&")]);
3855    }
3856
3857    #[test]
3858    fn start_tag_eof_is_reported_when_tag_close_is_missing() {
3859        let evs = collect(SimpleHtmlEventSource::from_str(
3860            "t",
3861            InputFormat::Html,
3862            "<a href='x'",
3863        ));
3864        assert!(
3865            evs.iter()
3866                .any(|e| matches!(e, ParseEvent::ParseError { .. }))
3867        );
3868    }
3869
3870    #[test]
3871    fn doctype_identifier_edge_cases_cover_public_eof_and_almost_standards_message() {
3872        let evs = collect(SimpleHtmlEventSource::from_str(
3873            "t",
3874            InputFormat::Html,
3875            "<!DOCTYPE html PUBLIC \"a",
3876        ));
3877        assert!(evs.iter().any(|e| matches!(
3878            e,
3879            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.eof_in_public_id"
3880        )));
3881
3882        let evs = collect(SimpleHtmlEventSource::from_str(
3883            "t",
3884            InputFormat::Html,
3885            "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">",
3886        ));
3887        assert!(evs.iter().any(|e| matches!(
3888            e,
3889            ParseEvent::ParseError { code, message, .. }
3890                if code == "html.parser.doctype.not_html5"
3891                    && message == "Almost standards mode doctype. Expected “<!DOCTYPE html>”."
3892        )));
3893    }
3894
3895    #[test]
3896    fn doctype_name_is_compared_case_insensitively_for_html5_conformance() {
3897        let evs = collect(SimpleHtmlEventSource::from_str(
3898            "t",
3899            InputFormat::Xhtml,
3900            "<!DOCTYPE HTML>",
3901        ));
3902        assert!(!evs.iter().any(|e| matches!(
3903            e,
3904            ParseEvent::ParseError { code, .. } if code == "html.parser.doctype.not_html5"
3905        )));
3906        assert!(evs.iter().any(|e| matches!(
3907            e,
3908            ParseEvent::Doctype { name: Some(name), .. } if name == "HTML"
3909        )));
3910    }
3911
3912    #[test]
3913    fn find_tag_close_ignores_gt_inside_quotes() {
3914        let bytes = b"<a x='>' y=z>";
3915        assert_eq!(find_tag_close(bytes, 1), Some(bytes.len() - 1));
3916
3917        let bytes = b"<a x=\"a>b\" y=z>";
3918        assert_eq!(find_tag_close(bytes, 1), Some(bytes.len() - 1));
3919
3920        let bytes = b"<a x='>'";
3921        assert_eq!(find_tag_close(bytes, 1), None);
3922    }
3923
3924    #[test]
3925    fn find_tag_close_state_machine_covers_additional_transitions() {
3926        let src = SimpleHtmlEventSource::from_str(
3927            "t",
3928            InputFormat::Html,
3929            "<a x ></a><a x y=z></a><a x=  y></a>",
3930        );
3931        let evs = collect(src);
3932        let mut seen_first = false;
3933        let mut seen_second = false;
3934        let mut seen_third = false;
3935        for ev in &evs {
3936            if let Some((name, attrs)) = as_start_tag(ev) {
3937                if name == "a"
3938                    && attrs.iter().any(|a| a.name == "x")
3939                    && attrs.len() == 1
3940                    && !seen_first
3941                {
3942                    seen_first = true;
3943                } else if name == "a"
3944                    && attrs.iter().any(|a| a.name == "x" && a.value.is_none())
3945                    && attrs
3946                        .iter()
3947                        .any(|a| a.name == "y" && a.value.as_deref() == Some("z"))
3948                {
3949                    seen_second = true;
3950                } else if name == "a"
3951                    && attrs
3952                        .iter()
3953                        .any(|a| a.name == "x" && a.value.as_deref() == Some("y"))
3954                {
3955                    seen_third = true;
3956                }
3957            }
3958        }
3959        assert!(seen_first);
3960        assert!(seen_second);
3961        assert!(seen_third);
3962    }
3963
3964    #[test]
3965    fn end_tag_prefix_at_eof_emits_eof_after_lt() {
3966        let evs = collect(SimpleHtmlEventSource::from_str(
3967            "t",
3968            InputFormat::Html,
3969            "</",
3970        ));
3971        assert_eq!(evs.len(), 1);
3972        let ParseEvent::ParseError {
3973            code,
3974            message,
3975            span,
3976        } = &evs[0]
3977        else {
3978            panic!("expected a parse error event");
3979        };
3980        assert_eq!(code, "html.tokenizer.eof_after_lt");
3981        assert_eq!(message, "End of file after “<”.");
3982        assert_eq!(span.unwrap(), Span::new(0, 2, 1, 1));
3983    }
3984
3985    #[test]
3986    fn end_tag_garbage_after_lt_slash_emits_error_and_comment() {
3987        let evs = collect(SimpleHtmlEventSource::from_str(
3988            "t",
3989            InputFormat::Html,
3990            "</ x>",
3991        ));
3992        assert_eq!(evs.len(), 2);
3993        assert!(matches!(
3994            &evs[0],
3995            ParseEvent::ParseError { code, .. } if code == "html.tokenizer.garbage_after_lt_slash"
3996        ));
3997        assert!(matches!(&evs[1], ParseEvent::Comment { text, .. } if text == " x"));
3998    }
3999}
html_inspector_html/lib.rs

html_inspector_html/
lib.rs